├── LICENSE
├── OCR
├── PDF_XChange-OCRed.pdf
├── README.md
├── easyocr1.py
├── images-to-ocr-pdf.py
├── ocr-ed.pdf
├── ocr-ed.txt
├── ocrpages.py
├── scanned.pdf
├── tesseract1.py
├── tesseract2.py
└── v110-changes.pdf
├── README.md
├── advanced-toc
├── README.md
├── colored-toc.pdf
├── colored-toc.png
├── colorize.py
├── example.pdf
└── replaced-toc.pdf
├── alias-changer.py
├── animations
├── README.md
├── morph-demo1.jpg
├── morph-demo1.py
├── morph-demo2.py
├── morph-demo3.py
├── quad-show1.py
├── quad-show2.jpg
└── quad-show2.py
├── annotations
├── freetext-annot-lang.pdf
├── freetext-annot-lang.py
├── new-annots-0.pdf
├── new-annots.py
├── opacity.pdf
├── opacity.py
├── show-no-annots.py
├── with-annots.png
└── without-annots.png
├── cloud-interactions
├── README.md
├── from-aws-s3.py
├── from-google.py
├── from-ms-azure.py
├── to-aws-s3.py
└── to-ms-azure.py
├── conversion
├── README.md
├── images-to-ocr-pdf.py
├── make-cbz.py
├── make-imagepdf.py
└── make-page-images.py
├── examples
├── .gitignore
├── DeDRM-ebook.py
├── README.md
├── anonymize-document
│ ├── anonymize.py
│ ├── input.pdf
│ └── output.pdf
├── attach-images
│ ├── attach.py
│ ├── input
│ │ ├── erik-jan-leusink-s2mkB4WOl9k-unsplash.jpg
│ │ └── joe-caione-qO-PIF84Vxg-unsplash.jpg
│ └── output.pdf
├── browse-document
│ ├── browse.py
│ └── input.pdf
├── combine-pages
│ ├── combine.py
│ ├── input.pdf
│ └── output.pdf
├── convert-document
│ ├── convert.py
│ ├── input.epub
│ └── output.pdf
├── convert-image
│ ├── convert.py
│ ├── input.jpg
│ └── output.png
├── convert-pixmap
│ ├── convert.py
│ ├── input.png
│ └── output.jpg
├── convert-text
│ ├── convert.py
│ ├── input.txt
│ └── output.pdf
├── copy-embedded
│ ├── copy.py
│ ├── input.pdf
│ └── output.pdf
├── decrypt-document
│ ├── decrypt.py
│ ├── input.pdf
│ └── output.pdf
├── display-document
│ ├── display.py
│ └── input.pdf
├── draw-cardioid
│ ├── draw.py
│ └── output.pdf
├── draw-caustic
│ ├── draw.py
│ ├── output.pdf
│ ├── output.png
│ ├── output.svg
│ └── output.svgz
├── draw-fractal
│ ├── carpet.py
│ ├── output_carpet.png
│ ├── output_punch.png
│ ├── output_triangle.pdf
│ ├── punch.py
│ └── triangle.py
├── draw-polygon
│ ├── draw.py
│ ├── output.pdf
│ └── output.svg
├── draw-rgb-area
│ ├── draw.py
│ ├── output_PIL.png
│ └── output_fitz.png
├── draw-sines
│ ├── draw.py
│ └── output.pdf
├── edit-images
│ ├── README.md
│ ├── edit.py
│ ├── figure-01.jpg
│ └── input.pdf
├── edit-links
│ ├── edit.py
│ └── input.pdf
├── edit-toc
│ ├── edit.py
│ └── input.pdf
├── embed-images
│ ├── embed.py
│ ├── input
│ │ ├── erik-jan-leusink-s2mkB4WOl9k-unsplash.jpg
│ │ └── joe-caione-qO-PIF84Vxg-unsplash.jpg
│ └── output.pdf
├── export-embedded
│ ├── export.py
│ ├── input.pdf
│ └── output.pdf
├── export-metadata
│ ├── export.py
│ ├── input.pdf
│ └── output.csv
├── export-toc
│ ├── export.py
│ ├── input.pdf
│ └── output.csv
├── extract-images
│ ├── extract-from-pages.py
│ ├── extract-from-xref.py
│ ├── input.pdf
│ └── output
│ │ ├── .gitkeep
│ │ ├── img00005.png
│ │ └── img00011.png
├── extract-table
│ ├── ParseTab.py
│ ├── README.md
│ ├── extract.py
│ ├── input.pdf
│ └── wx-extract.py
├── extract-vector-graphics
│ └── detect_graphics.py
├── extract-xobj
│ ├── extract.py
│ ├── input.pdf
│ └── output.pdf
├── filmfestival-2tables
│ ├── README.md
│ ├── filmfestival.db
│ ├── filmfestival.py
│ └── output.pdf
├── icons
│ ├── PyMuPDF.ico
│ ├── __init__.py
│ ├── ico_pdf.py
│ ├── pdf.py
│ └── pymupdf.png
├── import-embedded
│ ├── import.py
│ ├── input.pdf
│ ├── joe-caione-qO-PIF84Vxg-unsplash.jpg
│ └── output.pdf
├── import-metadata
│ ├── import.py
│ ├── input.csv
│ └── input.pdf
├── import-toc
│ ├── import.py
│ ├── input.csv
│ └── input.pdf
├── insert-images
│ ├── input
│ │ ├── erik-jan-leusink-s2mkB4WOl9k-unsplash.jpg
│ │ └── joe-caione-qO-PIF84Vxg-unsplash.jpg
│ ├── insert.py
│ └── output.pdf
├── insert-logo
│ ├── file.py
│ ├── input.pdf
│ ├── logo.png
│ ├── logo.svg
│ ├── output_file.pdf
│ ├── output_svg.pdf
│ └── svg.py
├── join-documents
│ ├── input
│ │ ├── made-with-cc.pdf
│ │ └── thinkpython2.pdf
│ ├── join.py
│ └── output.pdf
├── list-embedded
│ ├── input.pdf
│ └── list.py
├── make-calendar
│ ├── make.py
│ └── output.pdf
├── optimize-document
│ ├── input.pdf
│ └── optimize.py
├── posterize-document
│ ├── input.pdf
│ ├── output.pdf
│ └── posterize.py
├── print-hsv
│ ├── output.pdf
│ └── print.py
├── print-page-format
│ └── print.py
├── print-rgb
│ ├── output.pdf
│ └── print.py
├── replace-image
│ ├── README.md
│ ├── input.jpg
│ ├── input.pdf
│ ├── output_remove.pdf
│ ├── output_replace.pdf
│ ├── remove.py
│ └── replace.py
├── split-document
│ ├── input.pdf
│ ├── output
│ │ ├── .gitkeep
│ │ ├── input-0.pdf
│ │ ├── input-1.pdf
│ │ ├── input-10.pdf
│ │ ├── input-100.pdf
│ │ ├── input-101.pdf
│ │ ├── input-102.pdf
│ │ ├── input-103.pdf
│ │ ├── input-104.pdf
│ │ ├── input-105.pdf
│ │ ├── input-106.pdf
│ │ ├── input-107.pdf
│ │ ├── input-108.pdf
│ │ ├── input-109.pdf
│ │ ├── input-11.pdf
│ │ ├── input-110.pdf
│ │ ├── input-111.pdf
│ │ ├── input-112.pdf
│ │ ├── input-113.pdf
│ │ ├── input-114.pdf
│ │ ├── input-115.pdf
│ │ ├── input-116.pdf
│ │ ├── input-117.pdf
│ │ ├── input-118.pdf
│ │ ├── input-119.pdf
│ │ ├── input-12.pdf
│ │ ├── input-120.pdf
│ │ ├── input-121.pdf
│ │ ├── input-122.pdf
│ │ ├── input-123.pdf
│ │ ├── input-124.pdf
│ │ ├── input-125.pdf
│ │ ├── input-126.pdf
│ │ ├── input-127.pdf
│ │ ├── input-128.pdf
│ │ ├── input-129.pdf
│ │ ├── input-13.pdf
│ │ ├── input-130.pdf
│ │ ├── input-131.pdf
│ │ ├── input-132.pdf
│ │ ├── input-133.pdf
│ │ ├── input-134.pdf
│ │ ├── input-135.pdf
│ │ ├── input-136.pdf
│ │ ├── input-137.pdf
│ │ ├── input-138.pdf
│ │ ├── input-139.pdf
│ │ ├── input-14.pdf
│ │ ├── input-140.pdf
│ │ ├── input-141.pdf
│ │ ├── input-142.pdf
│ │ ├── input-143.pdf
│ │ ├── input-144.pdf
│ │ ├── input-145.pdf
│ │ ├── input-146.pdf
│ │ ├── input-147.pdf
│ │ ├── input-148.pdf
│ │ ├── input-149.pdf
│ │ ├── input-15.pdf
│ │ ├── input-150.pdf
│ │ ├── input-151.pdf
│ │ ├── input-152.pdf
│ │ ├── input-153.pdf
│ │ ├── input-154.pdf
│ │ ├── input-155.pdf
│ │ ├── input-156.pdf
│ │ ├── input-157.pdf
│ │ ├── input-158.pdf
│ │ ├── input-159.pdf
│ │ ├── input-16.pdf
│ │ ├── input-160.pdf
│ │ ├── input-161.pdf
│ │ ├── input-162.pdf
│ │ ├── input-163.pdf
│ │ ├── input-164.pdf
│ │ ├── input-165.pdf
│ │ ├── input-166.pdf
│ │ ├── input-167.pdf
│ │ ├── input-168.pdf
│ │ ├── input-169.pdf
│ │ ├── input-17.pdf
│ │ ├── input-170.pdf
│ │ ├── input-171.pdf
│ │ ├── input-172.pdf
│ │ ├── input-173.pdf
│ │ ├── input-174.pdf
│ │ ├── input-175.pdf
│ │ ├── input-18.pdf
│ │ ├── input-19.pdf
│ │ ├── input-2.pdf
│ │ ├── input-20.pdf
│ │ ├── input-21.pdf
│ │ ├── input-22.pdf
│ │ ├── input-23.pdf
│ │ ├── input-24.pdf
│ │ ├── input-25.pdf
│ │ ├── input-26.pdf
│ │ ├── input-27.pdf
│ │ ├── input-28.pdf
│ │ ├── input-29.pdf
│ │ ├── input-3.pdf
│ │ ├── input-30.pdf
│ │ ├── input-31.pdf
│ │ ├── input-32.pdf
│ │ ├── input-33.pdf
│ │ ├── input-34.pdf
│ │ ├── input-35.pdf
│ │ ├── input-36.pdf
│ │ ├── input-37.pdf
│ │ ├── input-38.pdf
│ │ ├── input-39.pdf
│ │ ├── input-4.pdf
│ │ ├── input-40.pdf
│ │ ├── input-41.pdf
│ │ ├── input-42.pdf
│ │ ├── input-43.pdf
│ │ ├── input-44.pdf
│ │ ├── input-45.pdf
│ │ ├── input-46.pdf
│ │ ├── input-47.pdf
│ │ ├── input-48.pdf
│ │ ├── input-49.pdf
│ │ ├── input-5.pdf
│ │ ├── input-50.pdf
│ │ ├── input-51.pdf
│ │ ├── input-52.pdf
│ │ ├── input-53.pdf
│ │ ├── input-54.pdf
│ │ ├── input-55.pdf
│ │ ├── input-56.pdf
│ │ ├── input-57.pdf
│ │ ├── input-58.pdf
│ │ ├── input-59.pdf
│ │ ├── input-6.pdf
│ │ ├── input-60.pdf
│ │ ├── input-61.pdf
│ │ ├── input-62.pdf
│ │ ├── input-63.pdf
│ │ ├── input-64.pdf
│ │ ├── input-65.pdf
│ │ ├── input-66.pdf
│ │ ├── input-67.pdf
│ │ ├── input-68.pdf
│ │ ├── input-69.pdf
│ │ ├── input-7.pdf
│ │ ├── input-70.pdf
│ │ ├── input-71.pdf
│ │ ├── input-72.pdf
│ │ ├── input-73.pdf
│ │ ├── input-74.pdf
│ │ ├── input-75.pdf
│ │ ├── input-76.pdf
│ │ ├── input-77.pdf
│ │ ├── input-78.pdf
│ │ ├── input-79.pdf
│ │ ├── input-8.pdf
│ │ ├── input-80.pdf
│ │ ├── input-81.pdf
│ │ ├── input-82.pdf
│ │ ├── input-83.pdf
│ │ ├── input-84.pdf
│ │ ├── input-85.pdf
│ │ ├── input-86.pdf
│ │ ├── input-87.pdf
│ │ ├── input-88.pdf
│ │ ├── input-89.pdf
│ │ ├── input-9.pdf
│ │ ├── input-90.pdf
│ │ ├── input-91.pdf
│ │ ├── input-92.pdf
│ │ ├── input-93.pdf
│ │ ├── input-94.pdf
│ │ ├── input-95.pdf
│ │ ├── input-96.pdf
│ │ ├── input-97.pdf
│ │ ├── input-98.pdf
│ │ └── input-99.pdf
│ └── split.py
├── test-blendmode
│ ├── output.pdf
│ └── test.py
├── tile-image
│ ├── input.jpg
│ ├── output
│ │ ├── .gitkeep
│ │ ├── target-00.png
│ │ ├── target-01.png
│ │ ├── target-02.png
│ │ ├── target-10.png
│ │ ├── target-11.png
│ │ ├── target-12.png
│ │ ├── target-20.png
│ │ ├── target-21.png
│ │ ├── target-22.png
│ │ ├── target-30.png
│ │ ├── target-31.png
│ │ └── target-32.png
│ └── tile.py
├── view-document
│ ├── input.pdf
│ └── view.py
└── zerofy-rotation
│ ├── derotate.py
│ ├── input.pdf
│ └── zerofy-rotation.py
├── fields
├── date-field.py
├── form-fields.py
├── interfield-calculation.py
├── list-fields.py
├── switch-text-on-off.py
├── widgettest-alt.pdf
└── widgettest.py
├── font-replacement
├── multi-language.jpg
├── page-17-after.png
├── page-17-before.png
├── readme.md
├── repl-font.py
├── repl-fontnames.py
└── run-log.txt
├── jupyter-notebooks
├── 1page-snap.log
├── 1page-snap.pdf
├── 1page.pdf
├── README-OCR.md
├── README.md
├── blacked.pdf
├── dehyphenate-flag.ipynb
├── detect-hidden.ipynb
├── input.pdf
├── input.pdf-status.log
├── journalling1.ipynb
├── journalling2.ipynb
├── new_circle_annot.ipynb
├── object-algebra.ipynb
├── ocr-illegible.ipynb
├── optional-content.ipynb
├── page-rectangles.ipynb
├── partial-ocr.ipynb
├── partial-ocr.pdf
├── show_image.py
└── testpage-performance.ipynb
├── optional-content
├── readme.md
├── source-ocmd.pdf
├── source-ocmd.py
├── source-radio.pdf
├── source-radio.py
└── source.pdf
├── pdf-names-resolution
├── README.md
├── find_names.py
└── list_names.py
├── reporting
├── README.md
├── documentation-draft.md
├── examples
│ ├── filmfestival-2tables
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── filmfestival.db
│ │ ├── filmfestival.py
│ │ └── output.pdf
│ ├── invoice
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── header.html
│ │ ├── invoice-parms.db
│ │ ├── invoicer.py
│ │ ├── items.html
│ │ ├── logo.png
│ │ ├── output.pdf
│ │ └── prolog.html
│ ├── multi-format
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── national-capitals.csv
│ │ ├── national-capitals.py
│ │ └── output.pdf
│ ├── row-with-images
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── flags.zip
│ │ ├── items.csv
│ │ ├── output.pdf
│ │ └── rows-with-images.py
│ ├── simple-article
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── output.pdf
│ │ ├── simple-article.py
│ │ ├── springer.html
│ │ └── springer.jpg
│ └── user-fonts
│ │ ├── DejaVuSansCondensed-Bold.ttf
│ │ ├── DejaVuSansCondensed.ttf
│ │ ├── README.md
│ │ ├── Reports.py
│ │ ├── dejavu.py
│ │ ├── filmfestival.db
│ │ ├── kenpixel.py
│ │ ├── kenpixel.ttf
│ │ ├── output-dejavu.pdf
│ │ └── output-kenpixel.pdf
├── pymupdf-reporting.pdf
└── pymupdf-reporting.pptx
├── shapes
├── piechart1.pdf
├── piechart1.png
├── piechart1.py
├── piechart2.pdf
├── piechart2.py
├── shapes_and_symbols.py
├── symbol-list.pdf
└── symbol-list.py
├── table-analysis
├── README.md
├── XPS-table.pdf
├── XPS-table.xlsx
├── XPS-table.xps
├── chinese-table.pdf
├── clean_graphics.py
├── compare-xps-pdf.ipynb
├── find_tables.ipynb
├── gridlines-to-pandas.py
├── input1-bbox.json
├── input1.pdf
├── input2.pdf
├── join_tables.ipynb
├── national-capitals.pdf
├── show_image.py
└── span-analysis-to-pandas.py
├── text-documents
├── README.md
├── any-file.ipynb
├── basic.ipynb
├── multi-language.ipynb
└── test.pdf
├── text-extraction
├── 1page-text.jpg
├── 1page.pdf
├── Dart-text.jpg
├── Dart.pdf
├── PDF2Text.py
├── PDF2TextBlocks.py
├── Petresume-text.jpg
├── Petresume.pdf
├── README.md
├── demo1-text.jpg
├── demo1.pdf
├── extend-dicts.pdf
├── extend-dicts.py
├── fitzcli.py
├── garbled-text.jpg
├── garbled.pdf
├── invoice-simple.pdf
├── layout-analyzer.py
├── layout-demo1.pdf
├── lookup-keywords.py
├── multi_column.py
├── shadows.pdf
├── textmaker.pdf
├── textmaker.py
├── textmaker2.pdf
└── textmaker2.py
├── textbox-extraction
├── readme.md
├── search.pdf
├── search.png
├── textbox-extract-1.py
└── textbox-extract-2.py
├── textwriter
├── cff-test.pdf
├── demo.pdf
├── demo.py
├── new-annots-tw-0.pdf
├── new-annots-tw.py
├── test-droid.pdf
├── test.pdf
├── textwriter-textbox.pdf
└── textwriter-textbox.py
└── word&line-marking
├── mark-lines.png
├── mark-lines.py
├── mark-lines2.jpg
├── mark-lines2.py
├── mark-words.pdf
├── mark-words.py
├── readme.md
└── search.pdf
/OCR/PDF_XChange-OCRed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/OCR/PDF_XChange-OCRed.pdf
--------------------------------------------------------------------------------
/OCR/images-to-ocr-pdf.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility to OCR a list of images and output them as one PDF
3 |
4 | License: GNU AGPL 3.0
5 | Author: (c) Harald Lieder, harald.lieder@outlook.com
6 | Date: 2021-10-26
7 | """
8 | import os
9 | import sys
10 |
11 | import fitz
12 |
13 | if tuple(map(int, fitz.VersionBind.split("."))) < (1, 19, 0):
14 | raise ValueError("Need at least PyMuPDF v1.19.0")
15 |
16 | doc = fitz.open() # output PDF
17 | img_folder = sys.argv[1] # example: image folder name provided
18 | dirname = os.path.dirname(img_folder)
19 | img_list = os.listdir(img_folder) # some list of image filenames
20 | for img in img_list:
21 | imgfile = os.path.join(dirname, img)
22 | pix = fitz.Pixmap(imgfile) # make a pixmap form the image file
23 | pdfbytes = pix.pdfocr_tobytes(language="eng") # 1-page PDF with the OCRed image
24 | imgpdf = fitz.open("pdf", pdfbytes) # open it as a PDF
25 | doc.insert_pdf(imgpdf) # append the image page to output
26 |
27 | doc.ez_save("ocr-pdf.pdf") # save output
28 |
--------------------------------------------------------------------------------
/OCR/ocr-ed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/OCR/ocr-ed.pdf
--------------------------------------------------------------------------------
/OCR/ocr-ed.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | PyMuPDF— the Python
6 | bindings for MuPDF
7 |
8 | PyMuPDF Documentation
9 | Release 1.18.19
10 |
11 | Jorj X. McKie
12 |
13 | Sep 17, 2021
14 |
--------------------------------------------------------------------------------
/OCR/ocrpages.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a basic script demonstrating the use of OCRmyPDF together with PyMuPDF.
3 |
4 | It reads a PDF's pages and passes them to ocrmypdf one by one. One could at this
5 | point insert some checks as to whether the page is actually an, contains no text,
6 | or text with many unrecognized characters or the like.
7 |
8 | Each page is then converted to a 1-page temporary PDF which is
9 | - passed to ocrmypdf for OCR-ing it
10 | - the 1-page output PDF of the pervious step is then text-extracted
11 | - return the extracted text
12 |
13 | Instead of extracting simple naive text format, one could also use all other
14 | text extraction formats like "dict" to get text position information.
15 |
16 | Requires
17 | ---------
18 | ocrmypdf
19 | """
20 | import fitz
21 | import ocrmypdf
22 | import sys
23 | import io
24 |
25 |
26 | def ocr_the_page(page):
27 | """Extract the text from passed-in PDF page."""
28 | src = page.parent # the page's document
29 | doc = fitz.open() # make temporary 1-pager
30 | doc.insert_pdf(src, from_page=page.number, to_page=page.number)
31 | pdfbytes = doc.tobytes()
32 | inbytes = io.BytesIO(pdfbytes) # transform to BytesIO object
33 | outbytes = io.BytesIO() # let ocrmypdf store its result pdf here
34 | ocrmypdf.ocr(
35 | inbytes, # input 1-pager
36 | outbytes, # ouput 1-pager
37 | language="eng", # modify as required e.g. ("eng", "ger")
38 | output_type="pdf", # only need simple PDF format
39 | # add more paramneters, e.g. to enforce OCR-ing, etc., e.g.
40 | # force_ocr=True, redo_ocr=True
41 | )
42 | ocr_pdf = fitz.open("pdf", outbytes.getvalue()) # read output as fitz PDF
43 | text = ocr_pdf[0].get_text() # ...and extract text from the page
44 | return text # return it
45 |
46 |
47 | if __name__ == "__main__":
48 | doc = fitz.open(sys.argv[1])
49 | for page in doc:
50 | text = ocr_the_page(page)
51 | print("Text from page %i:" % page.number)
52 | print(text)
53 |
--------------------------------------------------------------------------------
/OCR/scanned.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/OCR/scanned.pdf
--------------------------------------------------------------------------------
/OCR/v110-changes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/OCR/v110-changes.pdf
--------------------------------------------------------------------------------
/advanced-toc/colored-toc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/advanced-toc/colored-toc.pdf
--------------------------------------------------------------------------------
/advanced-toc/colored-toc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/advanced-toc/colored-toc.png
--------------------------------------------------------------------------------
/advanced-toc/colorize.py:
--------------------------------------------------------------------------------
1 | import fitz
2 |
3 | doc = fitz.open("example.pdf")
4 | toc = doc.get_toc(False)
5 | for i, item in enumerate(toc):
6 | lvl, title, pno, ddict = item
7 | ddict["collapse"] = False
8 | if lvl == 1:
9 | ddict["color"] = (1, 0, 0)
10 | ddict["bold"] = True
11 | ddict["italic"] = False
12 | elif lvl == 2:
13 | ddict["color"] = (0, 0, 1)
14 | ddict["bold"] = False
15 | ddict["italic"] = True
16 | else:
17 | ddict["color"] = (0, 1, 0)
18 | ddict["bold"] = ddict["italic"] = False
19 | doc.set_toc_item(i, dest_dict=ddict)
20 | doc.save("new-toc.pdf")
21 |
--------------------------------------------------------------------------------
/advanced-toc/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/advanced-toc/example.pdf
--------------------------------------------------------------------------------
/advanced-toc/replaced-toc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/advanced-toc/replaced-toc.pdf
--------------------------------------------------------------------------------
/animations/README.md:
--------------------------------------------------------------------------------
1 | This folder contains a few scripts which may best be characterized as "fun" or "entertainment" ... using PyMuPDF of course.
2 |
3 | They all work following the same basic approach:
4 |
5 | 1. Draw or write something on an empty page of a new PDF
6 | 2. Convert the page to an image
7 | 3. Show this image in a GUI (using PySimpleGUI)
8 | 4. Destroy image, page and PDF document
9 | 5. Modify some parameters
10 | 6. Start over with step 1 above in an endless loop.
11 |
12 | Because of the excellent performance of PyMuPDF (😉), this process is fast enough to be shown like a little video clip - mostly achieving more than 100 frames per second.
13 |
14 | Scripts `morph-demo1.py`, `morph-demo2.py` and `morph-demo3.py` show the effect of morphing a text box given some fixpoint.
15 |
16 | Scripts `quad-show1.py` and `quad-show2.py` simply draw quadrilaterals to demonstrate what happens when their corners are modified following certain patterns.
17 |
--------------------------------------------------------------------------------
/animations/morph-demo1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/animations/morph-demo1.jpg
--------------------------------------------------------------------------------
/animations/quad-show2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/animations/quad-show2.jpg
--------------------------------------------------------------------------------
/annotations/freetext-annot-lang.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/annotations/freetext-annot-lang.pdf
--------------------------------------------------------------------------------
/annotations/freetext-annot-lang.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division, print_function
3 |
4 | import os
5 | import sys
6 |
7 | import fitz
8 |
9 | print(fitz.__doc__)
10 | if fitz.VersionBind.split(".") < ["1", "17", "0"]:
11 | sys.exit("Need PyMuPDF v1.17.0 or later.")
12 |
13 | outfile = os.path.abspath(__file__).replace(".py", ".pdf")
14 |
15 |
16 | doc = fitz.open() # new PDF
17 | page = doc.new_page() # new page
18 |
19 | text = r"""This is a text of mixed languages to generate FreeText annotations with automatic font selection - a feature new in MuPDF v1.17.
20 | Euro: €, general Latin and other signs: | ~ ° ² ³ ñ ä ö ü ß â ¿ ¡ µ ¶ œ ¼ ½ ¾ ‰
21 | Japan: 熊野三山本願所は、 15世紀末以降における熊野三山 (熊野本宮、 熊野新宮
22 | Greece: Στα ερείπια της πόλης, που ήταν ένα σημαντικό
23 | Korea: 에듀롬은 하나의 계정으로 전 세계 고등교육 기관의 인터넷에 접속할
24 | Russia: Ко времени восшествия на престол Якова I в значительной
25 | China: 北京作为城市的历史 可以追溯到 3,000 年前。西周初年, 周武王封召公奭于燕國。
26 | Devanagari (not supported): नि:शुल्क ज्ञानको लागी लाई धन्यबाद""".splitlines()
27 |
28 | blue = (0, 0, 1)
29 | red = (1, 0, 0)
30 | gold = (1, 1, 0)
31 | green = (0, 1, 0)
32 |
33 | # make the rectangles for filling in above text lines
34 | tl = page.rect.tl + (72, 144) # some distance from the page's corners
35 | br = page.rect.br - (72, 144)
36 | rect = fitz.Rect(tl, br) # put all annots inside this rectangle
37 | cells = fitz.make_table(rect, cols=1, rows=len(text))
38 | shrink = (0, 5, 0, 0) # makes distance between annots
39 | for i in range(len(text)):
40 | annot = page.add_freetext_annot(
41 | cells[i][0] + shrink,
42 | text[i],
43 | fontsize=16,
44 | fontname="tiro", # used for non-CJK characters only!
45 | align=fitz.TEXT_ALIGN_CENTER,
46 | text_color=blue,
47 | )
48 | annot.set_border(width=1.0)
49 | annot.update(fill_color=gold, border_color=green)
50 |
51 | doc.save(outfile, garbage=3, deflate=True)
52 |
--------------------------------------------------------------------------------
/annotations/new-annots-0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/annotations/new-annots-0.pdf
--------------------------------------------------------------------------------
/annotations/opacity.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import fitz
5 |
6 | print(fitz.__doc__)
7 | doc = fitz.open()
8 | page = doc.new_page()
9 |
10 | annot1 = page.add_circle_annot((50, 50, 100, 100))
11 | annot1.set_colors(fill=(1, 0, 0), stroke=(1, 0, 0))
12 | annot1.set_opacity(2 / 3)
13 | annot1.update(blend_mode="Multiply")
14 |
15 | annot2 = page.add_circle_annot((75, 75, 125, 125))
16 | annot2.set_colors(fill=(0, 0, 1), stroke=(0, 0, 1))
17 | annot2.set_opacity(1 / 3)
18 | annot2.update(blend_mode="Multiply")
19 | outfile = os.path.abspath(__file__).replace(".py", ".pdf")
20 | doc.save(outfile, expand=True, pretty=True)
21 | print("saved", outfile)
22 |
--------------------------------------------------------------------------------
/annotations/show-no-annots.py:
--------------------------------------------------------------------------------
1 | import os
2 | import fitz
3 |
4 | """
5 | Render a page with and without anootations.
6 |
7 | Please note that starting with v1.16.0, pixmaps without annotations
8 | can be created directly.
9 | """
10 | print(fitz.__doc__)
11 | thisdir = os.path.dirname(__file__)
12 | infile = os.path.join(thisdir, "new-annots-0.pdf")
13 | src = fitz.open(infile) # a document with annotations
14 | p1 = src[0]
15 | pix1 = p1.get_pixmap(annots=True)
16 | pix1.save(os.path.join(thisdir, "with-annots.png")) # save page pixmap
17 | pix2 = p1.get_pixmap(annots=False)
18 | pix2.save(os.path.join(thisdir, "without-annots.png"))
19 |
--------------------------------------------------------------------------------
/annotations/with-annots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/annotations/with-annots.png
--------------------------------------------------------------------------------
/annotations/without-annots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/annotations/without-annots.png
--------------------------------------------------------------------------------
/cloud-interactions/README.md:
--------------------------------------------------------------------------------
1 | This is a set of code snippets showing how to download or upload to cloud services offered by major providers.
2 |
3 | The focus of the scripts is to demonstrate, how using intermediate disk storage can be avoided by using PyMuPDF Document features.
4 |
5 | We are currently considering to extend `Document` creation such that cloud access is covered too. Because of the diversity of ways how this works by cloud service provider, this is somewhat tedious. So please bear with us until we are clear what we need to do.
6 |
--------------------------------------------------------------------------------
/cloud-interactions/from-aws-s3.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import boto3
3 |
4 | s3 = boto3.client("s3")
5 |
6 | # fill in your credentials to access the cloud
7 | response = s3.get_object(Bucket="string", Key="string")
8 | mime = response["ContentType"]
9 | body = response["Body"]
10 |
11 | # define Document with these data
12 | doc = fitz.open(mime, body.read())
13 |
--------------------------------------------------------------------------------
/cloud-interactions/from-google.py:
--------------------------------------------------------------------------------
1 | import os
2 | import fitz # pymupdf
3 | import gcsfs # google cloud storage file system
4 |
5 | # Access the google filesystem.
6 | # You will need to supply credentials - which is omitted here
7 | fs = gcsfs.GCSFileSystem(project="my-google-project")
8 |
9 | filename = fs.ls("my-bucket")[0] # first filename in bucket
10 | ext = os.path.splitext(filename)[1] # determine file extension
11 | f = fs.open(filename, "rb") # open with that filesystem
12 |
13 | # now open with PyMuPDF using the bytes object of "f"
14 | doc = fitz.open(ext[1:], f.read())
15 |
--------------------------------------------------------------------------------
/cloud-interactions/from-ms-azure.py:
--------------------------------------------------------------------------------
1 | import os
2 | import fitz # pymupdf
3 | from azure.storage.blob import BlobClient
4 |
5 | blob = BlobClient.from_connection_string(
6 | conn_str="my_connection_string",
7 | container_name="my_container",
8 | blob_name="my_blob",
9 | )
10 |
11 | with open("some-file.pdf", "wb") as my_blob:
12 | blob_data = blob.download_blob()
13 | blob_data.readinto(my_blob)
14 |
15 | # now open with PyMuPDF using the bytes object of "f"
16 | doc = fitz.open("pdf", my_blob.read())
17 |
--------------------------------------------------------------------------------
/cloud-interactions/to-aws-s3.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import boto3
3 |
4 | # process some PDF document
5 | doc = fitz.open("...")
6 | # then write / upload it directly to AWS S3
7 | # Instead of save, we use the tobytes(), which generates a bytes object
8 | pdfbytes = doc.tobytes( # optional 'save' parameters:
9 | garbage=3,
10 | deflate=True,
11 | owner_pw="owner-password",
12 | user_pw="user-pasword",
13 | )
14 |
15 | s3 = boto3.client("s3")
16 | request_route = "string"
17 | request_token = "string"
18 | s3.write_get_object_response(
19 | Body=pdfbytes,
20 | RequestRoute=request_route,
21 | RequestToken=request_token,
22 | )
23 |
--------------------------------------------------------------------------------
/cloud-interactions/to-ms-azure.py:
--------------------------------------------------------------------------------
1 | import fitz # pymupdf
2 | from azure.storage.blob import BlobClient
3 |
4 | # some PDF document
5 | doc = fitz.open("...")
6 |
7 | # access Azure blob client
8 | blob = BlobClient.from_connection_string(
9 | conn_str="my_connection_string",
10 | container_name="my_container",
11 | blob_name="my_blob",
12 | )
13 |
14 | # upload document
15 | blob.upload_blob(
16 | doc.tobytes(
17 | garbage=3,
18 | deflate=True,
19 | # more parameters
20 | )
21 | )
22 |
--------------------------------------------------------------------------------
/conversion/README.md:
--------------------------------------------------------------------------------
1 | This folder contains scripts for document conversions.
2 |
3 | Over time, more examples will be added. Currently there are:
4 |
5 | * `make-cbz.py` - convert any document to a Comic Book
6 | * `make-imagepdf.py` - convert any document to a PDF with original pages rendered to images.
7 | * `make-page-images.py` - convert the pages of any document to PNG images.
8 | * `images-to-ocr-pdf.py` - make PDF from a list of images (one image per page), where each page contains an OCR text layer.
9 |
10 |
11 | Your contribution is welcome. This may include more conversion types, or improvements like better handling / supporting parameters of existing scripts.
12 |
--------------------------------------------------------------------------------
/conversion/images-to-ocr-pdf.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility to OCR a list of images and output them as one PDF
3 |
4 | License: GNU AGPL 3.0
5 | Author: (c) Harald Lieder, harald.lieder@outlook.com
6 | Date: 2021-10-26
7 | """
8 | import os
9 | import sys
10 |
11 | import fitz
12 |
13 | if tuple(map(int, fitz.VersionBind.split("."))) < (1, 19, 0):
14 | raise ValueError("Need at least PyMuPDF v1.19.0")
15 |
16 | doc = fitz.open() # output PDF
17 | img_folder = sys.argv[1] # example: image folder name provided
18 | dirname = os.path.dirname(img_folder)
19 | img_list = os.listdir(img_folder) # some list of image filenames
20 | for img in img_list:
21 | imgfile = os.path.join(dirname, img)
22 | pix = fitz.Pixmap(imgfile) # make a pixmap form the image file
23 | pdfbytes = pix.pdfocr_tobytes(language="eng") # 1-page PDF with the OCRed image
24 | imgpdf = fitz.open("pdf", pdfbytes) # open it as a PDF
25 | doc.insert_pdf(imgpdf) # append the image page to output
26 |
27 | doc.ez_save("ocr-pdf.pdf") # save output
28 |
--------------------------------------------------------------------------------
/conversion/make-cbz.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility to convert a supported document to a Comic Book archive.
3 |
4 | License: GNU AGPL 3.0
5 | Author: (c) Harald Lieder, harald.lieder@outlook.com
6 | Date: 2021-08-30
7 | """
8 |
9 | import os
10 | import sys
11 | import zipfile
12 |
13 | import fitz
14 |
15 |
16 | def main(doc, outfile=None, pages=None, dpi=96):
17 | if outfile is None:
18 | if doc.name:
19 | filename, _ = os.path.splitext(doc.name)
20 | outfile = filename + ".cbz"
21 | elif __file__.endswith(".py"):
22 | outfile = __file__.replace(".py", ".cbz")
23 | else:
24 | outfile = "out.cbz"
25 | zipout = zipfile.ZipFile(
26 | outfile,
27 | "w",
28 | compression=zipfile.ZIP_STORED,
29 | )
30 | if pages is None:
31 | pages = range(doc.page_count)
32 | zoom = dpi / 72
33 | mat = fitz.Matrix(zoom, zoom)
34 | for pno in pages:
35 | page = doc[pno]
36 | pix = page.get_pixmap(matrix=mat)
37 | pix.set_dpi(dpi, dpi)
38 | pagename = "p%05i.png" % (pno + 1)
39 | zipout.writestr(pagename, pix.tobytes("png"))
40 | zipout.close()
41 |
42 |
43 | if __name__ == "__main__":
44 | filename = sys.argv[1]
45 | doc = fitz.open(filename)
46 | main(doc)
47 |
--------------------------------------------------------------------------------
/conversion/make-imagepdf.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility to convert a supported document to an image-only PDF.
3 |
4 | License: GNU AGPL 3.0
5 | Author: (c) Harald Lieder, harald.lieder@outlook.com
6 | Date: 2021-08-30
7 | """
8 | import os
9 | import sys
10 |
11 | import fitz
12 |
13 |
14 | def main(doc, outfile=None, pages=None, dpi=96):
15 | if outfile is None:
16 | if doc.name:
17 | filename, _ = os.path.splitext(doc.name)
18 | outfile = filename + ".pdf"
19 | elif __file__.endswith(".py"):
20 | outfile = __file__.replace(".py", ".pdf")
21 | else:
22 | outfile = "out.pdf"
23 | if outfile == doc.name:
24 | outfile += ".pdf"
25 | if pages is None:
26 | pages = range(doc.page_count)
27 | zoom = dpi / 72
28 | mat = fitz.Matrix(zoom, zoom)
29 | pdfout = fitz.open()
30 | for pno in pages:
31 | page = doc[pno]
32 | pix = page.get_pixmap(matrix=mat)
33 | pix.set_dpi(dpi, dpi)
34 | opage = pdfout.new_page(width=page.rect.width, height=page.rect.height)
35 | opage.insert_image(opage.rect, pixmap=pix)
36 | pdfout.ez_save(outfile)
37 | pdfout.close()
38 |
39 |
40 | if __name__ == "__main__":
41 | filename = sys.argv[1]
42 | doc = fitz.open(filename)
43 | main(doc)
44 |
--------------------------------------------------------------------------------
/conversion/make-page-images.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic script to convert pages of an arbitrary document to PNG images.
3 |
4 | All MuPDF document types are supported: PDF, XPS, EPUB, etc.
5 | Page images are stored in the script's folder and named "page-0001.png",
6 | "page-0002.png".
7 |
8 | Desired resolution can be chosen by setting the "DPI" variable.
9 | """
10 | import sys
11 | import fitz
12 |
13 | filename = sys.argv[1]
14 | doc = fitz.open(filename)
15 | DPI = 300 # the desired image resolution
16 | ZOOM = DPI / 72 # zoom factor, standard dpi is 72
17 | magnify = fitz.Matrix(ZOOM, ZOOM) # takes care of zooming
18 | for page in doc:
19 | pix = page.get_pixmap(matrix=magnify) # make page image
20 | pix.set_dpi(DPI, DPI) # store dpi info in image
21 | pix.save("page-%04i.png" % (page.number + 1))
22 |
23 | # generates images named page-0001.png, page-0002.png, ...
24 |
--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | /__pycache__
2 | /extract-table/__pycache__/
3 | /replace-image/__pycache__/
4 | /icons/__pycache__
5 |
--------------------------------------------------------------------------------
/examples/anonymize-document/anonymize.py:
--------------------------------------------------------------------------------
1 | """
2 | Remove all text from a document.
3 | -------------------------------------------------------------------------------
4 | License: GNU GPL V3
5 | (c) 2022 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python anonymize.py input.pdf
10 |
11 | Description
12 | -----------
13 | Scan through all pages of a PDF and remove all text. The metadata dictionary
14 | will also be cleared with "none" values. Any XML-based metadata will also be
15 | deleted.
16 | """
17 |
18 | import sys
19 | import fitz
20 |
21 |
22 | def remove_txt(cont):
23 | """
24 | Remove everything enclosed in a pair of "BT" / "ET" strings, including both.
25 | Assuming "cont" is the string of a PDF "/Contents" stream, this will make
26 | all text of the owning page disappear (permanent delete).
27 | """
28 | cont1 = cont.replace(b"\n", b" ")
29 | ct = cont1.split(b" ")
30 | nct = []
31 | intext = False
32 | for word in ct:
33 | if word == b"ET":
34 | intext = False
35 | continue
36 | if word == b"BT":
37 | intext = True
38 | continue
39 | if intext:
40 | continue
41 | nct.append(word)
42 |
43 | ncont = b" ".join(nct)
44 | return ncont
45 |
46 |
47 | assert len(sys.argv) == 2, "need input PDF file name"
48 | fn = sys.argv[1]
49 | assert fn.endswith(".pdf"), "expect a PDF file"
50 | doc = fitz.open(fn)
51 | doc.set_metadata({}) # set metadata values to "none"
52 | doc.del_xml_metadata() # delete any XML metadata
53 | for page in doc:
54 | xref_lst = page.get_contents()
55 | for xref in xref_lst:
56 | cont = doc.xref_stream(xref)
57 | ncont = remove_txt(cont)
58 | doc.update_stream(xref, ncont)
59 |
60 | doc.save("output.pdf", clean=True, garbage=4)
61 |
--------------------------------------------------------------------------------
/examples/anonymize-document/input.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/anonymize-document/input.pdf
--------------------------------------------------------------------------------
/examples/anonymize-document/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/anonymize-document/output.pdf
--------------------------------------------------------------------------------
/examples/attach-images/input/erik-jan-leusink-s2mkB4WOl9k-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/attach-images/input/erik-jan-leusink-s2mkB4WOl9k-unsplash.jpg
--------------------------------------------------------------------------------
/examples/attach-images/input/joe-caione-qO-PIF84Vxg-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/attach-images/input/joe-caione-qO-PIF84Vxg-unsplash.jpg
--------------------------------------------------------------------------------
/examples/attach-images/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/attach-images/output.pdf
--------------------------------------------------------------------------------
/examples/browse-document/input.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/browse-document/input.pdf
--------------------------------------------------------------------------------
/examples/combine-pages/combine.py:
--------------------------------------------------------------------------------
1 | """
2 | Copy a PDF document combining every 4 pages
3 | -------------------------------------------------------------------------------
4 | License: GNU GPL V3
5 | (c) 2018 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python combine.py input.pdf
10 |
11 | Notes
12 | -----
13 | (1) Output file is chosen to have A4 portrait pages. Input pages are scaled
14 | maintaining side proportions. Both can be changed, e.g. based on input
15 | page size. However, note that not all pages need to have the same size, etc.
16 |
17 | (2) Easily adapt the example to combine just 2 pages (like for a booklet) or
18 | make the output page dimension dependent on input, or whatever.
19 |
20 | (3) This should run very fast: needed less than 25 sec on a Python 3.6 64bit,
21 | Windows 10, AMD 4.0 GHz for the 1'310 pages of the Adobe manual.
22 | Without save-options "garbage" and "deflate" this goes below 4 seconds, but
23 | results in a bigger file.
24 |
25 | Dependencies
26 | ------------
27 | PyMuPDF 1.12.1 or later
28 | """
29 |
30 | from __future__ import print_function
31 | import fitz, sys
32 |
33 | infile = sys.argv[1]
34 | src = fitz.open(infile)
35 | doc = fitz.open() # empty output PDF
36 |
37 | width, height = fitz.paper_size("a4") # A4 portrait output page format
38 | r = fitz.Rect(0, 0, width, height)
39 |
40 | # define the 4 rectangles per page
41 | r1 = r * 0.5 # top left rect
42 | r2 = r1 + (r1.width, 0, r1.width, 0) # top right
43 | r3 = r1 + (0, r1.height, 0, r1.height) # bottom left
44 | r4 = fitz.Rect(r1.br, r.br) # bottom right
45 |
46 | # put them in a list
47 | r_tab = [r1, r2, r3, r4]
48 |
49 | # now copy input pages to output
50 | for spage in src:
51 | if spage.number % 4 == 0: # create new output page
52 | page = doc.new_page(-1, width=width, height=height)
53 | # insert input page into the correct rectangle
54 | page.show_pdf_page(
55 | r_tab[spage.number % 4], # select output rect
56 | src, # input document
57 | spage.number,
58 | ) # input page number
59 |
60 | # by all means, save new file using garbage collection and compression
61 | doc.save("output.pdf", garbage=4, deflate=True)
62 |
--------------------------------------------------------------------------------
/examples/combine-pages/input.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/combine-pages/input.pdf
--------------------------------------------------------------------------------
/examples/combine-pages/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/combine-pages/output.pdf
--------------------------------------------------------------------------------
/examples/convert-document/input.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-document/input.epub
--------------------------------------------------------------------------------
/examples/convert-document/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-document/output.pdf
--------------------------------------------------------------------------------
/examples/convert-image/convert.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert an arbitrary image to a PNG pixmap using Pillow
3 | --------------------------------------------------------------------------------
4 | License: GNU GPL V3
5 | (c) 2022 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python convert.py input.jpg
10 |
11 | Dependencies
12 | ------------
13 | Pillow
14 | """
15 |
16 | import sys
17 | import fitz
18 | from PIL import Image
19 |
20 | print(fitz.__doc__)
21 |
22 | if len(sys.argv) == 2:
23 | pic_fn = sys.argv[1]
24 | else:
25 | pic_fn = None
26 |
27 | if pic_fn:
28 | print("Reading %s" % pic_fn)
29 | pic_f = open(pic_fn, "rb")
30 | img = Image.open(pic_f).convert("RGB")
31 | samples = img.tobytes()
32 | pix = fitz.Pixmap(fitz.csRGB, img.size[0], img.size[1], samples, 0)
33 | pix.save("output.png")
34 | pic_f.close()
35 |
--------------------------------------------------------------------------------
/examples/convert-image/input.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-image/input.jpg
--------------------------------------------------------------------------------
/examples/convert-image/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-image/output.png
--------------------------------------------------------------------------------
/examples/convert-pixmap/convert.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert an arbitrary pixmap to JPEG format using Pillow
3 | --------------------------------------------------------------------------------
4 | License: GNU GPL V3
5 | (c) 2022 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python convert.py input.png
10 |
11 | Dependencies
12 | ------------
13 | Pillow
14 | """
15 |
16 | import sys
17 | import fitz
18 | from PIL import Image
19 |
20 | print(fitz.__doc__)
21 | assert len(sys.argv) == 2, "Usage: %s " % sys.argv[0]
22 |
23 | pix = fitz.Pixmap(sys.argv[1])
24 | rgb = "RGB"
25 | if pix.alpha: # JPEG cannot have alpha!
26 | pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
27 | pix = pix0 # rename pixmap
28 |
29 | img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
30 | img.save("output.jpg")
31 |
--------------------------------------------------------------------------------
/examples/convert-pixmap/input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-pixmap/input.png
--------------------------------------------------------------------------------
/examples/convert-pixmap/output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-pixmap/output.jpg
--------------------------------------------------------------------------------
/examples/convert-text/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/convert-text/output.pdf
--------------------------------------------------------------------------------
/examples/copy-embedded/copy.py:
--------------------------------------------------------------------------------
1 | """
2 | Copy the embedded files in the input document to the output document
3 | -------------------------------------------------------------------------------
4 | License: GNU AGPL V3
5 | (c) 2021 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python copy.py input.pdf output.pdf
10 |
11 | Notes
12 | -----
13 | The output.pdf file generated in examples/embed-images is renamed as input.pdf
14 | to be used as the input file in this example.
15 |
16 | Dependencies
17 | ------------
18 | PyMuPDF
19 | """
20 |
21 | from __future__ import print_function
22 | import sys
23 | import fitz
24 |
25 | ifn = sys.argv[1] # input PDF
26 | ofn = sys.argv[2] # output PDF
27 | docin = fitz.open(ifn)
28 | docout = fitz.open(ofn)
29 | print("Copying embedded files from '%s' to '%s'" % (ifn, ofn))
30 | for i in range(docin.embfile_count()):
31 | d = docin.embfile_info(i) # file metadata
32 | b = docin.embfile_get(i) # file content
33 | try: # safeguarding against duplicate entries
34 | print("copying entry:", d["name"])
35 | docout.embfile_add(b, d["name"], d["file"], d["desc"])
36 | except:
37 | pass
38 |
39 | # save output (incrementally or to new PDF)
40 | docout.saveIncr()
41 |
--------------------------------------------------------------------------------
/examples/copy-embedded/input.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/copy-embedded/input.pdf
--------------------------------------------------------------------------------
/examples/copy-embedded/output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymupdf/PyMuPDF-Utilities/4d266de74be4c4d6dfb9925007b0d1a3818bf78a/examples/copy-embedded/output.pdf
--------------------------------------------------------------------------------
/examples/decrypt-document/decrypt.py:
--------------------------------------------------------------------------------
1 | """
2 | Decrypt a PDF document with the password provided and save it as a new document
3 | --------------------------------------------------------------------------------
4 | License: GNU GPL V3+
5 | (c) 2022 Jorj X. McKie
6 |
7 | Usage
8 | -----
9 | python decrypt.py input.pdf password output.pdf
10 | """
11 |
12 | import sys
13 | import fitz
14 |
15 | print(fitz.__doc__)
16 | assert len(sys.argv) == 4, (
17 | "Usage: %s