├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── examples ├── 4up.py ├── README.txt ├── alter.py ├── booklet.py ├── cat.py ├── extract.py ├── fancy_watermark.py ├── poster.py ├── print_two.py ├── rl1 │ ├── 4up.py │ ├── README.txt │ ├── booklet.py │ ├── platypus_pdf_template.py │ └── subset.py ├── rl2 │ ├── README.txt │ ├── copy.py │ └── decodegraphics.py ├── rotate.py ├── subset.py ├── subset_booklets.py ├── unspread.py └── watermark.py ├── pdfrw ├── __init__.py ├── buildxobj.py ├── compress.py ├── crypt.py ├── errors.py ├── findobjs.py ├── objects │ ├── __init__.py │ ├── pdfarray.py │ ├── pdfdict.py │ ├── pdfindirect.py │ ├── pdfname.py │ ├── pdfobject.py │ └── pdfstring.py ├── pagemerge.py ├── pdfreader.py ├── pdfwriter.py ├── py23_diffs.py ├── tokens.py ├── toreportlab.py └── uncompress.py ├── releasing.txt ├── setup.cfg ├── setup.py └── tests ├── Render Bitmap.ipynb ├── __init__.py ├── basn0g08.png.log ├── basn2c08.png.log ├── checkdiffs.py ├── expected.py ├── expected.txt ├── f01n2c08.png.log ├── f02n2c08.png.log ├── f03n2c08.png.log ├── f04n2c08.png.log ├── myprofile.py ├── test_examples.py ├── test_flate_png.py ├── test_pdfdict.py ├── test_pdfreader_init.py ├── test_pdfstring.py ├── test_roundtrip.py └── update_expected.py /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX 2 | .DS_Store 3 | .AppleDouble 4 | .LSOverride 5 | Icon 6 | 7 | # Thumbnails 8 | ._* 9 | 10 | # Files that might appear on external disk 11 | .Spotlight-V100 12 | .Trashes 13 | 14 | 15 | # Development artifacts 16 | diffs.txt 17 | examples/*.pdf 18 | examples/rl*/*.pdf 19 | tests/*.pdf 20 | examples/pdfrw 21 | examples/rl*/pdfrw 22 | tests/pdfrw 23 | tests/static_pdfs 24 | tests/ramdisk 25 | tests/saved_results 26 | tests/tmp_results 27 | wiki/ 28 | 29 | 30 | # Byte-compiled / optimized / DLL files 31 | __pycache__/ 32 | *.py[cod] 33 | 34 | # Distribution / packaging 35 | .Python 36 | env/ 37 | bin/ 38 | build/ 39 | develop-eggs/ 40 | dist/ 41 | eggs/ 42 | lib/ 43 | lib64/ 44 | lib64 45 | parts/ 46 | sdist/ 47 | var/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | pyvenv.cfg 52 | pip-selfcheck.json 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .coverage 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.3" 6 | - "3.4" 7 | - "3.5" 8 | - "3.6" 9 | - "nightly" 10 | # command to install dependencies 11 | before_install: 12 | - "git clone https://github.com/pmaupin/static_pdfs tests/static_pdfs" 13 | install: 14 | - "pip install ." 15 | - "pip install reportlab || true" 16 | - "pip install PyCrypto || true" 17 | - "pip install zlib || true" 18 | - "pip install unittest2 || true" 19 | # command to run tests 20 | script: "cd tests; /usr/bin/env PYTHONPATH=. py.test" 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | pdfrw (github.com/pmaupin/pdfrw) 2 | 3 | The majority of pdfrw was written by Patrick Maupin and is licensed 4 | under the MIT license (reproduced below). Other contributors include 5 | Attila Tajti and Nerijus Mika. It appears that some of the decompression 6 | code was based on the decompressor from PyPDF2, which was written by 7 | Mathieu Fenniak and licensed under the BSD license (also reproduced below). 8 | 9 | Please add any missing authors here: 10 | 11 | Copyright (c) 2006-2017 Patrick Maupin. All rights reserved. 12 | Copyright (c) 2006 Mathieu Fenniak. All rights reserved. 13 | Copyright (c) 2010 Attila Tajti. All rights reserved. 14 | Copyright (c) 2012 Nerijus Mika. All rights reserved. 15 | Copyright (c) 2015 Bastien Gandouet. All rights reserved. 16 | Copyright (c) 2015 Tzerjen Wei. All rights reserved. 17 | Copyright (c) 2015 Jorj X. McKie. All rights reserved. 18 | Copyright (c) 2015 Nicholas Devenish. All rights reserved. 19 | Copyright (c) 2015-2016 Jonatan Dellagostin. All rights reserved. 20 | Copyright (c) 2016-2017 Thomas Kluyver. All rights reserved. 21 | Copyright (c) 2016 James Laird-Wah. All rights reserved. 22 | Copyright (c) 2016 Marcus Brinkmann. All rights reserved. 23 | Copyright (c) 2016 Edward Betts. All rights reserved. 24 | Copyright (c) 2016 Patrick Mazulo. All rights reserved. 25 | Copyright (c) 2017 Haochen Wu. All rights reserved. 26 | Copyright (c) 2017 Jon Lund Steffensen. All rights reserved. 27 | Copyright (c) 2017 Henddher Pedroza. All rights reserved. 28 | 29 | 30 | MIT License: 31 | 32 | Permission is hereby granted, free of charge, to any person obtaining a copy 33 | of this software and associated documentation files (the "Software"), to deal 34 | in the Software without restriction, including without limitation the rights 35 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 36 | copies of the Software, and to permit persons to whom the Software is 37 | furnished to do so, subject to the following conditions: 38 | 39 | The above copyright notice and this permission notice shall be included in 40 | all copies or substantial portions of the Software. 41 | 42 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 43 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 44 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 45 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 46 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 47 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 48 | THE SOFTWARE. 49 | 50 | 51 | BSD License: 52 | 53 | Redistribution and use in source and binary forms, with or without 54 | modification, are permitted provided that the following conditions are 55 | met: 56 | 57 | * Redistributions of source code must retain the above copyright notice, 58 | this list of conditions and the following disclaimer. 59 | * Redistributions in binary form must reproduce the above copyright notice, 60 | this list of conditions and the following disclaimer in the documentation 61 | and/or other materials provided with the distribution. 62 | * The name of the author may not be used to endorse or promote products 63 | derived from this software without specific prior written permission. 64 | 65 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 69 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 75 | POSSIBILITY OF SUCH DAMAGE. 76 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt *.in *.rst 2 | recursive-include examples *.txt *.py 3 | recursive-include tests *.py 4 | -------------------------------------------------------------------------------- /examples/4up.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: 4up.py my.pdf 5 | 6 | Creates 4up.my.pdf with a single output page for every 7 | 4 input pages. 8 | ''' 9 | 10 | import sys 11 | import os 12 | 13 | from pdfrw import PdfReader, PdfWriter, PageMerge 14 | 15 | 16 | def get4(srcpages): 17 | scale = 0.5 18 | srcpages = PageMerge() + srcpages 19 | x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) 20 | for i, page in enumerate(srcpages): 21 | page.scale(scale) 22 | page.x = x_increment if i & 1 else 0 23 | page.y = 0 if i & 2 else y_increment 24 | return srcpages.render() 25 | 26 | 27 | inpfn, = sys.argv[1:] 28 | outfn = '4up.' + os.path.basename(inpfn) 29 | pages = PdfReader(inpfn).pages 30 | writer = PdfWriter(outfn) 31 | for index in range(0, len(pages), 4): 32 | writer.addpage(get4(pages[index:index + 4])) 33 | writer.write() 34 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | Example programs: 2 | 3 | 4up.py -- Prints pages four-up 4 | 5 | alter.py -- Simple example of making a very slight modification to a PDF. 6 | 7 | booklet.py -- Converts a PDF into a booklet. 8 | 9 | cat.py -- Concatenates multiple PDFs, adds metadata. 10 | 11 | poster.py -- Changes the size of a PDF to create a poster 12 | 13 | print_two.py -- this is used when printing two cut-down copies on a single sheet of paper (double-sided) Requires uncompressed PDF. 14 | 15 | rotate.py -- This will rotate selected ranges of pages within a document. 16 | 17 | subset.py -- This will retrieve a subset of pages from a document. 18 | 19 | watermark.py -- Adds a watermark to a PDF 20 | 21 | rl1/4up.py -- Same as 4up.py, using reportlab for output. Next simplest reportlab example. 22 | 23 | rl1/booklet.py -- Version of print_booklet using reportlab for output. 24 | 25 | rl1/platypus_pdf_template.py -- Example using a PDF page as a watermark background with reportlab. 26 | 27 | rl1/subset.py -- Same as subset.py, using reportlab for output. Simplest reportlab example. 28 | 29 | rl2/copy.py -- example of how you could parse a graphics stream and then use reportlab for output. 30 | Works on a few different PDFs, probably not a suitable starting point for real 31 | production work without a lot of work on the library functions. 32 | 33 | -------------------------------------------------------------------------------- /examples/alter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: alter.py my.pdf 5 | 6 | Creates alter.my.pdf 7 | 8 | Demonstrates making a slight alteration to a preexisting PDF file. 9 | 10 | ''' 11 | 12 | import sys 13 | import os 14 | 15 | from pdfrw import PdfReader, PdfWriter 16 | 17 | inpfn, = sys.argv[1:] 18 | outfn = 'alter.' + os.path.basename(inpfn) 19 | 20 | trailer = PdfReader(inpfn) 21 | trailer.Info.Title = 'My New Title Goes Here' 22 | PdfWriter(outfn, trailer=trailer).write() 23 | -------------------------------------------------------------------------------- /examples/booklet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: booklet.py [-p] my.pdf 5 | 6 | Creates booklet.my.pdf 7 | 8 | Pages organized in a form suitable for booklet printing, e.g. 9 | to print 4 8.5x11 pages using a single 11x17 sheet (double-sided). 10 | 11 | The output would be using the same type of sheet 12 | and you can get up to 3 blank sides if -p is enabled. 13 | 14 | Otherwise the two sides in the middle will be in original page size 15 | and you can have 1 blank sides at most. 16 | 17 | ''' 18 | 19 | import os 20 | import argparse 21 | 22 | from pdfrw import PdfReader, PdfWriter, PageMerge 23 | 24 | 25 | def fixpage(*pages): 26 | result = PageMerge() + (x for x in pages if x is not None) 27 | result[-1].x += result[0].w 28 | return result.render() 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("input", help="Input pdf file name") 33 | parser.add_argument("-p", "--padding", action = "store_true", 34 | help="Padding the document so that all pages use the same type of sheet") 35 | args = parser.parse_args() 36 | 37 | inpfn = args.input 38 | outfn = 'booklet.' + os.path.basename(inpfn) 39 | ipages = PdfReader(inpfn).pages 40 | 41 | if args.padding: 42 | pad_to = 4 43 | else: 44 | pad_to = 2 45 | 46 | # Make sure we have a correct number of sides 47 | ipages += [None]*(-len(ipages)%pad_to) 48 | 49 | opages = [] 50 | while len(ipages) > 2: 51 | opages.append(fixpage(ipages.pop(), ipages.pop(0))) 52 | opages.append(fixpage(ipages.pop(0), ipages.pop())) 53 | 54 | opages += ipages 55 | 56 | PdfWriter(outfn).addpages(opages).write() 57 | -------------------------------------------------------------------------------- /examples/cat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: cat.py [ ...] 5 | 6 | Creates cat. 7 | 8 | This file demonstrates two features: 9 | 10 | 1) Concatenating multiple input PDFs. 11 | 12 | 2) adding metadata to the PDF. 13 | 14 | ''' 15 | 16 | import sys 17 | import os 18 | 19 | from pdfrw import PdfReader, PdfWriter, IndirectPdfDict 20 | 21 | inputs = sys.argv[1:] 22 | assert inputs 23 | outfn = 'cat.' + os.path.basename(inputs[0]) 24 | 25 | writer = PdfWriter() 26 | for inpfn in inputs: 27 | writer.addpages(PdfReader(inpfn).pages) 28 | 29 | writer.trailer.Info = IndirectPdfDict( 30 | Title='your title goes here', 31 | Author='your name goes here', 32 | Subject='what is it all about?', 33 | Creator='some script goes here', 34 | ) 35 | writer.write(outfn) 36 | -------------------------------------------------------------------------------- /examples/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: extract.py 5 | 6 | Locates Form XObjects and Image XObjects within the PDF, 7 | and creates a new PDF containing these -- one per page. 8 | 9 | Resulting file will be named extract. 10 | 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from pdfrw import PdfReader, PdfWriter 17 | from pdfrw.findobjs import page_per_xobj 18 | 19 | 20 | inpfn, = sys.argv[1:] 21 | outfn = 'extract.' + os.path.basename(inpfn) 22 | pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72)) 23 | if not pages: 24 | raise IndexError("No XObjects found") 25 | writer = PdfWriter(outfn) 26 | writer.addpages(pages) 27 | writer.write() 28 | -------------------------------------------------------------------------------- /examples/fancy_watermark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Enhanced example of watermarking using form xobjects (pdfrw). 5 | 6 | usage: fancy_watermark.py [-u] my.pdf single_page.pdf 7 | 8 | Creates watermark.my.pdf, with every page overlaid with 9 | first page from single_page.pdf. If -u is selected, watermark 10 | will be placed underneath page (painted first). 11 | 12 | The stock watermark.py program assumes all pages are the same 13 | size. This example deals with pages of differing sizes in order 14 | to show some concepts of positioning and scaling. 15 | 16 | This version applies the watermark such that the upper right 17 | corner of the watermark is at the upper right corner of the 18 | document page for odd pages, and at the upper left corner 19 | of the document page for even pages, for each page of the 20 | document. 21 | 22 | It also rescales the size of the watermark if the watermark 23 | is too wide for the page. 24 | 25 | These scaling and positioning adjustments can easily 26 | be customized for any particular application. 27 | 28 | To handle documents with different page sizes, a cache is 29 | maintained of a modified intermediate watermark object 30 | for each page size. 31 | ''' 32 | 33 | import sys 34 | import os 35 | 36 | from pdfrw import PdfReader, PdfWriter, PageMerge 37 | 38 | # Get all the filenames 39 | 40 | argv = sys.argv[1:] 41 | underneath = '-u' in argv 42 | if underneath: 43 | del argv[argv.index('-u')] 44 | inpfn, wmarkfn = argv 45 | outfn = 'watermark.' + os.path.basename(inpfn) 46 | 47 | # Open both the source files 48 | wmark_trailer = PdfReader(wmarkfn) 49 | trailer = PdfReader(inpfn) 50 | 51 | # Handle different sized pages in same document with 52 | # a memoization cache, so we don't create more watermark 53 | # objects than we need to (typically only one per document). 54 | 55 | wmark_page = wmark_trailer.pages[0] 56 | wmark_cache = {} 57 | 58 | # Process every page 59 | for pagenum, page in enumerate(trailer.pages, 1): 60 | 61 | # Get the media box of the page, and see 62 | # if we have a matching watermark in the cache 63 | mbox = tuple(float(x) for x in page.MediaBox) 64 | odd = pagenum & 1 65 | key = mbox, odd 66 | wmark = wmark_cache.get(key) 67 | if wmark is None: 68 | 69 | # Create and cache a new watermark object. 70 | wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0] 71 | 72 | # The math is more complete than it probably needs to be, 73 | # because the origin of all pages is almost always (0, 0). 74 | # Nonetheless, we illustrate all the values and their names. 75 | 76 | page_x, page_y, page_x1, page_y1 = mbox 77 | page_w = page_x1 - page_x 78 | page_h = page_y1 - page_y # For illustration, not used 79 | 80 | # Scale the watermark if it is too wide for the page 81 | # (Could do the same for height instead if needed) 82 | if wmark.w > page_w: 83 | wmark.scale(1.0 * page_w / wmark.w) 84 | 85 | # Always put watermark at the top of the page 86 | # (but see horizontal positioning for other ideas) 87 | wmark.y += page_y1 - wmark.h 88 | 89 | # For odd pages, put it at the left of the page, 90 | # and for even pages, put it on the right of the page. 91 | if odd: 92 | wmark.x = page_x 93 | else: 94 | wmark.x += page_x1 - wmark.w 95 | 96 | # Optimize the case where the watermark is same width 97 | # as page. 98 | if page_w == wmark.w: 99 | wmark_cache[mbox, not odd] = wmark 100 | 101 | # Add the watermark to the page 102 | PageMerge(page).add(wmark, prepend=underneath).render() 103 | 104 | # Write out the destination file 105 | PdfWriter(outfn, trailer=trailer).write() 106 | -------------------------------------------------------------------------------- /examples/poster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: poster.py my.pdf 5 | 6 | Shows how to change the size on a PDF. 7 | 8 | Motivation: 9 | 10 | My daughter needed to create a 48" x 36" poster, but her Mac 11 | version of Powerpoint only wanted to output 8.5" x 11" for 12 | some reason. 13 | 14 | So she did an 8.5x11" output with 0.5" margin all around 15 | (actual size of useful area 7.5x10") and we scaled it 16 | up by 4.8. 17 | 18 | We also copy the Info dict to the new PDF. 19 | 20 | ''' 21 | 22 | import sys 23 | import os 24 | 25 | from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict 26 | 27 | 28 | def adjust(page, margin=36, scale=4.8): 29 | info = PageMerge().add(page) 30 | x1, y1, x2, y2 = info.xobj_box 31 | viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) 32 | page = PageMerge().add(page, viewrect=viewrect) 33 | page[0].scale(scale) 34 | return page.render() 35 | 36 | 37 | inpfn, = sys.argv[1:] 38 | outfn = 'poster.' + os.path.basename(inpfn) 39 | reader = PdfReader(inpfn) 40 | writer = PdfWriter(outfn) 41 | writer.addpage(adjust(reader.pages[0])) 42 | writer.trailer.Info = IndirectPdfDict(reader.Info or {}) 43 | writer.write() 44 | -------------------------------------------------------------------------------- /examples/print_two.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: print_two.py my.pdf 5 | 6 | Creates print_two.my.pdf 7 | 8 | This is only useful when you can cut down sheets of paper to make two 9 | small documents. Works for double-sided only right now. 10 | ''' 11 | 12 | import sys 13 | import os 14 | 15 | from pdfrw import PdfReader, PdfWriter, PageMerge 16 | 17 | 18 | def fixpage(page, count=[0]): 19 | count[0] += 1 20 | oddpage = (count[0] & 1) 21 | 22 | result = PageMerge() 23 | for rotation in (180 + 180 * oddpage, 180 * oddpage): 24 | result.add(page, rotate=rotation) 25 | result[1].x = result[0].w 26 | return result.render() 27 | 28 | 29 | inpfn, = sys.argv[1:] 30 | outfn = 'print_two.' + os.path.basename(inpfn) 31 | pages = PdfReader(inpfn).pages 32 | PdfWriter(outfn).addpages(fixpage(x) for x in pages).write() 33 | -------------------------------------------------------------------------------- /examples/rl1/4up.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: 4up.py my.pdf 5 | 6 | 7 | Uses Form XObjects and reportlab to create 4up.my.pdf. 8 | 9 | Demonstrates use of pdfrw with reportlab. 10 | 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from reportlab.pdfgen.canvas import Canvas 17 | 18 | from pdfrw import PdfReader 19 | from pdfrw.buildxobj import pagexobj 20 | from pdfrw.toreportlab import makerl 21 | 22 | 23 | def addpage(canvas, allpages): 24 | pages = allpages[:4] 25 | del allpages[:4] 26 | 27 | x_max = max(page.BBox[2] for page in pages) 28 | y_max = max(page.BBox[3] for page in pages) 29 | 30 | canvas.setPageSize((x_max, y_max)) 31 | 32 | for index, page in enumerate(pages): 33 | x = x_max * (index & 1) / 2.0 34 | y = y_max * (index <= 1) / 2.0 35 | canvas.saveState() 36 | canvas.translate(x, y) 37 | canvas.scale(0.5, 0.5) 38 | canvas.doForm(makerl(canvas, page)) 39 | canvas.restoreState() 40 | canvas.showPage() 41 | 42 | 43 | def go(argv): 44 | inpfn, = argv 45 | outfn = '4up.' + os.path.basename(inpfn) 46 | 47 | pages = PdfReader(inpfn).pages 48 | pages = [pagexobj(x) for x in pages] 49 | canvas = Canvas(outfn) 50 | 51 | while pages: 52 | addpage(canvas, pages) 53 | canvas.save() 54 | 55 | if __name__ == '__main__': 56 | go(sys.argv[1:]) 57 | -------------------------------------------------------------------------------- /examples/rl1/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains example scripts which read in PDFs 2 | and convert pages to PDF Form XObjects using pdfrw, and then 3 | write out the PDFs using reportlab. 4 | 5 | The examples, from easiest to hardest, are: 6 | 7 | subset.py -- prints a subset of pages 8 | 4up.py -- prints pages 4-up 9 | booklet.py -- creates a booklet out of the pages 10 | -------------------------------------------------------------------------------- /examples/rl1/booklet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: booklet.py my.pdf 5 | 6 | 7 | Uses Form XObjects and reportlab to create booklet.my.pdf. 8 | 9 | Demonstrates use of pdfrw with reportlab. 10 | 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from reportlab.pdfgen.canvas import Canvas 17 | 18 | from pdfrw import PdfReader 19 | from pdfrw.buildxobj import pagexobj 20 | from pdfrw.toreportlab import makerl 21 | 22 | 23 | def read_and_double(inpfn): 24 | pages = PdfReader(inpfn).pages 25 | pages = [pagexobj(x) for x in pages] 26 | if len(pages) & 1: 27 | pages.append(pages[0]) # Sentinel -- get same size for back as front 28 | 29 | xobjs = [] 30 | while len(pages) > 2: 31 | xobjs.append((pages.pop(), pages.pop(0))) 32 | xobjs.append((pages.pop(0), pages.pop())) 33 | xobjs += [(x,) for x in pages] 34 | return xobjs 35 | 36 | 37 | def make_pdf(outfn, xobjpairs): 38 | canvas = Canvas(outfn) 39 | for xobjlist in xobjpairs: 40 | x = y = 0 41 | for xobj in xobjlist: 42 | x += xobj.BBox[2] 43 | y = max(y, xobj.BBox[3]) 44 | 45 | canvas.setPageSize((x, y)) 46 | 47 | # Handle blank back page 48 | if len(xobjlist) > 1 and xobjlist[0] == xobjlist[-1]: 49 | xobjlist = xobjlist[:1] 50 | x = xobjlist[0].BBox[2] 51 | else: 52 | x = 0 53 | y = 0 54 | 55 | for xobj in xobjlist: 56 | canvas.saveState() 57 | canvas.translate(x, y) 58 | canvas.doForm(makerl(canvas, xobj)) 59 | canvas.restoreState() 60 | x += xobj.BBox[2] 61 | canvas.showPage() 62 | canvas.save() 63 | 64 | 65 | inpfn, = sys.argv[1:] 66 | outfn = 'booklet.' + os.path.basename(inpfn) 67 | 68 | make_pdf(outfn, read_and_double(inpfn)) 69 | -------------------------------------------------------------------------------- /examples/rl1/platypus_pdf_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | usage: platypus_pdf_template.py source.pdf 5 | 6 | Creates platypus.source.pdf 7 | 8 | Example of using pdfrw to use page 1 of a source PDF as the background 9 | for other pages programmatically generated with Platypus. 10 | 11 | Contributed by user asannes 12 | 13 | """ 14 | import sys 15 | import os 16 | 17 | from reportlab.platypus import PageTemplate, BaseDocTemplate, Frame 18 | from reportlab.platypus import NextPageTemplate, Paragraph, PageBreak 19 | from reportlab.platypus.tableofcontents import TableOfContents 20 | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle 21 | from reportlab.rl_config import defaultPageSize 22 | from reportlab.lib.units import inch 23 | from reportlab.graphics import renderPDF 24 | 25 | from pdfrw import PdfReader 26 | from pdfrw.buildxobj import pagexobj 27 | from pdfrw.toreportlab import makerl 28 | 29 | PAGE_WIDTH = defaultPageSize[0] 30 | PAGE_HEIGHT = defaultPageSize[1] 31 | 32 | 33 | class MyTemplate(PageTemplate): 34 | """The kernel of this example, where we use pdfrw to fill in the 35 | background of a page before writing to it. This could be used to fill 36 | in a water mark or similar.""" 37 | 38 | def __init__(self, pdf_template_filename, name=None): 39 | frames = [Frame( 40 | 0.85 * inch, 41 | 0.5 * inch, 42 | PAGE_WIDTH - 1.15 * inch, 43 | PAGE_HEIGHT - (1.5 * inch) 44 | )] 45 | PageTemplate.__init__(self, name, frames) 46 | # use first page as template 47 | page = PdfReader(pdf_template_filename).pages[0] 48 | self.page_template = pagexobj(page) 49 | # Scale it to fill the complete page 50 | self.page_xscale = PAGE_WIDTH/self.page_template.BBox[2] 51 | self.page_yscale = PAGE_HEIGHT/self.page_template.BBox[3] 52 | 53 | def beforeDrawPage(self, canvas, doc): 54 | """Draws the background before anything else""" 55 | canvas.saveState() 56 | rl_obj = makerl(canvas, self.page_template) 57 | canvas.scale(self.page_xscale, self.page_yscale) 58 | canvas.doForm(rl_obj) 59 | canvas.restoreState() 60 | 61 | 62 | class MyDocTemplate(BaseDocTemplate): 63 | """Used to apply heading to table of contents.""" 64 | 65 | def afterFlowable(self, flowable): 66 | """Adds Heading1 to table of contents""" 67 | if flowable.__class__.__name__ == 'Paragraph': 68 | style = flowable.style.name 69 | text = flowable.getPlainText() 70 | key = '%s' % self.seq.nextf('toc') 71 | if style == 'Heading1': 72 | self.canv.bookmarkPage(key) 73 | self.notify('TOCEntry', [1, text, self.page, key]) 74 | 75 | 76 | def create_toc(): 77 | """Creates the table of contents""" 78 | table_of_contents = TableOfContents() 79 | table_of_contents.dotsMinLevel = 0 80 | header1 = ParagraphStyle(name='Heading1', fontSize=16, leading=16) 81 | header2 = ParagraphStyle(name='Heading2', fontSize=14, leading=14) 82 | table_of_contents.levelStyles = [header1, header2] 83 | return [table_of_contents, PageBreak()] 84 | 85 | 86 | def create_pdf(filename, pdf_template_filename): 87 | """Create the pdf, with all the contents""" 88 | pdf_report = open(filename, "wb") 89 | document = MyDocTemplate(pdf_report) 90 | templates = [MyTemplate(pdf_template_filename, name='background')] 91 | document.addPageTemplates(templates) 92 | 93 | styles = getSampleStyleSheet() 94 | elements = [NextPageTemplate('background')] 95 | elements.extend(create_toc()) 96 | 97 | # Dummy content (hello world x 200) 98 | for i in range(200): 99 | elements.append(Paragraph("Hello World" + str(i), styles['Heading1'])) 100 | 101 | document.multiBuild(elements) 102 | pdf_report.close() 103 | 104 | 105 | if __name__ == '__main__': 106 | template, = sys.argv[1:] 107 | output = 'platypus_pdf_template.' + os.path.basename(template) 108 | create_pdf(output, template) 109 | -------------------------------------------------------------------------------- /examples/rl1/subset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: subset.py my.pdf firstpage lastpage 5 | 6 | Creates subset__to_.my.pdf 7 | 8 | 9 | Uses Form XObjects and reportlab to create output file. 10 | 11 | Demonstrates use of pdfrw with reportlab. 12 | 13 | ''' 14 | 15 | import sys 16 | import os 17 | 18 | from reportlab.pdfgen.canvas import Canvas 19 | 20 | from pdfrw import PdfReader 21 | from pdfrw.buildxobj import pagexobj 22 | from pdfrw.toreportlab import makerl 23 | 24 | 25 | def go(inpfn, firstpage, lastpage): 26 | firstpage, lastpage = int(firstpage), int(lastpage) 27 | outfn = 'subset.' + os.path.basename(inpfn) 28 | 29 | pages = PdfReader(inpfn).pages 30 | pages = [pagexobj(x) for x in pages[firstpage - 1:lastpage]] 31 | canvas = Canvas(outfn) 32 | 33 | for page in pages: 34 | canvas.setPageSize((page.BBox[2], page.BBox[3])) 35 | canvas.doForm(makerl(canvas, page)) 36 | canvas.showPage() 37 | 38 | canvas.save() 39 | 40 | if __name__ == '__main__': 41 | inpfn, firstpage, lastpage = sys.argv[1:] 42 | go(inpfn, firstpage, lastpage) 43 | -------------------------------------------------------------------------------- /examples/rl2/README.txt: -------------------------------------------------------------------------------- 1 | The copy.py demo in this directory parses the graphics stream from the PDF and actually plays it back through reportlab. 2 | 3 | Doesn't yet handle fonts or unicode very well. 4 | 5 | For a more practical demo, look at the Form XObjects approach in the examples/rl1 directory. 6 | -------------------------------------------------------------------------------- /examples/rl2/copy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: copy.py my.pdf 5 | 6 | Creates copy.my.pdf 7 | 8 | Uses somewhat-functional parser. For better results 9 | for most things, see the Form XObject-based method. 10 | 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from reportlab.pdfgen.canvas import Canvas 17 | 18 | from decodegraphics import parsepage 19 | from pdfrw import PdfReader, PdfWriter, PdfArray 20 | 21 | inpfn, = sys.argv[1:] 22 | outfn = 'copy.' + os.path.basename(inpfn) 23 | pages = PdfReader(inpfn, decompress=True).pages 24 | canvas = Canvas(outfn, pageCompression=0) 25 | 26 | for page in pages: 27 | box = [float(x) for x in page.MediaBox] 28 | assert box[0] == box[1] == 0, "demo won't work on this PDF" 29 | canvas.setPageSize(box[2:]) 30 | parsepage(page, canvas) 31 | canvas.showPage() 32 | canvas.save() 33 | -------------------------------------------------------------------------------- /examples/rl2/decodegraphics.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | This file is an example parser that will parse a graphics stream 7 | into a reportlab canvas. 8 | 9 | Needs work on fonts and unicode, but works on a few PDFs. 10 | 11 | Better to use Form XObjects for most things (see the example in rl1). 12 | 13 | ''' 14 | from inspect import getargspec 15 | 16 | from pdfrw import PdfTokens 17 | from pdfrw.objects import PdfString 18 | 19 | ############################################################################# 20 | # Graphics parsing 21 | 22 | 23 | def parse_array(self, token='[', params=None): 24 | mylist = [] 25 | for token in self.tokens: 26 | if token == ']': 27 | break 28 | mylist.append(token) 29 | self.params.append(mylist) 30 | 31 | 32 | def parse_savestate(self, token='q', params=''): 33 | self.canv.saveState() 34 | 35 | 36 | def parse_restorestate(self, token='Q', params=''): 37 | self.canv.restoreState() 38 | 39 | 40 | def parse_transform(self, token='cm', params='ffffff'): 41 | self.canv.transform(*params) 42 | 43 | 44 | def parse_linewidth(self, token='w', params='f'): 45 | self.canv.setLineWidth(*params) 46 | 47 | 48 | def parse_linecap(self, token='J', params='i'): 49 | self.canv.setLineCap(*params) 50 | 51 | 52 | def parse_linejoin(self, token='j', params='i'): 53 | self.canv.setLineJoin(*params) 54 | 55 | 56 | def parse_miterlimit(self, token='M', params='f'): 57 | self.canv.setMiterLimit(*params) 58 | 59 | 60 | def parse_dash(self, token='d', params='as'): # Array, string 61 | self.canv.setDash(*params) 62 | 63 | 64 | def parse_intent(self, token='ri', params='n'): 65 | # TODO: add logging 66 | pass 67 | 68 | 69 | def parse_flatness(self, token='i', params='i'): 70 | # TODO: add logging 71 | pass 72 | 73 | 74 | def parse_gstate(self, token='gs', params='n'): 75 | # TODO: add logging 76 | # Could parse stuff we care about from here later 77 | pass 78 | 79 | 80 | def parse_move(self, token='m', params='ff'): 81 | if self.gpath is None: 82 | self.gpath = self.canv.beginPath() 83 | self.gpath.moveTo(*params) 84 | self.current_point = params 85 | 86 | 87 | def parse_line(self, token='l', params='ff'): 88 | self.gpath.lineTo(*params) 89 | self.current_point = params 90 | 91 | 92 | def parse_curve(self, token='c', params='ffffff'): 93 | self.gpath.curveTo(*params) 94 | self.current_point = params[-2:] 95 | 96 | 97 | def parse_curve1(self, token='v', params='ffff'): 98 | parse_curve(self, token, tuple(self.current_point) + tuple(params)) 99 | 100 | 101 | def parse_curve2(self, token='y', params='ffff'): 102 | parse_curve(self, token, tuple(params) + tuple(params[-2:])) 103 | 104 | 105 | def parse_close(self, token='h', params=''): 106 | self.gpath.close() 107 | 108 | 109 | def parse_rect(self, token='re', params='ffff'): 110 | if self.gpath is None: 111 | self.gpath = self.canv.beginPath() 112 | self.gpath.rect(*params) 113 | self.current_point = params[-2:] 114 | 115 | 116 | def parse_stroke(self, token='S', params=''): 117 | finish_path(self, 1, 0, 0) 118 | 119 | 120 | def parse_close_stroke(self, token='s', params=''): 121 | self.gpath.close() 122 | finish_path(self, 1, 0, 0) 123 | 124 | 125 | def parse_fill(self, token='f', params=''): 126 | finish_path(self, 0, 1, 1) 127 | 128 | 129 | def parse_fill_compat(self, token='F', params=''): 130 | finish_path(self, 0, 1, 1) 131 | 132 | 133 | def parse_fill_even_odd(self, token='f*', params=''): 134 | finish_path(self, 0, 1, 0) 135 | 136 | 137 | def parse_fill_stroke_even_odd(self, token='B*', params=''): 138 | finish_path(self, 1, 1, 0) 139 | 140 | 141 | def parse_fill_stroke(self, token='B', params=''): 142 | finish_path(self, 1, 1, 1) 143 | 144 | 145 | def parse_close_fill_stroke_even_odd(self, token='b*', params=''): 146 | self.gpath.close() 147 | finish_path(self, 1, 1, 0) 148 | 149 | 150 | def parse_close_fill_stroke(self, token='b', params=''): 151 | self.gpath.close() 152 | finish_path(self, 1, 1, 1) 153 | 154 | 155 | def parse_nop(self, token='n', params=''): 156 | finish_path(self, 0, 0, 0) 157 | 158 | 159 | def finish_path(self, stroke, fill, fillmode): 160 | if self.gpath is not None: 161 | canv = self.canv 162 | canv._fillMode, oldmode = fillmode, canv._fillMode 163 | canv.drawPath(self.gpath, stroke, fill) 164 | canv._fillMode = oldmode 165 | self.gpath = None 166 | 167 | 168 | def parse_clip_path(self, token='W', params=''): 169 | # TODO: add logging 170 | pass 171 | 172 | 173 | def parse_clip_path_even_odd(self, token='W*', params=''): 174 | # TODO: add logging 175 | pass 176 | 177 | 178 | def parse_stroke_gray(self, token='G', params='f'): 179 | self.canv.setStrokeGray(*params) 180 | 181 | 182 | def parse_fill_gray(self, token='g', params='f'): 183 | self.canv.setFillGray(*params) 184 | 185 | 186 | def parse_stroke_rgb(self, token='RG', params='fff'): 187 | self.canv.setStrokeColorRGB(*params) 188 | 189 | 190 | def parse_fill_rgb(self, token='rg', params='fff'): 191 | self.canv.setFillColorRGB(*params) 192 | 193 | 194 | def parse_stroke_cmyk(self, token='K', params='ffff'): 195 | self.canv.setStrokeColorCMYK(*params) 196 | 197 | 198 | def parse_fill_cmyk(self, token='k', params='ffff'): 199 | self.canv.setFillColorCMYK(*params) 200 | 201 | ############################################################################# 202 | # Text parsing 203 | 204 | 205 | def parse_begin_text(self, token='BT', params=''): 206 | assert self.tpath is None 207 | self.tpath = self.canv.beginText() 208 | 209 | 210 | def parse_text_transform(self, token='Tm', params='ffffff'): 211 | path = self.tpath 212 | 213 | # Stoopid optimization to remove nop 214 | try: 215 | code = path._code 216 | except AttributeError: 217 | pass 218 | else: 219 | if code[-1] == '1 0 0 1 0 0 Tm': 220 | code.pop() 221 | 222 | path.setTextTransform(*params) 223 | 224 | 225 | def parse_setfont(self, token='Tf', params='nf'): 226 | fontinfo = self.fontdict[params[0]] 227 | self.tpath._setFont(fontinfo.name, params[1]) 228 | self.curfont = fontinfo 229 | 230 | 231 | def parse_text_out(self, token='Tj', params='t'): 232 | text = params[0].decode(self.curfont.remap, self.curfont.twobyte) 233 | self.tpath.textOut(text) 234 | 235 | def parse_lf_text_out(self, token="'", params='t'): 236 | self.tpath.textLine() 237 | text = params[0].decode(self.curfont.remap, self.curfont.twobyte) 238 | self.tpath.textOut(text) 239 | 240 | 241 | def parse_lf_text_out_with_spacing(self, token='"', params='fft'): 242 | self.tpath.setWordSpace(params[0]) 243 | self.tpath.setCharSpace(params[1]) 244 | self.tpath.textLine() 245 | text = params[2].decode(self.curfont.remap, self.curfont.twobyte) 246 | self.tpath.textOut(text) 247 | 248 | 249 | def parse_TJ(self, token='TJ', params='a'): 250 | remap = self.curfont.remap 251 | twobyte = self.curfont.twobyte 252 | result = [] 253 | for x in params[0]: 254 | if isinstance(x, PdfString): 255 | result.append(x.decode(remap, twobyte)) 256 | else: 257 | # TODO: Adjust spacing between characters here 258 | int(x) 259 | text = ''.join(result) 260 | self.tpath.textOut(text) 261 | 262 | 263 | def parse_end_text(self, token='ET', params=''): 264 | assert self.tpath is not None 265 | self.canv.drawText(self.tpath) 266 | self.tpath = None 267 | 268 | 269 | def parse_move_cursor(self, token='Td', params='ff'): 270 | self.tpath.moveCursor(params[0], -params[1]) 271 | 272 | 273 | def parse_set_leading(self, token='TL', params='f'): 274 | self.tpath.setLeading(*params) 275 | 276 | 277 | def parse_text_line(self, token='T*', params=''): 278 | self.tpath.textLine() 279 | 280 | 281 | def parse_set_char_space(self, token='Tc', params='f'): 282 | self.tpath.setCharSpace(*params) 283 | 284 | 285 | def parse_set_word_space(self, token='Tw', params='f'): 286 | self.tpath.setWordSpace(*params) 287 | 288 | 289 | def parse_set_hscale(self, token='Tz', params='f'): 290 | self.tpath.setHorizScale(params[0] - 100) 291 | 292 | 293 | def parse_set_rise(self, token='Ts', params='f'): 294 | self.tpath.setRise(*params) 295 | 296 | 297 | def parse_xobject(self, token='Do', params='n'): 298 | # TODO: Need to do this 299 | pass 300 | 301 | 302 | class FontInfo(object): 303 | ''' Pretty basic -- needs a lot of work to work right for all fonts 304 | ''' 305 | lookup = { 306 | # WRONG -- have to learn about font stuff... 307 | 'BitstreamVeraSans': 'Helvetica', 308 | } 309 | 310 | def __init__(self, source): 311 | name = source.BaseFont[1:] 312 | self.name = self.lookup.get(name, name) 313 | self.remap = chr 314 | self.twobyte = False 315 | info = source.ToUnicode 316 | if not info: 317 | return 318 | info = info.stream.split('beginbfchar')[1].split('endbfchar')[0] 319 | info = list(PdfTokens(info)) 320 | assert not len(info) & 1 321 | info2 = [] 322 | for x in info: 323 | assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x 324 | i = int(x[1:-1], 16) 325 | info2.append(i) 326 | self.remap = dict((x, chr(y)) for (x, y) in 327 | zip(info2[::2], info2[1::2])).get 328 | self.twobyte = len(info[0]) > 4 329 | 330 | ############################################################################# 331 | # Control structures 332 | 333 | 334 | def findparsefuncs(): 335 | 336 | def checkname(n): 337 | assert n.startswith('/') 338 | return n 339 | 340 | def checkarray(a): 341 | assert isinstance(a, list), a 342 | return a 343 | 344 | def checktext(t): 345 | assert isinstance(t, PdfString) 346 | return t 347 | 348 | fixparam = dict(f=float, i=int, n=checkname, a=checkarray, 349 | s=str, t=checktext) 350 | fixcache = {} 351 | 352 | def fixlist(params): 353 | try: 354 | result = fixcache[params] 355 | except KeyError: 356 | result = tuple(fixparam[x] for x in params) 357 | fixcache[params] = result 358 | return result 359 | 360 | dispatch = {} 361 | expected_args = 'self token params'.split() 362 | for key, func in globals().items(): 363 | if key.startswith('parse_'): 364 | args, varargs, keywords, defaults = getargspec(func) 365 | assert (args == expected_args and varargs is None and 366 | keywords is None and len(defaults) == 2), ( 367 | key, args, varargs, keywords, defaults) 368 | token, params = defaults 369 | if params is not None: 370 | params = fixlist(params) 371 | value = func, params 372 | assert dispatch.setdefault(token, value) is value, repr(token) 373 | return dispatch 374 | 375 | 376 | class _ParseClass(object): 377 | dispatch = findparsefuncs() 378 | 379 | @classmethod 380 | def parsepage(cls, page, canvas=None): 381 | self = cls() 382 | contents = page.Contents 383 | if contents.Filter is not None: 384 | raise SystemExit('Cannot parse graphics -- page encoded with %s' 385 | % contents.Filter) 386 | dispatch = cls.dispatch.get 387 | self.tokens = tokens = iter(PdfTokens(contents.stream)) 388 | self.params = params = [] 389 | self.canv = canvas 390 | self.gpath = None 391 | self.tpath = None 392 | self.fontdict = dict((x, FontInfo(y)) for 393 | (x, y) in page.Resources.Font.items()) 394 | 395 | for token in self.tokens: 396 | info = dispatch(token) 397 | if info is None: 398 | params.append(token) 399 | continue 400 | func, paraminfo = info 401 | if paraminfo is None: 402 | func(self, token, ()) 403 | continue 404 | delta = len(params) - len(paraminfo) 405 | if delta: 406 | if delta < 0: 407 | print ('Operator %s expected %s parameters, got %s' % 408 | (token, len(paraminfo), params)) 409 | params[:] = [] 410 | continue 411 | else: 412 | print ("Unparsed parameters/commands: %s" % params[:delta]) 413 | del params[:delta] 414 | paraminfo = zip(paraminfo, params) 415 | try: 416 | params[:] = [x(y) for (x, y) in paraminfo] 417 | except: 418 | for i, (x, y) in enumerate(paraminfo): 419 | try: 420 | x(y) 421 | except: 422 | raise # For now 423 | continue 424 | func(self, token, params) 425 | params[:] = [] 426 | 427 | 428 | def debugparser(undisturbed=set('parse_array'.split())): 429 | def debugdispatch(): 430 | def getvalue(oldval): 431 | name = oldval[0].__name__ 432 | 433 | def myfunc(self, token, params): 434 | print ('%s called %s(%s)' % (token, name, 435 | ', '.join(str(x) for x in params))) 436 | if name in undisturbed: 437 | myfunc = oldval[0] 438 | return myfunc, oldval[1] 439 | return dict((x, getvalue(y)) 440 | for (x, y) in _ParseClass.dispatch.items()) 441 | 442 | class _DebugParse(_ParseClass): 443 | dispatch = debugdispatch() 444 | 445 | return _DebugParse.parsepage 446 | 447 | parsepage = _ParseClass.parsepage 448 | 449 | if __name__ == '__main__': 450 | import sys 451 | from pdfrw import PdfReader 452 | parse = debugparser() 453 | fname, = sys.argv[1:] 454 | pdf = PdfReader(fname, decompress=True) 455 | for i, page in enumerate(pdf.pages): 456 | print ('\nPage %s ------------------------------------' % i) 457 | parse(page) 458 | -------------------------------------------------------------------------------- /examples/rotate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: rotate.py my.pdf rotation [page[range] ...] 5 | eg. rotate.py 270 1-3 5 7-9 6 | 7 | Rotation must be multiple of 90 degrees, clockwise. 8 | 9 | Creates rotate.my.pdf with selected pages rotated. Rotates all by default. 10 | 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from pdfrw import PdfReader, PdfWriter 17 | 18 | inpfn = sys.argv[1] 19 | rotate = sys.argv[2] 20 | ranges = sys.argv[3:] 21 | 22 | rotate = int(rotate) 23 | assert rotate % 90 == 0 24 | 25 | ranges = [[int(y) for y in x.split('-')] for x in ranges] 26 | outfn = 'rotate.%s' % os.path.basename(inpfn) 27 | trailer = PdfReader(inpfn) 28 | pages = trailer.pages 29 | 30 | if not ranges: 31 | ranges = [[1, len(pages)]] 32 | 33 | for onerange in ranges: 34 | onerange = (onerange + onerange[-1:])[:2] 35 | for pagenum in range(onerange[0]-1, onerange[1]): 36 | pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or 37 | 0) + rotate) % 360 38 | 39 | outdata = PdfWriter(outfn) 40 | outdata.trailer = trailer 41 | outdata.write() 42 | -------------------------------------------------------------------------------- /examples/subset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: subset.py my.pdf page[range] [page[range]] ... 5 | eg. subset.py 1-3 5 7-9 6 | 7 | Creates subset.my.pdf 8 | 9 | ''' 10 | 11 | import sys 12 | import os 13 | 14 | from pdfrw import PdfReader, PdfWriter 15 | 16 | inpfn = sys.argv[1] 17 | ranges = sys.argv[2:] 18 | assert ranges, "Expected at least one range" 19 | 20 | ranges = ([int(y) for y in x.split('-')] for x in ranges) 21 | outfn = 'subset.%s' % os.path.basename(inpfn) 22 | pages = PdfReader(inpfn).pages 23 | outdata = PdfWriter(outfn) 24 | 25 | for onerange in ranges: 26 | onerange = (onerange + onerange[-1:])[:2] 27 | for pagenum in range(onerange[0], onerange[1]+1): 28 | outdata.addpage(pages[pagenum-1]) 29 | outdata.write() 30 | -------------------------------------------------------------------------------- /examples/subset_booklets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: subset_booklets.py my.pdf 5 | 6 | Creates subset_booklets.my.pdf 7 | 8 | Pages organized in a form suitable for booklet printing, e.g. 9 | to print 4 8.5x11 pages using a single 11x17 sheet (double-sided). 10 | Instead of a large booklet, the pdf is divided into several mini 11 | booklets. The reason is: professional printing works this way: 12 | - Print all of several mini booklets(subsets of booklet); 13 | - Saw each mini booklet individually; 14 | - glue them all together; 15 | - Insert the cover. 16 | 17 | Take a look at http://www.wikihow.com/Bind-a-Book 18 | ''' 19 | 20 | import sys 21 | import os 22 | import time 23 | from pdfrw import PdfReader, PdfWriter, PageMerge 24 | 25 | BOOKLET_SIZE = 20 26 | START = time.time() 27 | 28 | def fixpage(*pages): 29 | result = PageMerge() + (x for x in pages if x is not None) 30 | result[-1].x += result[0].w 31 | return result.render() 32 | 33 | INPFN, = sys.argv[1:] 34 | OUTFN = 'booklet.' + os.path.basename(INPFN) 35 | ALL_IPAGES = PdfReader(INPFN).pages 36 | print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.' 37 | 38 | #Make sure we have an even number 39 | if len(ALL_IPAGES) & 1: 40 | ALL_IPAGES.append(None) 41 | print 'Inserting one more blank page to make pages number even.' 42 | NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE) 43 | 44 | print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.' 45 | opages = [] 46 | for iteration in range(0, NUM_OF_ITER): 47 | ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE] 48 | while len(ipages) > 2: 49 | opages.append(fixpage(ipages.pop(), ipages.pop(0))) 50 | opages.append(fixpage(ipages.pop(0), ipages.pop())) 51 | 52 | # Making one more subbooklet with the left pages 53 | ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)] 54 | while len(ipages) > 2: 55 | opages.append(fixpage(ipages.pop(), ipages.pop(0))) 56 | opages.append(fixpage(ipages.pop(0), ipages.pop())) 57 | if len(ipages) >= 1: 58 | opages.append(fixpage(ipages.pop(), ipages.pop(0))) 59 | 60 | PdfWriter(OUTFN).addpages(opages).write() 61 | print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.' 62 | -------------------------------------------------------------------------------- /examples/unspread.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | usage: unspread.py my.pdf 5 | 6 | Creates unspread.my.pdf 7 | 8 | Chops each page in half, e.g. if a source were 9 | created in booklet form, you could extract individual 10 | pages. 11 | ''' 12 | 13 | import sys 14 | import os 15 | 16 | from pdfrw import PdfReader, PdfWriter, PageMerge 17 | 18 | 19 | def splitpage(src): 20 | ''' Split a page into two (left and right) 21 | ''' 22 | # Yield a result for each half of the page 23 | for x_pos in (0, 0.5): 24 | yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render() 25 | 26 | 27 | inpfn, = sys.argv[1:] 28 | outfn = 'unspread.' + os.path.basename(inpfn) 29 | writer = PdfWriter(outfn) 30 | for page in PdfReader(inpfn).pages: 31 | writer.addpages(splitpage(page)) 32 | writer.write() 33 | -------------------------------------------------------------------------------- /examples/watermark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Simple example of watermarking using form xobjects (pdfrw). 5 | 6 | usage: watermark.py [-u] my.pdf single_page.pdf 7 | 8 | Creates watermark.my.pdf, with every page overlaid with 9 | first page from single_page.pdf. If -u is selected, watermark 10 | will be placed underneath page (painted first). 11 | 12 | NOTE 1: This program assumes that all pages (including the watermark 13 | page) are the same size. For other possibilities, see 14 | the fancy_watermark.py example. 15 | 16 | NOTE 2: At one point, this example was extremely complicated, with 17 | multiple options. That only led to errors in implementation, 18 | so it has been re-simplified in order to show basic principles 19 | of the library operation and to match the other examples better. 20 | ''' 21 | 22 | import sys 23 | import os 24 | 25 | from pdfrw import PdfReader, PdfWriter, PageMerge 26 | 27 | argv = sys.argv[1:] 28 | underneath = '-u' in argv 29 | if underneath: 30 | del argv[argv.index('-u')] 31 | inpfn, wmarkfn = argv 32 | outfn = 'watermark.' + os.path.basename(inpfn) 33 | wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0] 34 | trailer = PdfReader(inpfn) 35 | for page in trailer.pages: 36 | PageMerge(page).add(wmark, prepend=underneath).render() 37 | PdfWriter(outfn, trailer=trailer).write() 38 | -------------------------------------------------------------------------------- /pdfrw/__init__.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | from .pdfwriter import PdfWriter 6 | from .pdfreader import PdfReader 7 | from .objects import (PdfObject, PdfName, PdfArray, 8 | PdfDict, IndirectPdfDict, PdfString) 9 | from .tokens import PdfTokens 10 | from .errors import PdfParseError 11 | from .pagemerge import PageMerge 12 | 13 | __version__ = '0.4' 14 | 15 | # Add a tiny bit of compatibility to pyPdf 16 | 17 | PdfFileReader = PdfReader 18 | PdfFileWriter = PdfWriter 19 | 20 | __all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray 21 | PdfTokens PdfParseError PdfDict IndirectPdfDict 22 | PdfString PageMerge""".split() 23 | 24 | -------------------------------------------------------------------------------- /pdfrw/buildxobj.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | 7 | This module contains code to build PDF "Form XObjects". 8 | 9 | A Form XObject allows a fragment from one PDF file to be cleanly 10 | included in another PDF file. 11 | 12 | Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 13 | 14 | http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf 15 | 16 | supported 'page=xxx', 'viewrect=,,,' 17 | 18 | Also supported by this, but not by Adobe: 19 | 'rotate=xxx' where xxx in [0, 90, 180, 270] 20 | 21 | Units are in points 22 | 23 | 24 | Reference for content: Adobe PDF reference, sixth edition, version 1.7 25 | 26 | http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf 27 | 28 | Form xobjects discussed chapter 4.9, page 355 29 | ''' 30 | 31 | from .objects import PdfDict, PdfArray, PdfName 32 | from .pdfreader import PdfReader 33 | from .errors import log, PdfNotImplementedError 34 | from .py23_diffs import iteritems 35 | from .uncompress import uncompress 36 | from .compress import compress 37 | 38 | 39 | class ViewInfo(object): 40 | ''' Instantiate ViewInfo with a uri, and it will parse out 41 | the filename, page, and viewrect into object attributes. 42 | 43 | Note 1: 44 | Viewrects follow the adobe definition. (See reference 45 | above). They are arrays of 4 numbers: 46 | 47 | - Distance from left of document in points 48 | - Distance from top (NOT bottom) of document in points 49 | - Width of rectangle in points 50 | - Height of rectangle in points 51 | 52 | Note 2: 53 | For simplicity, Viewrects can also be specified 54 | in fractions of the document. If every number in 55 | the viewrect is between 0 and 1 inclusive, then 56 | viewrect elements 0 and 2 are multiplied by the 57 | mediabox width before use, and viewrect elements 58 | 1 and 3 are multiplied by the mediabox height before 59 | use. 60 | 61 | Note 3: 62 | By default, an XObject based on the view will be 63 | cacheable. It should not be cacheable if the XObject 64 | will be subsequently modified. 65 | ''' 66 | doc = None 67 | docname = None 68 | page = None 69 | viewrect = None 70 | rotate = None 71 | cacheable = True 72 | 73 | def __init__(self, pageinfo='', **kw): 74 | pageinfo = pageinfo.split('#', 1) 75 | if len(pageinfo) == 2: 76 | pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') 77 | for key in 'page viewrect'.split(): 78 | if pageinfo[0].startswith(key + '='): 79 | break 80 | else: 81 | self.docname = pageinfo.pop(0) 82 | for item in pageinfo: 83 | key, value = item.split('=') 84 | key = key.strip() 85 | value = value.replace(',', ' ').split() 86 | if key in ('page', 'rotate'): 87 | assert len(value) == 1 88 | setattr(self, key, int(value[0])) 89 | elif key == 'viewrect': 90 | assert len(value) == 4 91 | setattr(self, key, [float(x) for x in value]) 92 | else: 93 | log.error('Unknown option: %s', key) 94 | for key, value in iteritems(kw): 95 | assert hasattr(self, key), key 96 | setattr(self, key, value) 97 | 98 | 99 | def get_rotation(rotate): 100 | ''' Return clockwise rotation code: 101 | 0 = unrotated 102 | 1 = 90 degrees 103 | 2 = 180 degrees 104 | 3 = 270 degrees 105 | ''' 106 | try: 107 | rotate = int(rotate) 108 | except (ValueError, TypeError): 109 | return 0 110 | if rotate % 90 != 0: 111 | return 0 112 | return rotate // 90 113 | 114 | 115 | def rotate_point(point, rotation): 116 | ''' Rotate an (x,y) coordinate clockwise by a 117 | rotation code specifying a multiple of 90 degrees. 118 | ''' 119 | if rotation & 1: 120 | point = point[1], -point[0] 121 | if rotation & 2: 122 | point = -point[0], -point[1] 123 | return point 124 | 125 | 126 | def rotate_rect(rect, rotation): 127 | ''' Rotate both points within the rectangle, then normalize 128 | the rectangle by returning the new lower left, then new 129 | upper right. 130 | ''' 131 | rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation) 132 | return (min(rect[0], rect[2]), min(rect[1], rect[3]), 133 | max(rect[0], rect[2]), max(rect[1], rect[3])) 134 | 135 | 136 | def getrects(inheritable, pageinfo, rotation): 137 | ''' Given the inheritable attributes of a page and 138 | the desired pageinfo rectangle, return the page's 139 | media box and the calculated boundary (clip) box. 140 | ''' 141 | mbox = tuple([float(x) for x in inheritable.MediaBox]) 142 | cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) 143 | vrect = pageinfo.viewrect 144 | if vrect is not None: 145 | # Rotate the media box to match what the user sees, 146 | # figure out the clipping box, then rotate back 147 | mleft, mbot, mright, mtop = rotate_rect(cbox, rotation) 148 | x, y, w, h = vrect 149 | 150 | # Support operations in fractions of a page 151 | if 0 <= min(vrect) < max(vrect) <= 1: 152 | mw = mright - mleft 153 | mh = mtop - mbot 154 | x *= mw 155 | w *= mw 156 | y *= mh 157 | h *= mh 158 | 159 | cleft = mleft + x 160 | ctop = mtop - y 161 | cright = cleft + w 162 | cbot = ctop - h 163 | cbox = (max(mleft, cleft), max(mbot, cbot), 164 | min(mright, cright), min(mtop, ctop)) 165 | cbox = rotate_rect(cbox, -rotation) 166 | return mbox, cbox 167 | 168 | 169 | def _build_cache(contents, allow_compressed): 170 | ''' Build a new dictionary holding the stream, 171 | and save it along with private cache info. 172 | Assumes validity has been pre-checked if 173 | we have a non-None xobj_copy. 174 | 175 | Also, the spec says nothing about nested arrays, 176 | so we assume those don't exist until we see one 177 | in the wild. 178 | ''' 179 | try: 180 | xobj_copy = contents.xobj_copy 181 | except AttributeError: 182 | # Should have a PdfArray here... 183 | array = contents 184 | private = contents 185 | else: 186 | # Should have a PdfDict here -- might or might not have cache copy 187 | if xobj_copy is not None: 188 | return xobj_copy 189 | array = [contents] 190 | private = contents.private 191 | 192 | # If we don't allow compressed objects, OR if we have multiple compressed 193 | # objects, we try to decompress them, and fail if we cannot do that. 194 | 195 | if not allow_compressed or len(array) > 1: 196 | keys = set(x[0] for cdict in array for x in iteritems(cdict)) 197 | was_compressed = len(keys) > 1 198 | if was_compressed: 199 | # Make copies of the objects before we uncompress them. 200 | array = [PdfDict(x) for x in array] 201 | if not uncompress(array): 202 | raise PdfNotImplementedError( 203 | 'Xobjects with these compression parameters not supported: %s' % 204 | keys) 205 | 206 | xobj_copy = PdfDict(array[0]) 207 | xobj_copy.private.xobj_cachedict = {} 208 | private.xobj_copy = xobj_copy 209 | 210 | if len(array) > 1: 211 | newstream = '\n'.join(x.stream for x in array) 212 | newlength = sum(int(x.Length) for x in array) + len(array) - 1 213 | assert newlength == len(newstream) 214 | xobj_copy.stream = newstream 215 | if was_compressed and allow_compressed: 216 | compress(xobj_copy) 217 | 218 | return xobj_copy 219 | 220 | 221 | def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True): 222 | ''' Return a cached Form XObject, or create a new one and cache it. 223 | Adds private members x, y, w, h 224 | ''' 225 | cachedict = contents.xobj_cachedict 226 | cachekey = mbox, bbox, rotation 227 | result = cachedict.get(cachekey) if cacheable else None 228 | if result is None: 229 | # If we are not getting a full page, or if we are going to 230 | # modify the results, first retrieve an underlying Form XObject 231 | # that represents the entire page, so that we are not copying 232 | # the full page data into the new file multiple times 233 | func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable] 234 | result = PdfDict( 235 | func(contents, resources, mbox), 236 | Type=PdfName.XObject, 237 | Subtype=PdfName.Form, 238 | FormType=1, 239 | BBox=PdfArray(bbox), 240 | ) 241 | rect = bbox 242 | if rotation: 243 | matrix = (rotate_point((1, 0), rotation) + 244 | rotate_point((0, 1), rotation)) 245 | result.Matrix = PdfArray(matrix + (0, 0)) 246 | rect = rotate_rect(rect, rotation) 247 | 248 | private = result.private 249 | private.x = rect[0] 250 | private.y = rect[1] 251 | private.w = rect[2] - rect[0] 252 | private.h = rect[3] - rect[1] 253 | if cacheable: 254 | cachedict[cachekey] = result 255 | return result 256 | 257 | 258 | def _get_fullpage(contents, resources, mbox): 259 | ''' fullpage is easy. Just copy the contents, 260 | set up the resources, and let _cache_xobj handle the 261 | rest. 262 | ''' 263 | return PdfDict(contents, Resources=resources) 264 | 265 | 266 | def _get_subpage(contents, resources, mbox): 267 | ''' subpages *could* be as easy as full pages, but we 268 | choose to complicate life by creating a Form XObject 269 | for the page, and then one that references it for 270 | the subpage, on the off-chance that we want multiple 271 | items from the page. 272 | ''' 273 | return PdfDict( 274 | stream='/FullPage Do\n', 275 | Resources=PdfDict( 276 | XObject=PdfDict( 277 | FullPage=_cache_xobj(contents, resources, mbox, mbox, 0) 278 | ) 279 | ) 280 | ) 281 | 282 | 283 | def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): 284 | ''' pagexobj creates and returns a Form XObject for 285 | a given view within a page (Defaults to entire page.) 286 | 287 | pagexobj is passed a page and a viewrect. 288 | ''' 289 | inheritable = page.inheritable 290 | resources = inheritable.Resources 291 | rotation = get_rotation(inheritable.Rotate) 292 | mbox, bbox = getrects(inheritable, viewinfo, rotation) 293 | rotation += get_rotation(viewinfo.rotate) 294 | contents = _build_cache(page.Contents, allow_compressed) 295 | return _cache_xobj(contents, resources, mbox, bbox, rotation, 296 | viewinfo.cacheable) 297 | 298 | 299 | def docxobj(pageinfo, doc=None, allow_compressed=True): 300 | ''' docinfo reads a page out of a document and uses 301 | pagexobj to create the Form XObject based on 302 | the page. 303 | 304 | This is a convenience function for things like 305 | rst2pdf that want to be able to pass in textual 306 | filename/location descriptors and don't want to 307 | know about using PdfReader. 308 | 309 | Can work standalone, or in conjunction with 310 | the CacheXObj class (below). 311 | 312 | ''' 313 | if not isinstance(pageinfo, ViewInfo): 314 | pageinfo = ViewInfo(pageinfo) 315 | 316 | # If we're explicitly passed a document, 317 | # make sure we don't have one implicitly as well. 318 | # If no implicit or explicit doc, then read one in 319 | # from the filename. 320 | if doc is not None: 321 | assert pageinfo.doc is None 322 | pageinfo.doc = doc 323 | elif pageinfo.doc is not None: 324 | doc = pageinfo.doc 325 | else: 326 | doc = pageinfo.doc = PdfReader(pageinfo.docname, 327 | decompress=not allow_compressed) 328 | assert isinstance(doc, PdfReader) 329 | 330 | sourcepage = doc.pages[(pageinfo.page or 1) - 1] 331 | return pagexobj(sourcepage, pageinfo, allow_compressed) 332 | 333 | 334 | class CacheXObj(object): 335 | ''' Use to keep from reparsing files over and over, 336 | and to keep from making the output too much 337 | bigger than it ought to be by replicating 338 | unnecessary object copies. 339 | 340 | This is a convenience function for things like 341 | rst2pdf that want to be able to pass in textual 342 | filename/location descriptors and don't want to 343 | know about using PdfReader. 344 | ''' 345 | def __init__(self, decompress=False): 346 | ''' Set decompress true if you need 347 | the Form XObjects to be decompressed. 348 | Will decompress what it can and scream 349 | about the rest. 350 | ''' 351 | self.cached_pdfs = {} 352 | self.decompress = decompress 353 | 354 | def load(self, sourcename): 355 | ''' Load a Form XObject from a uri 356 | ''' 357 | info = ViewInfo(sourcename) 358 | fname = info.docname 359 | pcache = self.cached_pdfs 360 | doc = pcache.get(fname) 361 | if doc is None: 362 | doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) 363 | return docxobj(info, doc, allow_compressed=not self.decompress) 364 | -------------------------------------------------------------------------------- /pdfrw/compress.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | Currently, this sad little file only knows how to compress 7 | using the flate (zlib) algorithm. Maybe more later, but it's 8 | not a priority for me... 9 | ''' 10 | 11 | from .objects import PdfName 12 | from .uncompress import streamobjects 13 | from .py23_diffs import zlib, convert_load, convert_store 14 | 15 | 16 | def compress(mylist): 17 | flate = PdfName.FlateDecode 18 | for obj in streamobjects(mylist): 19 | ftype = obj.Filter 20 | if ftype is not None: 21 | continue 22 | oldstr = obj.stream 23 | newstr = convert_load(zlib.compress(convert_store(oldstr))) 24 | if len(newstr) < len(oldstr) + 30: 25 | obj.stream = newstr 26 | obj.Filter = flate 27 | obj.DecodeParms = None 28 | -------------------------------------------------------------------------------- /pdfrw/crypt.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2017 Jon Lund Steffensen 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | from __future__ import division 6 | 7 | import hashlib 8 | import struct 9 | 10 | try: 11 | from Crypto.Cipher import ARC4, AES 12 | HAS_CRYPTO = True 13 | except ImportError: 14 | HAS_CRYPTO = False 15 | 16 | from .objects import PdfDict, PdfName 17 | 18 | _PASSWORD_PAD = ( 19 | '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' 20 | '..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') 21 | 22 | 23 | def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict): 24 | for obj in mylist: 25 | if isinstance(obj, PdfDict) and obj.stream is not None: 26 | yield obj 27 | 28 | 29 | def create_key(password, doc): 30 | """Create an encryption key (Algorithm 2 in PDF spec).""" 31 | key_size = int(doc.Encrypt.Length or 40) // 8 32 | padded_pass = (password + _PASSWORD_PAD)[:32] 33 | hasher = hashlib.md5() 34 | hasher.update(padded_pass) 35 | hasher.update(doc.Encrypt.O.to_bytes()) 36 | hasher.update(struct.pack('= 3: 41 | for _ in range(50): 42 | temp_hash = hashlib.md5(temp_hash[:key_size]).digest() 43 | 44 | return temp_hash[:key_size] 45 | 46 | 47 | def create_user_hash(key, doc): 48 | """Create the user password hash (Algorithm 4/5).""" 49 | revision = int(doc.Encrypt.R or 0) 50 | if revision < 3: 51 | cipher = ARC4.new(key) 52 | return cipher.encrypt(_PASSWORD_PAD) 53 | else: 54 | hasher = hashlib.md5() 55 | hasher.update(_PASSWORD_PAD) 56 | hasher.update(doc.ID[0].to_bytes()) 57 | temp_hash = hasher.digest() 58 | 59 | for i in range(20): 60 | temp_key = ''.join(chr(i ^ ord(x)) for x in key) 61 | cipher = ARC4.new(temp_key) 62 | temp_hash = cipher.encrypt(temp_hash) 63 | 64 | return temp_hash 65 | 66 | 67 | def check_user_password(key, doc): 68 | """Check that the user password is correct (Algorithm 6).""" 69 | expect_user_hash = create_user_hash(key, doc) 70 | revision = int(doc.Encrypt.R or 0) 71 | if revision < 3: 72 | return doc.Encrypt.U.to_bytes() == expect_user_hash 73 | else: 74 | return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash 75 | 76 | 77 | class AESCryptFilter(object): 78 | """Crypt filter corresponding to /AESV2.""" 79 | def __init__(self, key): 80 | self._key = key 81 | 82 | def decrypt_data(self, num, gen, data): 83 | """Decrypt data (string/stream) using key (Algorithm 1).""" 84 | key_extension = struct.pack('= 1 and ftype[0] == PdfName.Crypt: 143 | ftype = ftype[1:] 144 | parms = obj.DecodeParms or obj.DP 145 | filter = filters[parms.Name] 146 | 147 | num, gen = obj.indirect 148 | obj.stream = filter.decrypt_data(num, gen, obj.stream) 149 | obj.private.decrypted = True 150 | obj.Filter = ftype or None 151 | -------------------------------------------------------------------------------- /pdfrw/errors.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | PDF Exceptions and error handling 7 | ''' 8 | 9 | import logging 10 | 11 | 12 | fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s') 13 | 14 | handler = logging.StreamHandler() 15 | handler.setFormatter(fmt) 16 | 17 | log = logging.getLogger('pdfrw') 18 | log.setLevel(logging.WARNING) 19 | log.addHandler(handler) 20 | 21 | 22 | class PdfError(Exception): 23 | "Abstract base class of exceptions thrown by this module" 24 | 25 | def __init__(self, msg): 26 | self.msg = msg 27 | 28 | def __str__(self): 29 | return self.msg 30 | 31 | 32 | class PdfParseError(PdfError): 33 | "Error thrown by parser/tokenizer" 34 | 35 | 36 | class PdfOutputError(PdfError): 37 | "Error thrown by PDF writer" 38 | 39 | 40 | class PdfNotImplementedError(PdfError): 41 | "Error thrown on missing features" 42 | -------------------------------------------------------------------------------- /pdfrw/findobjs.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' This module contains a function to find all the XObjects 6 | in a document, and another function that will wrap them 7 | in page objects. 8 | ''' 9 | 10 | from .objects import PdfDict, PdfArray, PdfName 11 | 12 | 13 | def find_objects(source, valid_types=(PdfName.XObject, None), 14 | valid_subtypes=(PdfName.Form, PdfName.Image), 15 | no_follow=(PdfName.Parent,), 16 | isinstance=isinstance, id=id, sorted=sorted, 17 | reversed=reversed, PdfDict=PdfDict): 18 | ''' 19 | Find all the objects of a particular kind in a document 20 | or array. Defaults to looking for Form and Image XObjects. 21 | 22 | This could be done recursively, but some PDFs 23 | are quite deeply nested, so we do it without 24 | recursion. 25 | 26 | Note that we don't know exactly where things appear on pages, 27 | but we aim for a sort order that is (a) mostly in document order, 28 | and (b) reproducible. For arrays, objects are processed in 29 | array order, and for dicts, they are processed in key order. 30 | ''' 31 | container = (PdfDict, PdfArray) 32 | 33 | # Allow passing a list of pages, or a dict 34 | if isinstance(source, PdfDict): 35 | source = [source] 36 | else: 37 | source = list(source) 38 | 39 | visited = set() 40 | source.reverse() 41 | while source: 42 | obj = source.pop() 43 | if not isinstance(obj, container): 44 | continue 45 | myid = id(obj) 46 | if myid in visited: 47 | continue 48 | visited.add(myid) 49 | if isinstance(obj, PdfDict): 50 | if obj.Type in valid_types and obj.Subtype in valid_subtypes: 51 | yield obj 52 | obj = [y for (x, y) in sorted(obj.iteritems()) 53 | if x not in no_follow] 54 | else: 55 | # TODO: This forces resolution of any indirect objects in 56 | # the array. It may not be necessary. Don't know if 57 | # reversed() does any voodoo underneath the hood. 58 | # It's cheap enough for now, but might be removeable. 59 | obj and obj[0] 60 | source.extend(reversed(obj)) 61 | 62 | 63 | def wrap_object(obj, width, margin): 64 | ''' Wrap an xobj in its own page object. 65 | ''' 66 | fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q' 67 | contents = PdfDict(indirect=True) 68 | subtype = obj.Subtype 69 | if subtype == PdfName.Form: 70 | contents._stream = obj.stream 71 | contents.Length = obj.Length 72 | contents.Filter = obj.Filter 73 | contents.DecodeParms = obj.DecodeParms 74 | resources = obj.Resources 75 | mbox = obj.BBox 76 | elif subtype == PdfName.Image: # Image 77 | xoffset = margin[0] 78 | yoffset = margin[1] 79 | cw = width - margin[0] - margin[2] 80 | iw, ih = float(obj.Width), float(obj.Height) 81 | ch = 1.0 * cw / iw * ih 82 | height = ch + margin[1] + margin[3] 83 | p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset)) 84 | contents.stream = fmt % p 85 | resources = PdfDict(XObject=PdfDict(MyImage=obj)) 86 | mbox = PdfArray((0, 0, width, height)) 87 | else: 88 | raise TypeError("Expected Form or Image XObject") 89 | 90 | return PdfDict( 91 | indirect=True, 92 | Type=PdfName.Page, 93 | MediaBox=mbox, 94 | Resources=resources, 95 | Contents=contents, 96 | ) 97 | 98 | 99 | def trivial_xobjs(maxignore=300): 100 | ''' Ignore XObjects that trivially contain other XObjects. 101 | ''' 102 | ignore = set('q Q cm Do'.split()) 103 | Image = PdfName.Image 104 | 105 | def check(obj): 106 | if obj.Subtype == Image: 107 | return False 108 | s = obj.stream 109 | if len(s) < maxignore: 110 | s = (x for x in s.split() if not x.startswith('/') and 111 | x not in ignore) 112 | s = (x.replace('.', '').replace('-', '') for x in s) 113 | if not [x for x in s if not x.isdigit()]: 114 | return True 115 | return check 116 | 117 | 118 | def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72, 119 | image_only=False, ignore=trivial_xobjs(), 120 | wrap_object=wrap_object): 121 | ''' page_per_xobj wraps every XObj found 122 | in its own page object. 123 | width and margin are used to set image sizes. 124 | ''' 125 | try: 126 | iter(margin) 127 | except: 128 | margin = [margin] 129 | while len(margin) < 4: 130 | margin *= 2 131 | 132 | if isinstance(xobj_iter, (list, dict)): 133 | xobj_iter = find_objects(xobj_iter) 134 | for obj in xobj_iter: 135 | if not ignore(obj): 136 | if not image_only or obj.Subtype == PdfName.IMage: 137 | yield wrap_object(obj, width, margin) 138 | -------------------------------------------------------------------------------- /pdfrw/objects/__init__.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | Objects that can occur in PDF files. The most important 7 | objects are arrays and dicts. Either of these can be 8 | indirect or not, and dicts could have an associated 9 | stream. 10 | ''' 11 | from .pdfname import PdfName 12 | from .pdfdict import PdfDict, IndirectPdfDict 13 | from .pdfarray import PdfArray 14 | from .pdfobject import PdfObject 15 | from .pdfstring import PdfString 16 | from .pdfindirect import PdfIndirect 17 | 18 | __all__ = """PdfName PdfDict IndirectPdfDict PdfArray 19 | PdfObject PdfString PdfIndirect""".split() 20 | -------------------------------------------------------------------------------- /pdfrw/objects/pdfarray.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | from .pdfindirect import PdfIndirect 6 | from .pdfobject import PdfObject 7 | 8 | 9 | def _resolved(): 10 | pass 11 | 12 | 13 | class PdfArray(list): 14 | ''' A PdfArray maps the PDF file array object into a Python list. 15 | It has an indirect attribute which defaults to False. 16 | ''' 17 | indirect = False 18 | 19 | def __init__(self, source=[]): 20 | self._resolve = self._resolver 21 | self.extend(source) 22 | 23 | def _resolver(self, isinstance=isinstance, enumerate=enumerate, 24 | listiter=list.__iter__, PdfIndirect=PdfIndirect, 25 | resolved=_resolved, PdfNull=PdfObject('null')): 26 | for index, value in enumerate(list.__iter__(self)): 27 | if isinstance(value, PdfIndirect): 28 | value = value.real_value() 29 | if value is None: 30 | value = PdfNull 31 | self[index] = value 32 | self._resolve = resolved 33 | 34 | def __getitem__(self, index, listget=list.__getitem__): 35 | self._resolve() 36 | return listget(self, index) 37 | 38 | try: 39 | def __getslice__(self, i, j, listget=list.__getslice__): 40 | self._resolve() 41 | return listget(self, i, j) 42 | except AttributeError: 43 | pass 44 | 45 | def __iter__(self, listiter=list.__iter__): 46 | self._resolve() 47 | return listiter(self) 48 | 49 | def count(self, item): 50 | self._resolve() 51 | return list.count(self, item) 52 | 53 | def index(self, item): 54 | self._resolve() 55 | return list.index(self, item) 56 | 57 | def remove(self, item): 58 | self._resolve() 59 | return list.remove(self, item) 60 | 61 | def sort(self, *args, **kw): 62 | self._resolve() 63 | return list.sort(self, *args, **kw) 64 | 65 | def pop(self, *args): 66 | self._resolve() 67 | return list.pop(self, *args) 68 | 69 | def __reversed__(self): 70 | self._resolve() 71 | return list.__reversed__(self) 72 | -------------------------------------------------------------------------------- /pdfrw/objects/pdfdict.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | from .pdfname import PdfName, BasePdfName 6 | from .pdfindirect import PdfIndirect 7 | from .pdfobject import PdfObject 8 | from ..py23_diffs import iteritems 9 | from ..errors import PdfParseError 10 | 11 | 12 | class _DictSearch(object): 13 | ''' Used to search for inheritable attributes. 14 | ''' 15 | 16 | def __init__(self, basedict): 17 | self.basedict = basedict 18 | 19 | def __getattr__(self, name, PdfName=PdfName): 20 | return self[PdfName(name)] 21 | 22 | def __getitem__(self, name, set=set, getattr=getattr, id=id): 23 | visited = set() 24 | mydict = self.basedict 25 | while 1: 26 | value = mydict[name] 27 | if value is not None: 28 | return value 29 | myid = id(mydict) 30 | assert myid not in visited 31 | visited.add(myid) 32 | mydict = mydict.Parent 33 | if mydict is None: 34 | return 35 | 36 | 37 | class _Private(object): 38 | ''' Used to store private attributes (not output to PDF files) 39 | on PdfDict classes 40 | ''' 41 | 42 | def __init__(self, pdfdict): 43 | vars(self)['pdfdict'] = pdfdict 44 | 45 | def __setattr__(self, name, value): 46 | vars(self.pdfdict)[name] = value 47 | 48 | 49 | class PdfDict(dict): 50 | ''' PdfDict objects are subclassed dictionaries 51 | with the following features: 52 | 53 | - Every key in the dictionary starts with "/" 54 | 55 | - A dictionary item can be deleted by assigning it to None 56 | 57 | - Keys that (after the initial "/") conform to Python 58 | naming conventions can also be accessed (set and retrieved) 59 | as attributes of the dictionary. E.g. mydict.Page is the 60 | same thing as mydict['/Page'] 61 | 62 | - Private attributes (not in the PDF space) can be set 63 | on the dictionary object attribute dictionary by using 64 | the private attribute: 65 | 66 | mydict.private.foo = 3 67 | mydict.foo = 5 68 | x = mydict.foo # x will now contain 3 69 | y = mydict['/foo'] # y will now contain 5 70 | 71 | Most standard adobe dictionary keys start with an upper case letter, 72 | so to avoid conflicts, it is best to start private attributes with 73 | lower case letters. 74 | 75 | - PdfDicts have the following read-only properties: 76 | 77 | - private -- as discussed above, provides write access to 78 | dictionary's attributes 79 | - inheritable -- this creates and returns a "view" attribute 80 | that will search through the object hierarchy for 81 | any desired attribute, such as /Rotate or /MediaBox 82 | 83 | - PdfDicts also have the following special attributes: 84 | - indirect is not stored in the PDF dictionary, but in the object's 85 | attribute dictionary 86 | - stream is also stored in the object's attribute dictionary 87 | and will also update the stream length. 88 | - _stream will store in the object's attribute dictionary without 89 | updating the stream length. 90 | 91 | It is possible, for example, to have a PDF name such as "/indirect" 92 | or "/stream", but you cannot access such a name as an attribute: 93 | 94 | mydict.indirect -- accesses object's attribute dictionary 95 | mydict["/indirect"] -- accesses actual PDF dictionary 96 | ''' 97 | indirect = False 98 | stream = None 99 | 100 | _special = dict(indirect=('indirect', False), 101 | stream=('stream', True), 102 | _stream=('stream', False), 103 | ) 104 | 105 | def __setitem__(self, name, value, setter=dict.__setitem__, 106 | BasePdfName=BasePdfName, isinstance=isinstance): 107 | if not isinstance(name, BasePdfName): 108 | raise PdfParseError('Dict key %s is not a PdfName' % repr(name)) 109 | if value is not None: 110 | setter(self, name, value) 111 | elif name in self: 112 | del self[name] 113 | 114 | def __init__(self, *args, **kw): 115 | if args: 116 | if len(args) == 1: 117 | args = args[0] 118 | self.update(args) 119 | if isinstance(args, PdfDict): 120 | self.indirect = args.indirect 121 | self._stream = args.stream 122 | for key, value in iteritems(kw): 123 | setattr(self, key, value) 124 | 125 | def __getattr__(self, name, PdfName=PdfName): 126 | ''' If the attribute doesn't exist on the dictionary object, 127 | try to slap a '/' in front of it and get it out 128 | of the actual dictionary itself. 129 | ''' 130 | return self.get(PdfName(name)) 131 | 132 | def get(self, key, dictget=dict.get, isinstance=isinstance, 133 | PdfIndirect=PdfIndirect): 134 | ''' Get a value out of the dictionary, 135 | after resolving any indirect objects. 136 | ''' 137 | value = dictget(self, key) 138 | if isinstance(value, PdfIndirect): 139 | # We used to use self[key] here, but that does an 140 | # unwanted check on the type of the key (github issue #98). 141 | # Python will keep the old key object in the dictionary, 142 | # so that check is not necessary. 143 | value = value.real_value() 144 | if value is not None: 145 | dict.__setitem__(self, key, value) 146 | else: 147 | del self[key] 148 | return value 149 | 150 | def __getitem__(self, key): 151 | return self.get(key) 152 | 153 | def __setattr__(self, name, value, special=_special.get, 154 | PdfName=PdfName, vars=vars): 155 | ''' Set an attribute on the dictionary. Handle the keywords 156 | indirect, stream, and _stream specially (for content objects) 157 | ''' 158 | info = special(name) 159 | if info is None: 160 | self[PdfName(name)] = value 161 | else: 162 | name, setlen = info 163 | vars(self)[name] = value 164 | if setlen: 165 | notnone = value is not None 166 | self.Length = notnone and PdfObject(len(value)) or None 167 | 168 | def iteritems(self, dictiter=iteritems, 169 | isinstance=isinstance, PdfIndirect=PdfIndirect, 170 | BasePdfName=BasePdfName): 171 | ''' Iterate over the dictionary, resolving any unresolved objects 172 | ''' 173 | for key, value in list(dictiter(self)): 174 | if isinstance(value, PdfIndirect): 175 | self[key] = value = value.real_value() 176 | if value is not None: 177 | if not isinstance(key, BasePdfName): 178 | raise PdfParseError('Dict key %s is not a PdfName' % 179 | repr(key)) 180 | yield key, value 181 | 182 | def items(self): 183 | return list(self.iteritems()) 184 | 185 | def itervalues(self): 186 | for key, value in self.iteritems(): 187 | yield value 188 | 189 | def values(self): 190 | return list((value for key, value in self.iteritems())) 191 | 192 | def keys(self): 193 | return list((key for key, value in self.iteritems())) 194 | 195 | def __iter__(self): 196 | for key, value in self.iteritems(): 197 | yield key 198 | 199 | def iterkeys(self): 200 | return iter(self) 201 | 202 | def copy(self): 203 | return type(self)(self) 204 | 205 | def pop(self, key): 206 | value = self.get(key) 207 | del self[key] 208 | return value 209 | 210 | def popitem(self): 211 | key, value = dict.pop(self) 212 | if isinstance(value, PdfIndirect): 213 | value = value.real_value() 214 | return value 215 | 216 | def inheritable(self): 217 | ''' Search through ancestors as needed for inheritable 218 | dictionary items. 219 | NOTE: You might think it would be a good idea 220 | to cache this class, but then you'd have to worry 221 | about it pointing to the wrong dictionary if you 222 | made a copy of the object... 223 | ''' 224 | return _DictSearch(self) 225 | inheritable = property(inheritable) 226 | 227 | def private(self): 228 | ''' Allows setting private metadata for use in 229 | processing (not sent to PDF file). 230 | See note on inheritable 231 | ''' 232 | return _Private(self) 233 | private = property(private) 234 | 235 | 236 | class IndirectPdfDict(PdfDict): 237 | ''' IndirectPdfDict is a convenience class. You could 238 | create a direct PdfDict and then set indirect = True on it, 239 | or you could just create an IndirectPdfDict. 240 | ''' 241 | indirect = True 242 | -------------------------------------------------------------------------------- /pdfrw/objects/pdfindirect.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | 6 | class _NotLoaded(object): 7 | pass 8 | 9 | 10 | class PdfIndirect(tuple): 11 | ''' A placeholder for an object that hasn't been read in yet. 12 | The object itself is the (object number, generation number) tuple. 13 | The attributes include information about where the object is 14 | referenced from and the file object to retrieve the real object from. 15 | ''' 16 | value = _NotLoaded 17 | 18 | def real_value(self, NotLoaded=_NotLoaded): 19 | value = self.value 20 | if value is NotLoaded: 21 | value = self.value = self._loader(self) 22 | return value 23 | -------------------------------------------------------------------------------- /pdfrw/objects/pdfname.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | import re 6 | 7 | from ..errors import log 8 | 9 | warn = log.warning 10 | 11 | 12 | class BasePdfName(str): 13 | ''' A PdfName is an identifier that starts with 14 | a slash. 15 | 16 | If a PdfName has illegal space or delimiter characters, 17 | then it will be decorated with an "encoded" attribute that 18 | has those characters properly escaped as # 19 | 20 | The "encoded" attribute is what is sent out to a PDF file, 21 | the non-encoded main object is what is compared for equality 22 | in a PDF dictionary. 23 | ''' 24 | 25 | indirect = False 26 | encoded = None 27 | 28 | whitespace = '\x00 \t\f\r\n' 29 | delimiters = '()<>{}[]/%' 30 | forbidden = list(whitespace) + list('\\' + x for x in delimiters) 31 | remap = dict((x, '#%02X' % ord(x)) for x in (whitespace + delimiters)) 32 | split_to_encode = re.compile('(%s)' % '|'.join(forbidden)).split 33 | split_to_decode = re.compile(r'\#([0-9A-Fa-f]{2})').split 34 | 35 | def __new__(cls, name, pre_encoded=True, remap=remap, 36 | join=''.join, new=str.__new__, chr=chr, int=int, 37 | split_to_encode=split_to_encode, 38 | split_to_decode=split_to_decode, 39 | ): 40 | ''' We can build a PdfName from scratch, or from 41 | a pre-encoded name (e.g. coming in from a file). 42 | ''' 43 | # Optimization for normal case 44 | if name[1:].isalnum(): 45 | return new(cls, name) 46 | encoded = name 47 | if pre_encoded: 48 | if '#' in name: 49 | substrs = split_to_decode(name) 50 | substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2]) 51 | name = join(substrs) 52 | else: 53 | encoded = split_to_encode(encoded) 54 | encoded[3::2] = (remap[x] for x in encoded[3::2]) 55 | encoded = join(encoded) 56 | self = new(cls, name) 57 | if encoded != name: 58 | self.encoded = encoded 59 | return self 60 | 61 | 62 | # We could have used a metaclass, but this matches what 63 | # we were doing historically. 64 | 65 | class PdfName(object): 66 | ''' Two simple ways to get a PDF name from a string: 67 | 68 | x = PdfName.FooBar 69 | x = pdfName('FooBar') 70 | 71 | Either technique will return "/FooBar" 72 | 73 | ''' 74 | 75 | def __getattr__(self, name, BasePdfName=BasePdfName): 76 | return BasePdfName('/' + name, False) 77 | 78 | def __call__(self, name, BasePdfName=BasePdfName): 79 | return BasePdfName('/' + name, False) 80 | 81 | PdfName = PdfName() 82 | -------------------------------------------------------------------------------- /pdfrw/objects/pdfobject.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | 6 | class PdfObject(str): 7 | ''' A PdfObject is a textual representation of any PDF file object 8 | other than an array, dict or string. It has an indirect attribute 9 | which defaults to False. 10 | ''' 11 | indirect = False 12 | -------------------------------------------------------------------------------- /pdfrw/pagemerge.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | This module contains code to edit pages. Sort of a canvas, I 7 | suppose, but I wouldn't want to call it that and get people all 8 | excited or anything. 9 | 10 | No, this is just for doing basic things like merging/splitting 11 | apart pages, watermarking, etc. All it does is allow converting 12 | pages (or parts of pages) into Form XObject rectangles, and then 13 | plopping those down on new or pre-existing pages. 14 | ''' 15 | 16 | from .objects import PdfDict, PdfArray, PdfName 17 | from .buildxobj import pagexobj, ViewInfo 18 | 19 | NullInfo = ViewInfo() 20 | 21 | 22 | class RectXObj(PdfDict): 23 | ''' This class facilitates doing positioning (moving and scaling) 24 | of Form XObjects within their containing page, by modifying 25 | the Form XObject's transformation matrix. 26 | 27 | By default, this class keeps the aspect ratio locked. For 28 | example, if your object is foo, you can write 'foo.w = 200', 29 | and it will scale in both the x and y directions. 30 | 31 | To unlock the aspect ration, you have to do a tiny bit of math 32 | and call the scale function. 33 | ''' 34 | def __init__(self, page, viewinfo=NullInfo, **kw): 35 | ''' The page is a page returned by PdfReader. It will be 36 | turned into a cached Form XObject (so that multiple 37 | rectangles can be extracted from it if desired), and then 38 | another Form XObject will be built using it and the viewinfo 39 | (which should be a ViewInfo class). The viewinfo includes 40 | source coordinates (from the top/left) and rotation information. 41 | 42 | Once the object has been built, its destination coordinates 43 | may be examined and manipulated by using x, y, w, h, and 44 | scale. The destination coordinates are in the normal 45 | PDF programmatic system (starting at bottom left). 46 | ''' 47 | if kw: 48 | if viewinfo is not NullInfo: 49 | raise ValueError("Cannot modify preexisting ViewInfo") 50 | viewinfo = ViewInfo(**kw) 51 | viewinfo.cacheable = False 52 | base = pagexobj(page, viewinfo) 53 | self.update(base) 54 | self.indirect = True 55 | self.stream = base.stream 56 | private = self.private 57 | private._rect = [base.x, base.y, base.w, base.h] 58 | matrix = self.Matrix 59 | if matrix is None: 60 | matrix = self.Matrix = PdfArray((1, 0, 0, 1, 0, 0)) 61 | private._matrix = matrix # Lookup optimization 62 | # Default to lower-left corner 63 | self.x = 0 64 | self.y = 0 65 | 66 | @property 67 | def x(self): 68 | ''' X location (from left) of object in points 69 | ''' 70 | return self._rect[0] 71 | 72 | @property 73 | def y(self): 74 | ''' Y location (from bottom) of object in points 75 | ''' 76 | return self._rect[1] 77 | 78 | @property 79 | def w(self): 80 | ''' Width of object in points 81 | ''' 82 | return self._rect[2] 83 | 84 | @property 85 | def h(self): 86 | ''' Height of object in points 87 | ''' 88 | return self._rect[3] 89 | 90 | def __setattr__(self, name, value, next=PdfDict.__setattr__, 91 | mine=set('x y w h'.split())): 92 | ''' The underlying __setitem__ won't let us use a property 93 | setter, so we have to fake one. 94 | ''' 95 | if name not in mine: 96 | return next(self, name, value) 97 | if name in 'xy': 98 | r_index, m_index = (0, 4) if name == 'x' else (1, 5) 99 | self._rect[r_index], old = value, self._rect[r_index] 100 | self._matrix[m_index] += value - old 101 | else: 102 | index = 2 + (value == 'h') 103 | self.scale(value / self._rect[index]) 104 | 105 | def scale(self, x_scale, y_scale=None): 106 | ''' Current scaling deals properly with things that 107 | have been rotated in 90 degree increments 108 | (via the ViewMerge object given when instantiating). 109 | ''' 110 | if y_scale is None: 111 | y_scale = x_scale 112 | x, y, w, h = rect = self._rect 113 | ao, bo, co, do, eo, fo = matrix = self._matrix 114 | an = ao * x_scale 115 | bn = bo * y_scale 116 | cn = co * x_scale 117 | dn = do * y_scale 118 | en = x + (eo - x) * 1.0 * (an + cn) / (ao + co) 119 | fn = y + (fo - y) * 1.0 * (bn + dn) / (bo + do) 120 | matrix[:] = an, bn, cn, dn, en, fn 121 | rect[:] = x, y, w * x_scale, h * y_scale 122 | 123 | @property 124 | def box(self): 125 | ''' Return the bounding box for the object 126 | ''' 127 | x, y, w, h = self._rect 128 | return PdfArray([x, y, x + w, y + h]) 129 | 130 | 131 | class PageMerge(list): 132 | ''' A PageMerge object can have 0 or 1 underlying pages 133 | (that get edited with the results of the merge) 134 | and 0-n RectXObjs that can be applied before or 135 | after the underlying page. 136 | ''' 137 | page = None 138 | mbox = None 139 | cbox = None 140 | resources = None 141 | rotate = None 142 | contents = None 143 | 144 | def __init__(self, page=None): 145 | if page is not None: 146 | self.setpage(page) 147 | 148 | def setpage(self, page): 149 | if page.Type != PdfName.Page: 150 | raise TypeError("Expected page") 151 | self.append(None) # Placeholder 152 | self.page = page 153 | inheritable = page.inheritable 154 | self.mbox = inheritable.MediaBox 155 | self.cbox = inheritable.CropBox 156 | self.resources = inheritable.Resources 157 | self.rotate = inheritable.Rotate 158 | self.contents = page.Contents 159 | 160 | def __add__(self, other): 161 | if isinstance(other, dict): 162 | other = [other] 163 | for other in other: 164 | self.add(other) 165 | return self 166 | 167 | def add(self, obj, prepend=False, **kw): 168 | if kw: 169 | obj = RectXObj(obj, **kw) 170 | elif obj.Type == PdfName.Page: 171 | obj = RectXObj(obj) 172 | if prepend: 173 | self.insert(0, obj) 174 | else: 175 | self.append(obj) 176 | return self 177 | 178 | def render(self): 179 | def do_xobjs(xobj_list, restore_first=False): 180 | content = ['Q'] if restore_first else [] 181 | for obj in xobj_list: 182 | index = PdfName('pdfrw_%d' % (key_offset + len(xobjs))) 183 | if xobjs.setdefault(index, obj) is not obj: 184 | raise KeyError("XObj key %s already in use" % index) 185 | content.append('%s Do' % index) 186 | return PdfDict(indirect=True, stream='\n'.join(content)) 187 | 188 | mbox = self.mbox 189 | cbox = self.cbox 190 | page = self.page 191 | old_contents = self.contents 192 | resources = self.resources or PdfDict() 193 | 194 | key_offset = 0 195 | xobjs = resources.XObject 196 | if xobjs is None: 197 | xobjs = resources.XObject = PdfDict() 198 | else: 199 | allkeys = xobjs.keys() 200 | if allkeys: 201 | keys = (x for x in allkeys if x.startswith('/pdfrw_')) 202 | keys = (x for x in keys if x[7:].isdigit()) 203 | keys = sorted(keys, key=lambda x: int(x[7:])) 204 | key_offset = (int(keys[-1][7:]) + 1) if keys else 0 205 | key_offset -= len(allkeys) 206 | 207 | if old_contents is None: 208 | new_contents = do_xobjs(self) 209 | else: 210 | isdict = isinstance(old_contents, PdfDict) 211 | old_contents = [old_contents] if isdict else old_contents 212 | new_contents = PdfArray() 213 | index = self.index(None) 214 | if index: 215 | new_contents.append(do_xobjs(self[:index])) 216 | 217 | index += 1 218 | if index < len(self): 219 | # There are elements to add after the original page contents, 220 | # so push the graphics state to the stack. Restored below. 221 | new_contents.append(PdfDict(indirect=True, stream='q')) 222 | 223 | new_contents.extend(old_contents) 224 | 225 | if index < len(self): 226 | # Restore graphics state and add other elements. 227 | new_contents.append(do_xobjs(self[index:], restore_first=True)) 228 | 229 | if mbox is None: 230 | cbox = None 231 | mbox = self.xobj_box 232 | mbox[0] = min(0, mbox[0]) 233 | mbox[1] = min(0, mbox[1]) 234 | 235 | page = PdfDict(indirect=True) if page is None else page 236 | page.Type = PdfName.Page 237 | page.Resources = resources 238 | page.MediaBox = mbox 239 | page.CropBox = cbox 240 | page.Rotate = self.rotate 241 | page.Contents = new_contents 242 | return page 243 | 244 | @property 245 | def xobj_box(self): 246 | ''' Return the smallest box that encloses every object 247 | in the list. 248 | ''' 249 | a, b, c, d = zip(*(xobj.box for xobj in self)) 250 | return PdfArray((min(a), min(b), max(c), max(d))) 251 | -------------------------------------------------------------------------------- /pdfrw/pdfwriter.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | The PdfWriter class writes an entire PDF file out to disk. 7 | 8 | The writing process is not at all optimized or organized. 9 | 10 | An instance of the PdfWriter class has two methods: 11 | addpage(page) 12 | and 13 | write(fname) 14 | 15 | addpage() assumes that the pages are part of a valid 16 | tree/forest of PDF objects. 17 | ''' 18 | import gc 19 | 20 | from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict, 21 | PdfObject, PdfString) 22 | from .compress import compress as do_compress 23 | from .errors import PdfOutputError, log 24 | from .py23_diffs import iteritems, convert_store 25 | 26 | NullObject = PdfObject('null') 27 | NullObject.indirect = True 28 | NullObject.Type = 'Null object' 29 | 30 | 31 | def user_fmt(obj, isinstance=isinstance, float=float, str=str, 32 | basestring=(type(u''), type(b'')), encode=PdfString.encode): 33 | ''' This function may be replaced by the user for 34 | specialized formatting requirements. 35 | ''' 36 | 37 | if isinstance(obj, basestring): 38 | return encode(obj) 39 | 40 | # PDFs don't handle exponent notation 41 | if isinstance(obj, float): 42 | return ('%.9f' % obj).rstrip('0').rstrip('.') 43 | 44 | return str(obj) 45 | 46 | 47 | def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), 48 | user_fmt=user_fmt, do_compress=do_compress, 49 | convert_store=convert_store, iteritems=iteritems, 50 | id=id, isinstance=isinstance, getattr=getattr, len=len, 51 | sum=sum, set=set, str=str, hasattr=hasattr, repr=repr, 52 | enumerate=enumerate, list=list, dict=dict, tuple=tuple, 53 | PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject): 54 | ''' FormatObjects performs the actual formatting and disk write. 55 | Should be a class, was a class, turned into nested functions 56 | for performace (to reduce attribute lookups). 57 | ''' 58 | 59 | def f_write(s): 60 | f.write(convert_store(s)) 61 | 62 | def add(obj): 63 | ''' Add an object to our list, if it's an indirect 64 | object. Just format it if not. 65 | ''' 66 | # Can't hash dicts, so just hash the object ID 67 | objid = id(obj) 68 | 69 | # Automatically set stream objects to indirect 70 | if isinstance(obj, PdfDict): 71 | indirect = obj.indirect or (obj.stream is not None) 72 | else: 73 | indirect = getattr(obj, 'indirect', False) 74 | 75 | if not indirect: 76 | if objid in visited: 77 | log.warning('Replicating direct %s object, ' 78 | 'should be indirect for optimal file size' % 79 | type(obj)) 80 | obj = type(obj)(obj) 81 | objid = id(obj) 82 | visiting(objid) 83 | result = format_obj(obj) 84 | leaving(objid) 85 | return result 86 | 87 | objnum = indirect_dict_get(objid) 88 | 89 | # If we haven't seen the object yet, we need to 90 | # add it to the indirect object list. 91 | if objnum is None: 92 | swapped = swapobj(objid) 93 | if swapped is not None: 94 | old_id = objid 95 | obj = swapped 96 | objid = id(obj) 97 | objnum = indirect_dict_get(objid) 98 | if objnum is not None: 99 | indirect_dict[old_id] = objnum 100 | return '%s 0 R' % objnum 101 | objnum = len(objlist) + 1 102 | objlist_append(None) 103 | indirect_dict[objid] = objnum 104 | deferred.append((objnum - 1, obj)) 105 | return '%s 0 R' % objnum 106 | 107 | def format_array(myarray, formatter): 108 | # Format array data into semi-readable ASCII 109 | if sum([len(x) for x in myarray]) <= 70: 110 | return formatter % space_join(myarray) 111 | return format_big(myarray, formatter) 112 | 113 | def format_big(myarray, formatter): 114 | bigarray = [] 115 | count = 1000000 116 | for x in myarray: 117 | lenx = len(x) + 1 118 | count += lenx 119 | if count > 71: 120 | subarray = [] 121 | bigarray.append(subarray) 122 | count = lenx 123 | subarray.append(x) 124 | return formatter % lf_join([space_join(x) for x in bigarray]) 125 | 126 | def format_obj(obj): 127 | ''' format PDF object data into semi-readable ASCII. 128 | May mutually recurse with add() -- add() will 129 | return references for indirect objects, and add 130 | the indirect object to the list. 131 | ''' 132 | while 1: 133 | if isinstance(obj, (list, dict, tuple)): 134 | if isinstance(obj, PdfArray): 135 | myarray = [add(x) for x in obj] 136 | return format_array(myarray, '[%s]') 137 | elif isinstance(obj, PdfDict): 138 | if compress and obj.stream: 139 | do_compress([obj]) 140 | pairs = sorted((getattr(x, 'encoded', None) or x, y) 141 | for (x, y) in obj.iteritems()) 142 | myarray = [] 143 | for key, value in pairs: 144 | myarray.append(key) 145 | myarray.append(add(value)) 146 | result = format_array(myarray, '<<%s>>') 147 | stream = obj.stream 148 | if stream is not None: 149 | result = ('%s\nstream\n%s\nendstream' % 150 | (result, stream)) 151 | return result 152 | obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj) 153 | continue 154 | 155 | # We assume that an object with an indirect 156 | # attribute knows how to represent itself to us. 157 | if hasattr(obj, 'indirect'): 158 | return str(getattr(obj, 'encoded', None) or obj) 159 | return user_fmt(obj) 160 | 161 | def format_deferred(): 162 | while deferred: 163 | index, obj = deferred.pop() 164 | objlist[index] = format_obj(obj) 165 | 166 | indirect_dict = {} 167 | indirect_dict_get = indirect_dict.get 168 | objlist = [] 169 | objlist_append = objlist.append 170 | visited = set() 171 | visiting = visited.add 172 | leaving = visited.remove 173 | space_join = ' '.join 174 | lf_join = '\n '.join 175 | 176 | deferred = [] 177 | 178 | # Don't reference old catalog or pages objects -- 179 | # swap references to new ones. 180 | type_remap = {PdfName.Catalog: trailer.Root, 181 | PdfName.Pages: trailer.Root.Pages, None: trailer}.get 182 | swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj) 183 | for objid, (obj, new_obj) in iteritems(killobj)] 184 | swapobj = dict((objid, obj is None and NullObject or obj) 185 | for objid, obj in swapobj).get 186 | 187 | for objid in killobj: 188 | assert swapobj(objid) is not None 189 | 190 | # The first format of trailer gets all the information, 191 | # but we throw away the actual trailer formatting. 192 | format_obj(trailer) 193 | # Keep formatting until we're done. 194 | # (Used to recurse inside format_obj for this, but 195 | # hit system limit.) 196 | format_deferred() 197 | # Now we know the size, so we update the trailer dict 198 | # and get the formatted data. 199 | trailer.Size = PdfObject(len(objlist) + 1) 200 | trailer = format_obj(trailer) 201 | 202 | # Now we have all the pieces to write out to the file. 203 | # Keep careful track of the counts while we do it so 204 | # we can correctly build the cross-reference. 205 | 206 | header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version 207 | f_write(header) 208 | offset = len(header) 209 | offsets = [(0, 65535, 'f')] 210 | offsets_append = offsets.append 211 | 212 | for i, x in enumerate(objlist): 213 | objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) 214 | offsets_append((offset, 0, 'n')) 215 | offset += len(objstr) 216 | f_write(objstr) 217 | 218 | f_write('xref\n0 %s\n' % len(offsets)) 219 | for x in offsets: 220 | f_write('%010d %05d %s\r\n' % x) 221 | f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) 222 | 223 | 224 | class PdfWriter(object): 225 | 226 | _trailer = None 227 | canonicalize = False 228 | fname = None 229 | 230 | def __init__(self, fname=None, version='1.3', compress=False, **kwargs): 231 | """ 232 | Parameters: 233 | fname -- Output file name, or file-like binary object 234 | with a write method 235 | version -- PDF version to target. Currently only 1.3 236 | supported. 237 | compress -- True to do compression on output. Currently 238 | compresses stream objects. 239 | """ 240 | 241 | # Legacy support: fname is new, was added in front 242 | if fname is not None: 243 | try: 244 | float(fname) 245 | except (ValueError, TypeError): 246 | pass 247 | else: 248 | if version != '1.3': 249 | assert compress == False 250 | compress = version 251 | version = fname 252 | fname = None 253 | 254 | self.fname = fname 255 | self.version = version 256 | self.compress = compress 257 | 258 | if kwargs: 259 | for name, value in iteritems(kwargs): 260 | if name not in self.replaceable: 261 | raise ValueError("Cannot set attribute %s " 262 | "on PdfWriter instance" % name) 263 | setattr(self, name, value) 264 | 265 | self.pagearray = PdfArray() 266 | self.killobj = {} 267 | 268 | def addpage(self, page): 269 | self._trailer = None 270 | if page.Type != PdfName.Page: 271 | raise PdfOutputError('Bad /Type: Expected %s, found %s' 272 | % (PdfName.Page, page.Type)) 273 | inheritable = page.inheritable # searches for resources 274 | self.pagearray.append( 275 | IndirectPdfDict( 276 | page, 277 | Resources=inheritable.Resources, 278 | MediaBox=inheritable.MediaBox, 279 | CropBox=inheritable.CropBox, 280 | Rotate=inheritable.Rotate, 281 | ) 282 | ) 283 | 284 | # Add parents in the hierarchy to objects we 285 | # don't want to output 286 | killobj = self.killobj 287 | obj, new_obj = page, self.pagearray[-1] 288 | while obj is not None: 289 | objid = id(obj) 290 | if objid in killobj: 291 | break 292 | killobj[objid] = obj, new_obj 293 | obj = obj.Parent 294 | new_obj = None 295 | return self 296 | 297 | addPage = addpage # for compatibility with pyPdf 298 | 299 | def addpages(self, pagelist): 300 | for page in pagelist: 301 | self.addpage(page) 302 | return self 303 | 304 | def _get_trailer(self): 305 | trailer = self._trailer 306 | if trailer is not None: 307 | return trailer 308 | 309 | if self.canonicalize: 310 | self.make_canonical() 311 | 312 | # Create the basic object structure of the PDF file 313 | trailer = PdfDict( 314 | Root=IndirectPdfDict( 315 | Type=PdfName.Catalog, 316 | Pages=IndirectPdfDict( 317 | Type=PdfName.Pages, 318 | Count=PdfObject(len(self.pagearray)), 319 | Kids=self.pagearray 320 | ) 321 | ) 322 | ) 323 | # Make all the pages point back to the page dictionary and 324 | # ensure they are indirect references 325 | pagedict = trailer.Root.Pages 326 | for page in pagedict.Kids: 327 | page.Parent = pagedict 328 | page.indirect = True 329 | self._trailer = trailer 330 | return trailer 331 | 332 | def _set_trailer(self, trailer): 333 | self._trailer = trailer 334 | 335 | trailer = property(_get_trailer, _set_trailer) 336 | 337 | def write(self, fname=None, trailer=None, user_fmt=user_fmt, 338 | disable_gc=True): 339 | 340 | trailer = trailer or self.trailer 341 | 342 | # Support fname for legacy applications 343 | if (fname is not None) == (self.fname is not None): 344 | raise PdfOutputError( 345 | "PdfWriter fname must be specified exactly once") 346 | 347 | fname = fname or self.fname 348 | 349 | # Dump the data. We either have a filename or a preexisting 350 | # file object. 351 | preexisting = hasattr(fname, 'write') 352 | f = preexisting and fname or open(fname, 'wb') 353 | if disable_gc: 354 | gc.disable() 355 | 356 | try: 357 | FormatObjects(f, trailer, self.version, self.compress, 358 | self.killobj, user_fmt=user_fmt) 359 | finally: 360 | if not preexisting: 361 | f.close() 362 | if disable_gc: 363 | gc.enable() 364 | 365 | def make_canonical(self): 366 | ''' Canonicalizes a PDF. Assumes everything 367 | is a Pdf object already. 368 | ''' 369 | visited = set() 370 | workitems = list(self.pagearray) 371 | while workitems: 372 | obj = workitems.pop() 373 | objid = id(obj) 374 | if objid in visited: 375 | continue 376 | visited.add(objid) 377 | obj.indirect = False 378 | if isinstance(obj, (PdfArray, PdfDict)): 379 | obj.indirect = True 380 | if isinstance(obj, PdfArray): 381 | workitems += obj 382 | else: 383 | workitems += obj.values() 384 | 385 | replaceable = set(vars()) -------------------------------------------------------------------------------- /pdfrw/py23_diffs.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | # Deal with Python2/3 differences 6 | 7 | try: 8 | import zlib 9 | except ImportError: 10 | zlib = None 11 | 12 | try: 13 | unicode = unicode 14 | except NameError: 15 | 16 | def convert_load(s): 17 | if isinstance(s, bytes): 18 | return s.decode('Latin-1') 19 | return s 20 | 21 | def convert_store(s): 22 | return s.encode('Latin-1') 23 | 24 | def from_array(a): 25 | return a.tobytes() 26 | 27 | else: 28 | 29 | def convert_load(s): 30 | return s 31 | 32 | def convert_store(s): 33 | return s 34 | 35 | def from_array(a): 36 | return a.tostring() 37 | 38 | nextattr, = (x for x in dir(iter([])) if 'next' in x) 39 | 40 | try: 41 | iteritems = dict.iteritems 42 | except AttributeError: 43 | iteritems = dict.items 44 | 45 | try: 46 | xrange = xrange 47 | except NameError: 48 | xrange = range 49 | 50 | try: 51 | intern = intern 52 | except NameError: 53 | from sys import intern 54 | -------------------------------------------------------------------------------- /pdfrw/tokens.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | A tokenizer for PDF streams. 7 | 8 | In general, documentation used was "PDF reference", 9 | sixth edition, for PDF version 1.7, dated November 2006. 10 | 11 | ''' 12 | 13 | import re 14 | import itertools 15 | from .objects import PdfString, PdfObject 16 | from .objects.pdfname import BasePdfName 17 | from .errors import log, PdfParseError 18 | from .py23_diffs import nextattr, intern 19 | 20 | 21 | def linepos(fdata, loc): 22 | line = fdata.count('\n', 0, loc) + 1 23 | line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc) 24 | col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc)) 25 | return line, col 26 | 27 | 28 | class PdfTokens(object): 29 | 30 | # Table 3.1, page 50 of reference, defines whitespace 31 | eol = '\n\r' 32 | whitespace = '\x00 \t\f' + eol 33 | 34 | # Text on page 50 defines delimiter characters 35 | # Escape the ] 36 | delimiters = r'()<>{}[\]/%' 37 | 38 | # "normal" stuff is all but delimiters or whitespace. 39 | 40 | p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, 41 | whitespace) 42 | 43 | p_comment = r'\%%[^%s]*' % eol 44 | 45 | # This will get the bulk of literal strings. 46 | p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?' 47 | 48 | # This will get more pieces of literal strings 49 | # (Don't ask me why, but it hangs without the trailing ?.) 50 | p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?' 51 | 52 | # A hex string. This one's easy. 53 | p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace 54 | 55 | p_dictdelim = r'\<\<|\>\>' 56 | p_name = r'/[^%s%s]*' % (delimiters, whitespace) 57 | 58 | p_catchall = '[^%s]' % whitespace 59 | 60 | pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, 61 | p_literal_string, p_comment, p_catchall]) 62 | findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), 63 | re.DOTALL).finditer 64 | findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, 65 | whitespace), re.DOTALL).finditer 66 | 67 | def _gettoks(self, startloc, intern=intern, 68 | delimiters=delimiters, findtok=findtok, 69 | findparen=findparen, PdfString=PdfString, 70 | PdfObject=PdfObject, BasePdfName=BasePdfName): 71 | ''' Given a source data string and a location inside it, 72 | gettoks generates tokens. Each token is a tuple of the form: 73 | , , 74 | The ending file loc is past any trailing whitespace. 75 | 76 | The main complication here is the literal strings, which 77 | can contain nested parentheses. In order to cope with these 78 | we can discard the current iterator and loop back to the 79 | top to get a fresh one. 80 | 81 | We could use re.search instead of re.finditer, but that's slower. 82 | ''' 83 | fdata = self.fdata 84 | current = self.current = [(startloc, startloc)] 85 | cache = {} 86 | get_cache = cache.get 87 | while 1: 88 | for match in findtok(fdata, current[0][1]): 89 | current[0] = tokspan = match.span() 90 | token = match.group(1) 91 | firstch = token[0] 92 | toktype = intern 93 | if firstch not in delimiters: 94 | toktype = PdfObject 95 | elif firstch in '/<(%': 96 | if firstch == '/': 97 | # PDF Name 98 | toktype = BasePdfName 99 | elif firstch == '<': 100 | # << dict delim, or < hex string > 101 | if token[1:2] != '<': 102 | toktype = PdfString 103 | elif firstch == '(': 104 | # Literal string 105 | # It's probably simple, but maybe not 106 | # Nested parentheses are a bear, and if 107 | # they are present, we exit the for loop 108 | # and get back in with a new starting location. 109 | ends = None # For broken strings 110 | if fdata[match.end(1) - 1] != ')': 111 | nest = 2 112 | m_start, loc = tokspan 113 | for match in findparen(fdata, loc): 114 | loc = match.end(1) 115 | ending = fdata[loc - 1] == ')' 116 | nest += 1 - ending * 2 117 | if not nest: 118 | break 119 | if ending and ends is None: 120 | ends = loc, match.end(), nest 121 | token = fdata[m_start:loc] 122 | current[0] = m_start, match.end() 123 | if nest: 124 | # There is one possible recoverable error 125 | # seen in the wild -- some stupid generators 126 | # don't escape (. If this happens, just 127 | # terminate on first unescaped ). The string 128 | # won't be quite right, but that's a science 129 | # fair project for another time. 130 | (self.error, self.exception)[not ends]( 131 | 'Unterminated literal string') 132 | loc, ends, nest = ends 133 | token = fdata[m_start:loc] + ')' * nest 134 | current[0] = m_start, ends 135 | toktype = PdfString 136 | elif firstch == '%': 137 | # Comment 138 | if self.strip_comments: 139 | continue 140 | else: 141 | self.exception(('Tokenizer logic incorrect -- ' 142 | 'should never get here')) 143 | 144 | newtok = get_cache(token) 145 | if newtok is None: 146 | newtok = cache[token] = toktype(token) 147 | yield newtok 148 | if current[0] is not tokspan: 149 | break 150 | else: 151 | if self.strip_comments: 152 | break 153 | raise StopIteration 154 | 155 | def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True): 156 | self.fdata = fdata 157 | self.strip_comments = strip_comments 158 | self.iterator = iterator = self._gettoks(startloc) 159 | self.msgs_dumped = None if verbose else set() 160 | self.next = getattr(iterator, nextattr) 161 | self.current = [(startloc, startloc)] 162 | 163 | def setstart(self, startloc): 164 | ''' Change the starting location. 165 | ''' 166 | current = self.current 167 | if startloc != current[0][1]: 168 | current[0] = startloc, startloc 169 | 170 | def floc(self): 171 | ''' Return the current file position 172 | (where the next token will be retrieved) 173 | ''' 174 | return self.current[0][1] 175 | floc = property(floc, setstart) 176 | 177 | def tokstart(self): 178 | ''' Return the file position of the most 179 | recently retrieved token. 180 | ''' 181 | return self.current[0][0] 182 | tokstart = property(tokstart, setstart) 183 | 184 | def __iter__(self): 185 | return self.iterator 186 | 187 | def multiple(self, count, islice=itertools.islice, list=list): 188 | ''' Retrieve multiple tokens 189 | ''' 190 | return list(islice(self, count)) 191 | 192 | def next_default(self, default='nope'): 193 | for result in self: 194 | return result 195 | return default 196 | 197 | def msg(self, msg, *arg): 198 | dumped = self.msgs_dumped 199 | if dumped is not None: 200 | if msg in dumped: 201 | return 202 | dumped.add(msg) 203 | if arg: 204 | msg %= arg 205 | fdata = self.fdata 206 | begin, end = self.current[0] 207 | if begin >= len(fdata): 208 | return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata)) 209 | line, col = linepos(fdata, begin) 210 | if end > begin: 211 | tok = fdata[begin:end].rstrip() 212 | if len(tok) > 30: 213 | tok = tok[:26] + ' ...' 214 | return ('%s (line=%d, col=%d, token=%s)' % 215 | (msg, line, col, repr(tok))) 216 | return '%s (line=%d, col=%d)' % (msg, line, col) 217 | 218 | def warning(self, *arg): 219 | s = self.msg(*arg) 220 | if s: 221 | log.warning(s) 222 | 223 | def error(self, *arg): 224 | s = self.msg(*arg) 225 | if s: 226 | log.error(s) 227 | 228 | def exception(self, *arg): 229 | raise PdfParseError(self.msg(*arg)) 230 | -------------------------------------------------------------------------------- /pdfrw/toreportlab.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | Converts pdfrw objects into reportlab objects. 7 | 8 | Designed for and tested with rl 2.3. 9 | 10 | Knows too much about reportlab internals. 11 | What can you do? 12 | 13 | The interface to this function is through the makerl() function. 14 | 15 | Parameters: 16 | canv - a reportlab "canvas" (also accepts a "document") 17 | pdfobj - a pdfrw PDF object 18 | 19 | Returns: 20 | A corresponding reportlab object, or if the 21 | object is a PDF Form XObject, the name to 22 | use with reportlab for the object. 23 | 24 | Will recursively convert all necessary objects. 25 | Be careful when converting a page -- if /Parent is set, 26 | will recursively convert all pages! 27 | 28 | Notes: 29 | 1) Original objects are annotated with a 30 | derived_rl_obj attribute which points to the 31 | reportlab object. This keeps multiple reportlab 32 | objects from being generated for the same pdfobj 33 | via repeated calls to makerl. This is great for 34 | not putting too many objects into the 35 | new PDF, but not so good if you are modifying 36 | objects for different pages. Then you 37 | need to do your own deep copying (of circular 38 | structures). You're on your own. 39 | 40 | 2) ReportLab seems weird about FormXObjects. 41 | They pass around a partial name instead of the 42 | object or a reference to it. So we have to 43 | reach into reportlab and get a number for 44 | a unique name. I guess this is to make it 45 | where you can combine page streams with 46 | impunity, but that's just a guess. 47 | 48 | 3) Updated 1/23/2010 to handle multipass documents 49 | (e.g. with a table of contents). These have 50 | a different doc object on every pass. 51 | 52 | ''' 53 | 54 | from reportlab.pdfbase import pdfdoc as rldocmodule 55 | from .objects import PdfDict, PdfArray, PdfName 56 | from .py23_diffs import convert_store 57 | 58 | RLStream = rldocmodule.PDFStream 59 | RLDict = rldocmodule.PDFDictionary 60 | RLArray = rldocmodule.PDFArray 61 | 62 | 63 | def _makedict(rldoc, pdfobj): 64 | rlobj = rldict = RLDict() 65 | if pdfobj.indirect: 66 | rlobj.__RefOnly__ = 1 67 | rlobj = rldoc.Reference(rlobj) 68 | pdfobj.derived_rl_obj[rldoc] = rlobj, None 69 | 70 | for key, value in pdfobj.iteritems(): 71 | rldict[key[1:]] = makerl_recurse(rldoc, value) 72 | 73 | return rlobj 74 | 75 | 76 | def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject): 77 | rldict = RLDict() 78 | rlobj = RLStream(rldict, convert_store(pdfobj.stream)) 79 | 80 | if pdfobj.Type == xobjtype: 81 | shortname = 'pdfrw_%s' % (rldoc.objectcounter + 1) 82 | fullname = rldoc.getXObjectName(shortname) 83 | else: 84 | shortname = fullname = None 85 | result = rldoc.Reference(rlobj, fullname) 86 | pdfobj.derived_rl_obj[rldoc] = result, shortname 87 | 88 | for key, value in pdfobj.iteritems(): 89 | rldict[key[1:]] = makerl_recurse(rldoc, value) 90 | 91 | return result 92 | 93 | 94 | def _makearray(rldoc, pdfobj): 95 | rlobj = rlarray = RLArray([]) 96 | if pdfobj.indirect: 97 | rlobj.__RefOnly__ = 1 98 | rlobj = rldoc.Reference(rlobj) 99 | pdfobj.derived_rl_obj[rldoc] = rlobj, None 100 | 101 | mylist = rlarray.sequence 102 | for value in pdfobj: 103 | mylist.append(makerl_recurse(rldoc, value)) 104 | 105 | return rlobj 106 | 107 | 108 | def _makestr(rldoc, pdfobj): 109 | assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) 110 | # TODO: Add fix for float like in pdfwriter 111 | return str(getattr(pdfobj, 'encoded', None) or pdfobj) 112 | 113 | 114 | def makerl_recurse(rldoc, pdfobj): 115 | docdict = getattr(pdfobj, 'derived_rl_obj', None) 116 | if docdict is not None: 117 | value = docdict.get(rldoc) 118 | if value is not None: 119 | return value[0] 120 | if isinstance(pdfobj, PdfDict): 121 | if pdfobj.stream is not None: 122 | func = _makestream 123 | else: 124 | func = _makedict 125 | if docdict is None: 126 | pdfobj.private.derived_rl_obj = {} 127 | elif isinstance(pdfobj, PdfArray): 128 | func = _makearray 129 | if docdict is None: 130 | pdfobj.derived_rl_obj = {} 131 | else: 132 | func = _makestr 133 | return func(rldoc, pdfobj) 134 | 135 | 136 | def makerl(canv, pdfobj): 137 | try: 138 | rldoc = canv._doc 139 | except AttributeError: 140 | rldoc = canv 141 | rlobj = makerl_recurse(rldoc, pdfobj) 142 | try: 143 | name = pdfobj.derived_rl_obj[rldoc][1] 144 | except AttributeError: 145 | name = None 146 | return name or rlobj 147 | -------------------------------------------------------------------------------- /pdfrw/uncompress.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # Copyright (C) 2012-2015 Nerijus Mika 4 | # MIT license -- See LICENSE.txt for details 5 | # Copyright (c) 2006, Mathieu Fenniak 6 | # BSD license -- see LICENSE.txt for details 7 | ''' 8 | A small subset of decompression filters. Should add more later. 9 | 10 | I believe, after looking at the code, that portions of the flate 11 | PNG predictor were originally transcribed from PyPDF2, which is 12 | probably an excellent source of additional filters. 13 | ''' 14 | import array 15 | from .objects import PdfDict, PdfName, PdfArray 16 | from .errors import log 17 | from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store 18 | import math 19 | 20 | def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict): 21 | for obj in mylist: 22 | if isinstance(obj, PdfDict) and obj.stream is not None: 23 | yield obj 24 | 25 | # Hack so we can import if zlib not available 26 | decompressobj = zlib if zlib is None else zlib.decompressobj 27 | 28 | 29 | def uncompress(mylist, leave_raw=False, warnings=set(), 30 | flate=PdfName.FlateDecode, decompress=decompressobj, 31 | isinstance=isinstance, list=list, len=len): 32 | ok = True 33 | for obj in streamobjects(mylist): 34 | ftype = obj.Filter 35 | if ftype is None: 36 | continue 37 | if isinstance(ftype, list) and len(ftype) == 1: 38 | # todo: multiple filters 39 | ftype = ftype[0] 40 | parms = obj.DecodeParms or obj.DP 41 | if ftype != flate: 42 | msg = ('Not decompressing: cannot use filter %s' 43 | ' with parameters %s') % (repr(ftype), repr(parms)) 44 | if msg not in warnings: 45 | warnings.add(msg) 46 | log.warning(msg) 47 | ok = False 48 | else: 49 | dco = decompress() 50 | try: 51 | data = dco.decompress(convert_store(obj.stream)) 52 | except Exception as s: 53 | error = str(s) 54 | else: 55 | error = None 56 | if isinstance(parms, PdfArray): 57 | oldparms = parms 58 | parms = PdfDict() 59 | for x in oldparms: 60 | parms.update(x) 61 | if parms: 62 | predictor = int(parms.Predictor or 1) 63 | columns = int(parms.Columns or 1) 64 | colors = int(parms.Colors or 1) 65 | bpc = int(parms.BitsPerComponent or 8) 66 | if 10 <= predictor <= 15: 67 | data, error = flate_png(data, predictor, columns, colors, bpc) 68 | elif predictor != 1: 69 | error = ('Unsupported flatedecode predictor %s' % 70 | repr(predictor)) 71 | if error is None: 72 | assert not dco.unconsumed_tail 73 | if dco.unused_data.strip(): 74 | error = ('Unconsumed compression data: %s' % 75 | repr(dco.unused_data[:20])) 76 | if error is None: 77 | obj.Filter = None 78 | obj.stream = data if leave_raw else convert_load(data) 79 | else: 80 | log.error('%s %s' % (error, repr(obj.indirect))) 81 | ok = False 82 | return ok 83 | 84 | def flate_png_impl(data, predictor=1, columns=1, colors=1, bpc=8): 85 | 86 | # http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html 87 | # https://www.w3.org/TR/2003/REC-PNG-20031110/#9Filters 88 | # Reconstruction functions 89 | # x: the byte being filtered; 90 | # a: the byte corresponding to x in the pixel immediately before the pixel containing x (or the byte immediately before x, when the bit depth is less than 8); 91 | # b: the byte corresponding to x in the previous scanline; 92 | # c: the byte corresponding to b in the pixel immediately before the pixel containing b (or the byte immediately before b, when the bit depth is less than 8). 93 | 94 | def subfilter(data, prior_row_data, start, length, pixel_size): 95 | # filter type 1: Sub 96 | # Recon(x) = Filt(x) + Recon(a) 97 | for i in xrange(pixel_size, length): 98 | left = data[start + i - pixel_size] 99 | data[start + i] = (data[start + i] + left) % 256 100 | 101 | def upfilter(data, prior_row_data, start, length, pixel_size): 102 | # filter type 2: Up 103 | # Recon(x) = Filt(x) + Recon(b) 104 | for i in xrange(length): 105 | up = prior_row_data[i] 106 | data[start + i] = (data[start + i] + up) % 256 107 | 108 | def avgfilter(data, prior_row_data, start, length, pixel_size): 109 | # filter type 3: Avg 110 | # Recon(x) = Filt(x) + floor((Recon(a) + Recon(b)) / 2) 111 | for i in xrange(length): 112 | left = data[start + i - pixel_size] if i >= pixel_size else 0 113 | up = prior_row_data[i] 114 | floor = math.floor((left + up) / 2) 115 | data[start + i] = (data[start + i] + int(floor)) % 256 116 | 117 | def paethfilter(data, prior_row_data, start, length, pixel_size): 118 | # filter type 4: Paeth 119 | # Recon(x) = Filt(x) + PaethPredictor(Recon(a), Recon(b), Recon(c)) 120 | def paeth_predictor(a, b, c): 121 | p = a + b - c 122 | pa = abs(p - a) 123 | pb = abs(p - b) 124 | pc = abs(p - c) 125 | if pa <= pb and pa <= pc: 126 | return a 127 | elif pb <= pc: 128 | return b 129 | else: 130 | return c 131 | for i in xrange(length): 132 | left = data[start + i - pixel_size] if i >= pixel_size else 0 133 | up = prior_row_data[i] 134 | up_left = prior_row_data[i - pixel_size] if i >= pixel_size else 0 135 | data[start + i] = (data[start + i] + paeth_predictor(left, up, up_left)) % 256 136 | 137 | columnbytes = ((columns * colors * bpc) + 7) // 8 138 | pixel_size = (colors * bpc + 7) // 8 139 | data = array.array('B', data) 140 | rowlen = columnbytes + 1 141 | if predictor == 15: 142 | padding = (rowlen - len(data)) % rowlen 143 | data.extend([0] * padding) 144 | assert len(data) % rowlen == 0 145 | 146 | rows = xrange(0, len(data), rowlen) 147 | prior_row_data = [ 0 for i in xrange(columnbytes) ] 148 | for row_index in rows: 149 | 150 | filter_type = data[row_index] 151 | 152 | if filter_type == 0: # None filter 153 | pass 154 | 155 | elif filter_type == 1: # Sub filter 156 | subfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 157 | 158 | elif filter_type == 2: # Up filter 159 | upfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 160 | 161 | elif filter_type == 3: # Average filter 162 | avgfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 163 | 164 | elif filter_type == 4: # Paeth filter 165 | paethfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 166 | 167 | else: 168 | return None, 'Unsupported PNG filter %d' % filter_type 169 | 170 | prior_row_data = data[row_index + 1 : row_index + 1 + columnbytes] # without filter_type 171 | 172 | for row_index in reversed(rows): 173 | data.pop(row_index) 174 | 175 | return data, None 176 | 177 | def flate_png(data, predictor=1, columns=1, colors=1, bpc=8): 178 | ''' PNG prediction is used to make certain kinds of data 179 | more compressible. Before the compression, each data 180 | byte is either left the same, or is set to be a delta 181 | from the previous byte, or is set to be a delta from 182 | the previous row. This selection is done on a per-row 183 | basis, and is indicated by a compression type byte 184 | prepended to each row of data. 185 | 186 | Within more recent PDF files, it is normal to use 187 | this technique for Xref stream objects, which are 188 | quite regular. 189 | ''' 190 | d, e = flate_png_impl(data, predictor, columns, colors, bpc) 191 | if d is not None: 192 | d = from_array(d) 193 | return d, e 194 | 195 | -------------------------------------------------------------------------------- /releasing.txt: -------------------------------------------------------------------------------- 1 | Notes on releasing, which is not yet fully automated: 2 | 3 | 1) Update version number in pdfrw/__init__.py 4 | 5 | 2) Use pyroma 6 | 7 | 3) https://packaging.python.org/en/latest/distributing.html 8 | 9 | a) python setup.py sdist bdist_wheel 10 | b) twine upload dist/* 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | from pdfrw import __version__ as version 5 | from pdfrw.py23_diffs import convert_load 6 | 7 | setup( 8 | name='pdfrw', 9 | version=version, 10 | description='PDF file reader/writer library', 11 | long_description=convert_load(open("README.rst", 'rb').read()), 12 | author='Patrick Maupin', 13 | author_email='pmaupin@gmail.com', 14 | platforms='Independent', 15 | url='https://github.com/pmaupin/pdfrw', 16 | packages=['pdfrw', 'pdfrw.objects'], 17 | license='MIT', 18 | classifiers=[ 19 | 'Development Status :: 4 - Beta', 20 | 'Intended Audience :: Developers', 21 | 'License :: OSI Approved :: MIT License', 22 | 'Operating System :: OS Independent', 23 | 'Programming Language :: Python', 24 | 'Programming Language :: Python :: 2', 25 | 'Programming Language :: Python :: 2.6', 26 | 'Programming Language :: Python :: 2.7', 27 | 'Programming Language :: Python :: 3', 28 | 'Programming Language :: Python :: 3.3', 29 | 'Programming Language :: Python :: 3.4', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Topic :: Multimedia :: Graphics :: Graphics Conversion', 33 | 'Topic :: Software Development :: Libraries', 34 | 'Topic :: Text Processing', 35 | 'Topic :: Printing', 36 | 'Topic :: Utilities', 37 | ], 38 | keywords='pdf vector graphics PDF nup watermark split join merge', 39 | zip_safe=True, 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # This file intentionally left blank. 2 | -------------------------------------------------------------------------------- /tests/basn0g08.png.log: -------------------------------------------------------------------------------- 1 | width = 32 2 | bit_depth = 8 3 | channels = 1 4 | color_type = 0 5 | pixel_depth = 8 6 | rowbytes = 32 7 | filter = 1 8 | data = [ 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 9 | expected = [ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, ] 10 | width = 32 11 | bit_depth = 8 12 | channels = 1 13 | color_type = 0 14 | pixel_depth = 8 15 | rowbytes = 32 16 | filter = 1 17 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 18 | expected = [ 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, ] 19 | width = 32 20 | bit_depth = 8 21 | channels = 1 22 | color_type = 0 23 | pixel_depth = 8 24 | rowbytes = 32 25 | filter = 4 26 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 27 | expected = [ 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, ] 28 | width = 32 29 | bit_depth = 8 30 | channels = 1 31 | color_type = 0 32 | pixel_depth = 8 33 | rowbytes = 32 34 | filter = 4 35 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 36 | expected = [ 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, ] 37 | width = 32 38 | bit_depth = 8 39 | channels = 1 40 | color_type = 0 41 | pixel_depth = 8 42 | rowbytes = 32 43 | filter = 4 44 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 45 | expected = [ 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, ] 46 | width = 32 47 | bit_depth = 8 48 | channels = 1 49 | color_type = 0 50 | pixel_depth = 8 51 | rowbytes = 32 52 | filter = 4 53 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 54 | expected = [ 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, ] 55 | width = 32 56 | bit_depth = 8 57 | channels = 1 58 | color_type = 0 59 | pixel_depth = 8 60 | rowbytes = 32 61 | filter = 4 62 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 63 | expected = [ 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, ] 64 | width = 32 65 | bit_depth = 8 66 | channels = 1 67 | color_type = 0 68 | pixel_depth = 8 69 | rowbytes = 32 70 | filter = 1 71 | data = [ 0xe0,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 72 | expected = [ 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff, ] 73 | width = 32 74 | bit_depth = 8 75 | channels = 1 76 | color_type = 0 77 | pixel_depth = 8 78 | rowbytes = 32 79 | filter = 1 80 | data = [ 0xfe,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 81 | expected = [ 0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,0xf8,0xf7,0xf6,0xf5,0xf4,0xf3,0xf2,0xf1,0xf0,0xef,0xee,0xed,0xec,0xeb,0xea,0xe9,0xe8,0xe7,0xe6,0xe5,0xe4,0xe3,0xe2,0xe1,0xe0,0xdf, ] 82 | width = 32 83 | bit_depth = 8 84 | channels = 1 85 | color_type = 0 86 | pixel_depth = 8 87 | rowbytes = 32 88 | filter = 4 89 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 90 | expected = [ 0xde,0xdd,0xdc,0xdb,0xda,0xd9,0xd8,0xd7,0xd6,0xd5,0xd4,0xd3,0xd2,0xd1,0xd0,0xcf,0xce,0xcd,0xcc,0xcb,0xca,0xc9,0xc8,0xc7,0xc6,0xc5,0xc4,0xc3,0xc2,0xc1,0xc0,0xbf, ] 91 | width = 32 92 | bit_depth = 8 93 | channels = 1 94 | color_type = 0 95 | pixel_depth = 8 96 | rowbytes = 32 97 | filter = 4 98 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 99 | expected = [ 0xbe,0xbd,0xbc,0xbb,0xba,0xb9,0xb8,0xb7,0xb6,0xb5,0xb4,0xb3,0xb2,0xb1,0xb0,0xaf,0xae,0xad,0xac,0xab,0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3,0xa2,0xa1,0xa0,0x9f, ] 100 | width = 32 101 | bit_depth = 8 102 | channels = 1 103 | color_type = 0 104 | pixel_depth = 8 105 | rowbytes = 32 106 | filter = 4 107 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 108 | expected = [ 0x9e,0x9d,0x9c,0x9b,0x9a,0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92,0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8b,0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83,0x82,0x81,0x80,0x7f, ] 109 | width = 32 110 | bit_depth = 8 111 | channels = 1 112 | color_type = 0 113 | pixel_depth = 8 114 | rowbytes = 32 115 | filter = 4 116 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 117 | expected = [ 0x7e,0x7d,0x7c,0x7b,0x7a,0x79,0x78,0x77,0x76,0x75,0x74,0x73,0x72,0x71,0x70,0x6f,0x6e,0x6d,0x6c,0x6b,0x6a,0x69,0x68,0x67,0x66,0x65,0x64,0x63,0x62,0x61,0x60,0x5f, ] 118 | width = 32 119 | bit_depth = 8 120 | channels = 1 121 | color_type = 0 122 | pixel_depth = 8 123 | rowbytes = 32 124 | filter = 4 125 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 126 | expected = [ 0x5e,0x5d,0x5c,0x5b,0x5a,0x59,0x58,0x57,0x56,0x55,0x54,0x53,0x52,0x51,0x50,0x4f,0x4e,0x4d,0x4c,0x4b,0x4a,0x49,0x48,0x47,0x46,0x45,0x44,0x43,0x42,0x41,0x40,0x3f, ] 127 | width = 32 128 | bit_depth = 8 129 | channels = 1 130 | color_type = 0 131 | pixel_depth = 8 132 | rowbytes = 32 133 | filter = 4 134 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 135 | expected = [ 0x3e,0x3d,0x3c,0x3b,0x3a,0x39,0x38,0x37,0x36,0x35,0x34,0x33,0x32,0x31,0x30,0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20,0x1f, ] 136 | width = 32 137 | bit_depth = 8 138 | channels = 1 139 | color_type = 0 140 | pixel_depth = 8 141 | rowbytes = 32 142 | filter = 1 143 | data = [ 0x1e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01, ] 144 | expected = [ 0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00,0x01, ] 145 | width = 32 146 | bit_depth = 8 147 | channels = 1 148 | color_type = 0 149 | pixel_depth = 8 150 | rowbytes = 32 151 | filter = 1 152 | data = [ 0x02,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 153 | expected = [ 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21, ] 154 | width = 32 155 | bit_depth = 8 156 | channels = 1 157 | color_type = 0 158 | pixel_depth = 8 159 | rowbytes = 32 160 | filter = 4 161 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 162 | expected = [ 0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x40,0x41, ] 163 | width = 32 164 | bit_depth = 8 165 | channels = 1 166 | color_type = 0 167 | pixel_depth = 8 168 | rowbytes = 32 169 | filter = 4 170 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 171 | expected = [ 0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,0x60,0x61, ] 172 | width = 32 173 | bit_depth = 8 174 | channels = 1 175 | color_type = 0 176 | pixel_depth = 8 177 | rowbytes = 32 178 | filter = 4 179 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 180 | expected = [ 0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81, ] 181 | width = 32 182 | bit_depth = 8 183 | channels = 1 184 | color_type = 0 185 | pixel_depth = 8 186 | rowbytes = 32 187 | filter = 4 188 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 189 | expected = [ 0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,0xa0,0xa1, ] 190 | width = 32 191 | bit_depth = 8 192 | channels = 1 193 | color_type = 0 194 | pixel_depth = 8 195 | rowbytes = 32 196 | filter = 4 197 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 198 | expected = [ 0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,0xc0,0xc1, ] 199 | width = 32 200 | bit_depth = 8 201 | channels = 1 202 | color_type = 0 203 | pixel_depth = 8 204 | rowbytes = 32 205 | filter = 4 206 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 207 | expected = [ 0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,0xe0,0xe1, ] 208 | width = 32 209 | bit_depth = 8 210 | channels = 1 211 | color_type = 0 212 | pixel_depth = 8 213 | rowbytes = 32 214 | filter = 1 215 | data = [ 0xe2,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0xff,0xff, ] 216 | expected = [ 0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,0xfe,0xfd, ] 217 | width = 32 218 | bit_depth = 8 219 | channels = 1 220 | color_type = 0 221 | pixel_depth = 8 222 | rowbytes = 32 223 | filter = 1 224 | data = [ 0xfc,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 225 | expected = [ 0xfc,0xfb,0xfa,0xf9,0xf8,0xf7,0xf6,0xf5,0xf4,0xf3,0xf2,0xf1,0xf0,0xef,0xee,0xed,0xec,0xeb,0xea,0xe9,0xe8,0xe7,0xe6,0xe5,0xe4,0xe3,0xe2,0xe1,0xe0,0xdf,0xde,0xdd, ] 226 | width = 32 227 | bit_depth = 8 228 | channels = 1 229 | color_type = 0 230 | pixel_depth = 8 231 | rowbytes = 32 232 | filter = 4 233 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 234 | expected = [ 0xdc,0xdb,0xda,0xd9,0xd8,0xd7,0xd6,0xd5,0xd4,0xd3,0xd2,0xd1,0xd0,0xcf,0xce,0xcd,0xcc,0xcb,0xca,0xc9,0xc8,0xc7,0xc6,0xc5,0xc4,0xc3,0xc2,0xc1,0xc0,0xbf,0xbe,0xbd, ] 235 | width = 32 236 | bit_depth = 8 237 | channels = 1 238 | color_type = 0 239 | pixel_depth = 8 240 | rowbytes = 32 241 | filter = 4 242 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 243 | expected = [ 0xbc,0xbb,0xba,0xb9,0xb8,0xb7,0xb6,0xb5,0xb4,0xb3,0xb2,0xb1,0xb0,0xaf,0xae,0xad,0xac,0xab,0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3,0xa2,0xa1,0xa0,0x9f,0x9e,0x9d, ] 244 | width = 32 245 | bit_depth = 8 246 | channels = 1 247 | color_type = 0 248 | pixel_depth = 8 249 | rowbytes = 32 250 | filter = 4 251 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 252 | expected = [ 0x9c,0x9b,0x9a,0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92,0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8b,0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83,0x82,0x81,0x80,0x7f,0x7e,0x7d, ] 253 | width = 32 254 | bit_depth = 8 255 | channels = 1 256 | color_type = 0 257 | pixel_depth = 8 258 | rowbytes = 32 259 | filter = 4 260 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 261 | expected = [ 0x7c,0x7b,0x7a,0x79,0x78,0x77,0x76,0x75,0x74,0x73,0x72,0x71,0x70,0x6f,0x6e,0x6d,0x6c,0x6b,0x6a,0x69,0x68,0x67,0x66,0x65,0x64,0x63,0x62,0x61,0x60,0x5f,0x5e,0x5d, ] 262 | width = 32 263 | bit_depth = 8 264 | channels = 1 265 | color_type = 0 266 | pixel_depth = 8 267 | rowbytes = 32 268 | filter = 4 269 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 270 | expected = [ 0x5c,0x5b,0x5a,0x59,0x58,0x57,0x56,0x55,0x54,0x53,0x52,0x51,0x50,0x4f,0x4e,0x4d,0x4c,0x4b,0x4a,0x49,0x48,0x47,0x46,0x45,0x44,0x43,0x42,0x41,0x40,0x3f,0x3e,0x3d, ] 271 | width = 32 272 | bit_depth = 8 273 | channels = 1 274 | color_type = 0 275 | pixel_depth = 8 276 | rowbytes = 32 277 | filter = 4 278 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 279 | expected = [ 0x3c,0x3b,0x3a,0x39,0x38,0x37,0x36,0x35,0x34,0x33,0x32,0x31,0x30,0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20,0x1f,0x1e,0x1d, ] 280 | width = 32 281 | bit_depth = 8 282 | channels = 1 283 | color_type = 0 284 | pixel_depth = 8 285 | rowbytes = 32 286 | filter = 1 287 | data = [ 0x1c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x01,0x01, ] 288 | expected = [ 0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00,0x01,0x02,0x03, ] 289 | PASS: pngimage --exhaustive --list-combos ./contrib/pngsuite/basn0g08.png 290 | -------------------------------------------------------------------------------- /tests/checkdiffs.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python2 2 | 3 | import sys 4 | import os 5 | import subprocess 6 | import hashlib 7 | 8 | import expected 9 | import static_pdfs 10 | 11 | source_pdfs = static_pdfs.pdffiles[0] 12 | source_pdfs = dict((os.path.basename(x), x) for x in source_pdfs) 13 | 14 | result_dir = expected.result_dir 15 | 16 | for subdir in sorted(os.listdir(result_dir)): 17 | dstd = os.path.join(result_dir, subdir) 18 | if not os.path.isdir(dstd): 19 | continue 20 | for pdffile in sorted(os.listdir(dstd)): 21 | testname = '%s/%s' % (subdir, pdffile) 22 | srcf = source_pdfs.get(pdffile) 23 | dstf = os.path.join(dstd, pdffile) 24 | if pdffile not in source_pdfs: 25 | print('\n Skipping %s -- source not found' % testname) 26 | continue 27 | 28 | with open(dstf, 'rb') as f: 29 | data = f.read() 30 | hash = hashlib.md5(data).hexdigest() 31 | skipset = set((hash, 'skip', 'xfail', 'fail', '!' + hash)) 32 | if expected.results[testname] & skipset: 33 | print('\n Skipping %s -- marked done' % testname) 34 | continue 35 | if os.path.exists('foobar.pdf'): 36 | os.remove('foobar.pdf') 37 | builtdiff = False 38 | while 1: 39 | sys.stdout.write(''' 40 | Test case %s 41 | 42 | c = compare using imagemagick and okular 43 | f = display foobar.pdf (result from comparison) 44 | o = display results with okular 45 | a = display results with acrobat 46 | 47 | s = mark 'skip' and go to next PDF 48 | g = mark as good and go to next PDF 49 | b = mark as bad and go to next PDF 50 | n = next pdf without marking 51 | q = quit 52 | --> ''' % testname) 53 | sel = raw_input() 54 | if sel == 'q': 55 | raise SystemExit(0) 56 | if sel == 'n': 57 | break 58 | if sel == 'c': 59 | subprocess.call(('compare', '-verbose', srcf, dstf, 60 | 'foobar.pdf')) 61 | builtdiff = True 62 | continue 63 | if sel == 'f': 64 | subprocess.call(('okular', 'foobar.pdf')) 65 | continue 66 | if sel == 'o': 67 | subprocess.call(('okular', srcf, dstf)) 68 | continue 69 | if sel == 'a': 70 | if builtdiff: 71 | subprocess.call(('acroread', srcf, dstf, 'foobar.pdf')) 72 | else: 73 | subprocess.call(('acroread', srcf, dstf)) 74 | continue 75 | 76 | if sel in 'sgb': 77 | results = (hash if sel == 'g' else 78 | ' skip' if sel == 's' else '!'+hash) 79 | with open(expected.expectedf, 'a') as f: 80 | f.write('%s %s\n' % (testname, results)) 81 | break 82 | -------------------------------------------------------------------------------- /tests/expected.py: -------------------------------------------------------------------------------- 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 3 | # MIT license -- See LICENSE.txt for details 4 | 5 | ''' 6 | Read expected.txt, which should be in the format: 7 | 8 | testname/srcname.pdf validhash 9 | 10 | More than one validhash is allowed (on separate lines), 11 | and hash-delimited comments are allowed. 12 | ''' 13 | 14 | import os 15 | import collections 16 | from pdfrw.py23_diffs import convert_load 17 | 18 | root_dir = os.path.dirname(__file__) 19 | result_dir = 'tmp_results' 20 | if os.path.exists('ramdisk'): 21 | result_dir = os.path.join('ramdisk', result_dir) 22 | result_dir = os.path.join(root_dir, result_dir) 23 | 24 | for sourcef in ('mytests.txt', 'expected.txt'): 25 | expectedf = os.path.join(root_dir, sourcef) 26 | if os.path.exists(expectedf): 27 | break 28 | 29 | 30 | def results(): 31 | results = collections.defaultdict(set) 32 | with open(expectedf, 'rb') as f: 33 | for line in f: 34 | line = convert_load(line) 35 | line = line.split('#', 1)[0].split() 36 | if not line: 37 | continue 38 | key, value = line 39 | results[key].add(value) 40 | return results 41 | results = results() 42 | -------------------------------------------------------------------------------- /tests/expected.txt: -------------------------------------------------------------------------------- 1 | # Example programs 2 | 3 | examples/4up_b1c400de699af29ea3f1983bb26870ab 1b73c612c40b5082d955ed72f63644bd 4 | examples/alter_b1c400de699af29ea3f1983bb26870ab 3c3ee465f45a685ba7098691be05a5ab 5 | examples/booklet_b1c400de699af29ea3f1983bb26870ab d711b74110eefb4e9e6bf1a5bea16bfe 6 | examples/extract_1975ef8db7355b1d691bc79d0749574b b4f5ee36a288da970ed040a9a733c8b0 7 | examples/extract_c5c895deecf7a7565393587e0d61be2b 539aad09ef80907bb396c3260eb87d7b 8 | examples/extract_d711b74110eefb4e9e6bf1a5bea16bfe 26ddfd09c6e6002228f06782c8544ac4 9 | examples/print_two_b1c400de699af29ea3f1983bb26870ab 73c8a16aba44548c2c06dae6e2551961 10 | examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb51265af08029 11 | examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b 12 | 13 | examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d 14 | examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331 15 | examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e 16 | examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612 17 | 18 | # All these are in the poster test 19 | examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a68e8f8df 20 | examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86 21 | examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d 22 | 23 | examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab e21dfdd9ae56ddb261dc3d02bf6da198 24 | examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 410063b7fbae1c6d5af33758e2b43450 25 | examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 745f1ac31a18d86afb294a449b72cb98 26 | examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5 27 | 28 | # List things that need work here (typically cause exceptions) 29 | 30 | # Bad info dict -- works otherwise 31 | 32 | simple/b1c400de699af29ea3f1983bb26870ab.pdf ecf2e28de18a724b53670c0d5637ec28 33 | repaginate/b1c400de699af29ea3f1983bb26870ab.pdf 4d7d6c5f6e14c6eac1dfc055cebfa499 34 | 35 | # 07b0ba4 is missing an object. Best we can do is report it 36 | # (and we do) 37 | 38 | repaginate/07b0ba4cff1c6ff73fd468b04b013457.pdf 993c763e085bce7ecc941ba104f4c892 39 | simple/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 40 | 41 | #b107 has a single page, but with an empty contents dict. 42 | 43 | repaginate/b107669d1dd69eabb89765fabb2cb321.pdf 0652d2da25b50cad75863d0e2bbaa878 44 | simple/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 45 | 46 | # Encrypted files 47 | 48 | repaginate/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip 49 | simple/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip 50 | compress/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip 51 | decompress/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip 52 | repaginate/6e122f618c27f3aa9a689423e3be6b8d.pdf skip 53 | simple/6e122f618c27f3aa9a689423e3be6b8d.pdf skip 54 | compress/6e122f618c27f3aa9a689423e3be6b8d.pdf skip 55 | decompress/6e122f618c27f3aa9a689423e3be6b8d.pdf skip 56 | repaginate/7dc787639aa6765214e9ff5494d231ed.pdf skip 57 | simple/7dc787639aa6765214e9ff5494d231ed.pdf skip 58 | compress/7dc787639aa6765214e9ff5494d231ed.pdf skip 59 | decompress/7dc787639aa6765214e9ff5494d231ed.pdf skip 60 | repaginate/b4b27aaa1f9c7c524298e98be279bebb.pdf skip 61 | simple/b4b27aaa1f9c7c524298e98be279bebb.pdf skip 62 | compress/b4b27aaa1f9c7c524298e98be279bebb.pdf skip 63 | decompress/b4b27aaa1f9c7c524298e98be279bebb.pdf skip 64 | repaginate/b5b6c6405d7b48418bccf97277957664.pdf skip 65 | simple/b5b6c6405d7b48418bccf97277957664.pdf skip 66 | compress/b5b6c6405d7b48418bccf97277957664.pdf skip 67 | decompress/b5b6c6405d7b48418bccf97277957664.pdf skip 68 | repaginate/bd0ef57aec16ded45bd89d61b54af0be.pdf skip 69 | simple/bd0ef57aec16ded45bd89d61b54af0be.pdf skip 70 | compress/bd0ef57aec16ded45bd89d61b54af0be.pdf skip 71 | decompress/bd0ef57aec16ded45bd89d61b54af0be.pdf skip 72 | repaginate/dbb807a878ac1da6b91ac15c9de4e209.pdf skip 73 | simple/dbb807a878ac1da6b91ac15c9de4e209.pdf skip 74 | compress/dbb807a878ac1da6b91ac15c9de4e209.pdf skip 75 | decompress/dbb807a878ac1da6b91ac15c9de4e209.pdf skip 76 | 77 | 78 | 79 | # List good hashes for round-trips here. 80 | 81 | repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b 82 | repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1 83 | repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8 84 | repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5 85 | repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c 86 | repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598 87 | repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4 88 | repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b 89 | repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61 90 | repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d 91 | repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795 92 | repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22 93 | repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0 94 | repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3 95 | repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0 96 | repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e 97 | repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2 98 | repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45 99 | repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315 100 | repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69 101 | repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c 102 | repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f 103 | repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e 104 | repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261 105 | repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c 106 | repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 107 | repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233 108 | repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea 109 | repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273 110 | repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae 111 | repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83 112 | repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd 113 | repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840 114 | repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68 115 | repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa 116 | repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273 117 | repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d 118 | 119 | 120 | simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469 121 | simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef 122 | simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 123 | simple/09715ec1a7b0f3a7ae02b3046f627b9f.pdf c4e4b3b725bd5fc3b008f1ac6251ad1c 124 | simple/1975ef8db7355b1d691bc79d0749574b.pdf 475c28c9588f3a7f6110d30f391758c4 125 | simple/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 3f17f19fd92adf01998bb13a0ee52b92 126 | simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7 127 | simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d 128 | simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097 129 | simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa 130 | simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2 131 | simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf 132 | simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4 133 | simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef 134 | simple/365b9c95574ee8944370fe286905d0e8.pdf cf3bfac41f410bf5bd657e3f906dfbc6 135 | simple/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e 136 | simple/49e31fd074eca6af981d78d42d0078ec.pdf 2c316537a5b0917634cbbdc5b91511df 137 | simple/536dfc6fbadd87c03eb59375d091eb53.pdf 319851765c70ba103c4191f7ec2148db 138 | simple/569f8094597bbe5b58efc3a7c6e14e87.pdf 025f1bf95cc537c36b8c3a044758b86c 139 | simple/5f0cff36d0ad74536a6513a98a755016.pdf 8476fd75e75394fcbbe02816d0640e7d 140 | simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9 141 | simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 142 | simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf 143 | simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05 144 | simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70 145 | simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 146 | simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e 147 | simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb 148 | simple/9d8626d18b1d8807d271e6ffc409446a.pdf 2358d654bf20d2b9d179ab009a615c4e 149 | simple/9f98322c243fe67726d56ccfa8e0885b.pdf 9290b4c32f005e1e4c7f431955246c4c 150 | simple/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6b406128e0ed1ac23dc5a0ee34d1f717 151 | simple/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c 152 | simple/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 2083f0e55cf06d88df02956a21bfef23 153 | simple/d6fd9567078b48c86710e9c49173781f.pdf 77464ec5cfdacb61a73b506bc4945631 154 | simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124 155 | simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318 156 | simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27 157 | 158 | 159 | decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469 160 | decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 161 | decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b 162 | decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36 163 | decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 164 | decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90 165 | decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637 166 | decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7 167 | decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b 168 | decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad 169 | decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9 170 | decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3 171 | decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f 172 | decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e 173 | decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a 174 | decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6 175 | decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e 176 | decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc 177 | decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6 178 | decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58 179 | decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56 180 | decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149 181 | decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 182 | decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0 183 | decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158 184 | decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24 185 | decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 186 | decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d 187 | decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb 188 | decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6 189 | decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 0fa96e3669d14c64fff159d5aa457014 190 | decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 191 | decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227 192 | decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f 193 | decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c 194 | decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a 195 | decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4 196 | decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380 197 | decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318 198 | decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0 199 | 200 | compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa 201 | compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 202 | compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e 203 | compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3 204 | compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 205 | compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976 206 | compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c 207 | compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6 208 | compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4 209 | compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5 210 | compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24 211 | compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c 212 | compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30 213 | compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44 214 | compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403 215 | compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15 216 | compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e 217 | compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba 218 | compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9 219 | compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192 220 | compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20 221 | compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c 222 | compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 223 | compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7 224 | compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7 225 | compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e 226 | compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594 227 | compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1 228 | compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81 229 | compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd 230 | compress/9f98322c243fe67726d56ccfa8e0885b.pdf f9d59774a75bb2dfc08ff7df65aa3048 231 | compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 232 | compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f 233 | compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868 234 | compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8 235 | compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24 236 | compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229 237 | compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c 238 | compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81 239 | compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578 240 | -------------------------------------------------------------------------------- /tests/myprofile.py: -------------------------------------------------------------------------------- 1 | import cProfile 2 | import unittest 3 | import test_roundtrip 4 | 5 | cProfile.run('unittest.main(test_roundtrip)') 6 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 4 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas 5 | # MIT license -- See LICENSE.txt for details 6 | 7 | ''' 8 | Run from the directory above like so: 9 | 10 | python -m tests.test_examples 11 | 12 | A PDF that has been determined to be good or bad 13 | should be added to expected.txt with either a good 14 | checksum, or just the word "fail". 15 | 16 | These tests are incomplete, but they allow us to try 17 | out various PDFs. There is a collection of difficult 18 | PDFs available on github. 19 | 20 | In order to use them: 21 | 22 | 1) Insure that github.com/pmaupin/static_pdfs is on your path. 23 | 24 | 2) Use the imagemagick compare program to look at differences 25 | between the static_pdfs/global directory and the tmp_results 26 | directory after you run this. 27 | 28 | 29 | ''' 30 | import sys 31 | import os 32 | import hashlib 33 | import subprocess 34 | import static_pdfs 35 | import expected 36 | 37 | from pdfrw.py23_diffs import convert_store 38 | from pdfrw import PdfReader, PdfWriter 39 | 40 | try: 41 | import unittest2 as unittest 42 | except ImportError: 43 | import unittest 44 | 45 | 46 | prog_dir = os.path.join(expected.root_dir, '..', 'examples', '%s.py') 47 | prog_dir = os.path.abspath(prog_dir) 48 | dstdir = os.path.join(expected.result_dir, 'examples') 49 | hashfile = os.path.join(expected.result_dir, 'hashes.txt') 50 | 51 | lookup = static_pdfs.pdffiles[0] 52 | lookup = dict((os.path.basename(x)[:-4], x) for x in lookup) 53 | 54 | 55 | class TestOnePdf(unittest.TestCase): 56 | 57 | def do_test(self, params, prev_results=[''], scrub=False): 58 | params = params.split() 59 | hashkey = 'examples/%s' % '_'.join(params) 60 | params = [lookup.get(x, x) for x in params] 61 | progname = params[0] 62 | params[0] = prog_dir % progname 63 | srcf = params[1] 64 | params.insert(0, sys.executable) 65 | subdir, progname = os.path.split(progname) 66 | subdir = os.path.join(dstdir, subdir) 67 | if not os.path.exists(subdir): 68 | os.makedirs(subdir) 69 | os.chdir(subdir) 70 | dstf = '%s.%s' % (progname, os.path.basename(srcf)) 71 | scrub = scrub and dstf 72 | dstf = dstf if not scrub else 'final.%s' % dstf 73 | hash = '------no-file-generated---------' 74 | expects = expected.results[hashkey] 75 | 76 | # If the test has been deliberately skipped, 77 | # we are done. Otherwise, execute it even 78 | # if we don't know about it yet, so we have 79 | # results to compare. 80 | 81 | result = 'fail' 82 | size = 0 83 | try: 84 | if 'skip' in expects: 85 | result = 'skip requested' 86 | return self.skipTest(result) 87 | elif 'xfail' in expects: 88 | result = 'xfail requested' 89 | return self.fail(result) 90 | 91 | exists = os.path.exists(dstf) 92 | if expects or not exists: 93 | if exists: 94 | os.remove(dstf) 95 | if scrub and os.path.exists(scrub): 96 | os.remove(scrub) 97 | subprocess.call(params) 98 | if scrub: 99 | PdfWriter(dstf).addpages(PdfReader(scrub).pages).write() 100 | with open(dstf, 'rb') as f: 101 | data = f.read() 102 | size = len(data) 103 | if data: 104 | hash = hashlib.md5(data).hexdigest() 105 | lookup[hash] = dstf 106 | prev_results[0] = hash 107 | else: 108 | os.remove(dstf) 109 | if expects: 110 | if len(expects) == 1: 111 | expects, = expects 112 | self.assertEqual(hash, expects) 113 | else: 114 | self.assertIn(hash, expects) 115 | result = 'pass' 116 | else: 117 | result = 'skip' 118 | self.skipTest('No hash available') 119 | finally: 120 | result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash) 121 | with open(hashfile, 'ab') as f: 122 | f.write(convert_store(result)) 123 | 124 | def test_4up(self): 125 | self.do_test('4up b1c400de699af29ea3f1983bb26870ab') 126 | 127 | def test_booklet_unspread(self): 128 | prev = [None] 129 | self.do_test('booklet b1c400de699af29ea3f1983bb26870ab', prev) 130 | if prev[0] is not None: 131 | self.do_test('unspread ' + prev[0]) 132 | self.do_test('extract ' + prev[0]) 133 | 134 | def test_print_two(self): 135 | self.do_test('print_two b1c400de699af29ea3f1983bb26870ab') 136 | 137 | def test_watermarks(self): 138 | self.do_test('watermark b1c400de699af29ea3f1983bb26870ab ' 139 | '06c86654f9a77e82f9adaa0086fc391c') 140 | self.do_test('watermark b1c400de699af29ea3f1983bb26870ab ' 141 | '06c86654f9a77e82f9adaa0086fc391c -u') 142 | 143 | def test_subset(self): 144 | self.do_test('subset b1c400de699af29ea3f1983bb26870ab 1-3 5') 145 | 146 | def test_alter(self): 147 | self.do_test('alter b1c400de699af29ea3f1983bb26870ab') 148 | 149 | def test_cat(self): 150 | self.do_test('cat b1c400de699af29ea3f1983bb26870ab ' 151 | '06c86654f9a77e82f9adaa0086fc391c') 152 | 153 | def test_rotate(self): 154 | self.do_test('rotate 707e3e2d17cbe9ec2273414b3b63f333 ' 155 | '270 1-4 7-8 10-50 52-56') 156 | 157 | def test_poster(self): 158 | prev = [None] 159 | self.do_test('subset 1975ef8db7355b1d691bc79d0749574b 21', prev) 160 | self.do_test('rotate %s 90 1' % prev[0], prev) 161 | self.do_test('poster %s' % prev[0], prev) 162 | 163 | def test_extract(self): 164 | self.do_test('extract 1975ef8db7355b1d691bc79d0749574b') 165 | self.do_test('extract c5c895deecf7a7565393587e0d61be2b') 166 | 167 | def test_rl1_4up(self): 168 | if sys.version_info < (2, 7): 169 | return 170 | self.do_test('rl1/4up b1c400de699af29ea3f1983bb26870ab', 171 | scrub=True) 172 | 173 | def test_rl1_booklet(self): 174 | if sys.version_info < (2, 7): 175 | return 176 | self.do_test('rl1/booklet b1c400de699af29ea3f1983bb26870ab', 177 | scrub=True) 178 | 179 | def test_rl1_subset(self): 180 | if sys.version_info < (2, 7): 181 | return 182 | self.do_test('rl1/subset b1c400de699af29ea3f1983bb26870ab 3 5', 183 | scrub=True) 184 | 185 | def test_rl1_platypus(self): 186 | if sys.version_info < (2, 7): 187 | return 188 | self.do_test('rl1/platypus_pdf_template b1c400de699af29ea3f1983bb26870ab', 189 | scrub=True) 190 | 191 | def main(): 192 | unittest.main() 193 | 194 | if __name__ == '__main__': 195 | main() 196 | -------------------------------------------------------------------------------- /tests/test_flate_png.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # encoding: utf-8 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas 5 | # 2017 Henddher Pedroza, Illinois 6 | # MIT license -- See LICENSE.txt for details 7 | 8 | ''' 9 | Run from the directory above like so: 10 | python -m tests.test_pdfstring 11 | ''' 12 | 13 | 14 | from pdfrw.uncompress import flate_png, flate_png_impl 15 | from pdfrw.py23_diffs import zlib, xrange, from_array, convert_load, convert_store 16 | 17 | import unittest 18 | import base64 19 | import array 20 | import logging 21 | import ast 22 | import os 23 | 24 | # 25 | # Sample PNGs with filtered scanlines retrieved from 26 | # http://www.schaik.com/pngsuite/pngsuite_fil_png.html 27 | # 28 | 29 | def filepath(filename): 30 | pwd = os.path.dirname(__file__) 31 | return os.path.join(pwd, filename) 32 | 33 | def create_data(nc=1, nr=1, bpc=8, ncolors=1, filter_type=0): 34 | pixel_size = (bpc * ncolors + 7) // 8 35 | data = [] 36 | for r in xrange(nr): 37 | data.append(filter_type if r > 0 else 0) # filter byte 38 | for c in xrange(nc * pixel_size): 39 | data.append(r * nc * pixel_size + c * pixel_size) 40 | data = array.array('B', data) 41 | logging.debug("Data: %r" % (data)) 42 | return data, nc, nr, bpc, ncolors 43 | 44 | def prepend_data_with_filter(data, filter): 45 | a = array.array('B', data) 46 | a.insert(0, filter) 47 | return a 48 | 49 | def print_data(data1, data2): 50 | if data1 is None: 51 | return 52 | for b1, b2 in zip(data1, data2): 53 | b1 = b1 if type(b1) != str else ord(b1) 54 | b2 = b2 if type(b2) != str else ord(b2) 55 | logging.error("%4d %4d" % (b1, b2)) 56 | if len(data1) != len(data2): 57 | logging.error("Mismatched lengths: %d %d" % (len(data1), len(data2))) 58 | return None 59 | 60 | class TestFlatePNG(unittest.TestCase): 61 | 62 | def test_flate_png(self): 63 | b64 = 'AAAAAAD//wACAAA2AAAAAQAADwAAAgEAACcAAQL/AAAzAP8AAgAANgACAAEAAO8AAAABAAF1AAAAAgAANgADAAEAAfsAAAACAAA2AAQCAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAQECBXx8AAIAAAGHAAAAAgAANgAMAAEDCcMAAAACAAA2AA0CAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQABBxI2AAAEAfn5AAAWAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAEAAQ6fJgAAAAIAADYAHwIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQABESDsAAAAAgAANgAmAAAAAAD//wIAAAAAAAACARp0hgEBAgAA/eAAAA==' 64 | predictor, columns, colors, bpc = (12, 6, 1, 8) 65 | data = base64.b64decode(b64) 66 | d2, error2 = flate_png(data, predictor, columns, colors, bpc) 67 | assert d2 is not None 68 | assert error2 is None 69 | 70 | def test_flate_png_filter_0(self): 71 | # None filter 72 | data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4) 73 | d2, error2 = flate_png(data, 12, nc, ncolors, bpc) 74 | assert d2 is not None 75 | assert error2 is None 76 | 77 | def test_flate_png_filter_1(self): 78 | # Sub filter 79 | data, nc, nr, bpc, ncolors = create_data(nc=2, nr=3, bpc=8, ncolors=4, filter_type=1) 80 | d2, error2 = flate_png(data, 12, nc, ncolors, bpc) 81 | assert d2 is not None 82 | assert error2 is None 83 | 84 | def test_flate_png_filter_2(self): 85 | # Up filter 86 | data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=2) 87 | d2, error2 = flate_png(data, 12, nc, ncolors, bpc) 88 | assert d2 is not None 89 | assert error2 is None 90 | 91 | def test_flate_png_filter_3(self): 92 | # Avg filter 93 | data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=3) 94 | d2, error2 = flate_png(data, 12, nc, ncolors, bpc) 95 | assert d2 96 | assert error2 is None 97 | 98 | def test_flate_png_filter_4(self): 99 | # Paeth filter 100 | data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=4) 101 | d2, error2 = flate_png(data, 12, nc, ncolors, bpc) 102 | assert d2 103 | assert error2 is None 104 | 105 | def test_flate_png_alt_filter_1(self): 106 | width = 32 107 | bit_depth = 8 108 | channels = 1 109 | color_type = 0 110 | pixel_depth = 8 111 | rowbytes = 32 112 | filter = 1 113 | data = [ 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 114 | expected = [ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, ] 115 | 116 | dataf = prepend_data_with_filter(data, filter) 117 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 118 | 119 | assert error is None 120 | expected = array.array('B', expected) 121 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 122 | 123 | width = 32 124 | bit_depth = 8 125 | channels = 3 126 | color_type = 2 127 | pixel_depth = 24 128 | rowbytes = 96 129 | filter = 1 130 | data = [ 0xff,0x00,0x08,0x00,0x08,0x07,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08, ] 131 | expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ] 132 | 133 | dataf = prepend_data_with_filter(data, filter) 134 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 135 | 136 | assert error is None 137 | expected = array.array('B', expected) 138 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 139 | 140 | def test_flate_png_alt_filter_2(self): 141 | width = 32 142 | bit_depth = 8 143 | channels = 3 144 | color_type = 2 145 | pixel_depth = 24 146 | rowbytes = 96 147 | filter = 2 148 | prev_row = [ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 149 | data = [ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, ] 150 | expected = [ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ] 151 | 152 | prev_rowf = prepend_data_with_filter(prev_row, 0) 153 | dataf = prepend_data_with_filter(data, filter) 154 | prev_rowf.extend(dataf) 155 | dataf = prev_rowf 156 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 157 | 158 | assert error is None 159 | prev_rowa = array.array('B', prev_row) 160 | prev_rowa.extend(expected) 161 | expected = prev_rowa 162 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 163 | 164 | width = 32 165 | bit_depth = 16 166 | channels = 1 167 | color_type = 0 168 | pixel_depth = 16 169 | rowbytes = 64 170 | filter = 2 171 | prev_row = [ 0x00,0x00,0x09,0x00,0x12,0x00,0x1b,0x00,0x24,0x00,0x2d,0x00,0x36,0x00,0x3f,0x00,0x48,0x00,0x51,0x00,0x5a,0x00,0x63,0x00,0x6c,0x00,0x75,0x00,0x7e,0x00,0x87,0x00,0x90,0x00,0x99,0x00,0xa2,0x00,0xab,0x00,0xb4,0x00,0xbd,0x00,0xc6,0x00,0xcf,0x00,0xd8,0x00,0xe1,0x00,0xea,0x00,0xf3,0x00,0xfc,0x00,0xf0,0xff,0xd5,0xff,0xba,0xff, ] 172 | data = [ 0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0xfa,0x00,0xfa,0x00,0xfa,0x00, ] 173 | expected = [ 0x02,0x00,0x0b,0x00,0x14,0x00,0x1d,0x00,0x26,0x00,0x2f,0x00,0x38,0x00,0x41,0x00,0x4a,0x00,0x53,0x00,0x5c,0x00,0x65,0x00,0x6e,0x00,0x77,0x00,0x80,0x00,0x89,0x00,0x92,0x00,0x9b,0x00,0xa4,0x00,0xad,0x00,0xb6,0x00,0xbf,0x00,0xc8,0x00,0xd1,0x00,0xda,0x00,0xe3,0x00,0xec,0x00,0xf5,0x00,0xfe,0x00,0xea,0xff,0xcf,0xff,0xb4,0xff, ] 174 | 175 | prev_rowf = prepend_data_with_filter(prev_row, 0) 176 | dataf = prepend_data_with_filter(data, filter) 177 | prev_rowf.extend(dataf) 178 | dataf = prev_rowf 179 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 180 | 181 | assert error is None 182 | prev_rowa = array.array('B', prev_row) 183 | prev_rowa.extend(expected) 184 | expected = prev_rowa 185 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 186 | 187 | def test_flate_png_alt_filter_3(self): 188 | 189 | width = 32 190 | bit_depth = 8 191 | channels = 1 192 | color_type = 0 193 | pixel_depth = 8 194 | rowbytes = 32 195 | filter = 3 196 | prev_row = [ 0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0xe3,0xc9,0xf1,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f, ] 197 | data = [ 0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x69,0x02,0xe4,0xb5,0xc3,0xa1,0xff,0x31,0x51,0xcf,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, ] 198 | expected = [ 0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0xe8,0xb5,0x7e,0x65,0x5a,0x46,0x61,0xa1,0xe1,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f, ] 199 | 200 | prev_rowf = prepend_data_with_filter(prev_row, 0) 201 | dataf = prepend_data_with_filter(data, filter) 202 | prev_rowf.extend(dataf) 203 | dataf = prev_rowf 204 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 205 | 206 | assert error is None 207 | prev_rowa = array.array('B', prev_row) 208 | prev_rowa.extend(expected) 209 | expected = prev_rowa 210 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 211 | 212 | width = 32 213 | bit_depth = 8 214 | channels = 3 215 | color_type = 2 216 | pixel_depth = 24 217 | rowbytes = 96 218 | filter = 3 219 | prev_row = [0] * rowbytes 220 | data = [ 0xff,0x00,0x08,0x80,0x08,0x0b,0x80,0x0c,0x10,0x80,0x10,0x14,0x80,0x14,0x18,0x80,0x19,0x1c,0x80,0x1d,0x20,0x80,0x21,0x24,0x80,0x25,0x28,0x80,0x2a,0x2c,0x80,0x2d,0x30,0x80,0x31,0x34,0x80,0x35,0x38,0x80,0x39,0x3c,0x80,0x3e,0x40,0x80,0x42,0x44,0x80,0x46,0x48,0x80,0x4a,0x4c,0x80,0x4f,0x50,0x80,0x52,0x54,0x80,0x56,0x58,0x80,0x5a,0x5c,0x80,0x5e,0x60,0x80,0x63,0x64,0x80,0x67,0x68,0x80,0x6b,0x6c,0x80,0x6f,0x70,0x80,0x74,0x74,0x80,0x77,0x78,0x80,0x7b,0x7c,0x80,0x7f,0x80,0x80,0x84,0x84, ] 221 | expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ] 222 | 223 | prev_rowf = prepend_data_with_filter(prev_row, 0) 224 | dataf = prepend_data_with_filter(data, filter) 225 | prev_rowf.extend(dataf) 226 | dataf = prev_rowf 227 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 228 | 229 | assert error is None 230 | prev_rowa = array.array('B', prev_row) 231 | prev_rowa.extend(expected) 232 | expected = prev_rowa 233 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 234 | 235 | def test_flate_png_alt_filter_4(self): 236 | width = 32 237 | bit_depth = 8 238 | channels = 1 239 | color_type = 0 240 | pixel_depth = 8 241 | rowbytes = 32 242 | filter = 4 243 | prev_row = [ 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, ] 244 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ] 245 | expected = [ 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, ] 246 | 247 | prev_rowf = prepend_data_with_filter(prev_row, 0) 248 | dataf = prepend_data_with_filter(data, filter) 249 | prev_rowf.extend(dataf) 250 | dataf = prev_rowf 251 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 252 | 253 | assert error is None 254 | prev_rowa = array.array('B', prev_row) 255 | prev_rowa.extend(expected) 256 | expected = prev_rowa 257 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 258 | 259 | width = 32 260 | bit_depth = 8 261 | channels = 3 262 | color_type = 2 263 | pixel_depth = 24 264 | rowbytes = 96 265 | filter = 4 266 | prev_row = [0] * rowbytes 267 | data = [ 0xff,0x00,0x08,0x00,0x08,0x07,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08, ] 268 | expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ] 269 | 270 | prev_rowf = prepend_data_with_filter(prev_row, 0) 271 | dataf = prepend_data_with_filter(data, filter) 272 | prev_rowf.extend(dataf) 273 | dataf = prev_rowf 274 | result, error = flate_png_impl(dataf, 12, width, channels, bit_depth) 275 | 276 | assert error is None 277 | prev_rowa = array.array('B', prev_row) 278 | prev_rowa.extend(expected) 279 | expected = prev_rowa 280 | assert expected == result, "\ne: %r\nr: %r" % (expected, result) 281 | 282 | def util_test_flate_png_alt_from_png_log_file(self, filename): 283 | 284 | with open(filepath(filename)) as f: 285 | data = array.array('B') 286 | expected = array.array('B') 287 | width = 0 288 | bit_depth = 0 289 | channels = 0 290 | color_type = 0 291 | pixel_depth = 0 292 | rowbytes = 0 293 | filter = 0 294 | nrows = 0 295 | 296 | for l in f.readlines(): 297 | 298 | if l.startswith("PASS:"): 299 | break 300 | 301 | l = l.split(' = ') 302 | var = l[0] 303 | val = l[1] 304 | 305 | if var == 'width': 306 | width = int(val) 307 | 308 | elif var == 'bit_depth': 309 | bit_depth = int(val) 310 | 311 | elif var == 'channels': 312 | channels = int(val) 313 | 314 | elif var == 'color_type': 315 | color_type = int(val) 316 | 317 | elif var == 'pixel_depth': 318 | pixel_depth = int(val) 319 | 320 | elif var == 'rowbytes': 321 | rowbytes = int(val) 322 | 323 | elif var == 'filter': 324 | filter = int(val) 325 | 326 | elif var == 'data': 327 | d = ast.literal_eval(val) 328 | data.append(filter) 329 | data.extend(d) 330 | 331 | elif var == 'expected': 332 | e = ast.literal_eval(val) 333 | expected.extend(e) 334 | nrows += 1 335 | 336 | bytes_per_pixel = pixel_depth // 8 337 | 338 | logging.error("width: %d" % width) 339 | logging.error("bit_depth: %d" % bit_depth) 340 | logging.error("channels: %d" % channels) 341 | logging.error("color_type: %d" % color_type) 342 | logging.error("pixel_depth: %d" % pixel_depth) 343 | logging.error("rowbytes: %d" % rowbytes) 344 | logging.error("filter: %d" % filter) 345 | logging.error("bytes_per_pixel: %d" % bytes_per_pixel) 346 | logging.error("expected: %r" % len(expected)) 347 | logging.error("data: %r" % len(data)) 348 | 349 | assert color_type in [ 350 | 0, # Grayscale (Y) 351 | 2, # Truecolor (RGB) 352 | # 3 Indexed is not supported (Palette) 353 | 4, # Grayscale with alpha (YA) 354 | 6, # Truecolor with alpha (RGBA) 355 | ] 356 | assert filter in [0, 1, 2, 3, 4] 357 | assert channels * bit_depth == pixel_depth 358 | assert (pixel_depth // 8) * width == rowbytes 359 | assert 0 == pixel_depth % 8 # can't support pixels with bit_depth < 8 360 | assert 8 == bit_depth # ideally, we should test bit_depth 16 also 361 | assert nrows * (1 + width * bytes_per_pixel) == len(data) # 1 filter byte preceeding each row 362 | assert nrows * width * bytes_per_pixel == len(expected) 363 | 364 | result, error = flate_png_impl(data, 12, width, channels, bit_depth) 365 | 366 | import pickle 367 | with open(filepath('./result.pickle'), 'wb') as f: 368 | pickle.dump(result, f) 369 | with open(filepath('./expected.pickle'), 'wb') as f: 370 | pickle.dump(expected, f) 371 | 372 | assert error is None 373 | assert expected == result 374 | 375 | 376 | def test_flate_png_alt_file_f01n2c08(self): 377 | self.util_test_flate_png_alt_from_png_log_file("./f01n2c08.png.log") 378 | 379 | def test_flate_png_alt_file_f02n2c08(self): 380 | self.util_test_flate_png_alt_from_png_log_file("./f02n2c08.png.log") 381 | 382 | def test_flate_png_alt_file_f03n2c08(self): 383 | self.util_test_flate_png_alt_from_png_log_file("./f03n2c08.png.log") 384 | 385 | def test_flate_png_alt_file_f04n2c08(self): 386 | self.util_test_flate_png_alt_from_png_log_file("./f04n2c08.png.log") 387 | 388 | def test_flate_png_alt_file_basn2c08(self): 389 | self.util_test_flate_png_alt_from_png_log_file("./basn2c08.png.log") 390 | 391 | def test_flate_png_alt_file_basn0g08(self): 392 | self.util_test_flate_png_alt_from_png_log_file("./basn0g08.png.log") 393 | 394 | 395 | def main(): 396 | unittest.main() 397 | 398 | 399 | if __name__ == '__main__': 400 | main() 401 | -------------------------------------------------------------------------------- /tests/test_pdfdict.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # encoding: utf-8 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas 5 | # 2016 James Laird-Wah, Sydney, Australia 6 | # MIT license -- See LICENSE.txt for details 7 | 8 | ''' 9 | Run from the directory above like so: 10 | python -m tests.test_pdfstring 11 | ''' 12 | 13 | 14 | from pdfrw import PdfDict, PdfName 15 | from pdfrw.objects import PdfIndirect 16 | 17 | import unittest 18 | 19 | 20 | class TestPdfDicts(unittest.TestCase): 21 | 22 | def test_indirect_set_get(self): 23 | io = PdfIndirect((1,2,3)) 24 | io.value = 42 25 | d = PdfDict() 26 | d.Name = io 27 | test, = (x for x in dict.values(d)) 28 | self.assertEqual(test, io) 29 | v = d['/Name'] 30 | self.assertEqual(v, io.value) 31 | test, = d 32 | self.assertEqual(type(test), type(PdfName.Name)) 33 | 34 | def main(): 35 | unittest.main() 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /tests/test_pdfreader_init.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import static_pdfs 3 | 4 | from pdfrw import PdfReader 5 | 6 | try: 7 | import unittest2 as unittest 8 | except ImportError: 9 | import unittest 10 | 11 | 12 | class TestPdfReaderInit(unittest.TestCase): 13 | 14 | def test_fname_binary_filelike(self): 15 | with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file: 16 | PdfReader(pdf_file) 17 | 18 | def test_fdata_binary(self): 19 | with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file: 20 | pdf_bytes = pdf_file.read() 21 | PdfReader(fdata=pdf_bytes) 22 | 23 | 24 | def main(): 25 | unittest.main() 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /tests/test_pdfstring.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # encoding: utf-8 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas 5 | # 2016 James Laird-Wah, Sydney, Australia 6 | # MIT license -- See LICENSE.txt for details 7 | 8 | ''' 9 | Run from the directory above like so: 10 | python -m tests.test_pdfstring 11 | ''' 12 | 13 | 14 | from pdfrw import PdfString 15 | from pdfrw.py23_diffs import convert_store 16 | 17 | import unittest 18 | 19 | 20 | class TestBaseEncoding(unittest.TestCase): 21 | 22 | def encode(self, value): 23 | x = PdfString.encode(value) 24 | if isinstance(value, type(u'')): 25 | y = PdfString.from_unicode(value) 26 | else: 27 | y = PdfString.from_bytes(value) 28 | self.assertEqual(x, y) 29 | return x 30 | 31 | def decode(self, value): 32 | s = PdfString(value) 33 | x = s.to_unicode() 34 | y = s.decode() 35 | self.assertEqual(x, y) 36 | return x 37 | 38 | def decode_bytes(self, decode_this, expected): 39 | """ Decode to bytes""" 40 | self.assertEqual(PdfString(decode_this).to_bytes(), 41 | convert_store(expected)) 42 | 43 | def roundtrip(self, value, expected=None): 44 | result = self.encode(value) 45 | self.assertEqual(value, self.decode(result)) 46 | if expected is not None: 47 | self.assertEqual(result, expected) 48 | return result 49 | 50 | def test_doubleslash(self): 51 | self.roundtrip('\\') 52 | self.roundtrip(r'\\') 53 | 54 | def test_unicode_encoding(self): 55 | # These chars are in PdfDocEncoding 56 | self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(') 57 | # These chars are not in PdfDocEncoding 58 | self.assertEqual(self.roundtrip(u'δΩσ')[0], '<') 59 | # Check that we're doing a reasonable encoding 60 | # Might want to change this later if we change the definition of reasonable 61 | self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)') 62 | self.roundtrip(u'(\n\u0101', '') 63 | 64 | 65 | def test_constructor(self): 66 | obj = PdfString('hello') 67 | 68 | def test_continuation(self): 69 | # See PDF 1.7 ref section 3.2 page 55 70 | s1 = PdfString('(These two strings are the same.)') 71 | self.assertEqual(s1.decode(), s1[1:-1]) 72 | s2 = PdfString('(These \\\ntwo strings \\\nare the same.)') 73 | self.assertEqual(s1.decode(), s2.decode()) 74 | s2 = PdfString(s2.replace('\n', '\r')) 75 | self.assertEqual(s1.decode(), s2.decode()) 76 | s2 = PdfString(s2.replace('\r', '\r\n')) 77 | self.assertEqual(s1.decode(), s2.decode()) 78 | 79 | def test_hex_whitespace(self): 80 | # See PDF 1.7 ref section 3.2 page 56 81 | self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB') 82 | 83 | def test_unicode_escaped_decode(self): 84 | # Some PDF producers happily put unicode strings in PdfDocEncoding, 85 | # because the Unicode BOM and \0 are valid code points 86 | decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)') 87 | self.assertEqual(decoded, "hello") 88 | 89 | 90 | def test_unescaping(self): 91 | self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)', 92 | ' ( ) \\ \n \t \f \r \r\n \\n') 93 | 94 | self.decode_bytes(r'(\b\010\10)', '\b\b\b') 95 | self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)', 96 | '\n\n\r\r\t\t\b\b\f\f()\001\023\f3') 97 | self.decode_bytes(r'(\\\nabc)', '\\\nabc') 98 | self.decode_bytes(r'(\ )', ' ') 99 | 100 | def test_BOM_variants(self): 101 | self.roundtrip(u'\ufeff', '') 102 | self.roundtrip(u'\ufffe', '') 103 | self.roundtrip(u'\xfe\xff', '') 104 | self.roundtrip(u'\xff\xfe', '(\xff\xfe)') 105 | self.assertRaises(UnicodeError, PdfString.from_unicode, 106 | u'þÿ blah', text_encoding='pdfdocencoding') 107 | 108 | def test_byte_encode(self): 109 | self.assertEqual(self.encode(b'ABC'), '(ABC)') 110 | 111 | def test_nullstring(self): 112 | self.assertEqual(PdfString('<>').to_bytes(), b'') 113 | self.assertEqual(PdfString('()').to_bytes(), b'') 114 | 115 | def main(): 116 | unittest.main() 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /tests/test_roundtrip.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw) 4 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas 5 | # MIT license -- See LICENSE.txt for details 6 | 7 | ''' 8 | Run from the directory above like so: 9 | 10 | python -m tests.test_roundtrip 11 | 12 | A PDF that has been determined to be good or bad 13 | should be added to expected.txt with either a good 14 | checksum, or just the word "fail". 15 | 16 | These tests are incomplete, but they allow us to try 17 | out various PDFs. There is a collection of difficult 18 | PDFs available on github. 19 | 20 | In order to use them: 21 | 22 | 1) Insure that github.com/pmaupin/static_pdfs is on your path. 23 | 24 | 2) Use the imagemagick compare program to look at differences 25 | between the static_pdfs/global directory and the tmp_results 26 | directory after you run this. 27 | 28 | 29 | ''' 30 | import os 31 | import hashlib 32 | import pdfrw 33 | import static_pdfs 34 | import expected 35 | 36 | from pdfrw.py23_diffs import convert_store 37 | 38 | try: 39 | import unittest2 as unittest 40 | except ImportError: 41 | import unittest 42 | 43 | 44 | class TestOnePdf(unittest.TestCase): 45 | 46 | def roundtrip(self, testname, basename, srcf, decompress=False, 47 | compress=False, repaginate=False): 48 | dstd = os.path.join(expected.result_dir, testname) 49 | if not os.path.exists(dstd): 50 | os.makedirs(dstd) 51 | dstf = os.path.join(dstd, basename) 52 | hashfile = os.path.join(expected.result_dir, 'hashes.txt') 53 | hashkey = '%s/%s' % (testname, basename) 54 | hash = '------no-file-generated---------' 55 | expects = expected.results[hashkey] 56 | 57 | # If the test has been deliberately skipped, 58 | # we are done. Otherwise, execute it even 59 | # if we don't know about it yet, so we have 60 | # results to compare. 61 | 62 | result = 'fail' 63 | size = 0 64 | try: 65 | if 'skip' in expects: 66 | result = 'skip requested' 67 | return self.skipTest(result) 68 | elif 'xfail' in expects: 69 | result = 'xfail requested' 70 | return self.fail(result) 71 | 72 | exists = os.path.exists(dstf) 73 | if expects or not exists: 74 | if exists: 75 | os.remove(dstf) 76 | trailer = pdfrw.PdfReader(srcf, decompress=decompress, 77 | verbose=False) 78 | writer = pdfrw.PdfWriter(dstf, compress=compress) 79 | if repaginate: 80 | writer.addpages(trailer.pages) 81 | else: 82 | writer.trailer = trailer 83 | writer.write() 84 | with open(dstf, 'rb') as f: 85 | data = f.read() 86 | size = len(data) 87 | if data: 88 | hash = hashlib.md5(data).hexdigest() 89 | else: 90 | os.remove(dstf) 91 | if expects: 92 | if len(expects) == 1: 93 | expects, = expects 94 | self.assertEqual(hash, expects) 95 | else: 96 | self.assertIn(hash, expects) 97 | result = 'pass' 98 | else: 99 | result = 'skip' 100 | self.skipTest('No hash available') 101 | finally: 102 | result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash) 103 | with open(hashfile, 'ab') as f: 104 | f.write(convert_store(result)) 105 | 106 | 107 | def build_tests(): 108 | def test_closure(*args, **kw): 109 | def test(self): 110 | self.roundtrip(*args, **kw) 111 | return test 112 | for mytest, repaginate, decompress, compress in ( 113 | ('simple', False, False, False), 114 | ('repaginate', True, False, False), 115 | ('decompress', False, True, False), 116 | ('compress', False, True, True), 117 | ): 118 | for srcf in static_pdfs.pdffiles[0]: 119 | basename = os.path.basename(srcf) 120 | test_name = 'test_%s_%s' % (mytest, basename) 121 | test = test_closure(mytest, basename, srcf, 122 | repaginate=repaginate, 123 | decompress=decompress, 124 | compress=compress, 125 | ) 126 | setattr(TestOnePdf, test_name, test) 127 | build_tests() 128 | 129 | 130 | def main(): 131 | unittest.main() 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /tests/update_expected.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python2 2 | """ 3 | Put old (good) results in ramdisk/reference, 4 | then generate new (unknown) test results in ramdisk/tmp_results, 5 | THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally: 6 | 7 | run this to update any checksums in expected.txt where both versions 8 | parse to same PDFs. 9 | """ 10 | 11 | import os 12 | import hashlib 13 | from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject 14 | 15 | 16 | def make_canonical(trailer): 17 | ''' Canonicalizes a PDF. Assumes everything 18 | is a Pdf object already. 19 | ''' 20 | visited = set() 21 | workitems = list(trailer.values()) 22 | while workitems: 23 | obj = workitems.pop() 24 | objid = id(obj) 25 | if objid in visited: 26 | continue 27 | visited.add(objid) 28 | obj.indirect = True 29 | if isinstance(obj, (PdfArray, PdfDict)): 30 | if isinstance(obj, PdfArray): 31 | workitems += obj 32 | else: 33 | workitems += obj.values() 34 | return trailer 35 | 36 | with open('expected.txt', 'rb') as f: 37 | expected = f.read() 38 | 39 | def get_digest(fname): 40 | with open(fname, 'rb') as f: 41 | data = f.read() 42 | if data: 43 | return hashlib.md5(data).hexdigest() 44 | 45 | tmp = '_temp.pdf' 46 | count = 0 47 | goodcount = 0 48 | 49 | changes = [] 50 | for (srcpath, _, filenames) in os.walk('ramdisk/reference'): 51 | for name in filenames: 52 | if not name.endswith('.pdf'): 53 | continue 54 | src = os.path.join(srcpath, name) 55 | dst = src.replace('/reference/', '/tmp_results/') 56 | if not os.path.exists(dst): 57 | continue 58 | src_digest = get_digest(src) 59 | if not src_digest or src_digest not in expected: 60 | continue 61 | print src 62 | count += 1 63 | trailer = make_canonical(PdfReader(src)) 64 | out = PdfWriter(tmp) 65 | out.write(trailer=trailer) 66 | match_digest = get_digest(tmp) 67 | if not match_digest: 68 | continue 69 | trailer = make_canonical(PdfReader(dst)) 70 | out = PdfWriter(tmp) 71 | out.write(trailer=trailer) 72 | if get_digest(tmp) != match_digest: 73 | continue 74 | goodcount += 1 75 | print "OK" 76 | changes.append((src_digest, get_digest(dst))) 77 | 78 | print count, goodcount 79 | 80 | for stuff in changes: 81 | expected = expected.replace(*stuff) 82 | 83 | with open('expected.txt', 'wb') as f: 84 | f.write(expected) 85 | --------------------------------------------------------------------------------