├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── examples
    ├── 4up.py
    ├── README.txt
    ├── alter.py
    ├── booklet.py
    ├── cat.py
    ├── extract.py
    ├── fancy_watermark.py
    ├── poster.py
    ├── print_two.py
    ├── rl1
    │   ├── 4up.py
    │   ├── README.txt
    │   ├── booklet.py
    │   ├── platypus_pdf_template.py
    │   └── subset.py
    ├── rl2
    │   ├── README.txt
    │   ├── copy.py
    │   └── decodegraphics.py
    ├── rotate.py
    ├── subset.py
    ├── subset_booklets.py
    ├── unspread.py
    └── watermark.py
├── pdfrw
    ├── __init__.py
    ├── buildxobj.py
    ├── compress.py
    ├── crypt.py
    ├── errors.py
    ├── findobjs.py
    ├── objects
    │   ├── __init__.py
    │   ├── pdfarray.py
    │   ├── pdfdict.py
    │   ├── pdfindirect.py
    │   ├── pdfname.py
    │   ├── pdfobject.py
    │   └── pdfstring.py
    ├── pagemerge.py
    ├── pdfreader.py
    ├── pdfwriter.py
    ├── py23_diffs.py
    ├── tokens.py
    ├── toreportlab.py
    └── uncompress.py
├── releasing.txt
├── setup.cfg
├── setup.py
└── tests
    ├── Render Bitmap.ipynb
    ├── __init__.py
    ├── basn0g08.png.log
    ├── basn2c08.png.log
    ├── checkdiffs.py
    ├── expected.py
    ├── expected.txt
    ├── f01n2c08.png.log
    ├── f02n2c08.png.log
    ├── f03n2c08.png.log
    ├── f04n2c08.png.log
    ├── myprofile.py
    ├── test_examples.py
    ├── test_flate_png.py
    ├── test_pdfdict.py
    ├── test_pdfreader_init.py
    ├── test_pdfstring.py
    ├── test_roundtrip.py
    └── update_expected.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # OSX
 2 | .DS_Store
 3 | .AppleDouble
 4 | .LSOverride
 5 | Icon
 6 | 
 7 | # Thumbnails
 8 | ._*
 9 | 
10 | # Files that might appear on external disk
11 | .Spotlight-V100
12 | .Trashes
13 | 
14 | 
15 | # Development artifacts
16 | diffs.txt
17 | examples/*.pdf
18 | examples/rl*/*.pdf
19 | tests/*.pdf
20 | examples/pdfrw
21 | examples/rl*/pdfrw
22 | tests/pdfrw
23 | tests/static_pdfs
24 | tests/ramdisk
25 | tests/saved_results
26 | tests/tmp_results
27 | wiki/
28 | 
29 | 
30 | # Byte-compiled / optimized / DLL files
31 | __pycache__/
32 | *.py[cod]
33 | 
34 | # Distribution / packaging
35 | .Python
36 | env/
37 | bin/
38 | build/
39 | develop-eggs/
40 | dist/
41 | eggs/
42 | lib/
43 | lib64/
44 | lib64
45 | parts/
46 | sdist/
47 | var/
48 | *.egg-info/
49 | .installed.cfg
50 | *.egg
51 | pyvenv.cfg
52 | pip-selfcheck.json
53 | 
54 | # Installer logs
55 | pip-log.txt
56 | pip-delete-this-directory.txt
57 | 
58 | # Unit test / coverage reports
59 | htmlcov/
60 | .tox/
61 | .coverage
62 | .cache
63 | nosetests.xml
64 | coverage.xml
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.3"
 6 |   - "3.4"
 7 |   - "3.5"
 8 |   - "3.6"
 9 |   - "nightly"
10 | # command to install dependencies
11 | before_install:
12 |   - "git clone https://github.com/pmaupin/static_pdfs tests/static_pdfs"
13 | install:
14 |   - "pip install ."
15 |   - "pip install reportlab || true"
16 |   - "pip install PyCrypto || true"
17 |   - "pip install zlib || true"
18 |   - "pip install unittest2 || true"
19 | # command to run tests
20 | script: "cd tests; /usr/bin/env PYTHONPATH=. py.test"
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | pdfrw (github.com/pmaupin/pdfrw)
 2 | 
 3 | The majority of pdfrw was written by Patrick Maupin and is licensed
 4 | under the MIT license (reproduced below).  Other contributors include
 5 | Attila Tajti and Nerijus Mika.  It appears that some of the decompression
 6 | code was based on the decompressor from PyPDF2, which was written by
 7 | Mathieu Fenniak and licensed under the BSD license (also reproduced below).
 8 | 
 9 | Please add any missing authors here:
10 | 
11 | Copyright (c) 2006-2017  Patrick Maupin. All rights reserved.
12 | Copyright (c) 2006       Mathieu Fenniak. All rights reserved.
13 | Copyright (c) 2010       Attila Tajti. All rights reserved.
14 | Copyright (c) 2012       Nerijus Mika. All rights reserved.
15 | Copyright (c) 2015       Bastien Gandouet. All rights reserved.
16 | Copyright (c) 2015       Tzerjen Wei. All rights reserved.
17 | Copyright (c) 2015       Jorj X. McKie. All rights reserved.
18 | Copyright (c) 2015       Nicholas Devenish. All rights reserved.
19 | Copyright (c) 2015-2016  Jonatan Dellagostin. All rights reserved.
20 | Copyright (c) 2016-2017  Thomas Kluyver. All rights reserved.
21 | Copyright (c) 2016       James Laird-Wah. All rights reserved.
22 | Copyright (c) 2016       Marcus Brinkmann. All rights reserved.
23 | Copyright (c) 2016       Edward Betts. All rights reserved.
24 | Copyright (c) 2016       Patrick Mazulo. All rights reserved.
25 | Copyright (c) 2017       Haochen Wu. All rights reserved.
26 | Copyright (c) 2017       Jon Lund Steffensen. All rights reserved.
27 | Copyright (c) 2017       Henddher Pedroza. All rights reserved.
28 | 
29 | 
30 | MIT License:
31 | 
32 | Permission is hereby granted, free of charge, to any person obtaining a copy
33 | of this software and associated documentation files (the "Software"), to deal
34 | in the Software without restriction, including without limitation the rights
35 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 | copies of the Software, and to permit persons to whom the Software is
37 | furnished to do so, subject to the following conditions:
38 | 
39 | The above copyright notice and this permission notice shall be included in
40 | all copies or substantial portions of the Software.
41 | 
42 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
48 | THE SOFTWARE.
49 | 
50 | 
51 | BSD License:
52 | 
53 | Redistribution and use in source and binary forms, with or without
54 | modification, are permitted provided that the following conditions are
55 | met:
56 | 
57 | * Redistributions of source code must retain the above copyright notice,
58 |   this list of conditions and the following disclaimer.
59 | * Redistributions in binary form must reproduce the above copyright notice,
60 |   this list of conditions and the following disclaimer in the documentation
61 |   and/or other materials provided with the distribution.
62 | * The name of the author may not be used to endorse or promote products
63 |   derived from this software without specific prior written permission.
64 | 
65 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
69 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
75 | POSSIBILITY OF SUCH DAMAGE.
76 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt *.in *.rst
2 | recursive-include examples *.txt *.py
3 | recursive-include tests *.py
4 | 


--------------------------------------------------------------------------------
/examples/4up.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   4up.py my.pdf
 5 | 
 6 | Creates 4up.my.pdf with a single output page for every
 7 | 4 input pages.
 8 | '''
 9 | 
10 | import sys
11 | import os
12 | 
13 | from pdfrw import PdfReader, PdfWriter, PageMerge
14 | 
15 | 
16 | def get4(srcpages):
17 |     scale = 0.5
18 |     srcpages = PageMerge() + srcpages
19 |     x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:])
20 |     for i, page in enumerate(srcpages):
21 |         page.scale(scale)
22 |         page.x = x_increment if i & 1 else 0
23 |         page.y = 0 if i & 2 else y_increment
24 |     return srcpages.render()
25 | 
26 | 
27 | inpfn, = sys.argv[1:]
28 | outfn = '4up.' + os.path.basename(inpfn)
29 | pages = PdfReader(inpfn).pages
30 | writer = PdfWriter(outfn)
31 | for index in range(0, len(pages), 4):
32 |     writer.addpage(get4(pages[index:index + 4]))
33 | writer.write()
34 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
 1 | Example programs:
 2 | 
 3 | 4up.py -- Prints pages four-up
 4 | 
 5 | alter.py -- Simple example of making a very slight modification to a PDF.
 6 | 
 7 | booklet.py -- Converts a PDF into a booklet.
 8 | 
 9 | cat.py -- Concatenates multiple PDFs, adds metadata.
10 | 
11 | poster.py -- Changes the size of a PDF to create a poster
12 | 
13 | print_two.py  -- this is used when printing two cut-down copies on a single sheet of paper (double-sided)  Requires uncompressed PDF.
14 | 
15 | rotate.py -- This will rotate selected ranges of pages within a document.
16 | 
17 | subset.py -- This will retrieve a subset of pages from a document.
18 | 
19 | watermark.py  -- Adds a watermark to a PDF
20 | 
21 | rl1/4up.py -- Same as 4up.py, using reportlab for output.  Next simplest reportlab example.
22 | 
23 | rl1/booklet.py -- Version of print_booklet using reportlab for output.
24 | 
25 | rl1/platypus_pdf_template.py -- Example using a PDF page as a watermark background with reportlab.
26 | 
27 | rl1/subset.py -- Same as subset.py, using reportlab for output.  Simplest reportlab example.
28 | 
29 | rl2/copy.py -- example of how you could parse a graphics stream and then use reportlab for output.
30 |                Works on a few different PDFs, probably not a suitable starting point for real
31 |                production work without a lot of work on the library functions.
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/alter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   alter.py my.pdf
 5 | 
 6 | Creates alter.my.pdf
 7 | 
 8 | Demonstrates making a slight alteration to a preexisting PDF file.
 9 | 
10 | '''
11 | 
12 | import sys
13 | import os
14 | 
15 | from pdfrw import PdfReader, PdfWriter
16 | 
17 | inpfn, = sys.argv[1:]
18 | outfn = 'alter.' + os.path.basename(inpfn)
19 | 
20 | trailer = PdfReader(inpfn)
21 | trailer.Info.Title = 'My New Title Goes Here'
22 | PdfWriter(outfn, trailer=trailer).write()
23 | 


--------------------------------------------------------------------------------
/examples/booklet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   booklet.py [-p] my.pdf
 5 | 
 6 | Creates booklet.my.pdf
 7 | 
 8 | Pages organized in a form suitable for booklet printing, e.g.
 9 | to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
10 | 
11 | The output would be using the same type of sheet
12 | and you can get up to 3 blank sides if -p is enabled.
13 | 
14 | Otherwise the two sides in the middle will be in original page size
15 | and you can have 1 blank sides at most.
16 | 
17 | '''
18 | 
19 | import os
20 | import argparse
21 | 
22 | from pdfrw import PdfReader, PdfWriter, PageMerge
23 | 
24 | 
25 | def fixpage(*pages):
26 |     result = PageMerge() + (x for x in pages if x is not None)
27 |     result[-1].x += result[0].w
28 |     return result.render()
29 | 
30 | 
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("input", help="Input pdf file name")
33 | parser.add_argument("-p", "--padding", action = "store_true",
34 |                     help="Padding the document so that all pages use the same type of sheet")
35 | args = parser.parse_args()
36 | 
37 | inpfn = args.input
38 | outfn = 'booklet.' + os.path.basename(inpfn)
39 | ipages = PdfReader(inpfn).pages
40 | 
41 | if args.padding:
42 |     pad_to = 4
43 | else:
44 |     pad_to = 2
45 | 
46 | # Make sure we have a correct number of sides
47 | ipages += [None]*(-len(ipages)%pad_to)
48 | 
49 | opages = []
50 | while len(ipages) > 2:
51 |     opages.append(fixpage(ipages.pop(), ipages.pop(0)))
52 |     opages.append(fixpage(ipages.pop(0), ipages.pop()))
53 | 
54 | opages += ipages
55 | 
56 | PdfWriter(outfn).addpages(opages).write()
57 | 


--------------------------------------------------------------------------------
/examples/cat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   cat.py <first.pdf> [<next.pdf> ...]
 5 | 
 6 | Creates cat.<first.pdf>
 7 | 
 8 | This file demonstrates two features:
 9 | 
10 | 1) Concatenating multiple input PDFs.
11 | 
12 | 2) adding metadata to the PDF.
13 | 
14 | '''
15 | 
16 | import sys
17 | import os
18 | 
19 | from pdfrw import PdfReader, PdfWriter, IndirectPdfDict
20 | 
21 | inputs = sys.argv[1:]
22 | assert inputs
23 | outfn = 'cat.' + os.path.basename(inputs[0])
24 | 
25 | writer = PdfWriter()
26 | for inpfn in inputs:
27 |     writer.addpages(PdfReader(inpfn).pages)
28 | 
29 | writer.trailer.Info = IndirectPdfDict(
30 |     Title='your title goes here',
31 |     Author='your name goes here',
32 |     Subject='what is it all about?',
33 |     Creator='some script goes here',
34 | )
35 | writer.write(outfn)
36 | 


--------------------------------------------------------------------------------
/examples/extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   extract.py <some.pdf>
 5 | 
 6 | Locates Form XObjects and Image XObjects within the PDF,
 7 | and creates a new PDF containing these -- one per page.
 8 | 
 9 | Resulting file will be named extract.<some.pdf>
10 | 
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from pdfrw import PdfReader, PdfWriter
17 | from pdfrw.findobjs import page_per_xobj
18 | 
19 | 
20 | inpfn, = sys.argv[1:]
21 | outfn = 'extract.' + os.path.basename(inpfn)
22 | pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72))
23 | if not pages:
24 |     raise IndexError("No XObjects found")
25 | writer = PdfWriter(outfn)
26 | writer.addpages(pages)
27 | writer.write()
28 | 


--------------------------------------------------------------------------------
/examples/fancy_watermark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | '''
  4 | Enhanced example of watermarking using form xobjects (pdfrw).
  5 | 
  6 | usage:   fancy_watermark.py [-u] my.pdf single_page.pdf
  7 | 
  8 | Creates watermark.my.pdf, with every page overlaid with
  9 | first page from single_page.pdf.  If -u is selected, watermark
 10 | will be placed underneath page (painted first).
 11 | 
 12 | The stock watermark.py program assumes all pages are the same
 13 | size.  This example deals with pages of differing sizes in order
 14 | to show some concepts of positioning and scaling.
 15 | 
 16 | This version applies the watermark such that the upper right
 17 | corner of the watermark is at the upper right corner of the
 18 | document page for odd pages, and at the upper left corner
 19 | of the document page for even pages, for each page of the
 20 | document.
 21 | 
 22 | It also rescales the size of the watermark if the watermark
 23 | is too wide for the page.
 24 | 
 25 | These scaling and positioning adjustments can easily
 26 | be customized for any particular application.
 27 | 
 28 | To handle documents with different page sizes, a cache is
 29 | maintained of a modified intermediate watermark object
 30 | for each page size.
 31 | '''
 32 | 
 33 | import sys
 34 | import os
 35 | 
 36 | from pdfrw import PdfReader, PdfWriter, PageMerge
 37 | 
 38 | # Get all the filenames
 39 | 
 40 | argv = sys.argv[1:]
 41 | underneath = '-u' in argv
 42 | if underneath:
 43 |     del argv[argv.index('-u')]
 44 | inpfn, wmarkfn = argv
 45 | outfn = 'watermark.' + os.path.basename(inpfn)
 46 | 
 47 | # Open both the source files
 48 | wmark_trailer = PdfReader(wmarkfn)
 49 | trailer = PdfReader(inpfn)
 50 | 
 51 | # Handle different sized pages in same document with
 52 | # a memoization cache, so we don't create more watermark
 53 | # objects than we need to (typically only one per document).
 54 | 
 55 | wmark_page = wmark_trailer.pages[0]
 56 | wmark_cache = {}
 57 | 
 58 | # Process every page
 59 | for pagenum, page in enumerate(trailer.pages, 1):
 60 | 
 61 |     # Get the media box of the page, and see
 62 |     # if we have a matching watermark in the cache
 63 |     mbox = tuple(float(x) for x in page.MediaBox)
 64 |     odd = pagenum & 1
 65 |     key = mbox, odd
 66 |     wmark = wmark_cache.get(key)
 67 |     if wmark is None:
 68 | 
 69 |         # Create and cache a new watermark object.
 70 |         wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0]
 71 | 
 72 |         # The math is more complete than it probably needs to be,
 73 |         # because the origin of all pages is almost always (0, 0).
 74 |         # Nonetheless, we illustrate all the values and their names.
 75 | 
 76 |         page_x, page_y, page_x1, page_y1 = mbox
 77 |         page_w = page_x1 - page_x
 78 |         page_h = page_y1 - page_y  # For illustration, not used
 79 | 
 80 |         # Scale the watermark if it is too wide for the page
 81 |         # (Could do the same for height instead if needed)
 82 |         if wmark.w > page_w:
 83 |             wmark.scale(1.0 * page_w / wmark.w)
 84 | 
 85 |         # Always put watermark at the top of the page
 86 |         # (but see horizontal positioning for other ideas)
 87 |         wmark.y += page_y1 - wmark.h
 88 | 
 89 |         # For odd pages, put it at the left of the page,
 90 |         # and for even pages, put it on the right of the page.
 91 |         if odd:
 92 |             wmark.x = page_x
 93 |         else:
 94 |             wmark.x += page_x1 - wmark.w
 95 | 
 96 |         # Optimize the case where the watermark is same width
 97 |         # as page.
 98 |         if page_w == wmark.w:
 99 |             wmark_cache[mbox, not odd] = wmark
100 | 
101 |     # Add the watermark to the page
102 |     PageMerge(page).add(wmark, prepend=underneath).render()
103 | 
104 | # Write out the destination file
105 | PdfWriter(outfn, trailer=trailer).write()
106 | 


--------------------------------------------------------------------------------
/examples/poster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   poster.py my.pdf
 5 | 
 6 | Shows how to change the size on a PDF.
 7 | 
 8 | Motivation:
 9 | 
10 | My daughter needed to create a 48" x 36" poster, but her Mac
11 | version of Powerpoint only wanted to output 8.5" x 11" for
12 | some reason.
13 | 
14 | So she did an 8.5x11" output with 0.5" margin all around
15 | (actual size of useful area 7.5x10") and we scaled it
16 | up by 4.8.
17 | 
18 | We also copy the Info dict to the new PDF.
19 | 
20 | '''
21 | 
22 | import sys
23 | import os
24 | 
25 | from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict
26 | 
27 | 
28 | def adjust(page, margin=36, scale=4.8):
29 |     info = PageMerge().add(page)
30 |     x1, y1, x2, y2 = info.xobj_box
31 |     viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin)
32 |     page = PageMerge().add(page, viewrect=viewrect)
33 |     page[0].scale(scale)
34 |     return page.render()
35 | 
36 | 
37 | inpfn, = sys.argv[1:]
38 | outfn = 'poster.' + os.path.basename(inpfn)
39 | reader = PdfReader(inpfn)
40 | writer = PdfWriter(outfn)
41 | writer.addpage(adjust(reader.pages[0]))
42 | writer.trailer.Info = IndirectPdfDict(reader.Info or {})
43 | writer.write()
44 | 


--------------------------------------------------------------------------------
/examples/print_two.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   print_two.py my.pdf
 5 | 
 6 | Creates print_two.my.pdf
 7 | 
 8 | This is only useful when you can cut down sheets of paper to make two
 9 | small documents.  Works for double-sided only right now.
10 | '''
11 | 
12 | import sys
13 | import os
14 | 
15 | from pdfrw import PdfReader, PdfWriter, PageMerge
16 | 
17 | 
18 | def fixpage(page, count=[0]):
19 |     count[0] += 1
20 |     oddpage = (count[0] & 1)
21 | 
22 |     result = PageMerge()
23 |     for rotation in (180 + 180 * oddpage, 180 * oddpage):
24 |         result.add(page, rotate=rotation)
25 |     result[1].x = result[0].w
26 |     return result.render()
27 | 
28 | 
29 | inpfn, = sys.argv[1:]
30 | outfn = 'print_two.' + os.path.basename(inpfn)
31 | pages = PdfReader(inpfn).pages
32 | PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
33 | 


--------------------------------------------------------------------------------
/examples/rl1/4up.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   4up.py my.pdf
 5 | 
 6 | 
 7 | Uses Form XObjects and reportlab to create 4up.my.pdf.
 8 | 
 9 | Demonstrates use of pdfrw with reportlab.
10 | 
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from reportlab.pdfgen.canvas import Canvas
17 | 
18 | from pdfrw import PdfReader
19 | from pdfrw.buildxobj import pagexobj
20 | from pdfrw.toreportlab import makerl
21 | 
22 | 
23 | def addpage(canvas, allpages):
24 |     pages = allpages[:4]
25 |     del allpages[:4]
26 | 
27 |     x_max = max(page.BBox[2] for page in pages)
28 |     y_max = max(page.BBox[3] for page in pages)
29 | 
30 |     canvas.setPageSize((x_max, y_max))
31 | 
32 |     for index, page in enumerate(pages):
33 |         x = x_max * (index & 1) / 2.0
34 |         y = y_max * (index <= 1) / 2.0
35 |         canvas.saveState()
36 |         canvas.translate(x, y)
37 |         canvas.scale(0.5, 0.5)
38 |         canvas.doForm(makerl(canvas, page))
39 |         canvas.restoreState()
40 |     canvas.showPage()
41 | 
42 | 
43 | def go(argv):
44 |     inpfn, = argv
45 |     outfn = '4up.' + os.path.basename(inpfn)
46 | 
47 |     pages = PdfReader(inpfn).pages
48 |     pages = [pagexobj(x) for x in pages]
49 |     canvas = Canvas(outfn)
50 | 
51 |     while pages:
52 |         addpage(canvas, pages)
53 |     canvas.save()
54 | 
55 | if __name__ == '__main__':
56 |     go(sys.argv[1:])
57 | 


--------------------------------------------------------------------------------
/examples/rl1/README.txt:
--------------------------------------------------------------------------------
 1 | This directory contains example scripts which read in PDFs
 2 | and convert pages to PDF Form XObjects using pdfrw, and then
 3 | write out the PDFs using reportlab.
 4 | 
 5 | The examples, from easiest to hardest, are:
 6 | 
 7 | subset.py -- prints a subset of pages
 8 | 4up.py -- prints pages 4-up
 9 | booklet.py -- creates a booklet out of the pages
10 | 


--------------------------------------------------------------------------------
/examples/rl1/booklet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   booklet.py my.pdf
 5 | 
 6 | 
 7 | Uses Form XObjects and reportlab to create booklet.my.pdf.
 8 | 
 9 | Demonstrates use of pdfrw with reportlab.
10 | 
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from reportlab.pdfgen.canvas import Canvas
17 | 
18 | from pdfrw import PdfReader
19 | from pdfrw.buildxobj import pagexobj
20 | from pdfrw.toreportlab import makerl
21 | 
22 | 
23 | def read_and_double(inpfn):
24 |     pages = PdfReader(inpfn).pages
25 |     pages = [pagexobj(x) for x in pages]
26 |     if len(pages) & 1:
27 |         pages.append(pages[0])  # Sentinel -- get same size for back as front
28 | 
29 |     xobjs = []
30 |     while len(pages) > 2:
31 |         xobjs.append((pages.pop(), pages.pop(0)))
32 |         xobjs.append((pages.pop(0), pages.pop()))
33 |     xobjs += [(x,) for x in pages]
34 |     return xobjs
35 | 
36 | 
37 | def make_pdf(outfn, xobjpairs):
38 |     canvas = Canvas(outfn)
39 |     for xobjlist in xobjpairs:
40 |         x = y = 0
41 |         for xobj in xobjlist:
42 |             x += xobj.BBox[2]
43 |             y = max(y, xobj.BBox[3])
44 | 
45 |         canvas.setPageSize((x, y))
46 | 
47 |         # Handle blank back page
48 |         if len(xobjlist) > 1 and xobjlist[0] == xobjlist[-1]:
49 |             xobjlist = xobjlist[:1]
50 |             x = xobjlist[0].BBox[2]
51 |         else:
52 |             x = 0
53 |         y = 0
54 | 
55 |         for xobj in xobjlist:
56 |             canvas.saveState()
57 |             canvas.translate(x, y)
58 |             canvas.doForm(makerl(canvas, xobj))
59 |             canvas.restoreState()
60 |             x += xobj.BBox[2]
61 |         canvas.showPage()
62 |     canvas.save()
63 | 
64 | 
65 | inpfn, = sys.argv[1:]
66 | outfn = 'booklet.' + os.path.basename(inpfn)
67 | 
68 | make_pdf(outfn, read_and_double(inpfn))
69 | 


--------------------------------------------------------------------------------
/examples/rl1/platypus_pdf_template.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | usage: platypus_pdf_template.py source.pdf
  5 | 
  6 | Creates platypus.source.pdf
  7 | 
  8 | Example of using pdfrw to use page 1 of a source PDF as the background
  9 | for other pages programmatically generated with Platypus.
 10 | 
 11 | Contributed by user asannes
 12 | 
 13 | """
 14 | import sys
 15 | import os
 16 | 
 17 | from reportlab.platypus import PageTemplate, BaseDocTemplate, Frame
 18 | from reportlab.platypus import NextPageTemplate, Paragraph, PageBreak
 19 | from reportlab.platypus.tableofcontents import TableOfContents
 20 | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 21 | from reportlab.rl_config import defaultPageSize
 22 | from reportlab.lib.units import inch
 23 | from reportlab.graphics import renderPDF
 24 | 
 25 | from pdfrw import PdfReader
 26 | from pdfrw.buildxobj import pagexobj
 27 | from pdfrw.toreportlab import makerl
 28 | 
 29 | PAGE_WIDTH = defaultPageSize[0]
 30 | PAGE_HEIGHT = defaultPageSize[1]
 31 | 
 32 | 
 33 | class MyTemplate(PageTemplate):
 34 |     """The kernel of this example, where we use pdfrw to fill in the
 35 |     background of a page before writing to it.  This could be used to fill
 36 |     in a water mark or similar."""
 37 | 
 38 |     def __init__(self, pdf_template_filename, name=None):
 39 |         frames = [Frame(
 40 |             0.85 * inch,
 41 |             0.5 * inch,
 42 |             PAGE_WIDTH - 1.15 * inch,
 43 |             PAGE_HEIGHT - (1.5 * inch)
 44 |             )]
 45 |         PageTemplate.__init__(self, name, frames)
 46 |         # use first page as template
 47 |         page = PdfReader(pdf_template_filename).pages[0]
 48 |         self.page_template = pagexobj(page)
 49 |         # Scale it to fill the complete page
 50 |         self.page_xscale = PAGE_WIDTH/self.page_template.BBox[2]
 51 |         self.page_yscale = PAGE_HEIGHT/self.page_template.BBox[3]
 52 | 
 53 |     def beforeDrawPage(self, canvas, doc):
 54 |         """Draws the background before anything else"""
 55 |         canvas.saveState()
 56 |         rl_obj = makerl(canvas, self.page_template)
 57 |         canvas.scale(self.page_xscale, self.page_yscale)
 58 |         canvas.doForm(rl_obj)
 59 |         canvas.restoreState()
 60 | 
 61 | 
 62 | class MyDocTemplate(BaseDocTemplate):
 63 |     """Used to apply heading to table of contents."""
 64 | 
 65 |     def afterFlowable(self, flowable):
 66 |         """Adds Heading1 to table of contents"""
 67 |         if flowable.__class__.__name__ == 'Paragraph':
 68 |             style = flowable.style.name
 69 |             text = flowable.getPlainText()
 70 |             key = '%s' % self.seq.nextf('toc')
 71 |             if style == 'Heading1':
 72 |                 self.canv.bookmarkPage(key)
 73 |                 self.notify('TOCEntry', [1, text, self.page, key])
 74 | 
 75 | 
 76 | def create_toc():
 77 |     """Creates the table of contents"""
 78 |     table_of_contents = TableOfContents()
 79 |     table_of_contents.dotsMinLevel = 0
 80 |     header1 = ParagraphStyle(name='Heading1', fontSize=16, leading=16)
 81 |     header2 = ParagraphStyle(name='Heading2', fontSize=14, leading=14)
 82 |     table_of_contents.levelStyles = [header1, header2]
 83 |     return [table_of_contents, PageBreak()]
 84 | 
 85 | 
 86 | def create_pdf(filename, pdf_template_filename):
 87 |     """Create the pdf, with all the contents"""
 88 |     pdf_report = open(filename, "wb")
 89 |     document = MyDocTemplate(pdf_report)
 90 |     templates = [MyTemplate(pdf_template_filename, name='background')]
 91 |     document.addPageTemplates(templates)
 92 | 
 93 |     styles = getSampleStyleSheet()
 94 |     elements = [NextPageTemplate('background')]
 95 |     elements.extend(create_toc())
 96 | 
 97 |     # Dummy content (hello world x 200)
 98 |     for i in range(200):
 99 |         elements.append(Paragraph("Hello World" + str(i), styles['Heading1']))
100 | 
101 |     document.multiBuild(elements)
102 |     pdf_report.close()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     template, = sys.argv[1:]
107 |     output = 'platypus_pdf_template.' + os.path.basename(template)
108 |     create_pdf(output, template)
109 | 


--------------------------------------------------------------------------------
/examples/rl1/subset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   subset.py my.pdf firstpage lastpage
 5 | 
 6 | Creates subset_<pagenum>_to_<pagenum>.my.pdf
 7 | 
 8 | 
 9 | Uses Form XObjects and reportlab to create output file.
10 | 
11 | Demonstrates use of pdfrw with reportlab.
12 | 
13 | '''
14 | 
15 | import sys
16 | import os
17 | 
18 | from reportlab.pdfgen.canvas import Canvas
19 | 
20 | from pdfrw import PdfReader
21 | from pdfrw.buildxobj import pagexobj
22 | from pdfrw.toreportlab import makerl
23 | 
24 | 
25 | def go(inpfn, firstpage, lastpage):
26 |     firstpage, lastpage = int(firstpage), int(lastpage)
27 |     outfn = 'subset.' + os.path.basename(inpfn)
28 | 
29 |     pages = PdfReader(inpfn).pages
30 |     pages = [pagexobj(x) for x in pages[firstpage - 1:lastpage]]
31 |     canvas = Canvas(outfn)
32 | 
33 |     for page in pages:
34 |         canvas.setPageSize((page.BBox[2], page.BBox[3]))
35 |         canvas.doForm(makerl(canvas, page))
36 |         canvas.showPage()
37 | 
38 |     canvas.save()
39 | 
40 | if __name__ == '__main__':
41 |     inpfn, firstpage, lastpage = sys.argv[1:]
42 |     go(inpfn, firstpage, lastpage)
43 | 


--------------------------------------------------------------------------------
/examples/rl2/README.txt:
--------------------------------------------------------------------------------
1 | The copy.py demo in this directory parses the graphics stream from the PDF and actually plays it back through reportlab.
2 | 
3 | Doesn't yet handle fonts or unicode very well.
4 | 
5 | For a more practical demo, look at the Form XObjects approach in the examples/rl1 directory.
6 | 


--------------------------------------------------------------------------------
/examples/rl2/copy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   copy.py my.pdf
 5 | 
 6 | Creates copy.my.pdf
 7 | 
 8 | Uses somewhat-functional parser.  For better results
 9 | for most things, see the Form XObject-based method.
10 | 
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from reportlab.pdfgen.canvas import Canvas
17 | 
18 | from decodegraphics import parsepage
19 | from pdfrw import PdfReader, PdfWriter, PdfArray
20 | 
21 | inpfn, = sys.argv[1:]
22 | outfn = 'copy.' + os.path.basename(inpfn)
23 | pages = PdfReader(inpfn, decompress=True).pages
24 | canvas = Canvas(outfn, pageCompression=0)
25 | 
26 | for page in pages:
27 |     box = [float(x) for x in page.MediaBox]
28 |     assert box[0] == box[1] == 0, "demo won't work on this PDF"
29 |     canvas.setPageSize(box[2:])
30 |     parsepage(page, canvas)
31 |     canvas.showPage()
32 | canvas.save()
33 | 


--------------------------------------------------------------------------------
/examples/rl2/decodegraphics.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | This file is an example parser that will parse a graphics stream
  7 | into a reportlab canvas.
  8 | 
  9 | Needs work on fonts and unicode, but works on a few PDFs.
 10 | 
 11 | Better to use Form XObjects for most things (see the example in rl1).
 12 | 
 13 | '''
 14 | from inspect import getargspec
 15 | 
 16 | from pdfrw import PdfTokens
 17 | from pdfrw.objects import PdfString
 18 | 
 19 | #############################################################################
 20 | # Graphics parsing
 21 | 
 22 | 
 23 | def parse_array(self, token='[', params=None):
 24 |     mylist = []
 25 |     for token in self.tokens:
 26 |         if token == ']':
 27 |             break
 28 |         mylist.append(token)
 29 |     self.params.append(mylist)
 30 | 
 31 | 
 32 | def parse_savestate(self, token='q', params=''):
 33 |     self.canv.saveState()
 34 | 
 35 | 
 36 | def parse_restorestate(self, token='Q', params=''):
 37 |     self.canv.restoreState()
 38 | 
 39 | 
 40 | def parse_transform(self, token='cm', params='ffffff'):
 41 |     self.canv.transform(*params)
 42 | 
 43 | 
 44 | def parse_linewidth(self, token='w', params='f'):
 45 |     self.canv.setLineWidth(*params)
 46 | 
 47 | 
 48 | def parse_linecap(self, token='J', params='i'):
 49 |     self.canv.setLineCap(*params)
 50 | 
 51 | 
 52 | def parse_linejoin(self, token='j', params='i'):
 53 |     self.canv.setLineJoin(*params)
 54 | 
 55 | 
 56 | def parse_miterlimit(self, token='M', params='f'):
 57 |     self.canv.setMiterLimit(*params)
 58 | 
 59 | 
 60 | def parse_dash(self, token='d', params='as'):  # Array, string
 61 |     self.canv.setDash(*params)
 62 | 
 63 | 
 64 | def parse_intent(self, token='ri', params='n'):
 65 |     # TODO: add logging
 66 |     pass
 67 | 
 68 | 
 69 | def parse_flatness(self, token='i', params='i'):
 70 |     # TODO: add logging
 71 |     pass
 72 | 
 73 | 
 74 | def parse_gstate(self, token='gs', params='n'):
 75 |     # TODO: add logging
 76 |     # Could parse stuff we care about from here later
 77 |     pass
 78 | 
 79 | 
 80 | def parse_move(self, token='m', params='ff'):
 81 |     if self.gpath is None:
 82 |         self.gpath = self.canv.beginPath()
 83 |     self.gpath.moveTo(*params)
 84 |     self.current_point = params
 85 | 
 86 | 
 87 | def parse_line(self, token='l', params='ff'):
 88 |     self.gpath.lineTo(*params)
 89 |     self.current_point = params
 90 | 
 91 | 
 92 | def parse_curve(self, token='c', params='ffffff'):
 93 |     self.gpath.curveTo(*params)
 94 |     self.current_point = params[-2:]
 95 | 
 96 | 
 97 | def parse_curve1(self, token='v', params='ffff'):
 98 |     parse_curve(self, token, tuple(self.current_point) + tuple(params))
 99 | 
100 | 
101 | def parse_curve2(self, token='y', params='ffff'):
102 |     parse_curve(self, token, tuple(params) + tuple(params[-2:]))
103 | 
104 | 
105 | def parse_close(self, token='h', params=''):
106 |     self.gpath.close()
107 | 
108 | 
109 | def parse_rect(self, token='re', params='ffff'):
110 |     if self.gpath is None:
111 |         self.gpath = self.canv.beginPath()
112 |     self.gpath.rect(*params)
113 |     self.current_point = params[-2:]
114 | 
115 | 
116 | def parse_stroke(self, token='S', params=''):
117 |     finish_path(self, 1, 0, 0)
118 | 
119 | 
120 | def parse_close_stroke(self, token='s', params=''):
121 |     self.gpath.close()
122 |     finish_path(self, 1, 0, 0)
123 | 
124 | 
125 | def parse_fill(self, token='f', params=''):
126 |     finish_path(self, 0, 1, 1)
127 | 
128 | 
129 | def parse_fill_compat(self, token='F', params=''):
130 |     finish_path(self, 0, 1, 1)
131 | 
132 | 
133 | def parse_fill_even_odd(self, token='f*', params=''):
134 |     finish_path(self, 0, 1, 0)
135 | 
136 | 
137 | def parse_fill_stroke_even_odd(self, token='B*', params=''):
138 |     finish_path(self, 1, 1, 0)
139 | 
140 | 
141 | def parse_fill_stroke(self, token='B', params=''):
142 |     finish_path(self, 1, 1, 1)
143 | 
144 | 
145 | def parse_close_fill_stroke_even_odd(self, token='b*', params=''):
146 |     self.gpath.close()
147 |     finish_path(self, 1, 1, 0)
148 | 
149 | 
150 | def parse_close_fill_stroke(self, token='b', params=''):
151 |     self.gpath.close()
152 |     finish_path(self, 1, 1, 1)
153 | 
154 | 
155 | def parse_nop(self, token='n', params=''):
156 |     finish_path(self, 0, 0, 0)
157 | 
158 | 
159 | def finish_path(self, stroke, fill, fillmode):
160 |     if self.gpath is not None:
161 |         canv = self.canv
162 |         canv._fillMode, oldmode = fillmode, canv._fillMode
163 |         canv.drawPath(self.gpath, stroke, fill)
164 |         canv._fillMode = oldmode
165 |         self.gpath = None
166 | 
167 | 
168 | def parse_clip_path(self, token='W', params=''):
169 |     # TODO: add logging
170 |     pass
171 | 
172 | 
173 | def parse_clip_path_even_odd(self, token='W*', params=''):
174 |     # TODO: add logging
175 |     pass
176 | 
177 | 
178 | def parse_stroke_gray(self, token='G', params='f'):
179 |     self.canv.setStrokeGray(*params)
180 | 
181 | 
182 | def parse_fill_gray(self, token='g', params='f'):
183 |     self.canv.setFillGray(*params)
184 | 
185 | 
186 | def parse_stroke_rgb(self, token='RG', params='fff'):
187 |     self.canv.setStrokeColorRGB(*params)
188 | 
189 | 
190 | def parse_fill_rgb(self, token='rg', params='fff'):
191 |     self.canv.setFillColorRGB(*params)
192 | 
193 | 
194 | def parse_stroke_cmyk(self, token='K', params='ffff'):
195 |     self.canv.setStrokeColorCMYK(*params)
196 | 
197 | 
198 | def parse_fill_cmyk(self, token='k', params='ffff'):
199 |     self.canv.setFillColorCMYK(*params)
200 | 
201 | #############################################################################
202 | # Text parsing
203 | 
204 | 
205 | def parse_begin_text(self, token='BT', params=''):
206 |     assert self.tpath is None
207 |     self.tpath = self.canv.beginText()
208 | 
209 | 
210 | def parse_text_transform(self, token='Tm', params='ffffff'):
211 |     path = self.tpath
212 | 
213 |     # Stoopid optimization to remove nop
214 |     try:
215 |         code = path._code
216 |     except AttributeError:
217 |         pass
218 |     else:
219 |         if code[-1] == '1 0 0 1 0 0 Tm':
220 |             code.pop()
221 | 
222 |     path.setTextTransform(*params)
223 | 
224 | 
225 | def parse_setfont(self, token='Tf', params='nf'):
226 |     fontinfo = self.fontdict[params[0]]
227 |     self.tpath._setFont(fontinfo.name, params[1])
228 |     self.curfont = fontinfo
229 | 
230 | 
231 | def parse_text_out(self, token='Tj', params='t'):
232 |     text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
233 |     self.tpath.textOut(text)
234 | 
235 | def parse_lf_text_out(self, token="'", params='t'):
236 |     self.tpath.textLine()
237 |     text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
238 |     self.tpath.textOut(text)
239 | 
240 | 
241 | def parse_lf_text_out_with_spacing(self, token='"', params='fft'):
242 |     self.tpath.setWordSpace(params[0])
243 |     self.tpath.setCharSpace(params[1])
244 |     self.tpath.textLine()
245 |     text = params[2].decode(self.curfont.remap, self.curfont.twobyte)
246 |     self.tpath.textOut(text)
247 | 
248 | 
249 | def parse_TJ(self, token='TJ', params='a'):
250 |     remap = self.curfont.remap
251 |     twobyte = self.curfont.twobyte
252 |     result = []
253 |     for x in params[0]:
254 |         if isinstance(x, PdfString):
255 |             result.append(x.decode(remap, twobyte))
256 |         else:
257 |             # TODO: Adjust spacing between characters here
258 |             int(x)
259 |     text = ''.join(result)
260 |     self.tpath.textOut(text)
261 | 
262 | 
263 | def parse_end_text(self, token='ET', params=''):
264 |     assert self.tpath is not None
265 |     self.canv.drawText(self.tpath)
266 |     self.tpath = None
267 | 
268 | 
269 | def parse_move_cursor(self, token='Td', params='ff'):
270 |     self.tpath.moveCursor(params[0], -params[1])
271 | 
272 | 
273 | def parse_set_leading(self, token='TL', params='f'):
274 |     self.tpath.setLeading(*params)
275 | 
276 | 
277 | def parse_text_line(self, token='T*', params=''):
278 |     self.tpath.textLine()
279 | 
280 | 
281 | def parse_set_char_space(self, token='Tc', params='f'):
282 |     self.tpath.setCharSpace(*params)
283 | 
284 | 
285 | def parse_set_word_space(self, token='Tw', params='f'):
286 |     self.tpath.setWordSpace(*params)
287 | 
288 | 
289 | def parse_set_hscale(self, token='Tz', params='f'):
290 |     self.tpath.setHorizScale(params[0] - 100)
291 | 
292 | 
293 | def parse_set_rise(self, token='Ts', params='f'):
294 |     self.tpath.setRise(*params)
295 | 
296 | 
297 | def parse_xobject(self, token='Do', params='n'):
298 |     # TODO: Need to do this
299 |     pass
300 | 
301 | 
302 | class FontInfo(object):
303 |     ''' Pretty basic -- needs a lot of work to work right for all fonts
304 |     '''
305 |     lookup = {
306 |                 # WRONG -- have to learn about font stuff...
307 |                 'BitstreamVeraSans': 'Helvetica',
308 |              }
309 | 
310 |     def __init__(self, source):
311 |         name = source.BaseFont[1:]
312 |         self.name = self.lookup.get(name, name)
313 |         self.remap = chr
314 |         self.twobyte = False
315 |         info = source.ToUnicode
316 |         if not info:
317 |             return
318 |         info = info.stream.split('beginbfchar')[1].split('endbfchar')[0]
319 |         info = list(PdfTokens(info))
320 |         assert not len(info) & 1
321 |         info2 = []
322 |         for x in info:
323 |             assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x
324 |             i = int(x[1:-1], 16)
325 |             info2.append(i)
326 |         self.remap = dict((x, chr(y)) for (x, y) in
327 |                           zip(info2[::2], info2[1::2])).get
328 |         self.twobyte = len(info[0]) > 4
329 | 
330 | #############################################################################
331 | # Control structures
332 | 
333 | 
334 | def findparsefuncs():
335 | 
336 |     def checkname(n):
337 |         assert n.startswith('/')
338 |         return n
339 | 
340 |     def checkarray(a):
341 |         assert isinstance(a, list), a
342 |         return a
343 | 
344 |     def checktext(t):
345 |         assert isinstance(t, PdfString)
346 |         return t
347 | 
348 |     fixparam = dict(f=float, i=int, n=checkname, a=checkarray,
349 |                     s=str, t=checktext)
350 |     fixcache = {}
351 | 
352 |     def fixlist(params):
353 |         try:
354 |             result = fixcache[params]
355 |         except KeyError:
356 |             result = tuple(fixparam[x] for x in params)
357 |             fixcache[params] = result
358 |         return result
359 | 
360 |     dispatch = {}
361 |     expected_args = 'self token params'.split()
362 |     for key, func in globals().items():
363 |         if key.startswith('parse_'):
364 |             args, varargs, keywords, defaults = getargspec(func)
365 |             assert (args == expected_args and varargs is None and
366 |                     keywords is None and len(defaults) == 2), (
367 |                     key, args, varargs, keywords, defaults)
368 |             token, params = defaults
369 |             if params is not None:
370 |                 params = fixlist(params)
371 |             value = func, params
372 |             assert dispatch.setdefault(token, value) is value, repr(token)
373 |     return dispatch
374 | 
375 | 
376 | class _ParseClass(object):
377 |     dispatch = findparsefuncs()
378 | 
379 |     @classmethod
380 |     def parsepage(cls, page, canvas=None):
381 |         self = cls()
382 |         contents = page.Contents
383 |         if contents.Filter is not None:
384 |             raise SystemExit('Cannot parse graphics -- page encoded with %s'
385 |                              % contents.Filter)
386 |         dispatch = cls.dispatch.get
387 |         self.tokens = tokens = iter(PdfTokens(contents.stream))
388 |         self.params = params = []
389 |         self.canv = canvas
390 |         self.gpath = None
391 |         self.tpath = None
392 |         self.fontdict = dict((x, FontInfo(y)) for
393 |                              (x, y) in page.Resources.Font.items())
394 | 
395 |         for token in self.tokens:
396 |             info = dispatch(token)
397 |             if info is None:
398 |                 params.append(token)
399 |                 continue
400 |             func, paraminfo = info
401 |             if paraminfo is None:
402 |                 func(self, token, ())
403 |                 continue
404 |             delta = len(params) - len(paraminfo)
405 |             if delta:
406 |                 if delta < 0:
407 |                     print ('Operator %s expected %s parameters, got %s' %
408 |                            (token, len(paraminfo), params))
409 |                     params[:] = []
410 |                     continue
411 |                 else:
412 |                     print ("Unparsed parameters/commands: %s" % params[:delta])
413 |                 del params[:delta]
414 |             paraminfo = zip(paraminfo, params)
415 |             try:
416 |                 params[:] = [x(y) for (x, y) in paraminfo]
417 |             except:
418 |                 for i, (x, y) in enumerate(paraminfo):
419 |                     try:
420 |                         x(y)
421 |                     except:
422 |                         raise  # For now
423 |                     continue
424 |             func(self, token, params)
425 |             params[:] = []
426 | 
427 | 
428 | def debugparser(undisturbed=set('parse_array'.split())):
429 |     def debugdispatch():
430 |         def getvalue(oldval):
431 |             name = oldval[0].__name__
432 | 
433 |             def myfunc(self, token, params):
434 |                 print ('%s called %s(%s)' % (token, name,
435 |                        ', '.join(str(x) for x in params)))
436 |             if name in undisturbed:
437 |                 myfunc = oldval[0]
438 |             return myfunc, oldval[1]
439 |         return dict((x, getvalue(y))
440 |                     for (x, y) in _ParseClass.dispatch.items())
441 | 
442 |     class _DebugParse(_ParseClass):
443 |         dispatch = debugdispatch()
444 | 
445 |     return _DebugParse.parsepage
446 | 
447 | parsepage = _ParseClass.parsepage
448 | 
449 | if __name__ == '__main__':
450 |     import sys
451 |     from pdfrw import PdfReader
452 |     parse = debugparser()
453 |     fname, = sys.argv[1:]
454 |     pdf = PdfReader(fname, decompress=True)
455 |     for i, page in enumerate(pdf.pages):
456 |         print ('\nPage %s ------------------------------------' % i)
457 |         parse(page)
458 | 


--------------------------------------------------------------------------------
/examples/rotate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   rotate.py my.pdf rotation [page[range] ...]
 5 |          eg. rotate.py 270 1-3 5 7-9
 6 | 
 7 |         Rotation must be multiple of 90 degrees, clockwise.
 8 | 
 9 | Creates rotate.my.pdf with selected pages rotated.  Rotates all by default.
10 | 
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from pdfrw import PdfReader, PdfWriter
17 | 
18 | inpfn = sys.argv[1]
19 | rotate = sys.argv[2]
20 | ranges = sys.argv[3:]
21 | 
22 | rotate = int(rotate)
23 | assert rotate % 90 == 0
24 | 
25 | ranges = [[int(y) for y in x.split('-')] for x in ranges]
26 | outfn = 'rotate.%s' % os.path.basename(inpfn)
27 | trailer = PdfReader(inpfn)
28 | pages = trailer.pages
29 | 
30 | if not ranges:
31 |     ranges = [[1, len(pages)]]
32 | 
33 | for onerange in ranges:
34 |     onerange = (onerange + onerange[-1:])[:2]
35 |     for pagenum in range(onerange[0]-1, onerange[1]):
36 |         pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
37 |                                      0) + rotate) % 360
38 | 
39 | outdata = PdfWriter(outfn)
40 | outdata.trailer = trailer
41 | outdata.write()
42 | 


--------------------------------------------------------------------------------
/examples/subset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   subset.py my.pdf page[range] [page[range]] ...
 5 |          eg. subset.py 1-3 5 7-9
 6 | 
 7 | Creates subset.my.pdf
 8 | 
 9 | '''
10 | 
11 | import sys
12 | import os
13 | 
14 | from pdfrw import PdfReader, PdfWriter
15 | 
16 | inpfn = sys.argv[1]
17 | ranges = sys.argv[2:]
18 | assert ranges, "Expected at least one range"
19 | 
20 | ranges = ([int(y) for y in x.split('-')] for x in ranges)
21 | outfn = 'subset.%s' % os.path.basename(inpfn)
22 | pages = PdfReader(inpfn).pages
23 | outdata = PdfWriter(outfn)
24 | 
25 | for onerange in ranges:
26 |     onerange = (onerange + onerange[-1:])[:2]
27 |     for pagenum in range(onerange[0], onerange[1]+1):
28 |         outdata.addpage(pages[pagenum-1])
29 | outdata.write()
30 | 


--------------------------------------------------------------------------------
/examples/subset_booklets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage: subset_booklets.py my.pdf
 5 | 
 6 | Creates subset_booklets.my.pdf
 7 | 
 8 | Pages organized in a form suitable for booklet printing, e.g.
 9 | to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
10 | Instead of a large booklet, the pdf is divided into several mini
11 | booklets. The reason is: professional printing works this way:
12 |     - Print all of several mini booklets(subsets of booklet);
13 |     - Saw each mini booklet individually;
14 |     - glue them all together;
15 |     - Insert the cover.
16 | 
17 |     Take a look at http://www.wikihow.com/Bind-a-Book
18 | '''
19 | 
20 | import sys
21 | import os
22 | import time
23 | from pdfrw import PdfReader, PdfWriter, PageMerge
24 | 
25 | BOOKLET_SIZE = 20
26 | START = time.time()
27 | 
28 | def fixpage(*pages):
29 |     result = PageMerge() + (x for x in pages if x is not None)
30 |     result[-1].x += result[0].w
31 |     return result.render()
32 | 
33 | INPFN, = sys.argv[1:]
34 | OUTFN = 'booklet.' + os.path.basename(INPFN)
35 | ALL_IPAGES = PdfReader(INPFN).pages
36 | print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'
37 | 
38 | #Make sure we have an even number
39 | if len(ALL_IPAGES) & 1:
40 |     ALL_IPAGES.append(None)
41 |     print 'Inserting one more blank page to make pages number even.'
42 | NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)
43 | 
44 | print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
45 | opages = []
46 | for iteration in range(0, NUM_OF_ITER):
47 |     ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
48 |     while len(ipages) > 2:
49 |         opages.append(fixpage(ipages.pop(), ipages.pop(0)))
50 |         opages.append(fixpage(ipages.pop(0), ipages.pop()))
51 | 
52 | # Making one more subbooklet with the left pages
53 | ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)]
54 | while len(ipages) > 2:
55 |     opages.append(fixpage(ipages.pop(), ipages.pop(0)))
56 |     opages.append(fixpage(ipages.pop(0), ipages.pop()))
57 | if len(ipages) >= 1:
58 |     opages.append(fixpage(ipages.pop(), ipages.pop(0)))
59 | 
60 | PdfWriter(OUTFN).addpages(opages).write()
61 | print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.'
62 | 


--------------------------------------------------------------------------------
/examples/unspread.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | usage:   unspread.py my.pdf
 5 | 
 6 | Creates unspread.my.pdf
 7 | 
 8 | Chops each page in half, e.g. if a source were
 9 | created in booklet form, you could extract individual
10 | pages.
11 | '''
12 | 
13 | import sys
14 | import os
15 | 
16 | from pdfrw import PdfReader, PdfWriter, PageMerge
17 | 
18 | 
19 | def splitpage(src):
20 |     ''' Split a page into two (left and right)
21 |     '''
22 |     # Yield a result for each half of the page
23 |     for x_pos in (0, 0.5):
24 |         yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render()
25 | 
26 | 
27 | inpfn, = sys.argv[1:]
28 | outfn = 'unspread.' + os.path.basename(inpfn)
29 | writer = PdfWriter(outfn)
30 | for page in PdfReader(inpfn).pages:
31 |     writer.addpages(splitpage(page))
32 | writer.write()
33 | 


--------------------------------------------------------------------------------
/examples/watermark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | Simple example of watermarking using form xobjects (pdfrw).
 5 | 
 6 | usage:   watermark.py [-u] my.pdf single_page.pdf
 7 | 
 8 | Creates watermark.my.pdf, with every page overlaid with
 9 | first page from single_page.pdf.  If -u is selected, watermark
10 | will be placed underneath page (painted first).
11 | 
12 | NOTE 1: This program assumes that all pages (including the watermark
13 |         page) are the same size.  For other possibilities, see
14 |         the fancy_watermark.py example.
15 | 
16 | NOTE 2: At one point, this example was extremely complicated, with
17 |         multiple options.  That only led to errors in implementation,
18 |         so it has been re-simplified in order to show basic principles
19 |         of the library operation and to match the other examples better.
20 | '''
21 | 
22 | import sys
23 | import os
24 | 
25 | from pdfrw import PdfReader, PdfWriter, PageMerge
26 | 
27 | argv = sys.argv[1:]
28 | underneath = '-u' in argv
29 | if underneath:
30 |     del argv[argv.index('-u')]
31 | inpfn, wmarkfn = argv
32 | outfn = 'watermark.' + os.path.basename(inpfn)
33 | wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
34 | trailer = PdfReader(inpfn)
35 | for page in trailer.pages:
36 |     PageMerge(page).add(wmark, prepend=underneath).render()
37 | PdfWriter(outfn, trailer=trailer).write()
38 | 


--------------------------------------------------------------------------------
/pdfrw/__init__.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | from .pdfwriter import PdfWriter
 6 | from .pdfreader import PdfReader
 7 | from .objects import (PdfObject, PdfName, PdfArray,
 8 |                       PdfDict, IndirectPdfDict, PdfString)
 9 | from .tokens import PdfTokens
10 | from .errors import PdfParseError
11 | from .pagemerge import PageMerge
12 | 
13 | __version__ = '0.4'
14 | 
15 | # Add a tiny bit of compatibility to pyPdf
16 | 
17 | PdfFileReader = PdfReader
18 | PdfFileWriter = PdfWriter
19 | 
20 | __all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray
21 |              PdfTokens PdfParseError PdfDict IndirectPdfDict
22 |              PdfString PageMerge""".split()
23 | 
24 | 


--------------------------------------------------------------------------------
/pdfrw/buildxobj.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | 
  7 | This module contains code to build PDF "Form XObjects".
  8 | 
  9 | A Form XObject allows a fragment from one PDF file to be cleanly
 10 | included in another PDF file.
 11 | 
 12 | Reference for syntax: "Parameters for opening PDF files" from SDK 8.1
 13 | 
 14 |         http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf
 15 | 
 16 |         supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>'
 17 | 
 18 |         Also supported by this, but not by Adobe:
 19 |             'rotate=xxx'  where xxx in [0, 90, 180, 270]
 20 | 
 21 |         Units are in points
 22 | 
 23 | 
 24 | Reference for content:   Adobe PDF reference, sixth edition, version 1.7
 25 | 
 26 |         http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
 27 | 
 28 |         Form xobjects discussed chapter 4.9, page 355
 29 | '''
 30 | 
 31 | from .objects import PdfDict, PdfArray, PdfName
 32 | from .pdfreader import PdfReader
 33 | from .errors import log, PdfNotImplementedError
 34 | from .py23_diffs import iteritems
 35 | from .uncompress import uncompress
 36 | from .compress import compress
 37 | 
 38 | 
 39 | class ViewInfo(object):
 40 |     ''' Instantiate ViewInfo with a uri, and it will parse out
 41 |         the filename, page, and viewrect into object attributes.
 42 | 
 43 |         Note 1:
 44 |             Viewrects follow the adobe definition.  (See reference
 45 |             above). They are arrays of 4 numbers:
 46 | 
 47 |             - Distance from left of document in points
 48 |             - Distance from top (NOT bottom) of document in points
 49 |             - Width of rectangle in points
 50 |             - Height of rectangle in points
 51 | 
 52 |         Note 2:
 53 |             For simplicity, Viewrects can also be specified
 54 |             in fractions of the document.  If every number in
 55 |             the viewrect is between 0 and 1 inclusive, then
 56 |             viewrect elements 0 and 2 are multiplied by the
 57 |             mediabox width before use, and viewrect elements
 58 |             1 and 3 are multiplied by the mediabox height before
 59 |             use.
 60 | 
 61 |         Note 3:
 62 |             By default, an XObject based on the view will be
 63 |             cacheable.  It should not be cacheable if the XObject
 64 |             will be subsequently modified.
 65 |     '''
 66 |     doc = None
 67 |     docname = None
 68 |     page = None
 69 |     viewrect = None
 70 |     rotate = None
 71 |     cacheable = True
 72 | 
 73 |     def __init__(self, pageinfo='', **kw):
 74 |         pageinfo = pageinfo.split('#', 1)
 75 |         if len(pageinfo) == 2:
 76 |             pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
 77 |         for key in 'page viewrect'.split():
 78 |             if pageinfo[0].startswith(key + '='):
 79 |                 break
 80 |         else:
 81 |             self.docname = pageinfo.pop(0)
 82 |         for item in pageinfo:
 83 |             key, value = item.split('=')
 84 |             key = key.strip()
 85 |             value = value.replace(',', ' ').split()
 86 |             if key in ('page', 'rotate'):
 87 |                 assert len(value) == 1
 88 |                 setattr(self, key, int(value[0]))
 89 |             elif key == 'viewrect':
 90 |                 assert len(value) == 4
 91 |                 setattr(self, key, [float(x) for x in value])
 92 |             else:
 93 |                 log.error('Unknown option: %s', key)
 94 |         for key, value in iteritems(kw):
 95 |             assert hasattr(self, key), key
 96 |             setattr(self, key, value)
 97 | 
 98 | 
 99 | def get_rotation(rotate):
100 |     ''' Return clockwise rotation code:
101 |           0 = unrotated
102 |           1 = 90 degrees
103 |           2 = 180 degrees
104 |           3 = 270 degrees
105 |     '''
106 |     try:
107 |         rotate = int(rotate)
108 |     except (ValueError, TypeError):
109 |         return 0
110 |     if rotate % 90 != 0:
111 |         return 0
112 |     return rotate // 90
113 | 
114 | 
115 | def rotate_point(point, rotation):
116 |     ''' Rotate an (x,y) coordinate clockwise by a
117 |         rotation code specifying a multiple of 90 degrees.
118 |     '''
119 |     if rotation & 1:
120 |         point = point[1], -point[0]
121 |     if rotation & 2:
122 |         point = -point[0], -point[1]
123 |     return point
124 | 
125 | 
126 | def rotate_rect(rect, rotation):
127 |     ''' Rotate both points within the rectangle, then normalize
128 |         the rectangle by returning the new lower left, then new
129 |         upper right.
130 |     '''
131 |     rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation)
132 |     return (min(rect[0], rect[2]), min(rect[1], rect[3]),
133 |             max(rect[0], rect[2]), max(rect[1], rect[3]))
134 | 
135 | 
136 | def getrects(inheritable, pageinfo, rotation):
137 |     ''' Given the inheritable attributes of a page and
138 |         the desired pageinfo rectangle, return the page's
139 |         media box and the calculated boundary (clip) box.
140 |     '''
141 |     mbox = tuple([float(x) for x in inheritable.MediaBox])
142 |     cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
143 |     vrect = pageinfo.viewrect
144 |     if vrect is not None:
145 |         # Rotate the media box to match what the user sees,
146 |         # figure out the clipping box, then rotate back
147 |         mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
148 |         x, y, w, h = vrect
149 | 
150 |         # Support operations in fractions of a page
151 |         if 0 <= min(vrect) < max(vrect) <= 1:
152 |             mw = mright - mleft
153 |             mh = mtop - mbot
154 |             x *= mw
155 |             w *= mw
156 |             y *= mh
157 |             h *= mh
158 | 
159 |         cleft = mleft + x
160 |         ctop = mtop - y
161 |         cright = cleft + w
162 |         cbot = ctop - h
163 |         cbox = (max(mleft, cleft), max(mbot, cbot),
164 |                 min(mright, cright), min(mtop, ctop))
165 |         cbox = rotate_rect(cbox, -rotation)
166 |     return mbox, cbox
167 | 
168 | 
169 | def _build_cache(contents, allow_compressed):
170 |     ''' Build a new dictionary holding the stream,
171 |         and save it along with private cache info.
172 |         Assumes validity has been pre-checked if
173 |         we have a non-None xobj_copy.
174 | 
175 |         Also, the spec says nothing about nested arrays,
176 |         so we assume those don't exist until we see one
177 |         in the wild.
178 |     '''
179 |     try:
180 |         xobj_copy = contents.xobj_copy
181 |     except AttributeError:
182 |         # Should have a PdfArray here...
183 |         array = contents
184 |         private = contents
185 |     else:
186 |         # Should have a PdfDict here -- might or might not have cache copy
187 |         if xobj_copy is not None:
188 |             return xobj_copy
189 |         array = [contents]
190 |         private = contents.private
191 | 
192 |     # If we don't allow compressed objects, OR if we have multiple compressed
193 |     # objects, we try to decompress them, and fail if we cannot do that.
194 | 
195 |     if not allow_compressed or len(array) > 1:
196 |         keys = set(x[0] for cdict in array for x in iteritems(cdict))
197 |         was_compressed = len(keys) > 1
198 |         if was_compressed:
199 |             # Make copies of the objects before we uncompress them.
200 |             array = [PdfDict(x) for x in array]
201 |             if not uncompress(array):
202 |                 raise PdfNotImplementedError(
203 |                     'Xobjects with these compression parameters not supported: %s' %
204 |                     keys)
205 |     
206 |     xobj_copy = PdfDict(array[0])
207 |     xobj_copy.private.xobj_cachedict = {}
208 |     private.xobj_copy = xobj_copy
209 | 
210 |     if len(array) > 1:
211 |         newstream = '\n'.join(x.stream for x in array)
212 |         newlength = sum(int(x.Length) for x in array) + len(array) - 1
213 |         assert newlength == len(newstream)
214 |         xobj_copy.stream = newstream
215 |         if was_compressed and allow_compressed:
216 |             compress(xobj_copy)
217 | 
218 |     return xobj_copy
219 | 
220 | 
221 | def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
222 |     ''' Return a cached Form XObject, or create a new one and cache it.
223 |         Adds private members x, y, w, h
224 |     '''
225 |     cachedict = contents.xobj_cachedict
226 |     cachekey = mbox, bbox, rotation
227 |     result = cachedict.get(cachekey) if cacheable else None
228 |     if result is None:
229 |         # If we are not getting a full page, or if we are going to
230 |         # modify the results, first retrieve an underlying Form XObject
231 |         # that represents the entire page, so that we are not copying
232 |         # the full page data into the new file multiple times
233 |         func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
234 |         result = PdfDict(
235 |             func(contents, resources, mbox),
236 |             Type=PdfName.XObject,
237 |             Subtype=PdfName.Form,
238 |             FormType=1,
239 |             BBox=PdfArray(bbox),
240 |         )
241 |         rect = bbox
242 |         if rotation:
243 |             matrix = (rotate_point((1, 0), rotation) +
244 |                       rotate_point((0, 1), rotation))
245 |             result.Matrix = PdfArray(matrix + (0, 0))
246 |             rect = rotate_rect(rect, rotation)
247 | 
248 |         private = result.private
249 |         private.x = rect[0]
250 |         private.y = rect[1]
251 |         private.w = rect[2] - rect[0]
252 |         private.h = rect[3] - rect[1]
253 |         if cacheable:
254 |             cachedict[cachekey] = result
255 |     return result
256 | 
257 | 
258 | def _get_fullpage(contents, resources, mbox):
259 |     ''' fullpage is easy.  Just copy the contents,
260 |         set up the resources, and let _cache_xobj handle the
261 |         rest.
262 |     '''
263 |     return PdfDict(contents, Resources=resources)
264 | 
265 | 
266 | def _get_subpage(contents, resources, mbox):
267 |     ''' subpages *could* be as easy as full pages, but we
268 |         choose to complicate life by creating a Form XObject
269 |         for the page, and then one that references it for
270 |         the subpage, on the off-chance that we want multiple
271 |         items from the page.
272 |     '''
273 |     return PdfDict(
274 |         stream='/FullPage Do\n',
275 |         Resources=PdfDict(
276 |             XObject=PdfDict(
277 |                 FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
278 |             )
279 |         )
280 |     )
281 | 
282 | 
283 | def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
284 |     ''' pagexobj creates and returns a Form XObject for
285 |         a given view within a page (Defaults to entire page.)
286 | 
287 |         pagexobj is passed a page and a viewrect.
288 |     '''
289 |     inheritable = page.inheritable
290 |     resources = inheritable.Resources
291 |     rotation = get_rotation(inheritable.Rotate)
292 |     mbox, bbox = getrects(inheritable, viewinfo, rotation)
293 |     rotation += get_rotation(viewinfo.rotate)
294 |     contents = _build_cache(page.Contents, allow_compressed)
295 |     return _cache_xobj(contents, resources, mbox, bbox, rotation,
296 |                        viewinfo.cacheable)
297 | 
298 | 
299 | def docxobj(pageinfo, doc=None, allow_compressed=True):
300 |     ''' docinfo reads a page out of a document and uses
301 |         pagexobj to create the Form XObject based on
302 |         the page.
303 | 
304 |         This is a convenience function for things like
305 |         rst2pdf that want to be able to pass in textual
306 |         filename/location descriptors and don't want to
307 |         know about using PdfReader.
308 | 
309 |         Can work standalone, or in conjunction with
310 |         the CacheXObj class (below).
311 | 
312 |     '''
313 |     if not isinstance(pageinfo, ViewInfo):
314 |         pageinfo = ViewInfo(pageinfo)
315 | 
316 |     # If we're explicitly passed a document,
317 |     # make sure we don't have one implicitly as well.
318 |     # If no implicit or explicit doc, then read one in
319 |     # from the filename.
320 |     if doc is not None:
321 |         assert pageinfo.doc is None
322 |         pageinfo.doc = doc
323 |     elif pageinfo.doc is not None:
324 |         doc = pageinfo.doc
325 |     else:
326 |         doc = pageinfo.doc = PdfReader(pageinfo.docname,
327 |                                        decompress=not allow_compressed)
328 |     assert isinstance(doc, PdfReader)
329 | 
330 |     sourcepage = doc.pages[(pageinfo.page or 1) - 1]
331 |     return pagexobj(sourcepage, pageinfo, allow_compressed)
332 | 
333 | 
334 | class CacheXObj(object):
335 |     ''' Use to keep from reparsing files over and over,
336 |         and to keep from making the output too much
337 |         bigger than it ought to be by replicating
338 |         unnecessary object copies.
339 | 
340 |         This is a convenience function for things like
341 |         rst2pdf that want to be able to pass in textual
342 |         filename/location descriptors and don't want to
343 |         know about using PdfReader.
344 |     '''
345 |     def __init__(self, decompress=False):
346 |         ''' Set decompress true if you need
347 |             the Form XObjects to be decompressed.
348 |             Will decompress what it can and scream
349 |             about the rest.
350 |         '''
351 |         self.cached_pdfs = {}
352 |         self.decompress = decompress
353 | 
354 |     def load(self, sourcename):
355 |         ''' Load a Form XObject from a uri
356 |         '''
357 |         info = ViewInfo(sourcename)
358 |         fname = info.docname
359 |         pcache = self.cached_pdfs
360 |         doc = pcache.get(fname)
361 |         if doc is None:
362 |             doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
363 |         return docxobj(info, doc, allow_compressed=not self.decompress)
364 | 


--------------------------------------------------------------------------------
/pdfrw/compress.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | '''
 6 | Currently, this sad little file only knows how to compress
 7 | using the flate (zlib) algorithm.  Maybe more later, but it's
 8 | not a priority for me...
 9 | '''
10 | 
11 | from .objects import PdfName
12 | from .uncompress import streamobjects
13 | from .py23_diffs import zlib, convert_load, convert_store
14 | 
15 | 
16 | def compress(mylist):
17 |     flate = PdfName.FlateDecode
18 |     for obj in streamobjects(mylist):
19 |         ftype = obj.Filter
20 |         if ftype is not None:
21 |             continue
22 |         oldstr = obj.stream
23 |         newstr = convert_load(zlib.compress(convert_store(oldstr)))
24 |         if len(newstr) < len(oldstr) + 30:
25 |             obj.stream = newstr
26 |             obj.Filter = flate
27 |             obj.DecodeParms = None
28 | 


--------------------------------------------------------------------------------
/pdfrw/crypt.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2017  Jon Lund Steffensen
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | from __future__ import division
  6 | 
  7 | import hashlib
  8 | import struct
  9 | 
 10 | try:
 11 |     from Crypto.Cipher import ARC4, AES
 12 |     HAS_CRYPTO = True
 13 | except ImportError:
 14 |     HAS_CRYPTO = False
 15 | 
 16 | from .objects import PdfDict, PdfName
 17 | 
 18 | _PASSWORD_PAD = (
 19 |     '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
 20 |     '..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
 21 | 
 22 | 
 23 | def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
 24 |     for obj in mylist:
 25 |         if isinstance(obj, PdfDict) and obj.stream is not None:
 26 |             yield obj
 27 | 
 28 | 
 29 | def create_key(password, doc):
 30 |     """Create an encryption key (Algorithm 2 in PDF spec)."""
 31 |     key_size = int(doc.Encrypt.Length or 40) // 8
 32 |     padded_pass = (password + _PASSWORD_PAD)[:32]
 33 |     hasher = hashlib.md5()
 34 |     hasher.update(padded_pass)
 35 |     hasher.update(doc.Encrypt.O.to_bytes())
 36 |     hasher.update(struct.pack('<i', int(doc.Encrypt.P)))
 37 |     hasher.update(doc.ID[0].to_bytes())
 38 |     temp_hash = hasher.digest()
 39 | 
 40 |     if int(doc.Encrypt.R or 0) >= 3:
 41 |         for _ in range(50):
 42 |             temp_hash = hashlib.md5(temp_hash[:key_size]).digest()
 43 | 
 44 |     return temp_hash[:key_size]
 45 | 
 46 | 
 47 | def create_user_hash(key, doc):
 48 |     """Create the user password hash (Algorithm 4/5)."""
 49 |     revision = int(doc.Encrypt.R or 0)
 50 |     if revision < 3:
 51 |         cipher = ARC4.new(key)
 52 |         return cipher.encrypt(_PASSWORD_PAD)
 53 |     else:
 54 |         hasher = hashlib.md5()
 55 |         hasher.update(_PASSWORD_PAD)
 56 |         hasher.update(doc.ID[0].to_bytes())
 57 |         temp_hash = hasher.digest()
 58 | 
 59 |         for i in range(20):
 60 |             temp_key = ''.join(chr(i ^ ord(x)) for x in key)
 61 |             cipher = ARC4.new(temp_key)
 62 |             temp_hash = cipher.encrypt(temp_hash)
 63 | 
 64 |         return temp_hash
 65 | 
 66 | 
 67 | def check_user_password(key, doc):
 68 |     """Check that the user password is correct (Algorithm 6)."""
 69 |     expect_user_hash = create_user_hash(key, doc)
 70 |     revision = int(doc.Encrypt.R or 0)
 71 |     if revision < 3:
 72 |         return doc.Encrypt.U.to_bytes() == expect_user_hash
 73 |     else:
 74 |         return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash
 75 | 
 76 | 
 77 | class AESCryptFilter(object):
 78 |     """Crypt filter corresponding to /AESV2."""
 79 |     def __init__(self, key):
 80 |         self._key = key
 81 | 
 82 |     def decrypt_data(self, num, gen, data):
 83 |         """Decrypt data (string/stream) using key (Algorithm 1)."""
 84 |         key_extension = struct.pack('<i', num)[:3]
 85 |         key_extension += struct.pack('<i', gen)[:2]
 86 |         key_extension += 'sAlT'
 87 |         temp_key = self._key + key_extension
 88 |         temp_key = hashlib.md5(temp_key).digest()
 89 | 
 90 |         iv = data[:AES.block_size]
 91 |         cipher = AES.new(temp_key, AES.MODE_CBC, iv)
 92 |         decrypted = cipher.decrypt(data[AES.block_size:])
 93 | 
 94 |         # Remove padding
 95 |         pad_size = ord(decrypted[-1])
 96 |         assert 1 <= pad_size <= 16
 97 |         return decrypted[:-pad_size]
 98 | 
 99 | 
100 | class RC4CryptFilter(object):
101 |     """Crypt filter corresponding to /V2."""
102 |     def __init__(self, key):
103 |         self._key = key
104 | 
105 |     def decrypt_data(self, num, gen, data):
106 |         """Decrypt data (string/stream) using key (Algorithm 1)."""
107 |         new_key_size = min(len(self._key) + 5, 16)
108 |         key_extension = struct.pack('<i', num)[:3]
109 |         key_extension += struct.pack('<i', gen)[:2]
110 |         temp_key = self._key + key_extension
111 |         temp_key = hashlib.md5(temp_key).digest()[:new_key_size]
112 | 
113 |         cipher = ARC4.new(temp_key)
114 |         return cipher.decrypt(data)
115 | 
116 | 
117 | class IdentityCryptFilter(object):
118 |     """Identity crypt filter (pass through with no encryption)."""
119 |     def decrypt_data(self, num, gen, data):
120 |         return data
121 | 
122 | 
123 | def decrypt_objects(objects, default_filter, filters):
124 |     """Decrypt list of stream objects.
125 | 
126 |     The parameter default_filter specifies the default filter to use. The
127 |     filters parameter is a dictionary of alternate filters to use when the
128 |     object specfies an alternate filter locally.
129 |     """
130 |     for obj in streamobjects(objects):
131 |         if getattr(obj, 'decrypted', False):
132 |             continue
133 | 
134 |         filter = default_filter
135 | 
136 |         # Check whether a locally defined crypt filter should override the
137 |         # default filter.
138 |         ftype = obj.Filter
139 |         if ftype is not None:
140 |             if not isinstance(ftype, list):
141 |                 ftype = [ftype]
142 |             if len(ftype) >= 1 and ftype[0] == PdfName.Crypt:
143 |                 ftype = ftype[1:]
144 |                 parms = obj.DecodeParms or obj.DP
145 |                 filter = filters[parms.Name]
146 | 
147 |         num, gen = obj.indirect
148 |         obj.stream = filter.decrypt_data(num, gen, obj.stream)
149 |         obj.private.decrypted = True
150 |         obj.Filter = ftype or None
151 | 


--------------------------------------------------------------------------------
/pdfrw/errors.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | '''
 6 | PDF Exceptions and error handling
 7 | '''
 8 | 
 9 | import logging
10 | 
11 | 
12 | fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s')
13 | 
14 | handler = logging.StreamHandler()
15 | handler.setFormatter(fmt)
16 | 
17 | log = logging.getLogger('pdfrw')
18 | log.setLevel(logging.WARNING)
19 | log.addHandler(handler)
20 | 
21 | 
22 | class PdfError(Exception):
23 |     "Abstract base class of exceptions thrown by this module"
24 | 
25 |     def __init__(self, msg):
26 |         self.msg = msg
27 | 
28 |     def __str__(self):
29 |         return self.msg
30 | 
31 | 
32 | class PdfParseError(PdfError):
33 |     "Error thrown by parser/tokenizer"
34 | 
35 | 
36 | class PdfOutputError(PdfError):
37 |     "Error thrown by PDF writer"
38 | 
39 | 
40 | class PdfNotImplementedError(PdfError):
41 |     "Error thrown on missing features"
42 | 


--------------------------------------------------------------------------------
/pdfrw/findobjs.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | ''' This module contains a function to find all the XObjects
  6 |     in a document, and another function that will wrap them
  7 |     in page objects.
  8 | '''
  9 | 
 10 | from .objects import PdfDict, PdfArray, PdfName
 11 | 
 12 | 
 13 | def find_objects(source, valid_types=(PdfName.XObject, None),
 14 |                  valid_subtypes=(PdfName.Form, PdfName.Image),
 15 |                  no_follow=(PdfName.Parent,),
 16 |                  isinstance=isinstance, id=id, sorted=sorted,
 17 |                  reversed=reversed, PdfDict=PdfDict):
 18 |     '''
 19 |         Find all the objects of a particular kind in a document
 20 |         or array.  Defaults to looking for Form and Image XObjects.
 21 | 
 22 |         This could be done recursively, but some PDFs
 23 |         are quite deeply nested, so we do it without
 24 |         recursion.
 25 | 
 26 |         Note that we don't know exactly where things appear on pages,
 27 |         but we aim for a sort order that is (a) mostly in document order,
 28 |         and (b) reproducible.  For arrays, objects are processed in
 29 |         array order, and for dicts, they are processed in key order.
 30 |     '''
 31 |     container = (PdfDict, PdfArray)
 32 | 
 33 |     # Allow passing a list of pages, or a dict
 34 |     if isinstance(source, PdfDict):
 35 |         source = [source]
 36 |     else:
 37 |         source = list(source)
 38 | 
 39 |     visited = set()
 40 |     source.reverse()
 41 |     while source:
 42 |         obj = source.pop()
 43 |         if not isinstance(obj, container):
 44 |             continue
 45 |         myid = id(obj)
 46 |         if myid in visited:
 47 |             continue
 48 |         visited.add(myid)
 49 |         if isinstance(obj, PdfDict):
 50 |             if obj.Type in valid_types and obj.Subtype in valid_subtypes:
 51 |                 yield obj
 52 |             obj = [y for (x, y) in sorted(obj.iteritems())
 53 |                    if x not in no_follow]
 54 |         else:
 55 |             # TODO: This forces resolution of any indirect objects in
 56 |             # the array.  It may not be necessary.  Don't know if
 57 |             # reversed() does any voodoo underneath the hood.
 58 |             # It's cheap enough for now, but might be removeable.
 59 |             obj and obj[0]
 60 |         source.extend(reversed(obj))
 61 | 
 62 | 
 63 | def wrap_object(obj, width, margin):
 64 |     ''' Wrap an xobj in its own page object.
 65 |     '''
 66 |     fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
 67 |     contents = PdfDict(indirect=True)
 68 |     subtype = obj.Subtype
 69 |     if subtype == PdfName.Form:
 70 |         contents._stream = obj.stream
 71 |         contents.Length = obj.Length
 72 |         contents.Filter = obj.Filter
 73 |         contents.DecodeParms = obj.DecodeParms
 74 |         resources = obj.Resources
 75 |         mbox = obj.BBox
 76 |     elif subtype == PdfName.Image:  # Image
 77 |         xoffset = margin[0]
 78 |         yoffset = margin[1]
 79 |         cw = width - margin[0] - margin[2]
 80 |         iw, ih = float(obj.Width), float(obj.Height)
 81 |         ch = 1.0 * cw / iw * ih
 82 |         height = ch + margin[1] + margin[3]
 83 |         p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
 84 |         contents.stream = fmt % p
 85 |         resources = PdfDict(XObject=PdfDict(MyImage=obj))
 86 |         mbox = PdfArray((0, 0, width, height))
 87 |     else:
 88 |         raise TypeError("Expected Form or Image XObject")
 89 | 
 90 |     return PdfDict(
 91 |         indirect=True,
 92 |         Type=PdfName.Page,
 93 |         MediaBox=mbox,
 94 |         Resources=resources,
 95 |         Contents=contents,
 96 |         )
 97 | 
 98 | 
 99 | def trivial_xobjs(maxignore=300):
100 |     ''' Ignore XObjects that trivially contain other XObjects.
101 |     '''
102 |     ignore = set('q Q cm Do'.split())
103 |     Image = PdfName.Image
104 | 
105 |     def check(obj):
106 |         if obj.Subtype == Image:
107 |             return False
108 |         s = obj.stream
109 |         if len(s) < maxignore:
110 |             s = (x for x in s.split() if not x.startswith('/') and
111 |                  x not in ignore)
112 |             s = (x.replace('.', '').replace('-', '') for x in s)
113 |             if not [x for x in s if not x.isdigit()]:
114 |                 return True
115 |     return check
116 | 
117 | 
118 | def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
119 |                   image_only=False, ignore=trivial_xobjs(),
120 |                   wrap_object=wrap_object):
121 |     ''' page_per_xobj wraps every XObj found
122 |         in its own page object.
123 |         width and margin are used to set image sizes.
124 |     '''
125 |     try:
126 |         iter(margin)
127 |     except:
128 |         margin = [margin]
129 |     while len(margin) < 4:
130 |         margin *= 2
131 | 
132 |     if isinstance(xobj_iter, (list, dict)):
133 |         xobj_iter = find_objects(xobj_iter)
134 |     for obj in xobj_iter:
135 |         if not ignore(obj):
136 |             if not image_only or obj.Subtype == PdfName.IMage:
137 |                 yield wrap_object(obj, width, margin)
138 | 


--------------------------------------------------------------------------------
/pdfrw/objects/__init__.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | '''
 6 | Objects that can occur in PDF files.  The most important
 7 | objects are arrays and dicts.  Either of these can be
 8 | indirect or not, and dicts could have an associated
 9 | stream.
10 | '''
11 | from .pdfname import PdfName
12 | from .pdfdict import PdfDict, IndirectPdfDict
13 | from .pdfarray import PdfArray
14 | from .pdfobject import PdfObject
15 | from .pdfstring import PdfString
16 | from .pdfindirect import PdfIndirect
17 | 
18 | __all__ = """PdfName PdfDict IndirectPdfDict PdfArray
19 |              PdfObject PdfString PdfIndirect""".split()
20 | 


--------------------------------------------------------------------------------
/pdfrw/objects/pdfarray.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | from .pdfindirect import PdfIndirect
 6 | from .pdfobject import PdfObject
 7 | 
 8 | 
 9 | def _resolved():
10 |     pass
11 | 
12 | 
13 | class PdfArray(list):
14 |     ''' A PdfArray maps the PDF file array object into a Python list.
15 |         It has an indirect attribute which defaults to False.
16 |     '''
17 |     indirect = False
18 | 
19 |     def __init__(self, source=[]):
20 |         self._resolve = self._resolver
21 |         self.extend(source)
22 | 
23 |     def _resolver(self, isinstance=isinstance, enumerate=enumerate,
24 |                   listiter=list.__iter__, PdfIndirect=PdfIndirect,
25 |                   resolved=_resolved, PdfNull=PdfObject('null')):
26 |         for index, value in enumerate(list.__iter__(self)):
27 |                 if isinstance(value, PdfIndirect):
28 |                     value = value.real_value()
29 |                     if value is None:
30 |                         value = PdfNull
31 |                     self[index] = value
32 |         self._resolve = resolved
33 | 
34 |     def __getitem__(self, index, listget=list.__getitem__):
35 |         self._resolve()
36 |         return listget(self, index)
37 | 
38 |     try:
39 |         def __getslice__(self, i, j, listget=list.__getslice__):
40 |             self._resolve()
41 |             return listget(self, i, j)
42 |     except AttributeError:
43 |         pass
44 | 
45 |     def __iter__(self, listiter=list.__iter__):
46 |         self._resolve()
47 |         return listiter(self)
48 | 
49 |     def count(self, item):
50 |         self._resolve()
51 |         return list.count(self, item)
52 | 
53 |     def index(self, item):
54 |         self._resolve()
55 |         return list.index(self, item)
56 | 
57 |     def remove(self, item):
58 |         self._resolve()
59 |         return list.remove(self, item)
60 | 
61 |     def sort(self, *args, **kw):
62 |         self._resolve()
63 |         return list.sort(self, *args, **kw)
64 | 
65 |     def pop(self, *args):
66 |         self._resolve()
67 |         return list.pop(self, *args)
68 | 
69 |     def __reversed__(self):
70 |         self._resolve()
71 |         return list.__reversed__(self)
72 | 


--------------------------------------------------------------------------------
/pdfrw/objects/pdfdict.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | from .pdfname import PdfName, BasePdfName
  6 | from .pdfindirect import PdfIndirect
  7 | from .pdfobject import PdfObject
  8 | from ..py23_diffs import iteritems
  9 | from ..errors import PdfParseError
 10 | 
 11 | 
 12 | class _DictSearch(object):
 13 |     '''  Used to search for inheritable attributes.
 14 |     '''
 15 | 
 16 |     def __init__(self, basedict):
 17 |         self.basedict = basedict
 18 | 
 19 |     def __getattr__(self, name, PdfName=PdfName):
 20 |         return self[PdfName(name)]
 21 | 
 22 |     def __getitem__(self, name, set=set, getattr=getattr, id=id):
 23 |         visited = set()
 24 |         mydict = self.basedict
 25 |         while 1:
 26 |             value = mydict[name]
 27 |             if value is not None:
 28 |                 return value
 29 |             myid = id(mydict)
 30 |             assert myid not in visited
 31 |             visited.add(myid)
 32 |             mydict = mydict.Parent
 33 |             if mydict is None:
 34 |                 return
 35 | 
 36 | 
 37 | class _Private(object):
 38 |     ''' Used to store private attributes (not output to PDF files)
 39 |         on PdfDict classes
 40 |     '''
 41 | 
 42 |     def __init__(self, pdfdict):
 43 |         vars(self)['pdfdict'] = pdfdict
 44 | 
 45 |     def __setattr__(self, name, value):
 46 |         vars(self.pdfdict)[name] = value
 47 | 
 48 | 
 49 | class PdfDict(dict):
 50 |     ''' PdfDict objects are subclassed dictionaries
 51 |         with the following features:
 52 | 
 53 |         - Every key in the dictionary starts with "/"
 54 | 
 55 |         - A dictionary item can be deleted by assigning it to None
 56 | 
 57 |         - Keys that (after the initial "/") conform to Python
 58 |           naming conventions can also be accessed (set and retrieved)
 59 |           as attributes of the dictionary.  E.g.  mydict.Page is the
 60 |           same thing as mydict['/Page']
 61 | 
 62 |         - Private attributes (not in the PDF space) can be set
 63 |           on the dictionary object attribute dictionary by using
 64 |           the private attribute:
 65 | 
 66 |                 mydict.private.foo = 3
 67 |                 mydict.foo = 5
 68 |                 x = mydict.foo       # x will now contain 3
 69 |                 y = mydict['/foo']   # y will now contain 5
 70 | 
 71 |           Most standard adobe dictionary keys start with an upper case letter,
 72 |           so to avoid conflicts, it is best to start private attributes with
 73 |           lower case letters.
 74 | 
 75 |         - PdfDicts have the following read-only properties:
 76 | 
 77 |             - private -- as discussed above, provides write access to
 78 |                          dictionary's attributes
 79 |             - inheritable -- this creates and returns a "view" attribute
 80 |                          that will search through the object hierarchy for
 81 |                          any desired attribute, such as /Rotate or /MediaBox
 82 | 
 83 |         - PdfDicts also have the following special attributes:
 84 |             - indirect is not stored in the PDF dictionary, but in the object's
 85 |               attribute dictionary
 86 |             - stream is also stored in the object's attribute dictionary
 87 |               and will also update the stream length.
 88 |             - _stream will store in the object's attribute dictionary without
 89 |               updating the stream length.
 90 | 
 91 |             It is possible, for example, to have a PDF name such as "/indirect"
 92 |             or "/stream", but you cannot access such a name as an attribute:
 93 | 
 94 |                 mydict.indirect -- accesses object's attribute dictionary
 95 |                 mydict["/indirect"] -- accesses actual PDF dictionary
 96 |     '''
 97 |     indirect = False
 98 |     stream = None
 99 | 
100 |     _special = dict(indirect=('indirect', False),
101 |                     stream=('stream', True),
102 |                     _stream=('stream', False),
103 |                     )
104 | 
105 |     def __setitem__(self, name, value, setter=dict.__setitem__,
106 |                     BasePdfName=BasePdfName, isinstance=isinstance):
107 |         if not isinstance(name, BasePdfName):
108 |             raise PdfParseError('Dict key %s is not a PdfName' % repr(name))
109 |         if value is not None:
110 |             setter(self, name, value)
111 |         elif name in self:
112 |             del self[name]
113 | 
114 |     def __init__(self, *args, **kw):
115 |         if args:
116 |             if len(args) == 1:
117 |                 args = args[0]
118 |             self.update(args)
119 |             if isinstance(args, PdfDict):
120 |                 self.indirect = args.indirect
121 |                 self._stream = args.stream
122 |         for key, value in iteritems(kw):
123 |             setattr(self, key, value)
124 | 
125 |     def __getattr__(self, name, PdfName=PdfName):
126 |         ''' If the attribute doesn't exist on the dictionary object,
127 |             try to slap a '/' in front of it and get it out
128 |             of the actual dictionary itself.
129 |         '''
130 |         return self.get(PdfName(name))
131 | 
132 |     def get(self, key, dictget=dict.get, isinstance=isinstance,
133 |             PdfIndirect=PdfIndirect):
134 |         ''' Get a value out of the dictionary,
135 |             after resolving any indirect objects.
136 |         '''
137 |         value = dictget(self, key)
138 |         if isinstance(value, PdfIndirect):
139 |             # We used to use self[key] here, but that does an
140 |             # unwanted check on the type of the key (github issue #98).
141 |             # Python will keep the old key object in the dictionary,
142 |             # so that check is not necessary.
143 |             value = value.real_value()
144 |             if value is not None:
145 |                 dict.__setitem__(self, key, value)
146 |             else:
147 |                 del self[key]
148 |         return value
149 | 
150 |     def __getitem__(self, key):
151 |         return self.get(key)
152 | 
153 |     def __setattr__(self, name, value, special=_special.get,
154 |                     PdfName=PdfName, vars=vars):
155 |         ''' Set an attribute on the dictionary.  Handle the keywords
156 |             indirect, stream, and _stream specially (for content objects)
157 |         '''
158 |         info = special(name)
159 |         if info is None:
160 |             self[PdfName(name)] = value
161 |         else:
162 |             name, setlen = info
163 |             vars(self)[name] = value
164 |             if setlen:
165 |                 notnone = value is not None
166 |                 self.Length = notnone and PdfObject(len(value)) or None
167 | 
168 |     def iteritems(self, dictiter=iteritems,
169 |                   isinstance=isinstance, PdfIndirect=PdfIndirect,
170 |                   BasePdfName=BasePdfName):
171 |         ''' Iterate over the dictionary, resolving any unresolved objects
172 |         '''
173 |         for key, value in list(dictiter(self)):
174 |             if isinstance(value, PdfIndirect):
175 |                 self[key] = value = value.real_value()
176 |             if value is not None:
177 |                 if not isinstance(key, BasePdfName):
178 |                     raise PdfParseError('Dict key %s is not a PdfName' %
179 |                                         repr(key))
180 |                 yield key, value
181 | 
182 |     def items(self):
183 |         return list(self.iteritems())
184 | 
185 |     def itervalues(self):
186 |         for key, value in self.iteritems():
187 |             yield value
188 | 
189 |     def values(self):
190 |         return list((value for key, value in self.iteritems()))
191 | 
192 |     def keys(self):
193 |         return list((key for key, value in self.iteritems()))
194 | 
195 |     def __iter__(self):
196 |         for key, value in self.iteritems():
197 |             yield key
198 | 
199 |     def iterkeys(self):
200 |         return iter(self)
201 | 
202 |     def copy(self):
203 |         return type(self)(self)
204 | 
205 |     def pop(self, key):
206 |         value = self.get(key)
207 |         del self[key]
208 |         return value
209 | 
210 |     def popitem(self):
211 |         key, value = dict.pop(self)
212 |         if isinstance(value, PdfIndirect):
213 |             value = value.real_value()
214 |         return value
215 | 
216 |     def inheritable(self):
217 |         ''' Search through ancestors as needed for inheritable
218 |             dictionary items.
219 |             NOTE:  You might think it would be a good idea
220 |             to cache this class, but then you'd have to worry
221 |             about it pointing to the wrong dictionary if you
222 |             made a copy of the object...
223 |         '''
224 |         return _DictSearch(self)
225 |     inheritable = property(inheritable)
226 | 
227 |     def private(self):
228 |         ''' Allows setting private metadata for use in
229 |             processing (not sent to PDF file).
230 |             See note on inheritable
231 |         '''
232 |         return _Private(self)
233 |     private = property(private)
234 | 
235 | 
236 | class IndirectPdfDict(PdfDict):
237 |     ''' IndirectPdfDict is a convenience class.  You could
238 |         create a direct PdfDict and then set indirect = True on it,
239 |         or you could just create an IndirectPdfDict.
240 |     '''
241 |     indirect = True
242 | 


--------------------------------------------------------------------------------
/pdfrw/objects/pdfindirect.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | 
 6 | class _NotLoaded(object):
 7 |     pass
 8 | 
 9 | 
10 | class PdfIndirect(tuple):
11 |     ''' A placeholder for an object that hasn't been read in yet.
12 |         The object itself is the (object number, generation number) tuple.
13 |         The attributes include information about where the object is
14 |         referenced from and the file object to retrieve the real object from.
15 |     '''
16 |     value = _NotLoaded
17 | 
18 |     def real_value(self, NotLoaded=_NotLoaded):
19 |         value = self.value
20 |         if value is NotLoaded:
21 |             value = self.value = self._loader(self)
22 |         return value
23 | 


--------------------------------------------------------------------------------
/pdfrw/objects/pdfname.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | import re
 6 | 
 7 | from ..errors import log
 8 | 
 9 | warn = log.warning
10 | 
11 | 
12 | class BasePdfName(str):
13 |     ''' A PdfName is an identifier that starts with
14 |         a slash.
15 | 
16 |         If a PdfName has illegal space or delimiter characters,
17 |         then it will be decorated with an "encoded" attribute that
18 |         has those characters properly escaped as #<hex><hex>
19 | 
20 |         The "encoded" attribute is what is sent out to a PDF file,
21 |         the non-encoded main object is what is compared for equality
22 |         in a PDF dictionary.
23 |     '''
24 | 
25 |     indirect = False
26 |     encoded = None
27 | 
28 |     whitespace = '\x00 \t\f\r\n'
29 |     delimiters = '()<>{}[]/%'
30 |     forbidden = list(whitespace) + list('\\' + x for x in delimiters)
31 |     remap = dict((x, '#%02X' % ord(x)) for x in (whitespace + delimiters))
32 |     split_to_encode = re.compile('(%s)' % '|'.join(forbidden)).split
33 |     split_to_decode = re.compile(r'\#([0-9A-Fa-f]{2})').split
34 | 
35 |     def __new__(cls, name, pre_encoded=True, remap=remap,
36 |                 join=''.join, new=str.__new__, chr=chr, int=int,
37 |                 split_to_encode=split_to_encode,
38 |                 split_to_decode=split_to_decode,
39 |                 ):
40 |         ''' We can build a PdfName from scratch, or from
41 |             a pre-encoded name (e.g. coming in from a file).
42 |         '''
43 |         # Optimization for normal case
44 |         if name[1:].isalnum():
45 |             return new(cls, name)
46 |         encoded = name
47 |         if pre_encoded:
48 |             if '#' in name:
49 |                 substrs = split_to_decode(name)
50 |                 substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
51 |                 name = join(substrs)
52 |         else:
53 |             encoded = split_to_encode(encoded)
54 |             encoded[3::2] = (remap[x] for x in encoded[3::2])
55 |             encoded = join(encoded)
56 |         self = new(cls, name)
57 |         if encoded != name:
58 |             self.encoded = encoded
59 |         return self
60 | 
61 | 
62 | # We could have used a metaclass, but this matches what
63 | # we were doing historically.
64 | 
65 | class PdfName(object):
66 |     ''' Two simple ways to get a PDF name from a string:
67 | 
68 |                 x = PdfName.FooBar
69 |                 x = pdfName('FooBar')
70 | 
71 |         Either technique will return "/FooBar"
72 | 
73 |     '''
74 | 
75 |     def __getattr__(self, name, BasePdfName=BasePdfName):
76 |         return BasePdfName('/' + name, False)
77 | 
78 |     def __call__(self, name, BasePdfName=BasePdfName):
79 |         return BasePdfName('/' + name, False)
80 | 
81 | PdfName = PdfName()
82 | 


--------------------------------------------------------------------------------
/pdfrw/objects/pdfobject.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | 
 6 | class PdfObject(str):
 7 |     ''' A PdfObject is a textual representation of any PDF file object
 8 |         other than an array, dict or string. It has an indirect attribute
 9 |         which defaults to False.
10 |     '''
11 |     indirect = False
12 | 


--------------------------------------------------------------------------------
/pdfrw/pagemerge.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | This module contains code to edit pages.  Sort of a canvas, I
  7 | suppose, but I wouldn't want to call it that and get people all
  8 | excited or anything.
  9 | 
 10 | No, this is just for doing basic things like merging/splitting
 11 | apart pages, watermarking, etc.  All it does is allow converting
 12 | pages (or parts of pages) into Form XObject rectangles, and then
 13 | plopping those down on new or pre-existing pages.
 14 | '''
 15 | 
 16 | from .objects import PdfDict, PdfArray, PdfName
 17 | from .buildxobj import pagexobj, ViewInfo
 18 | 
 19 | NullInfo = ViewInfo()
 20 | 
 21 | 
 22 | class RectXObj(PdfDict):
 23 |     ''' This class facilitates doing positioning (moving and scaling)
 24 |         of Form XObjects within their containing page, by modifying
 25 |         the Form XObject's transformation matrix.
 26 | 
 27 |         By default, this class keeps the aspect ratio locked.  For
 28 |         example, if your object is foo, you can write 'foo.w = 200',
 29 |         and it will scale in both the x and y directions.
 30 | 
 31 |         To unlock the aspect ration, you have to do a tiny bit of math
 32 |         and call the scale function.
 33 |     '''
 34 |     def __init__(self, page, viewinfo=NullInfo, **kw):
 35 |         ''' The page is a page returned by PdfReader.  It will be
 36 |             turned into a cached Form XObject (so that multiple
 37 |             rectangles can be extracted from it if desired), and then
 38 |             another Form XObject will be built using it and the viewinfo
 39 |             (which should be a ViewInfo class).  The viewinfo includes
 40 |             source coordinates (from the top/left) and rotation information.
 41 | 
 42 |             Once the object has been built, its destination coordinates
 43 |             may be examined and manipulated by using x, y, w, h, and
 44 |             scale.  The destination coordinates are in the normal
 45 |             PDF programmatic system (starting at bottom left).
 46 |         '''
 47 |         if kw:
 48 |             if viewinfo is not NullInfo:
 49 |                 raise ValueError("Cannot modify preexisting ViewInfo")
 50 |             viewinfo = ViewInfo(**kw)
 51 |         viewinfo.cacheable = False
 52 |         base = pagexobj(page, viewinfo)
 53 |         self.update(base)
 54 |         self.indirect = True
 55 |         self.stream = base.stream
 56 |         private = self.private
 57 |         private._rect = [base.x, base.y, base.w, base.h]
 58 |         matrix = self.Matrix
 59 |         if matrix is None:
 60 |             matrix = self.Matrix = PdfArray((1, 0, 0, 1, 0, 0))
 61 |         private._matrix = matrix  # Lookup optimization
 62 |         # Default to lower-left corner
 63 |         self.x = 0
 64 |         self.y = 0
 65 | 
 66 |     @property
 67 |     def x(self):
 68 |         ''' X location (from left) of object in points
 69 |         '''
 70 |         return self._rect[0]
 71 | 
 72 |     @property
 73 |     def y(self):
 74 |         ''' Y location (from bottom) of object in points
 75 |         '''
 76 |         return self._rect[1]
 77 | 
 78 |     @property
 79 |     def w(self):
 80 |         ''' Width of object in points
 81 |         '''
 82 |         return self._rect[2]
 83 | 
 84 |     @property
 85 |     def h(self):
 86 |         ''' Height of object in points
 87 |         '''
 88 |         return self._rect[3]
 89 | 
 90 |     def __setattr__(self, name, value, next=PdfDict.__setattr__,
 91 |                     mine=set('x y w h'.split())):
 92 |         ''' The underlying __setitem__ won't let us use a property
 93 |             setter, so we have to fake one.
 94 |         '''
 95 |         if name not in mine:
 96 |             return next(self, name, value)
 97 |         if name in 'xy':
 98 |             r_index, m_index = (0, 4) if name == 'x' else (1, 5)
 99 |             self._rect[r_index], old = value, self._rect[r_index]
100 |             self._matrix[m_index] += value - old
101 |         else:
102 |             index = 2 + (value == 'h')
103 |             self.scale(value / self._rect[index])
104 | 
105 |     def scale(self, x_scale, y_scale=None):
106 |         ''' Current scaling deals properly with things that
107 |             have been rotated in 90 degree increments
108 |             (via the ViewMerge object given when instantiating).
109 |         '''
110 |         if y_scale is None:
111 |             y_scale = x_scale
112 |         x, y, w, h = rect = self._rect
113 |         ao, bo, co, do, eo, fo = matrix = self._matrix
114 |         an = ao * x_scale
115 |         bn = bo * y_scale
116 |         cn = co * x_scale
117 |         dn = do * y_scale
118 |         en = x + (eo - x) * 1.0 * (an + cn) / (ao + co)
119 |         fn = y + (fo - y) * 1.0 * (bn + dn) / (bo + do)
120 |         matrix[:] = an, bn, cn, dn, en, fn
121 |         rect[:] = x, y, w * x_scale, h * y_scale
122 | 
123 |     @property
124 |     def box(self):
125 |         ''' Return the bounding box for the object
126 |         '''
127 |         x, y, w, h = self._rect
128 |         return PdfArray([x, y, x + w, y + h])
129 | 
130 | 
131 | class PageMerge(list):
132 |     ''' A PageMerge object can have 0 or 1 underlying pages
133 |         (that get edited with the results of the merge)
134 |         and 0-n RectXObjs that can be applied before or
135 |         after the underlying page.
136 |     '''
137 |     page = None
138 |     mbox = None
139 |     cbox = None
140 |     resources = None
141 |     rotate = None
142 |     contents = None
143 | 
144 |     def __init__(self, page=None):
145 |         if page is not None:
146 |             self.setpage(page)
147 | 
148 |     def setpage(self, page):
149 |         if page.Type != PdfName.Page:
150 |             raise TypeError("Expected page")
151 |         self.append(None)  # Placeholder
152 |         self.page = page
153 |         inheritable = page.inheritable
154 |         self.mbox = inheritable.MediaBox
155 |         self.cbox = inheritable.CropBox
156 |         self.resources = inheritable.Resources
157 |         self.rotate = inheritable.Rotate
158 |         self.contents = page.Contents
159 | 
160 |     def __add__(self, other):
161 |         if isinstance(other, dict):
162 |             other = [other]
163 |         for other in other:
164 |             self.add(other)
165 |         return self
166 | 
167 |     def add(self, obj, prepend=False, **kw):
168 |         if kw:
169 |             obj = RectXObj(obj, **kw)
170 |         elif obj.Type == PdfName.Page:
171 |             obj = RectXObj(obj)
172 |         if prepend:
173 |             self.insert(0, obj)
174 |         else:
175 |             self.append(obj)
176 |         return self
177 | 
178 |     def render(self):
179 |         def do_xobjs(xobj_list, restore_first=False):
180 |             content = ['Q'] if restore_first else []
181 |             for obj in xobj_list:
182 |                 index = PdfName('pdfrw_%d' % (key_offset + len(xobjs)))
183 |                 if xobjs.setdefault(index, obj) is not obj:
184 |                     raise KeyError("XObj key %s already in use" % index)
185 |                 content.append('%s Do' % index)
186 |             return PdfDict(indirect=True, stream='\n'.join(content))
187 | 
188 |         mbox = self.mbox
189 |         cbox = self.cbox
190 |         page = self.page
191 |         old_contents = self.contents
192 |         resources = self.resources or PdfDict()
193 | 
194 |         key_offset = 0
195 |         xobjs = resources.XObject
196 |         if xobjs is None:
197 |             xobjs = resources.XObject = PdfDict()
198 |         else:
199 |             allkeys = xobjs.keys()
200 |             if allkeys:
201 |                 keys = (x for x in allkeys if x.startswith('/pdfrw_'))
202 |                 keys = (x for x in keys if x[7:].isdigit())
203 |                 keys = sorted(keys, key=lambda x: int(x[7:]))
204 |                 key_offset = (int(keys[-1][7:]) + 1) if keys else 0
205 |                 key_offset -= len(allkeys)
206 | 
207 |         if old_contents is None:
208 |             new_contents = do_xobjs(self)
209 |         else:
210 |             isdict = isinstance(old_contents, PdfDict)
211 |             old_contents = [old_contents] if isdict else old_contents
212 |             new_contents = PdfArray()
213 |             index = self.index(None)
214 |             if index:
215 |                 new_contents.append(do_xobjs(self[:index]))
216 | 
217 |             index += 1
218 |             if index < len(self):
219 |                 # There are elements to add after the original page contents,
220 |                 # so push the graphics state to the stack. Restored below.
221 |                 new_contents.append(PdfDict(indirect=True, stream='q'))
222 | 
223 |             new_contents.extend(old_contents)
224 | 
225 |             if index < len(self):
226 |                 # Restore graphics state and add other elements.
227 |                 new_contents.append(do_xobjs(self[index:], restore_first=True))
228 | 
229 |         if mbox is None:
230 |             cbox = None
231 |             mbox = self.xobj_box
232 |             mbox[0] = min(0, mbox[0])
233 |             mbox[1] = min(0, mbox[1])
234 | 
235 |         page = PdfDict(indirect=True) if page is None else page
236 |         page.Type = PdfName.Page
237 |         page.Resources = resources
238 |         page.MediaBox = mbox
239 |         page.CropBox = cbox
240 |         page.Rotate = self.rotate
241 |         page.Contents = new_contents
242 |         return page
243 | 
244 |     @property
245 |     def xobj_box(self):
246 |         ''' Return the smallest box that encloses every object
247 |             in the list.
248 |         '''
249 |         a, b, c, d = zip(*(xobj.box for xobj in self))
250 |         return PdfArray((min(a), min(b), max(c), max(d)))
251 | 


--------------------------------------------------------------------------------
/pdfrw/pdfwriter.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | The PdfWriter class writes an entire PDF file out to disk.
  7 | 
  8 | The writing process is not at all optimized or organized.
  9 | 
 10 | An instance of the PdfWriter class has two methods:
 11 |     addpage(page)
 12 | and
 13 |     write(fname)
 14 | 
 15 | addpage() assumes that the pages are part of a valid
 16 | tree/forest of PDF objects.
 17 | '''
 18 | import gc
 19 | 
 20 | from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
 21 |                       PdfObject, PdfString)
 22 | from .compress import compress as do_compress
 23 | from .errors import PdfOutputError, log
 24 | from .py23_diffs import iteritems, convert_store
 25 | 
 26 | NullObject = PdfObject('null')
 27 | NullObject.indirect = True
 28 | NullObject.Type = 'Null object'
 29 | 
 30 | 
 31 | def user_fmt(obj, isinstance=isinstance, float=float, str=str,
 32 |              basestring=(type(u''), type(b'')), encode=PdfString.encode):
 33 |     ''' This function may be replaced by the user for
 34 |         specialized formatting requirements.
 35 |     '''
 36 | 
 37 |     if isinstance(obj, basestring):
 38 |         return encode(obj)
 39 | 
 40 |     # PDFs don't handle exponent notation
 41 |     if isinstance(obj, float):
 42 |             return ('%.9f' % obj).rstrip('0').rstrip('.')
 43 | 
 44 |     return str(obj)
 45 | 
 46 | 
 47 | def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
 48 |                   user_fmt=user_fmt, do_compress=do_compress,
 49 |                   convert_store=convert_store, iteritems=iteritems,
 50 |                   id=id, isinstance=isinstance, getattr=getattr, len=len,
 51 |                   sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
 52 |                   enumerate=enumerate, list=list, dict=dict, tuple=tuple,
 53 |                   PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
 54 |     ''' FormatObjects performs the actual formatting and disk write.
 55 |         Should be a class, was a class, turned into nested functions
 56 |         for performace (to reduce attribute lookups).
 57 |     '''
 58 | 
 59 |     def f_write(s):
 60 |         f.write(convert_store(s))
 61 | 
 62 |     def add(obj):
 63 |         ''' Add an object to our list, if it's an indirect
 64 |             object.  Just format it if not.
 65 |         '''
 66 |         # Can't hash dicts, so just hash the object ID
 67 |         objid = id(obj)
 68 | 
 69 |         # Automatically set stream objects to indirect
 70 |         if isinstance(obj, PdfDict):
 71 |             indirect = obj.indirect or (obj.stream is not None)
 72 |         else:
 73 |             indirect = getattr(obj, 'indirect', False)
 74 | 
 75 |         if not indirect:
 76 |             if objid in visited:
 77 |                 log.warning('Replicating direct %s object, '
 78 |                             'should be indirect for optimal file size' %
 79 |                             type(obj))
 80 |                 obj = type(obj)(obj)
 81 |                 objid = id(obj)
 82 |             visiting(objid)
 83 |             result = format_obj(obj)
 84 |             leaving(objid)
 85 |             return result
 86 | 
 87 |         objnum = indirect_dict_get(objid)
 88 | 
 89 |         # If we haven't seen the object yet, we need to
 90 |         # add it to the indirect object list.
 91 |         if objnum is None:
 92 |             swapped = swapobj(objid)
 93 |             if swapped is not None:
 94 |                 old_id = objid
 95 |                 obj = swapped
 96 |                 objid = id(obj)
 97 |                 objnum = indirect_dict_get(objid)
 98 |                 if objnum is not None:
 99 |                     indirect_dict[old_id] = objnum
100 |                     return '%s 0 R' % objnum
101 |             objnum = len(objlist) + 1
102 |             objlist_append(None)
103 |             indirect_dict[objid] = objnum
104 |             deferred.append((objnum - 1, obj))
105 |         return '%s 0 R' % objnum
106 | 
107 |     def format_array(myarray, formatter):
108 |         # Format array data into semi-readable ASCII
109 |         if sum([len(x) for x in myarray]) <= 70:
110 |             return formatter % space_join(myarray)
111 |         return format_big(myarray, formatter)
112 | 
113 |     def format_big(myarray, formatter):
114 |         bigarray = []
115 |         count = 1000000
116 |         for x in myarray:
117 |             lenx = len(x) + 1
118 |             count += lenx
119 |             if count > 71:
120 |                 subarray = []
121 |                 bigarray.append(subarray)
122 |                 count = lenx
123 |             subarray.append(x)
124 |         return formatter % lf_join([space_join(x) for x in bigarray])
125 | 
126 |     def format_obj(obj):
127 |         ''' format PDF object data into semi-readable ASCII.
128 |             May mutually recurse with add() -- add() will
129 |             return references for indirect objects, and add
130 |             the indirect object to the list.
131 |         '''
132 |         while 1:
133 |             if isinstance(obj, (list, dict, tuple)):
134 |                 if isinstance(obj, PdfArray):
135 |                     myarray = [add(x) for x in obj]
136 |                     return format_array(myarray, '[%s]')
137 |                 elif isinstance(obj, PdfDict):
138 |                     if compress and obj.stream:
139 |                         do_compress([obj])
140 |                     pairs = sorted((getattr(x, 'encoded', None) or x, y)
141 |                                    for (x, y) in obj.iteritems())
142 |                     myarray = []
143 |                     for key, value in pairs:
144 |                         myarray.append(key)
145 |                         myarray.append(add(value))
146 |                     result = format_array(myarray, '<<%s>>')
147 |                     stream = obj.stream
148 |                     if stream is not None:
149 |                         result = ('%s\nstream\n%s\nendstream' %
150 |                                   (result, stream))
151 |                     return result
152 |                 obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
153 |                 continue
154 | 
155 |             # We assume that an object with an indirect
156 |             # attribute knows how to represent itself to us.
157 |             if hasattr(obj, 'indirect'):
158 |                 return str(getattr(obj, 'encoded', None) or obj)
159 |             return user_fmt(obj)
160 | 
161 |     def format_deferred():
162 |         while deferred:
163 |             index, obj = deferred.pop()
164 |             objlist[index] = format_obj(obj)
165 | 
166 |     indirect_dict = {}
167 |     indirect_dict_get = indirect_dict.get
168 |     objlist = []
169 |     objlist_append = objlist.append
170 |     visited = set()
171 |     visiting = visited.add
172 |     leaving = visited.remove
173 |     space_join = ' '.join
174 |     lf_join = '\n  '.join
175 | 
176 |     deferred = []
177 | 
178 |     # Don't reference old catalog or pages objects --
179 |     # swap references to new ones.
180 |     type_remap = {PdfName.Catalog: trailer.Root,
181 |                PdfName.Pages: trailer.Root.Pages, None: trailer}.get
182 |     swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
183 |                for objid, (obj, new_obj) in iteritems(killobj)]
184 |     swapobj = dict((objid, obj is None and NullObject or obj)
185 |                    for objid, obj in swapobj).get
186 | 
187 |     for objid in killobj:
188 |         assert swapobj(objid) is not None
189 | 
190 |     # The first format of trailer gets all the information,
191 |     # but we throw away the actual trailer formatting.
192 |     format_obj(trailer)
193 |     # Keep formatting until we're done.
194 |     # (Used to recurse inside format_obj for this, but
195 |     #  hit system limit.)
196 |     format_deferred()
197 |     # Now we know the size, so we update the trailer dict
198 |     # and get the formatted data.
199 |     trailer.Size = PdfObject(len(objlist) + 1)
200 |     trailer = format_obj(trailer)
201 | 
202 |     # Now we have all the pieces to write out to the file.
203 |     # Keep careful track of the counts while we do it so
204 |     # we can correctly build the cross-reference.
205 | 
206 |     header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
207 |     f_write(header)
208 |     offset = len(header)
209 |     offsets = [(0, 65535, 'f')]
210 |     offsets_append = offsets.append
211 | 
212 |     for i, x in enumerate(objlist):
213 |         objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
214 |         offsets_append((offset, 0, 'n'))
215 |         offset += len(objstr)
216 |         f_write(objstr)
217 | 
218 |     f_write('xref\n0 %s\n' % len(offsets))
219 |     for x in offsets:
220 |         f_write('%010d %05d %s\r\n' % x)
221 |     f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
222 | 
223 | 
224 | class PdfWriter(object):
225 | 
226 |     _trailer = None
227 |     canonicalize = False
228 |     fname = None
229 | 
230 |     def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
231 |         """
232 |             Parameters:
233 |                 fname -- Output file name, or file-like binary object
234 |                          with a write method
235 |                 version -- PDF version to target.  Currently only 1.3
236 |                            supported.
237 |                 compress -- True to do compression on output.  Currently
238 |                             compresses stream objects.
239 |         """
240 | 
241 |         # Legacy support:  fname is new, was added in front
242 |         if fname is not None:
243 |             try:
244 |                 float(fname)
245 |             except (ValueError, TypeError):
246 |                 pass
247 |             else:
248 |                 if version != '1.3':
249 |                     assert compress == False
250 |                     compress = version
251 |                 version = fname
252 |                 fname = None
253 | 
254 |         self.fname = fname
255 |         self.version = version
256 |         self.compress = compress
257 | 
258 |         if kwargs:
259 |             for name, value in iteritems(kwargs):
260 |                 if name not in self.replaceable:
261 |                     raise ValueError("Cannot set attribute %s "
262 |                                      "on PdfWriter instance" % name)
263 |                 setattr(self, name, value)
264 | 
265 |         self.pagearray = PdfArray()
266 |         self.killobj = {}
267 | 
268 |     def addpage(self, page):
269 |         self._trailer = None
270 |         if page.Type != PdfName.Page:
271 |             raise PdfOutputError('Bad /Type:  Expected %s, found %s'
272 |                                  % (PdfName.Page, page.Type))
273 |         inheritable = page.inheritable  # searches for resources
274 |         self.pagearray.append(
275 |             IndirectPdfDict(
276 |                 page,
277 |                 Resources=inheritable.Resources,
278 |                 MediaBox=inheritable.MediaBox,
279 |                 CropBox=inheritable.CropBox,
280 |                 Rotate=inheritable.Rotate,
281 |             )
282 |         )
283 | 
284 |         # Add parents in the hierarchy to objects we
285 |         # don't want to output
286 |         killobj = self.killobj
287 |         obj, new_obj = page, self.pagearray[-1]
288 |         while obj is not None:
289 |             objid = id(obj)
290 |             if objid in killobj:
291 |                 break
292 |             killobj[objid] = obj, new_obj
293 |             obj = obj.Parent
294 |             new_obj = None
295 |         return self
296 | 
297 |     addPage = addpage  # for compatibility with pyPdf
298 | 
299 |     def addpages(self, pagelist):
300 |         for page in pagelist:
301 |             self.addpage(page)
302 |         return self
303 | 
304 |     def _get_trailer(self):
305 |         trailer = self._trailer
306 |         if trailer is not None:
307 |             return trailer
308 | 
309 |         if self.canonicalize:
310 |             self.make_canonical()
311 | 
312 |         # Create the basic object structure of the PDF file
313 |         trailer = PdfDict(
314 |             Root=IndirectPdfDict(
315 |                 Type=PdfName.Catalog,
316 |                 Pages=IndirectPdfDict(
317 |                     Type=PdfName.Pages,
318 |                     Count=PdfObject(len(self.pagearray)),
319 |                     Kids=self.pagearray
320 |                 )
321 |             )
322 |         )
323 |         # Make all the pages point back to the page dictionary and
324 |         # ensure they are indirect references
325 |         pagedict = trailer.Root.Pages
326 |         for page in pagedict.Kids:
327 |             page.Parent = pagedict
328 |             page.indirect = True
329 |         self._trailer = trailer
330 |         return trailer
331 | 
332 |     def _set_trailer(self, trailer):
333 |         self._trailer = trailer
334 | 
335 |     trailer = property(_get_trailer, _set_trailer)
336 | 
337 |     def write(self, fname=None, trailer=None, user_fmt=user_fmt,
338 |               disable_gc=True):
339 | 
340 |         trailer = trailer or self.trailer
341 | 
342 |         # Support fname for legacy applications
343 |         if (fname is not None) == (self.fname is not None):
344 |             raise PdfOutputError(
345 |                 "PdfWriter fname must be specified exactly once")
346 | 
347 |         fname = fname or self.fname
348 | 
349 |         # Dump the data.  We either have a filename or a preexisting
350 |         # file object.
351 |         preexisting = hasattr(fname, 'write')
352 |         f = preexisting and fname or open(fname, 'wb')
353 |         if disable_gc:
354 |             gc.disable()
355 | 
356 |         try:
357 |             FormatObjects(f, trailer, self.version, self.compress,
358 |                           self.killobj, user_fmt=user_fmt)
359 |         finally:
360 |             if not preexisting:
361 |                 f.close()
362 |             if disable_gc:
363 |                 gc.enable()
364 | 
365 |     def make_canonical(self):
366 |         ''' Canonicalizes a PDF.  Assumes everything
367 |             is a Pdf object already.
368 |         '''
369 |         visited = set()
370 |         workitems = list(self.pagearray)
371 |         while workitems:
372 |             obj = workitems.pop()
373 |             objid = id(obj)
374 |             if objid in visited:
375 |                 continue
376 |             visited.add(objid)
377 |             obj.indirect = False
378 |             if isinstance(obj, (PdfArray, PdfDict)):
379 |                 obj.indirect = True
380 |                 if isinstance(obj, PdfArray):
381 |                     workitems += obj
382 |                 else:
383 |                     workitems += obj.values()
384 | 
385 |     replaceable = set(vars())


--------------------------------------------------------------------------------
/pdfrw/py23_diffs.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | # Deal with Python2/3 differences
 6 | 
 7 | try:
 8 |     import zlib
 9 | except ImportError:
10 |     zlib = None
11 | 
12 | try:
13 |     unicode = unicode
14 | except NameError:
15 | 
16 |     def convert_load(s):
17 |         if isinstance(s, bytes):
18 |             return s.decode('Latin-1')
19 |         return s
20 | 
21 |     def convert_store(s):
22 |         return s.encode('Latin-1')
23 | 
24 |     def from_array(a):
25 |         return a.tobytes()
26 | 
27 | else:
28 | 
29 |     def convert_load(s):
30 |         return s
31 | 
32 |     def convert_store(s):
33 |         return s
34 | 
35 |     def from_array(a):
36 |         return a.tostring()
37 | 
38 | nextattr, = (x for x in dir(iter([])) if 'next' in x)
39 | 
40 | try:
41 |     iteritems = dict.iteritems
42 | except AttributeError:
43 |     iteritems = dict.items
44 | 
45 | try:
46 |     xrange = xrange
47 | except NameError:
48 |     xrange = range
49 | 
50 | try:
51 |     intern = intern
52 | except NameError:
53 |     from sys import intern
54 | 


--------------------------------------------------------------------------------
/pdfrw/tokens.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | A tokenizer for PDF streams.
  7 | 
  8 | In general, documentation used was "PDF reference",
  9 | sixth edition, for PDF version 1.7, dated November 2006.
 10 | 
 11 | '''
 12 | 
 13 | import re
 14 | import itertools
 15 | from .objects import PdfString, PdfObject
 16 | from .objects.pdfname import BasePdfName
 17 | from .errors import log, PdfParseError
 18 | from .py23_diffs import nextattr, intern
 19 | 
 20 | 
 21 | def linepos(fdata, loc):
 22 |     line = fdata.count('\n', 0, loc) + 1
 23 |     line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc)
 24 |     col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
 25 |     return line, col
 26 | 
 27 | 
 28 | class PdfTokens(object):
 29 | 
 30 |     # Table 3.1, page 50 of reference, defines whitespace
 31 |     eol = '\n\r'
 32 |     whitespace = '\x00 \t\f' + eol
 33 | 
 34 |     # Text on page 50 defines delimiter characters
 35 |     # Escape the ]
 36 |     delimiters = r'()<>{}[\]/%'
 37 | 
 38 |     # "normal" stuff is all but delimiters or whitespace.
 39 | 
 40 |     p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
 41 |                                              whitespace)
 42 | 
 43 |     p_comment = r'\%%[^%s]*' % eol
 44 | 
 45 |     # This will get the bulk of literal strings.
 46 |     p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?'
 47 | 
 48 |     # This will get more pieces of literal strings
 49 |     # (Don't ask me why, but it hangs without the trailing ?.)
 50 |     p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?'
 51 | 
 52 |     # A hex string.  This one's easy.
 53 |     p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace
 54 | 
 55 |     p_dictdelim = r'\<\<|\>\>'
 56 |     p_name = r'/[^%s%s]*' % (delimiters, whitespace)
 57 | 
 58 |     p_catchall = '[^%s]' % whitespace
 59 | 
 60 |     pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
 61 |                         p_literal_string, p_comment, p_catchall])
 62 |     findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
 63 |                          re.DOTALL).finditer
 64 |     findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
 65 |                                           whitespace), re.DOTALL).finditer
 66 | 
 67 |     def _gettoks(self, startloc, intern=intern,
 68 |                  delimiters=delimiters, findtok=findtok,
 69 |                  findparen=findparen, PdfString=PdfString,
 70 |                  PdfObject=PdfObject, BasePdfName=BasePdfName):
 71 |         ''' Given a source data string and a location inside it,
 72 |             gettoks generates tokens.  Each token is a tuple of the form:
 73 |              <starting file loc>, <ending file loc>, <token string>
 74 |             The ending file loc is past any trailing whitespace.
 75 | 
 76 |             The main complication here is the literal strings, which
 77 |             can contain nested parentheses.  In order to cope with these
 78 |             we can discard the current iterator and loop back to the
 79 |             top to get a fresh one.
 80 | 
 81 |             We could use re.search instead of re.finditer, but that's slower.
 82 |         '''
 83 |         fdata = self.fdata
 84 |         current = self.current = [(startloc, startloc)]
 85 |         cache = {}
 86 |         get_cache = cache.get
 87 |         while 1:
 88 |             for match in findtok(fdata, current[0][1]):
 89 |                 current[0] = tokspan = match.span()
 90 |                 token = match.group(1)
 91 |                 firstch = token[0]
 92 |                 toktype = intern
 93 |                 if firstch not in delimiters:
 94 |                     toktype = PdfObject
 95 |                 elif firstch in '/<(%':
 96 |                     if firstch == '/':
 97 |                         # PDF Name
 98 |                         toktype = BasePdfName
 99 |                     elif firstch == '<':
100 |                         # << dict delim, or < hex string >
101 |                         if token[1:2] != '<':
102 |                             toktype = PdfString
103 |                     elif firstch == '(':
104 |                         # Literal string
105 |                         # It's probably simple, but maybe not
106 |                         # Nested parentheses are a bear, and if
107 |                         # they are present, we exit the for loop
108 |                         # and get back in with a new starting location.
109 |                         ends = None  # For broken strings
110 |                         if fdata[match.end(1) - 1] != ')':
111 |                             nest = 2
112 |                             m_start, loc = tokspan
113 |                             for match in findparen(fdata, loc):
114 |                                 loc = match.end(1)
115 |                                 ending = fdata[loc - 1] == ')'
116 |                                 nest += 1 - ending * 2
117 |                                 if not nest:
118 |                                     break
119 |                                 if ending and ends is None:
120 |                                     ends = loc, match.end(), nest
121 |                             token = fdata[m_start:loc]
122 |                             current[0] = m_start, match.end()
123 |                             if nest:
124 |                                 # There is one possible recoverable error
125 |                                 # seen in the wild -- some stupid generators
126 |                                 # don't escape (.  If this happens, just
127 |                                 # terminate on first unescaped ). The string
128 |                                 # won't be quite right, but that's a science
129 |                                 # fair project for another time.
130 |                                 (self.error, self.exception)[not ends](
131 |                                     'Unterminated literal string')
132 |                                 loc, ends, nest = ends
133 |                                 token = fdata[m_start:loc] + ')' * nest
134 |                                 current[0] = m_start, ends
135 |                         toktype = PdfString
136 |                     elif firstch == '%':
137 |                         # Comment
138 |                         if self.strip_comments:
139 |                             continue
140 |                     else:
141 |                         self.exception(('Tokenizer logic incorrect -- '
142 |                                         'should never get here'))
143 | 
144 |                 newtok = get_cache(token)
145 |                 if newtok is None:
146 |                     newtok = cache[token] = toktype(token)
147 |                 yield newtok
148 |                 if current[0] is not tokspan:
149 |                     break
150 |             else:
151 |                 if self.strip_comments:
152 |                     break
153 |                 raise StopIteration
154 | 
155 |     def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
156 |         self.fdata = fdata
157 |         self.strip_comments = strip_comments
158 |         self.iterator = iterator = self._gettoks(startloc)
159 |         self.msgs_dumped = None if verbose else set()
160 |         self.next = getattr(iterator, nextattr)
161 |         self.current = [(startloc, startloc)]
162 | 
163 |     def setstart(self, startloc):
164 |         ''' Change the starting location.
165 |         '''
166 |         current = self.current
167 |         if startloc != current[0][1]:
168 |             current[0] = startloc, startloc
169 | 
170 |     def floc(self):
171 |         ''' Return the current file position
172 |             (where the next token will be retrieved)
173 |         '''
174 |         return self.current[0][1]
175 |     floc = property(floc, setstart)
176 | 
177 |     def tokstart(self):
178 |         ''' Return the file position of the most
179 |             recently retrieved token.
180 |         '''
181 |         return self.current[0][0]
182 |     tokstart = property(tokstart, setstart)
183 | 
184 |     def __iter__(self):
185 |         return self.iterator
186 | 
187 |     def multiple(self, count, islice=itertools.islice, list=list):
188 |         ''' Retrieve multiple tokens
189 |         '''
190 |         return list(islice(self, count))
191 | 
192 |     def next_default(self, default='nope'):
193 |         for result in self:
194 |             return result
195 |         return default
196 | 
197 |     def msg(self, msg, *arg):
198 |         dumped = self.msgs_dumped
199 |         if dumped is not None:
200 |             if msg in dumped:
201 |                 return
202 |             dumped.add(msg)
203 |         if arg:
204 |             msg %= arg
205 |         fdata = self.fdata
206 |         begin, end = self.current[0]
207 |         if begin >= len(fdata):
208 |             return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
209 |         line, col = linepos(fdata, begin)
210 |         if end > begin:
211 |             tok = fdata[begin:end].rstrip()
212 |             if len(tok) > 30:
213 |                 tok = tok[:26] + ' ...'
214 |             return ('%s (line=%d, col=%d, token=%s)' %
215 |                     (msg, line, col, repr(tok)))
216 |         return '%s (line=%d, col=%d)' % (msg, line, col)
217 | 
218 |     def warning(self, *arg):
219 |         s = self.msg(*arg)
220 |         if s:
221 |             log.warning(s)
222 | 
223 |     def error(self, *arg):
224 |         s = self.msg(*arg)
225 |         if s:
226 |             log.error(s)
227 | 
228 |     def exception(self, *arg):
229 |         raise PdfParseError(self.msg(*arg))
230 | 


--------------------------------------------------------------------------------
/pdfrw/toreportlab.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # MIT license -- See LICENSE.txt for details
  4 | 
  5 | '''
  6 | Converts pdfrw objects into reportlab objects.
  7 | 
  8 | Designed for and tested with rl 2.3.
  9 | 
 10 | Knows too much about reportlab internals.
 11 | What can you do?
 12 | 
 13 | The interface to this function is through the makerl() function.
 14 | 
 15 | Parameters:
 16 |         canv       - a reportlab "canvas" (also accepts a "document")
 17 |         pdfobj      - a pdfrw PDF object
 18 | 
 19 | Returns:
 20 |         A corresponding reportlab object, or if the
 21 |         object is a PDF Form XObject, the name to
 22 |         use with reportlab for the object.
 23 | 
 24 |         Will recursively convert all necessary objects.
 25 |         Be careful when converting a page -- if /Parent is set,
 26 |         will recursively convert all pages!
 27 | 
 28 | Notes:
 29 |     1) Original objects are annotated with a
 30 |         derived_rl_obj attribute which points to the
 31 |         reportlab object.  This keeps multiple reportlab
 32 |         objects from being generated for the same pdfobj
 33 |         via repeated calls to makerl.  This is great for
 34 |         not putting too many objects into the
 35 |         new PDF, but not so good if you are modifying
 36 |         objects for different pages.  Then you
 37 |         need to do your own deep copying (of circular
 38 |         structures).  You're on your own.
 39 | 
 40 |     2) ReportLab seems weird about FormXObjects.
 41 |        They pass around a partial name instead of the
 42 |        object or a reference to it.  So we have to
 43 |        reach into reportlab and get a number for
 44 |        a unique name.  I guess this is to make it
 45 |        where you can combine page streams with
 46 |        impunity, but that's just a guess.
 47 | 
 48 |     3) Updated 1/23/2010 to handle multipass documents
 49 |        (e.g. with a table of contents).  These have
 50 |        a different doc object on every pass.
 51 | 
 52 | '''
 53 | 
 54 | from reportlab.pdfbase import pdfdoc as rldocmodule
 55 | from .objects import PdfDict, PdfArray, PdfName
 56 | from .py23_diffs import convert_store
 57 | 
 58 | RLStream = rldocmodule.PDFStream
 59 | RLDict = rldocmodule.PDFDictionary
 60 | RLArray = rldocmodule.PDFArray
 61 | 
 62 | 
 63 | def _makedict(rldoc, pdfobj):
 64 |     rlobj = rldict = RLDict()
 65 |     if pdfobj.indirect:
 66 |         rlobj.__RefOnly__ = 1
 67 |         rlobj = rldoc.Reference(rlobj)
 68 |     pdfobj.derived_rl_obj[rldoc] = rlobj, None
 69 | 
 70 |     for key, value in pdfobj.iteritems():
 71 |         rldict[key[1:]] = makerl_recurse(rldoc, value)
 72 | 
 73 |     return rlobj
 74 | 
 75 | 
 76 | def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject):
 77 |     rldict = RLDict()
 78 |     rlobj = RLStream(rldict, convert_store(pdfobj.stream))
 79 | 
 80 |     if pdfobj.Type == xobjtype:
 81 |         shortname = 'pdfrw_%s' % (rldoc.objectcounter + 1)
 82 |         fullname = rldoc.getXObjectName(shortname)
 83 |     else:
 84 |         shortname = fullname = None
 85 |     result = rldoc.Reference(rlobj, fullname)
 86 |     pdfobj.derived_rl_obj[rldoc] = result, shortname
 87 | 
 88 |     for key, value in pdfobj.iteritems():
 89 |         rldict[key[1:]] = makerl_recurse(rldoc, value)
 90 | 
 91 |     return result
 92 | 
 93 | 
 94 | def _makearray(rldoc, pdfobj):
 95 |     rlobj = rlarray = RLArray([])
 96 |     if pdfobj.indirect:
 97 |         rlobj.__RefOnly__ = 1
 98 |         rlobj = rldoc.Reference(rlobj)
 99 |     pdfobj.derived_rl_obj[rldoc] = rlobj, None
100 | 
101 |     mylist = rlarray.sequence
102 |     for value in pdfobj:
103 |         mylist.append(makerl_recurse(rldoc, value))
104 | 
105 |     return rlobj
106 | 
107 | 
108 | def _makestr(rldoc, pdfobj):
109 |     assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
110 |     # TODO: Add fix for float like in pdfwriter
111 |     return str(getattr(pdfobj, 'encoded', None) or pdfobj)
112 | 
113 | 
114 | def makerl_recurse(rldoc, pdfobj):
115 |     docdict = getattr(pdfobj, 'derived_rl_obj', None)
116 |     if docdict is not None:
117 |         value = docdict.get(rldoc)
118 |         if value is not None:
119 |             return value[0]
120 |     if isinstance(pdfobj, PdfDict):
121 |         if pdfobj.stream is not None:
122 |             func = _makestream
123 |         else:
124 |             func = _makedict
125 |         if docdict is None:
126 |             pdfobj.private.derived_rl_obj = {}
127 |     elif isinstance(pdfobj, PdfArray):
128 |         func = _makearray
129 |         if docdict is None:
130 |             pdfobj.derived_rl_obj = {}
131 |     else:
132 |         func = _makestr
133 |     return func(rldoc, pdfobj)
134 | 
135 | 
136 | def makerl(canv, pdfobj):
137 |     try:
138 |         rldoc = canv._doc
139 |     except AttributeError:
140 |         rldoc = canv
141 |     rlobj = makerl_recurse(rldoc, pdfobj)
142 |     try:
143 |         name = pdfobj.derived_rl_obj[rldoc][1]
144 |     except AttributeError:
145 |         name = None
146 |     return name or rlobj
147 | 


--------------------------------------------------------------------------------
/pdfrw/uncompress.py:
--------------------------------------------------------------------------------
  1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
  3 | # Copyright (C) 2012-2015 Nerijus Mika
  4 | # MIT license -- See LICENSE.txt for details
  5 | # Copyright (c) 2006, Mathieu Fenniak
  6 | # BSD license -- see LICENSE.txt for details
  7 | '''
  8 | A small subset of decompression filters.  Should add more later.
  9 | 
 10 | I believe, after looking at the code, that portions of the flate
 11 | PNG predictor were originally transcribed from PyPDF2, which is
 12 | probably an excellent source of additional filters.
 13 | '''
 14 | import array
 15 | from .objects import PdfDict, PdfName, PdfArray
 16 | from .errors import log
 17 | from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
 18 | import math
 19 | 
 20 | def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
 21 |     for obj in mylist:
 22 |         if isinstance(obj, PdfDict) and obj.stream is not None:
 23 |             yield obj
 24 | 
 25 | # Hack so we can import if zlib not available
 26 | decompressobj = zlib if zlib is None else zlib.decompressobj
 27 | 
 28 | 
 29 | def uncompress(mylist, leave_raw=False, warnings=set(),
 30 |                flate=PdfName.FlateDecode, decompress=decompressobj,
 31 |                isinstance=isinstance, list=list, len=len):
 32 |     ok = True
 33 |     for obj in streamobjects(mylist):
 34 |         ftype = obj.Filter
 35 |         if ftype is None:
 36 |             continue
 37 |         if isinstance(ftype, list) and len(ftype) == 1:
 38 |             # todo: multiple filters
 39 |             ftype = ftype[0]
 40 |         parms = obj.DecodeParms or obj.DP
 41 |         if ftype != flate:
 42 |             msg = ('Not decompressing: cannot use filter %s'
 43 |                    ' with parameters %s') % (repr(ftype), repr(parms))
 44 |             if msg not in warnings:
 45 |                 warnings.add(msg)
 46 |                 log.warning(msg)
 47 |             ok = False
 48 |         else:
 49 |             dco = decompress()
 50 |             try:
 51 |                 data = dco.decompress(convert_store(obj.stream))
 52 |             except Exception as s:
 53 |                 error = str(s)
 54 |             else:
 55 |                 error = None
 56 |                 if isinstance(parms, PdfArray):
 57 |                     oldparms = parms
 58 |                     parms = PdfDict()
 59 |                     for x in oldparms:
 60 |                         parms.update(x)
 61 |                 if parms:
 62 |                     predictor = int(parms.Predictor or 1)
 63 |                     columns = int(parms.Columns or 1)
 64 |                     colors = int(parms.Colors or 1)
 65 |                     bpc = int(parms.BitsPerComponent or 8)
 66 |                     if 10 <= predictor <= 15:
 67 |                         data, error = flate_png(data, predictor, columns, colors, bpc)
 68 |                     elif predictor != 1:
 69 |                         error = ('Unsupported flatedecode predictor %s' %
 70 |                                  repr(predictor))
 71 |             if error is None:
 72 |                 assert not dco.unconsumed_tail
 73 |                 if dco.unused_data.strip():
 74 |                     error = ('Unconsumed compression data: %s' %
 75 |                              repr(dco.unused_data[:20]))
 76 |             if error is None:
 77 |                 obj.Filter = None
 78 |                 obj.stream = data if leave_raw else convert_load(data)
 79 |             else:
 80 |                 log.error('%s %s' % (error, repr(obj.indirect)))
 81 |                 ok = False
 82 |     return ok
 83 | 
 84 | def flate_png_impl(data, predictor=1, columns=1, colors=1, bpc=8):
 85 | 
 86 |     # http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
 87 |     # https://www.w3.org/TR/2003/REC-PNG-20031110/#9Filters
 88 |     # Reconstruction functions
 89 |     # x: the byte being filtered;
 90 |     # a: the byte corresponding to x in the pixel immediately before the pixel containing x (or the byte immediately before x, when the bit depth is less than 8);
 91 |     # b: the byte corresponding to x in the previous scanline;
 92 |     # c: the byte corresponding to b in the pixel immediately before the pixel containing b (or the byte immediately before b, when the bit depth is less than 8).
 93 | 
 94 |     def subfilter(data, prior_row_data, start, length, pixel_size):
 95 |         # filter type 1: Sub
 96 |         # Recon(x) = Filt(x) + Recon(a)
 97 |         for i in xrange(pixel_size, length):
 98 |             left = data[start + i - pixel_size]
 99 |             data[start + i] = (data[start + i] + left) % 256
100 | 
101 |     def upfilter(data, prior_row_data, start, length, pixel_size):
102 |         # filter type 2: Up
103 |         # Recon(x) = Filt(x) + Recon(b)
104 |         for i in xrange(length):
105 |             up = prior_row_data[i]
106 |             data[start + i] = (data[start + i] + up) % 256
107 | 
108 |     def avgfilter(data, prior_row_data, start, length, pixel_size):
109 |         # filter type 3: Avg
110 |         # Recon(x) = Filt(x) + floor((Recon(a) + Recon(b)) / 2)
111 |         for i in xrange(length):
112 |             left = data[start + i - pixel_size] if i >= pixel_size else 0
113 |             up = prior_row_data[i]
114 |             floor = math.floor((left + up) / 2)
115 |             data[start + i] = (data[start + i] + int(floor)) % 256
116 | 
117 |     def paethfilter(data, prior_row_data, start, length, pixel_size):
118 |         # filter type 4: Paeth
119 |         # Recon(x) = Filt(x) + PaethPredictor(Recon(a), Recon(b), Recon(c))
120 |         def paeth_predictor(a, b, c):
121 |             p = a + b - c
122 |             pa = abs(p - a)
123 |             pb = abs(p - b)
124 |             pc = abs(p - c)
125 |             if pa <= pb and pa <= pc:
126 |                 return a
127 |             elif pb <= pc:
128 |                 return b
129 |             else:
130 |                 return c
131 |         for i in xrange(length):
132 |             left = data[start + i - pixel_size] if i >= pixel_size else 0
133 |             up = prior_row_data[i]
134 |             up_left = prior_row_data[i - pixel_size] if i >= pixel_size else 0
135 |             data[start + i] = (data[start + i] + paeth_predictor(left, up, up_left)) % 256
136 | 
137 |     columnbytes = ((columns * colors * bpc) + 7) // 8
138 |     pixel_size = (colors * bpc + 7) // 8
139 |     data = array.array('B', data)
140 |     rowlen = columnbytes + 1
141 |     if predictor == 15:
142 |         padding = (rowlen - len(data)) % rowlen
143 |         data.extend([0] * padding)
144 |     assert len(data) % rowlen == 0
145 | 
146 |     rows = xrange(0, len(data), rowlen)
147 |     prior_row_data = [ 0 for i in xrange(columnbytes) ]
148 |     for row_index in rows:
149 | 
150 |         filter_type = data[row_index]
151 | 
152 |         if filter_type == 0: # None filter
153 |             pass
154 | 
155 |         elif filter_type == 1: # Sub filter
156 |             subfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
157 | 
158 |         elif filter_type == 2: # Up filter
159 |             upfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
160 | 
161 |         elif filter_type == 3: # Average filter
162 |             avgfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
163 | 
164 |         elif filter_type == 4: # Paeth filter
165 |             paethfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
166 | 
167 |         else:
168 |             return None, 'Unsupported PNG filter %d' % filter_type
169 | 
170 |         prior_row_data = data[row_index + 1 : row_index + 1 + columnbytes] # without filter_type
171 | 
172 |     for row_index in reversed(rows):
173 |         data.pop(row_index)
174 | 
175 |     return data, None
176 | 
177 | def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
178 |     ''' PNG prediction is used to make certain kinds of data
179 |         more compressible.  Before the compression, each data
180 |         byte is either left the same, or is set to be a delta
181 |         from the previous byte, or is set to be a delta from
182 |         the previous row.  This selection is done on a per-row
183 |         basis, and is indicated by a compression type byte
184 |         prepended to each row of data.
185 | 
186 |         Within more recent PDF files, it is normal to use
187 |         this technique for Xref stream objects, which are
188 |         quite regular.
189 |     '''
190 |     d, e = flate_png_impl(data, predictor, columns, colors, bpc)
191 |     if d is not None:
192 |         d = from_array(d)
193 |     return d, e
194 | 
195 | 


--------------------------------------------------------------------------------
/releasing.txt:
--------------------------------------------------------------------------------
 1 | Notes on releasing, which is not yet fully automated:
 2 | 
 3 | 1) Update version number in pdfrw/__init__.py
 4 | 
 5 | 2) Use pyroma
 6 | 
 7 | 3) https://packaging.python.org/en/latest/distributing.html
 8 | 
 9 | a) python setup.py sdist bdist_wheel
10 | b) twine upload dist/*
11 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says that the code is written to work on both Python 2 and Python
3 | # 3. If at all possible, it is good practice to do this. If you cannot, you
4 | # will need to generate wheels for each Python version that you support.
5 | universal=1
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | from pdfrw import __version__ as version
 5 | from pdfrw.py23_diffs import convert_load
 6 | 
 7 | setup(
 8 |     name='pdfrw',
 9 |     version=version,
10 |     description='PDF file reader/writer library',
11 |     long_description=convert_load(open("README.rst", 'rb').read()),
12 |     author='Patrick Maupin',
13 |     author_email='pmaupin@gmail.com',
14 |     platforms='Independent',
15 |     url='https://github.com/pmaupin/pdfrw',
16 |     packages=['pdfrw', 'pdfrw.objects'],
17 |     license='MIT',
18 |     classifiers=[
19 |         'Development Status :: 4 - Beta',
20 |         'Intended Audience :: Developers',
21 |         'License :: OSI Approved :: MIT License',
22 |         'Operating System :: OS Independent',
23 |         'Programming Language :: Python',
24 |         'Programming Language :: Python :: 2',
25 |         'Programming Language :: Python :: 2.6',
26 |         'Programming Language :: Python :: 2.7',
27 |         'Programming Language :: Python :: 3',
28 |         'Programming Language :: Python :: 3.3',
29 |         'Programming Language :: Python :: 3.4',
30 |         'Programming Language :: Python :: 3.5',
31 |         'Programming Language :: Python :: 3.6',
32 |         'Topic :: Multimedia :: Graphics :: Graphics Conversion',
33 |         'Topic :: Software Development :: Libraries',
34 |         'Topic :: Text Processing',
35 |         'Topic :: Printing',
36 |         'Topic :: Utilities',
37 |     ],
38 |     keywords='pdf vector graphics PDF nup watermark split join merge',
39 |     zip_safe=True,
40 | )
41 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # This file intentionally left blank.
2 | 


--------------------------------------------------------------------------------
/tests/basn0g08.png.log:
--------------------------------------------------------------------------------
  1 | width = 32
  2 | bit_depth = 8
  3 | channels = 1
  4 | color_type = 0
  5 | pixel_depth = 8
  6 | rowbytes = 32
  7 | filter = 1
  8 | data = [ 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
  9 | expected = [ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, ]
 10 | width = 32
 11 | bit_depth = 8
 12 | channels = 1
 13 | color_type = 0
 14 | pixel_depth = 8
 15 | rowbytes = 32
 16 | filter = 1
 17 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 18 | expected = [ 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, ]
 19 | width = 32
 20 | bit_depth = 8
 21 | channels = 1
 22 | color_type = 0
 23 | pixel_depth = 8
 24 | rowbytes = 32
 25 | filter = 4
 26 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 27 | expected = [ 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, ]
 28 | width = 32
 29 | bit_depth = 8
 30 | channels = 1
 31 | color_type = 0
 32 | pixel_depth = 8
 33 | rowbytes = 32
 34 | filter = 4
 35 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 36 | expected = [ 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, ]
 37 | width = 32
 38 | bit_depth = 8
 39 | channels = 1
 40 | color_type = 0
 41 | pixel_depth = 8
 42 | rowbytes = 32
 43 | filter = 4
 44 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 45 | expected = [ 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, ]
 46 | width = 32
 47 | bit_depth = 8
 48 | channels = 1
 49 | color_type = 0
 50 | pixel_depth = 8
 51 | rowbytes = 32
 52 | filter = 4
 53 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 54 | expected = [ 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, ]
 55 | width = 32
 56 | bit_depth = 8
 57 | channels = 1
 58 | color_type = 0
 59 | pixel_depth = 8
 60 | rowbytes = 32
 61 | filter = 4
 62 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 63 | expected = [ 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, ]
 64 | width = 32
 65 | bit_depth = 8
 66 | channels = 1
 67 | color_type = 0
 68 | pixel_depth = 8
 69 | rowbytes = 32
 70 | filter = 1
 71 | data = [ 0xe0,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
 72 | expected = [ 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff, ]
 73 | width = 32
 74 | bit_depth = 8
 75 | channels = 1
 76 | color_type = 0
 77 | pixel_depth = 8
 78 | rowbytes = 32
 79 | filter = 1
 80 | data = [ 0xfe,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
 81 | expected = [ 0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,0xf8,0xf7,0xf6,0xf5,0xf4,0xf3,0xf2,0xf1,0xf0,0xef,0xee,0xed,0xec,0xeb,0xea,0xe9,0xe8,0xe7,0xe6,0xe5,0xe4,0xe3,0xe2,0xe1,0xe0,0xdf, ]
 82 | width = 32
 83 | bit_depth = 8
 84 | channels = 1
 85 | color_type = 0
 86 | pixel_depth = 8
 87 | rowbytes = 32
 88 | filter = 4
 89 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
 90 | expected = [ 0xde,0xdd,0xdc,0xdb,0xda,0xd9,0xd8,0xd7,0xd6,0xd5,0xd4,0xd3,0xd2,0xd1,0xd0,0xcf,0xce,0xcd,0xcc,0xcb,0xca,0xc9,0xc8,0xc7,0xc6,0xc5,0xc4,0xc3,0xc2,0xc1,0xc0,0xbf, ]
 91 | width = 32
 92 | bit_depth = 8
 93 | channels = 1
 94 | color_type = 0
 95 | pixel_depth = 8
 96 | rowbytes = 32
 97 | filter = 4
 98 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
 99 | expected = [ 0xbe,0xbd,0xbc,0xbb,0xba,0xb9,0xb8,0xb7,0xb6,0xb5,0xb4,0xb3,0xb2,0xb1,0xb0,0xaf,0xae,0xad,0xac,0xab,0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3,0xa2,0xa1,0xa0,0x9f, ]
100 | width = 32
101 | bit_depth = 8
102 | channels = 1
103 | color_type = 0
104 | pixel_depth = 8
105 | rowbytes = 32
106 | filter = 4
107 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
108 | expected = [ 0x9e,0x9d,0x9c,0x9b,0x9a,0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92,0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8b,0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83,0x82,0x81,0x80,0x7f, ]
109 | width = 32
110 | bit_depth = 8
111 | channels = 1
112 | color_type = 0
113 | pixel_depth = 8
114 | rowbytes = 32
115 | filter = 4
116 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
117 | expected = [ 0x7e,0x7d,0x7c,0x7b,0x7a,0x79,0x78,0x77,0x76,0x75,0x74,0x73,0x72,0x71,0x70,0x6f,0x6e,0x6d,0x6c,0x6b,0x6a,0x69,0x68,0x67,0x66,0x65,0x64,0x63,0x62,0x61,0x60,0x5f, ]
118 | width = 32
119 | bit_depth = 8
120 | channels = 1
121 | color_type = 0
122 | pixel_depth = 8
123 | rowbytes = 32
124 | filter = 4
125 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
126 | expected = [ 0x5e,0x5d,0x5c,0x5b,0x5a,0x59,0x58,0x57,0x56,0x55,0x54,0x53,0x52,0x51,0x50,0x4f,0x4e,0x4d,0x4c,0x4b,0x4a,0x49,0x48,0x47,0x46,0x45,0x44,0x43,0x42,0x41,0x40,0x3f, ]
127 | width = 32
128 | bit_depth = 8
129 | channels = 1
130 | color_type = 0
131 | pixel_depth = 8
132 | rowbytes = 32
133 | filter = 4
134 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
135 | expected = [ 0x3e,0x3d,0x3c,0x3b,0x3a,0x39,0x38,0x37,0x36,0x35,0x34,0x33,0x32,0x31,0x30,0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20,0x1f, ]
136 | width = 32
137 | bit_depth = 8
138 | channels = 1
139 | color_type = 0
140 | pixel_depth = 8
141 | rowbytes = 32
142 | filter = 1
143 | data = [ 0x1e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01, ]
144 | expected = [ 0x1e,0x1d,0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00,0x01, ]
145 | width = 32
146 | bit_depth = 8
147 | channels = 1
148 | color_type = 0
149 | pixel_depth = 8
150 | rowbytes = 32
151 | filter = 1
152 | data = [ 0x02,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
153 | expected = [ 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21, ]
154 | width = 32
155 | bit_depth = 8
156 | channels = 1
157 | color_type = 0
158 | pixel_depth = 8
159 | rowbytes = 32
160 | filter = 4
161 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
162 | expected = [ 0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,0x40,0x41, ]
163 | width = 32
164 | bit_depth = 8
165 | channels = 1
166 | color_type = 0
167 | pixel_depth = 8
168 | rowbytes = 32
169 | filter = 4
170 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
171 | expected = [ 0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,0x60,0x61, ]
172 | width = 32
173 | bit_depth = 8
174 | channels = 1
175 | color_type = 0
176 | pixel_depth = 8
177 | rowbytes = 32
178 | filter = 4
179 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
180 | expected = [ 0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81, ]
181 | width = 32
182 | bit_depth = 8
183 | channels = 1
184 | color_type = 0
185 | pixel_depth = 8
186 | rowbytes = 32
187 | filter = 4
188 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
189 | expected = [ 0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,0xa0,0xa1, ]
190 | width = 32
191 | bit_depth = 8
192 | channels = 1
193 | color_type = 0
194 | pixel_depth = 8
195 | rowbytes = 32
196 | filter = 4
197 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
198 | expected = [ 0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,0xc0,0xc1, ]
199 | width = 32
200 | bit_depth = 8
201 | channels = 1
202 | color_type = 0
203 | pixel_depth = 8
204 | rowbytes = 32
205 | filter = 4
206 | data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
207 | expected = [ 0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,0xe0,0xe1, ]
208 | width = 32
209 | bit_depth = 8
210 | channels = 1
211 | color_type = 0
212 | pixel_depth = 8
213 | rowbytes = 32
214 | filter = 1
215 | data = [ 0xe2,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0xff,0xff, ]
216 | expected = [ 0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,0xfe,0xfd, ]
217 | width = 32
218 | bit_depth = 8
219 | channels = 1
220 | color_type = 0
221 | pixel_depth = 8
222 | rowbytes = 32
223 | filter = 1
224 | data = [ 0xfc,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
225 | expected = [ 0xfc,0xfb,0xfa,0xf9,0xf8,0xf7,0xf6,0xf5,0xf4,0xf3,0xf2,0xf1,0xf0,0xef,0xee,0xed,0xec,0xeb,0xea,0xe9,0xe8,0xe7,0xe6,0xe5,0xe4,0xe3,0xe2,0xe1,0xe0,0xdf,0xde,0xdd, ]
226 | width = 32
227 | bit_depth = 8
228 | channels = 1
229 | color_type = 0
230 | pixel_depth = 8
231 | rowbytes = 32
232 | filter = 4
233 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
234 | expected = [ 0xdc,0xdb,0xda,0xd9,0xd8,0xd7,0xd6,0xd5,0xd4,0xd3,0xd2,0xd1,0xd0,0xcf,0xce,0xcd,0xcc,0xcb,0xca,0xc9,0xc8,0xc7,0xc6,0xc5,0xc4,0xc3,0xc2,0xc1,0xc0,0xbf,0xbe,0xbd, ]
235 | width = 32
236 | bit_depth = 8
237 | channels = 1
238 | color_type = 0
239 | pixel_depth = 8
240 | rowbytes = 32
241 | filter = 4
242 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
243 | expected = [ 0xbc,0xbb,0xba,0xb9,0xb8,0xb7,0xb6,0xb5,0xb4,0xb3,0xb2,0xb1,0xb0,0xaf,0xae,0xad,0xac,0xab,0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3,0xa2,0xa1,0xa0,0x9f,0x9e,0x9d, ]
244 | width = 32
245 | bit_depth = 8
246 | channels = 1
247 | color_type = 0
248 | pixel_depth = 8
249 | rowbytes = 32
250 | filter = 4
251 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
252 | expected = [ 0x9c,0x9b,0x9a,0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92,0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8b,0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83,0x82,0x81,0x80,0x7f,0x7e,0x7d, ]
253 | width = 32
254 | bit_depth = 8
255 | channels = 1
256 | color_type = 0
257 | pixel_depth = 8
258 | rowbytes = 32
259 | filter = 4
260 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
261 | expected = [ 0x7c,0x7b,0x7a,0x79,0x78,0x77,0x76,0x75,0x74,0x73,0x72,0x71,0x70,0x6f,0x6e,0x6d,0x6c,0x6b,0x6a,0x69,0x68,0x67,0x66,0x65,0x64,0x63,0x62,0x61,0x60,0x5f,0x5e,0x5d, ]
262 | width = 32
263 | bit_depth = 8
264 | channels = 1
265 | color_type = 0
266 | pixel_depth = 8
267 | rowbytes = 32
268 | filter = 4
269 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
270 | expected = [ 0x5c,0x5b,0x5a,0x59,0x58,0x57,0x56,0x55,0x54,0x53,0x52,0x51,0x50,0x4f,0x4e,0x4d,0x4c,0x4b,0x4a,0x49,0x48,0x47,0x46,0x45,0x44,0x43,0x42,0x41,0x40,0x3f,0x3e,0x3d, ]
271 | width = 32
272 | bit_depth = 8
273 | channels = 1
274 | color_type = 0
275 | pixel_depth = 8
276 | rowbytes = 32
277 | filter = 4
278 | data = [ 0xe0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
279 | expected = [ 0x3c,0x3b,0x3a,0x39,0x38,0x37,0x36,0x35,0x34,0x33,0x32,0x31,0x30,0x2f,0x2e,0x2d,0x2c,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x23,0x22,0x21,0x20,0x1f,0x1e,0x1d, ]
280 | width = 32
281 | bit_depth = 8
282 | channels = 1
283 | color_type = 0
284 | pixel_depth = 8
285 | rowbytes = 32
286 | filter = 1
287 | data = [ 0x1c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x01,0x01, ]
288 | expected = [ 0x1c,0x1b,0x1a,0x19,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00,0x01,0x02,0x03, ]
289 | PASS: pngimage --exhaustive --list-combos ./contrib/pngsuite/basn0g08.png
290 | 


--------------------------------------------------------------------------------
/tests/checkdiffs.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python2
 2 | 
 3 | import sys
 4 | import os
 5 | import subprocess
 6 | import hashlib
 7 | 
 8 | import expected
 9 | import static_pdfs
10 | 
11 | source_pdfs = static_pdfs.pdffiles[0]
12 | source_pdfs = dict((os.path.basename(x), x) for x in source_pdfs)
13 | 
14 | result_dir = expected.result_dir
15 | 
16 | for subdir in sorted(os.listdir(result_dir)):
17 |     dstd = os.path.join(result_dir, subdir)
18 |     if not os.path.isdir(dstd):
19 |         continue
20 |     for pdffile in sorted(os.listdir(dstd)):
21 |         testname = '%s/%s' % (subdir, pdffile)
22 |         srcf = source_pdfs.get(pdffile)
23 |         dstf = os.path.join(dstd, pdffile)
24 |         if pdffile not in source_pdfs:
25 |             print('\n Skipping %s -- source not found' % testname)
26 |             continue
27 | 
28 |         with open(dstf, 'rb') as f:
29 |             data = f.read()
30 |         hash = hashlib.md5(data).hexdigest()
31 |         skipset = set((hash, 'skip', 'xfail', 'fail', '!' + hash))
32 |         if expected.results[testname] & skipset:
33 |             print('\n Skipping %s -- marked done' % testname)
34 |             continue
35 |         if os.path.exists('foobar.pdf'):
36 |             os.remove('foobar.pdf')
37 |         builtdiff = False
38 |         while 1:
39 |             sys.stdout.write('''
40 |                 Test case %s
41 | 
42 |                 c = compare using imagemagick and okular
43 |                 f = display foobar.pdf (result from comparison)
44 |                 o = display results with okular
45 |                 a = display results with acrobat
46 | 
47 |                 s = mark 'skip' and go to next PDF
48 |                 g = mark as good and go to next PDF
49 |                 b = mark as bad and go to next PDF
50 |                 n = next pdf without marking
51 |                 q = quit
52 | -->  ''' % testname)
53 |             sel = raw_input()
54 |             if sel == 'q':
55 |                 raise SystemExit(0)
56 |             if sel == 'n':
57 |                 break
58 |             if sel == 'c':
59 |                 subprocess.call(('compare', '-verbose', srcf, dstf,
60 |                                  'foobar.pdf'))
61 |                 builtdiff = True
62 |                 continue
63 |             if sel == 'f':
64 |                 subprocess.call(('okular', 'foobar.pdf'))
65 |                 continue
66 |             if sel == 'o':
67 |                 subprocess.call(('okular', srcf, dstf))
68 |                 continue
69 |             if sel == 'a':
70 |                 if builtdiff:
71 |                     subprocess.call(('acroread', srcf, dstf, 'foobar.pdf'))
72 |                 else:
73 |                     subprocess.call(('acroread', srcf, dstf))
74 |                 continue
75 | 
76 |             if sel in 'sgb':
77 |                 results = (hash if sel == 'g' else
78 |                            '    skip' if sel == 's' else '!'+hash)
79 |                 with open(expected.expectedf, 'a') as f:
80 |                     f.write('%s %s\n' % (testname, results))
81 |                 break
82 | 


--------------------------------------------------------------------------------
/tests/expected.py:
--------------------------------------------------------------------------------
 1 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 2 | # Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 3 | # MIT license -- See LICENSE.txt for details
 4 | 
 5 | '''
 6 |     Read expected.txt, which should be in the format:
 7 | 
 8 |        testname/srcname.pdf validhash
 9 | 
10 |     More than one validhash is allowed (on separate lines),
11 |     and hash-delimited comments are allowed.
12 | '''
13 | 
14 | import os
15 | import collections
16 | from pdfrw.py23_diffs import convert_load
17 | 
18 | root_dir = os.path.dirname(__file__)
19 | result_dir = 'tmp_results'
20 | if os.path.exists('ramdisk'):
21 |     result_dir = os.path.join('ramdisk', result_dir)
22 | result_dir = os.path.join(root_dir, result_dir)
23 | 
24 | for sourcef in ('mytests.txt', 'expected.txt'):
25 |     expectedf = os.path.join(root_dir, sourcef)
26 |     if os.path.exists(expectedf):
27 |         break
28 | 
29 | 
30 | def results():
31 |     results = collections.defaultdict(set)
32 |     with open(expectedf, 'rb') as f:
33 |         for line in f:
34 |             line = convert_load(line)
35 |             line = line.split('#', 1)[0].split()
36 |             if not line:
37 |                 continue
38 |             key, value = line
39 |             results[key].add(value)
40 |     return results
41 | results = results()
42 | 


--------------------------------------------------------------------------------
/tests/expected.txt:
--------------------------------------------------------------------------------
  1 | # Example programs
  2 | 
  3 | examples/4up_b1c400de699af29ea3f1983bb26870ab               1b73c612c40b5082d955ed72f63644bd
  4 | examples/alter_b1c400de699af29ea3f1983bb26870ab             3c3ee465f45a685ba7098691be05a5ab
  5 | examples/booklet_b1c400de699af29ea3f1983bb26870ab           d711b74110eefb4e9e6bf1a5bea16bfe
  6 | examples/extract_1975ef8db7355b1d691bc79d0749574b           b4f5ee36a288da970ed040a9a733c8b0
  7 | examples/extract_c5c895deecf7a7565393587e0d61be2b           539aad09ef80907bb396c3260eb87d7b
  8 | examples/extract_d711b74110eefb4e9e6bf1a5bea16bfe           26ddfd09c6e6002228f06782c8544ac4
  9 | examples/print_two_b1c400de699af29ea3f1983bb26870ab         73c8a16aba44548c2c06dae6e2551961
 10 | examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5      880a9578197130273ccb51265af08029
 11 | examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe          780a9abe26a9de0b5b95ee22c4835e4b
 12 | 
 13 | examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c          62bb9b746ff5932d3f1b88942d36a81d
 14 | examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56                7633ba56641115050ba098ecbef8d331
 15 | examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c    fe2330d42b3bfc06212415f295752f0e
 16 | examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612
 17 | 
 18 | # All these are in the poster test
 19 | examples/subset_1975ef8db7355b1d691bc79d0749574b_21     5057f345f1a1109a0e54276a68e8f8df
 20 | examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1   881f4dc8dcf069e707bf61af95492d86
 21 | examples/poster_881f4dc8dcf069e707bf61af95492d86        a34be06d22105b6c02394a9f278fec0d
 22 | 
 23 | examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab                   e21dfdd9ae56ddb261dc3d02bf6da198
 24 | examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab               410063b7fbae1c6d5af33758e2b43450
 25 | examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5            745f1ac31a18d86afb294a449b72cb98
 26 | examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5
 27 | 
 28 | # List things that need work here (typically cause exceptions)
 29 | 
 30 | # Bad info dict -- works otherwise
 31 | 
 32 | simple/b1c400de699af29ea3f1983bb26870ab.pdf         ecf2e28de18a724b53670c0d5637ec28
 33 | repaginate/b1c400de699af29ea3f1983bb26870ab.pdf     4d7d6c5f6e14c6eac1dfc055cebfa499
 34 | 
 35 | # 07b0ba4 is missing an object.  Best we can do is report it
 36 | # (and we do)
 37 | 
 38 | repaginate/07b0ba4cff1c6ff73fd468b04b013457.pdf     993c763e085bce7ecc941ba104f4c892
 39 | simple/07b0ba4cff1c6ff73fd468b04b013457.pdf         499b9c1b1e1c76b7c5c0d5e3b62889e3
 40 | 
 41 | #b107 has a single page, but with an empty contents dict.
 42 | 
 43 | repaginate/b107669d1dd69eabb89765fabb2cb321.pdf     0652d2da25b50cad75863d0e2bbaa878
 44 | simple/b107669d1dd69eabb89765fabb2cb321.pdf         56025c06ab8633575ddc6c6990d2fbf1
 45 | 
 46 | # Encrypted files
 47 | 
 48 | repaginate/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf     skip
 49 | simple/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf         skip
 50 | compress/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf       skip
 51 | decompress/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf     skip
 52 | repaginate/6e122f618c27f3aa9a689423e3be6b8d.pdf     skip
 53 | simple/6e122f618c27f3aa9a689423e3be6b8d.pdf         skip
 54 | compress/6e122f618c27f3aa9a689423e3be6b8d.pdf       skip
 55 | decompress/6e122f618c27f3aa9a689423e3be6b8d.pdf     skip
 56 | repaginate/7dc787639aa6765214e9ff5494d231ed.pdf     skip
 57 | simple/7dc787639aa6765214e9ff5494d231ed.pdf         skip
 58 | compress/7dc787639aa6765214e9ff5494d231ed.pdf       skip
 59 | decompress/7dc787639aa6765214e9ff5494d231ed.pdf     skip
 60 | repaginate/b4b27aaa1f9c7c524298e98be279bebb.pdf     skip
 61 | simple/b4b27aaa1f9c7c524298e98be279bebb.pdf         skip
 62 | compress/b4b27aaa1f9c7c524298e98be279bebb.pdf       skip
 63 | decompress/b4b27aaa1f9c7c524298e98be279bebb.pdf     skip
 64 | repaginate/b5b6c6405d7b48418bccf97277957664.pdf     skip
 65 | simple/b5b6c6405d7b48418bccf97277957664.pdf         skip
 66 | compress/b5b6c6405d7b48418bccf97277957664.pdf       skip
 67 | decompress/b5b6c6405d7b48418bccf97277957664.pdf     skip
 68 | repaginate/bd0ef57aec16ded45bd89d61b54af0be.pdf     skip
 69 | simple/bd0ef57aec16ded45bd89d61b54af0be.pdf         skip
 70 | compress/bd0ef57aec16ded45bd89d61b54af0be.pdf       skip
 71 | decompress/bd0ef57aec16ded45bd89d61b54af0be.pdf     skip
 72 | repaginate/dbb807a878ac1da6b91ac15c9de4e209.pdf     skip
 73 | simple/dbb807a878ac1da6b91ac15c9de4e209.pdf         skip
 74 | compress/dbb807a878ac1da6b91ac15c9de4e209.pdf       skip
 75 | decompress/dbb807a878ac1da6b91ac15c9de4e209.pdf     skip
 76 | 
 77 | 
 78 | 
 79 | # List good hashes for round-trips here.
 80 | 
 81 | repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b
 82 | repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1
 83 | repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8
 84 | repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5
 85 | repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c
 86 | repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598
 87 | repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4
 88 | repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b
 89 | repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61
 90 | repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d
 91 | repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795
 92 | repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22
 93 | repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0
 94 | repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3
 95 | repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0
 96 | repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
 97 | repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2
 98 | repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45
 99 | repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315
100 | repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69
101 | repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c
102 | repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f
103 | repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e
104 | repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261
105 | repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c
106 | repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
107 | repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233
108 | repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea
109 | repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273
110 | repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae
111 | repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83
112 | repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd
113 | repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840
114 | repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68
115 | repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa
116 | repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273
117 | repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d
118 | 
119 | 
120 | simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
121 | simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef
122 | simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
123 | simple/09715ec1a7b0f3a7ae02b3046f627b9f.pdf c4e4b3b725bd5fc3b008f1ac6251ad1c
124 | simple/1975ef8db7355b1d691bc79d0749574b.pdf 475c28c9588f3a7f6110d30f391758c4
125 | simple/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 3f17f19fd92adf01998bb13a0ee52b92
126 | simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
127 | simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d
128 | simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097
129 | simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa
130 | simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2
131 | simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf
132 | simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4
133 | simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef
134 | simple/365b9c95574ee8944370fe286905d0e8.pdf cf3bfac41f410bf5bd657e3f906dfbc6
135 | simple/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
136 | simple/49e31fd074eca6af981d78d42d0078ec.pdf 2c316537a5b0917634cbbdc5b91511df
137 | simple/536dfc6fbadd87c03eb59375d091eb53.pdf 319851765c70ba103c4191f7ec2148db
138 | simple/569f8094597bbe5b58efc3a7c6e14e87.pdf 025f1bf95cc537c36b8c3a044758b86c
139 | simple/5f0cff36d0ad74536a6513a98a755016.pdf 8476fd75e75394fcbbe02816d0640e7d
140 | simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9
141 | simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
142 | simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf
143 | simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05
144 | simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70
145 | simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
146 | simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e
147 | simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
148 | simple/9d8626d18b1d8807d271e6ffc409446a.pdf 2358d654bf20d2b9d179ab009a615c4e
149 | simple/9f98322c243fe67726d56ccfa8e0885b.pdf 9290b4c32f005e1e4c7f431955246c4c
150 | simple/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6b406128e0ed1ac23dc5a0ee34d1f717
151 | simple/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
152 | simple/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 2083f0e55cf06d88df02956a21bfef23
153 | simple/d6fd9567078b48c86710e9c49173781f.pdf 77464ec5cfdacb61a73b506bc4945631
154 | simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124
155 | simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
156 | simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27
157 | 
158 | 
159 | decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
160 | decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
161 | decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b
162 | decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36
163 | decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
164 | decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90
165 | decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637
166 | decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
167 | decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b
168 | decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad
169 | decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9
170 | decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3
171 | decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f
172 | decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e
173 | decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a
174 | decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6
175 | decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
176 | decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc
177 | decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6
178 | decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58
179 | decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56
180 | decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149
181 | decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
182 | decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0
183 | decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158
184 | decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24
185 | decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
186 | decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d
187 | decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
188 | decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6
189 | decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 0fa96e3669d14c64fff159d5aa457014
190 | decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
191 | decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227
192 | decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f
193 | decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
194 | decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a
195 | decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4
196 | decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380
197 | decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
198 | decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0
199 | 
200 | compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa
201 | compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
202 | compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e
203 | compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3
204 | compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
205 | compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976
206 | compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c
207 | compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6
208 | compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4
209 | compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5
210 | compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24
211 | compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c
212 | compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30
213 | compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44
214 | compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403
215 | compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15
216 | compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e
217 | compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba
218 | compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9
219 | compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192
220 | compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20
221 | compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c
222 | compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
223 | compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7
224 | compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7
225 | compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e
226 | compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594
227 | compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1
228 | compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81
229 | compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd
230 | compress/9f98322c243fe67726d56ccfa8e0885b.pdf f9d59774a75bb2dfc08ff7df65aa3048
231 | compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
232 | compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f
233 | compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868
234 | compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8
235 | compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24
236 | compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229
237 | compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c
238 | compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81
239 | compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578
240 | 


--------------------------------------------------------------------------------
/tests/myprofile.py:
--------------------------------------------------------------------------------
1 | import cProfile
2 | import unittest
3 | import test_roundtrip
4 | 
5 | cProfile.run('unittest.main(test_roundtrip)')
6 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  4 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas
  5 | # MIT license -- See LICENSE.txt for details
  6 | 
  7 | '''
  8 | Run from the directory above like so:
  9 | 
 10 |    python -m tests.test_examples
 11 | 
 12 | A PDF that has been determined to be good or bad
 13 | should be added to expected.txt with either a good
 14 | checksum, or just the word "fail".
 15 | 
 16 | These tests are incomplete, but they allow us to try
 17 | out various PDFs.  There is a collection of difficult
 18 | PDFs available on github.
 19 | 
 20 | In order to use them:
 21 | 
 22 |   1) Insure that github.com/pmaupin/static_pdfs is on your path.
 23 | 
 24 |   2) Use the imagemagick compare program to look at differences
 25 |      between the static_pdfs/global directory and the tmp_results
 26 |      directory after you run this.
 27 | 
 28 | 
 29 | '''
 30 | import sys
 31 | import os
 32 | import hashlib
 33 | import subprocess
 34 | import static_pdfs
 35 | import expected
 36 | 
 37 | from pdfrw.py23_diffs import convert_store
 38 | from pdfrw import PdfReader, PdfWriter
 39 | 
 40 | try:
 41 |     import unittest2 as unittest
 42 | except ImportError:
 43 |     import unittest
 44 | 
 45 | 
 46 | prog_dir = os.path.join(expected.root_dir, '..', 'examples', '%s.py')
 47 | prog_dir = os.path.abspath(prog_dir)
 48 | dstdir = os.path.join(expected.result_dir, 'examples')
 49 | hashfile = os.path.join(expected.result_dir, 'hashes.txt')
 50 | 
 51 | lookup = static_pdfs.pdffiles[0]
 52 | lookup = dict((os.path.basename(x)[:-4], x) for x in lookup)
 53 | 
 54 | 
 55 | class TestOnePdf(unittest.TestCase):
 56 | 
 57 |     def do_test(self, params, prev_results=[''], scrub=False):
 58 |         params = params.split()
 59 |         hashkey = 'examples/%s' % '_'.join(params)
 60 |         params = [lookup.get(x, x) for x in params]
 61 |         progname = params[0]
 62 |         params[0] = prog_dir % progname
 63 |         srcf = params[1]
 64 |         params.insert(0, sys.executable)
 65 |         subdir, progname = os.path.split(progname)
 66 |         subdir = os.path.join(dstdir, subdir)
 67 |         if not os.path.exists(subdir):
 68 |             os.makedirs(subdir)
 69 |         os.chdir(subdir)
 70 |         dstf = '%s.%s' % (progname, os.path.basename(srcf))
 71 |         scrub = scrub and dstf
 72 |         dstf = dstf if not scrub else 'final.%s' % dstf
 73 |         hash = '------no-file-generated---------'
 74 |         expects = expected.results[hashkey]
 75 | 
 76 |         # If the test has been deliberately skipped,
 77 |         # we are done.  Otherwise, execute it even
 78 |         # if we don't know about it yet, so we have
 79 |         # results to compare.
 80 | 
 81 |         result = 'fail'
 82 |         size = 0
 83 |         try:
 84 |             if 'skip' in expects:
 85 |                 result = 'skip requested'
 86 |                 return self.skipTest(result)
 87 |             elif 'xfail' in expects:
 88 |                 result = 'xfail requested'
 89 |                 return self.fail(result)
 90 | 
 91 |             exists = os.path.exists(dstf)
 92 |             if expects or not exists:
 93 |                 if exists:
 94 |                     os.remove(dstf)
 95 |                 if scrub and os.path.exists(scrub):
 96 |                     os.remove(scrub)
 97 |                 subprocess.call(params)
 98 |                 if scrub:
 99 |                     PdfWriter(dstf).addpages(PdfReader(scrub).pages).write()
100 |             with open(dstf, 'rb') as f:
101 |                 data = f.read()
102 |             size = len(data)
103 |             if data:
104 |                 hash = hashlib.md5(data).hexdigest()
105 |                 lookup[hash] = dstf
106 |                 prev_results[0] = hash
107 |             else:
108 |                 os.remove(dstf)
109 |             if expects:
110 |                 if len(expects) == 1:
111 |                     expects, = expects
112 |                     self.assertEqual(hash, expects)
113 |                 else:
114 |                     self.assertIn(hash, expects)
115 |                 result = 'pass'
116 |             else:
117 |                 result = 'skip'
118 |                 self.skipTest('No hash available')
119 |         finally:
120 |             result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
121 |             with open(hashfile, 'ab') as f:
122 |                 f.write(convert_store(result))
123 | 
124 |     def test_4up(self):
125 |         self.do_test('4up b1c400de699af29ea3f1983bb26870ab')
126 | 
127 |     def test_booklet_unspread(self):
128 |         prev = [None]
129 |         self.do_test('booklet b1c400de699af29ea3f1983bb26870ab', prev)
130 |         if prev[0] is not None:
131 |             self.do_test('unspread ' + prev[0])
132 |             self.do_test('extract  ' + prev[0])
133 | 
134 |     def test_print_two(self):
135 |         self.do_test('print_two b1c400de699af29ea3f1983bb26870ab')
136 | 
137 |     def test_watermarks(self):
138 |         self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
139 |                      '06c86654f9a77e82f9adaa0086fc391c')
140 |         self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
141 |                      '06c86654f9a77e82f9adaa0086fc391c -u')
142 | 
143 |     def test_subset(self):
144 |         self.do_test('subset b1c400de699af29ea3f1983bb26870ab 1-3 5')
145 | 
146 |     def test_alter(self):
147 |         self.do_test('alter b1c400de699af29ea3f1983bb26870ab')
148 | 
149 |     def test_cat(self):
150 |         self.do_test('cat b1c400de699af29ea3f1983bb26870ab '
151 |                      '06c86654f9a77e82f9adaa0086fc391c')
152 | 
153 |     def test_rotate(self):
154 |         self.do_test('rotate 707e3e2d17cbe9ec2273414b3b63f333 '
155 |                      '270 1-4 7-8 10-50 52-56')
156 | 
157 |     def test_poster(self):
158 |         prev = [None]
159 |         self.do_test('subset 1975ef8db7355b1d691bc79d0749574b 21', prev)
160 |         self.do_test('rotate %s 90 1' % prev[0], prev)
161 |         self.do_test('poster %s' % prev[0], prev)
162 | 
163 |     def test_extract(self):
164 |         self.do_test('extract 1975ef8db7355b1d691bc79d0749574b')
165 |         self.do_test('extract c5c895deecf7a7565393587e0d61be2b')
166 | 
167 |     def test_rl1_4up(self):
168 |         if sys.version_info < (2, 7):
169 |             return
170 |         self.do_test('rl1/4up     b1c400de699af29ea3f1983bb26870ab',
171 |                      scrub=True)
172 | 
173 |     def test_rl1_booklet(self):
174 |         if sys.version_info < (2, 7):
175 |             return
176 |         self.do_test('rl1/booklet b1c400de699af29ea3f1983bb26870ab',
177 |                      scrub=True)
178 | 
179 |     def test_rl1_subset(self):
180 |         if sys.version_info < (2, 7):
181 |             return
182 |         self.do_test('rl1/subset  b1c400de699af29ea3f1983bb26870ab 3 5',
183 |                      scrub=True)
184 | 
185 |     def test_rl1_platypus(self):
186 |         if sys.version_info < (2, 7):
187 |             return
188 |         self.do_test('rl1/platypus_pdf_template b1c400de699af29ea3f1983bb26870ab',
189 |                      scrub=True)
190 | 
191 | def main():
192 |     unittest.main()
193 | 
194 | if __name__ == '__main__':
195 |     main()
196 | 


--------------------------------------------------------------------------------
/tests/test_flate_png.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # encoding: utf-8
  3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
  5 | #                    2017 Henddher Pedroza, Illinois
  6 | # MIT license -- See LICENSE.txt for details
  7 | 
  8 | '''
  9 | Run from the directory above like so:
 10 | python -m tests.test_pdfstring
 11 | '''
 12 | 
 13 | 
 14 | from pdfrw.uncompress import flate_png, flate_png_impl
 15 | from pdfrw.py23_diffs import zlib, xrange, from_array, convert_load, convert_store
 16 | 
 17 | import unittest
 18 | import base64
 19 | import array
 20 | import logging
 21 | import ast
 22 | import os
 23 | 
 24 | #
 25 | # Sample PNGs with filtered scanlines retrieved from
 26 | # http://www.schaik.com/pngsuite/pngsuite_fil_png.html
 27 | #
 28 | 
 29 | def filepath(filename):
 30 |     pwd = os.path.dirname(__file__)
 31 |     return os.path.join(pwd, filename)
 32 | 
 33 | def create_data(nc=1, nr=1, bpc=8, ncolors=1, filter_type=0):
 34 |     pixel_size = (bpc * ncolors + 7) // 8
 35 |     data = []
 36 |     for r in xrange(nr):
 37 |         data.append(filter_type if r > 0 else 0) # filter byte
 38 |         for c in xrange(nc * pixel_size):
 39 |             data.append(r * nc * pixel_size + c * pixel_size)
 40 |     data = array.array('B', data)
 41 |     logging.debug("Data: %r" % (data))
 42 |     return data, nc, nr, bpc, ncolors
 43 | 
 44 | def prepend_data_with_filter(data, filter):
 45 |     a = array.array('B', data)
 46 |     a.insert(0, filter)
 47 |     return a
 48 | 
 49 | def print_data(data1, data2):
 50 |     if data1 is None:
 51 |         return
 52 |     for b1, b2 in zip(data1, data2):
 53 |         b1 = b1 if type(b1) != str else ord(b1)
 54 |         b2 = b2 if type(b2) != str else ord(b2)
 55 |         logging.error("%4d %4d" % (b1, b2))
 56 |     if len(data1) != len(data2):
 57 |         logging.error("Mismatched lengths: %d %d" % (len(data1), len(data2)))
 58 |     return None
 59 | 
 60 | class TestFlatePNG(unittest.TestCase):
 61 |     
 62 |     def test_flate_png(self):
 63 |         b64 = 'AAAAAAD//wACAAA2AAAAAQAADwAAAgEAACcAAQL/AAAzAP8AAgAANgACAAEAAO8AAAABAAF1AAAAAgAANgADAAEAAfsAAAACAAA2AAQCAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAQECBXx8AAIAAAGHAAAAAgAANgAMAAEDCcMAAAACAAA2AA0CAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQABBxI2AAAEAfn5AAAWAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAEAAQ6fJgAAAAIAADYAHwIAAAAAAAECAAAAAAABAgAAAAAAAQIAAAAAAAECAAAAAAABAgAAAAAAAQABESDsAAAAAgAANgAmAAAAAAD//wIAAAAAAAACARp0hgEBAgAA/eAAAA=='
 64 |         predictor, columns, colors, bpc = (12, 6, 1, 8)
 65 |         data = base64.b64decode(b64)
 66 |         d2, error2 = flate_png(data, predictor, columns, colors, bpc)
 67 |         assert d2 is not None
 68 |         assert error2 is None
 69 | 
 70 |     def test_flate_png_filter_0(self):
 71 |         # None filter
 72 |         data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4)
 73 |         d2, error2 = flate_png(data, 12, nc, ncolors, bpc)
 74 |         assert d2 is not None
 75 |         assert error2 is None
 76 | 
 77 |     def test_flate_png_filter_1(self):
 78 |         # Sub filter
 79 |         data, nc, nr, bpc, ncolors = create_data(nc=2, nr=3, bpc=8, ncolors=4, filter_type=1)
 80 |         d2, error2 = flate_png(data, 12, nc, ncolors, bpc)
 81 |         assert d2 is not None
 82 |         assert error2 is None
 83 | 
 84 |     def test_flate_png_filter_2(self):
 85 |         # Up filter
 86 |         data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=2)
 87 |         d2, error2 = flate_png(data, 12, nc, ncolors, bpc)
 88 |         assert d2 is not None
 89 |         assert error2 is None
 90 | 
 91 |     def test_flate_png_filter_3(self):
 92 |         # Avg filter
 93 |         data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=3)
 94 |         d2, error2 = flate_png(data, 12, nc, ncolors, bpc)
 95 |         assert d2
 96 |         assert error2 is None
 97 | 
 98 |     def test_flate_png_filter_4(self):
 99 |         # Paeth filter
100 |         data, nc, nr, bpc, ncolors = create_data(nc=5, nr=7, bpc=8, ncolors=4, filter_type=4)
101 |         d2, error2 = flate_png(data, 12, nc, ncolors, bpc)
102 |         assert d2
103 |         assert error2 is None
104 | 
105 |     def test_flate_png_alt_filter_1(self):
106 |         width = 32
107 |         bit_depth = 8
108 |         channels = 1
109 |         color_type = 0
110 |         pixel_depth = 8
111 |         rowbytes = 32
112 |         filter = 1
113 |         data = [ 0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
114 |         expected = [ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, ]
115 | 
116 |         dataf = prepend_data_with_filter(data, filter)
117 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
118 | 
119 |         assert error is None
120 |         expected = array.array('B', expected)
121 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
122 | 
123 |         width = 32
124 |         bit_depth = 8
125 |         channels = 3
126 |         color_type = 2
127 |         pixel_depth = 24
128 |         rowbytes = 96
129 |         filter = 1
130 |         data = [ 0xff,0x00,0x08,0x00,0x08,0x07,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08, ]
131 |         expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ]
132 | 
133 |         dataf = prepend_data_with_filter(data, filter)
134 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
135 | 
136 |         assert error is None
137 |         expected = array.array('B', expected)
138 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
139 | 
140 |     def test_flate_png_alt_filter_2(self):
141 |         width = 32
142 |         bit_depth = 8
143 |         channels = 3
144 |         color_type = 2
145 |         pixel_depth = 24
146 |         rowbytes = 96
147 |         filter = 2
148 |         prev_row = [ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
149 |         data = [ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, ]
150 |         expected = [ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, ]
151 | 
152 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
153 |         dataf = prepend_data_with_filter(data, filter)
154 |         prev_rowf.extend(dataf)
155 |         dataf = prev_rowf
156 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
157 | 
158 |         assert error is None
159 |         prev_rowa = array.array('B', prev_row)
160 |         prev_rowa.extend(expected)
161 |         expected = prev_rowa
162 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
163 | 
164 |         width = 32
165 |         bit_depth = 16
166 |         channels = 1
167 |         color_type = 0
168 |         pixel_depth = 16
169 |         rowbytes = 64
170 |         filter = 2
171 |         prev_row = [ 0x00,0x00,0x09,0x00,0x12,0x00,0x1b,0x00,0x24,0x00,0x2d,0x00,0x36,0x00,0x3f,0x00,0x48,0x00,0x51,0x00,0x5a,0x00,0x63,0x00,0x6c,0x00,0x75,0x00,0x7e,0x00,0x87,0x00,0x90,0x00,0x99,0x00,0xa2,0x00,0xab,0x00,0xb4,0x00,0xbd,0x00,0xc6,0x00,0xcf,0x00,0xd8,0x00,0xe1,0x00,0xea,0x00,0xf3,0x00,0xfc,0x00,0xf0,0xff,0xd5,0xff,0xba,0xff, ]
172 |         data = [ 0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0x02,0x00,0xfa,0x00,0xfa,0x00,0xfa,0x00, ]
173 |         expected = [ 0x02,0x00,0x0b,0x00,0x14,0x00,0x1d,0x00,0x26,0x00,0x2f,0x00,0x38,0x00,0x41,0x00,0x4a,0x00,0x53,0x00,0x5c,0x00,0x65,0x00,0x6e,0x00,0x77,0x00,0x80,0x00,0x89,0x00,0x92,0x00,0x9b,0x00,0xa4,0x00,0xad,0x00,0xb6,0x00,0xbf,0x00,0xc8,0x00,0xd1,0x00,0xda,0x00,0xe3,0x00,0xec,0x00,0xf5,0x00,0xfe,0x00,0xea,0xff,0xcf,0xff,0xb4,0xff, ]
174 | 
175 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
176 |         dataf = prepend_data_with_filter(data, filter)
177 |         prev_rowf.extend(dataf)
178 |         dataf = prev_rowf
179 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
180 | 
181 |         assert error is None
182 |         prev_rowa = array.array('B', prev_row)
183 |         prev_rowa.extend(expected)
184 |         expected = prev_rowa
185 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
186 | 
187 |     def test_flate_png_alt_filter_3(self):
188 | 
189 |         width = 32
190 |         bit_depth = 8
191 |         channels = 1
192 |         color_type = 0
193 |         pixel_depth = 8
194 |         rowbytes = 32
195 |         filter = 3
196 |         prev_row = [ 0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0xe3,0xc9,0xf1,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f, ]
197 |         data = [ 0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x69,0x02,0xe4,0xb5,0xc3,0xa1,0xff,0x31,0x51,0xcf,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, ]
198 |         expected = [ 0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0xe8,0xb5,0x7e,0x65,0x5a,0x46,0x61,0xa1,0xe1,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f,0x7f, ]
199 | 
200 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
201 |         dataf = prepend_data_with_filter(data, filter)
202 |         prev_rowf.extend(dataf)
203 |         dataf = prev_rowf
204 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
205 | 
206 |         assert error is None
207 |         prev_rowa = array.array('B', prev_row)
208 |         prev_rowa.extend(expected)
209 |         expected = prev_rowa
210 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
211 | 
212 |         width = 32
213 |         bit_depth = 8
214 |         channels = 3
215 |         color_type = 2
216 |         pixel_depth = 24
217 |         rowbytes = 96
218 |         filter = 3
219 |         prev_row = [0] * rowbytes
220 |         data = [ 0xff,0x00,0x08,0x80,0x08,0x0b,0x80,0x0c,0x10,0x80,0x10,0x14,0x80,0x14,0x18,0x80,0x19,0x1c,0x80,0x1d,0x20,0x80,0x21,0x24,0x80,0x25,0x28,0x80,0x2a,0x2c,0x80,0x2d,0x30,0x80,0x31,0x34,0x80,0x35,0x38,0x80,0x39,0x3c,0x80,0x3e,0x40,0x80,0x42,0x44,0x80,0x46,0x48,0x80,0x4a,0x4c,0x80,0x4f,0x50,0x80,0x52,0x54,0x80,0x56,0x58,0x80,0x5a,0x5c,0x80,0x5e,0x60,0x80,0x63,0x64,0x80,0x67,0x68,0x80,0x6b,0x6c,0x80,0x6f,0x70,0x80,0x74,0x74,0x80,0x77,0x78,0x80,0x7b,0x7c,0x80,0x7f,0x80,0x80,0x84,0x84, ]
221 |         expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ]
222 | 
223 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
224 |         dataf = prepend_data_with_filter(data, filter)
225 |         prev_rowf.extend(dataf)
226 |         dataf = prev_rowf
227 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
228 | 
229 |         assert error is None
230 |         prev_rowa = array.array('B', prev_row)
231 |         prev_rowa.extend(expected)
232 |         expected = prev_rowa
233 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
234 | 
235 |     def test_flate_png_alt_filter_4(self):
236 |         width = 32
237 |         bit_depth = 8
238 |         channels = 1
239 |         color_type = 0
240 |         pixel_depth = 8
241 |         rowbytes = 32
242 |         filter = 4
243 |         prev_row = [ 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, ]
244 |         data = [ 0x20,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, ]
245 |         expected = [ 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, ]
246 | 
247 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
248 |         dataf = prepend_data_with_filter(data, filter)
249 |         prev_rowf.extend(dataf)
250 |         dataf = prev_rowf
251 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
252 | 
253 |         assert error is None
254 |         prev_rowa = array.array('B', prev_row)
255 |         prev_rowa.extend(expected)
256 |         expected = prev_rowa
257 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
258 | 
259 |         width = 32
260 |         bit_depth = 8
261 |         channels = 3
262 |         color_type = 2
263 |         pixel_depth = 24
264 |         rowbytes = 96
265 |         filter = 4
266 |         prev_row = [0] * rowbytes
267 |         data = [ 0xff,0x00,0x08,0x00,0x08,0x07,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x08,0x08,0x00,0x09,0x08, ]
268 |         expected = [ 0xff,0x00,0x08,0xff,0x08,0x0f,0xff,0x10,0x17,0xff,0x18,0x1f,0xff,0x20,0x27,0xff,0x29,0x2f,0xff,0x31,0x37,0xff,0x39,0x3f,0xff,0x41,0x47,0xff,0x4a,0x4f,0xff,0x52,0x57,0xff,0x5a,0x5f,0xff,0x62,0x67,0xff,0x6a,0x6f,0xff,0x73,0x77,0xff,0x7b,0x7f,0xff,0x83,0x87,0xff,0x8b,0x8f,0xff,0x94,0x97,0xff,0x9c,0x9f,0xff,0xa4,0xa7,0xff,0xac,0xaf,0xff,0xb4,0xb7,0xff,0xbd,0xbf,0xff,0xc5,0xc7,0xff,0xcd,0xcf,0xff,0xd5,0xd7,0xff,0xde,0xdf,0xff,0xe6,0xe7,0xff,0xee,0xef,0xff,0xf6,0xf7,0xff,0xff,0xff, ]
269 | 
270 |         prev_rowf = prepend_data_with_filter(prev_row, 0)
271 |         dataf = prepend_data_with_filter(data, filter)
272 |         prev_rowf.extend(dataf)
273 |         dataf = prev_rowf
274 |         result, error = flate_png_impl(dataf, 12, width, channels, bit_depth)
275 | 
276 |         assert error is None
277 |         prev_rowa = array.array('B', prev_row)
278 |         prev_rowa.extend(expected)
279 |         expected = prev_rowa
280 |         assert expected == result, "\ne: %r\nr: %r" % (expected, result)
281 | 
282 |     def util_test_flate_png_alt_from_png_log_file(self, filename):
283 | 
284 |         with open(filepath(filename)) as f:
285 |             data = array.array('B')
286 |             expected = array.array('B')
287 |             width = 0
288 |             bit_depth = 0
289 |             channels = 0
290 |             color_type = 0
291 |             pixel_depth = 0
292 |             rowbytes = 0
293 |             filter = 0
294 |             nrows = 0
295 | 
296 |             for l in f.readlines():
297 | 
298 |                 if l.startswith("PASS:"):
299 |                     break
300 | 
301 |                 l = l.split(' = ')
302 |                 var = l[0]
303 |                 val = l[1]
304 | 
305 |                 if var == 'width':
306 |                     width = int(val)
307 | 
308 |                 elif var == 'bit_depth':
309 |                     bit_depth = int(val)
310 | 
311 |                 elif var == 'channels':
312 |                     channels = int(val)
313 | 
314 |                 elif var == 'color_type':
315 |                    color_type = int(val)
316 | 
317 |                 elif var == 'pixel_depth':
318 |                     pixel_depth = int(val)
319 | 
320 |                 elif var == 'rowbytes':
321 |                     rowbytes = int(val)
322 | 
323 |                 elif var == 'filter':
324 |                     filter = int(val)
325 | 
326 |                 elif var == 'data':
327 |                     d = ast.literal_eval(val)
328 |                     data.append(filter)
329 |                     data.extend(d)
330 | 
331 |                 elif var == 'expected':
332 |                     e = ast.literal_eval(val)
333 |                     expected.extend(e)
334 |                     nrows += 1
335 | 
336 |             bytes_per_pixel = pixel_depth // 8
337 | 
338 |             logging.error("width: %d" % width)
339 |             logging.error("bit_depth: %d" % bit_depth)
340 |             logging.error("channels: %d" % channels)
341 |             logging.error("color_type: %d" % color_type)
342 |             logging.error("pixel_depth: %d" % pixel_depth)
343 |             logging.error("rowbytes: %d" % rowbytes)
344 |             logging.error("filter: %d" % filter)
345 |             logging.error("bytes_per_pixel: %d" % bytes_per_pixel)
346 |             logging.error("expected: %r" % len(expected))
347 |             logging.error("data: %r" % len(data))
348 | 
349 |             assert color_type in [
350 |                         0, # Grayscale (Y)
351 |                         2, # Truecolor (RGB)
352 |                         # 3 Indexed is not supported (Palette)
353 |                         4, # Grayscale with alpha (YA)
354 |                         6, # Truecolor with alpha (RGBA)
355 |                     ]
356 |             assert filter in [0, 1, 2, 3, 4]
357 |             assert channels * bit_depth == pixel_depth
358 |             assert (pixel_depth // 8) * width == rowbytes
359 |             assert 0 == pixel_depth % 8 # can't support pixels with bit_depth < 8
360 |             assert 8 == bit_depth # ideally, we should test bit_depth 16 also
361 |             assert nrows * (1 + width * bytes_per_pixel) == len(data) # 1 filter byte preceeding each row
362 |             assert nrows * width * bytes_per_pixel == len(expected)
363 | 
364 |         result, error = flate_png_impl(data, 12, width, channels, bit_depth)
365 | 
366 |         import pickle
367 |         with open(filepath('./result.pickle'), 'wb') as f:
368 |             pickle.dump(result, f)
369 |         with open(filepath('./expected.pickle'), 'wb') as f:
370 |             pickle.dump(expected, f)
371 | 
372 |         assert error is None
373 |         assert expected == result
374 | 
375 | 
376 |     def test_flate_png_alt_file_f01n2c08(self):
377 |         self.util_test_flate_png_alt_from_png_log_file("./f01n2c08.png.log")
378 | 
379 |     def test_flate_png_alt_file_f02n2c08(self):
380 |         self.util_test_flate_png_alt_from_png_log_file("./f02n2c08.png.log")
381 | 
382 |     def test_flate_png_alt_file_f03n2c08(self):
383 |         self.util_test_flate_png_alt_from_png_log_file("./f03n2c08.png.log")
384 | 
385 |     def test_flate_png_alt_file_f04n2c08(self):
386 |         self.util_test_flate_png_alt_from_png_log_file("./f04n2c08.png.log")
387 | 
388 |     def test_flate_png_alt_file_basn2c08(self):
389 |         self.util_test_flate_png_alt_from_png_log_file("./basn2c08.png.log")
390 | 
391 |     def test_flate_png_alt_file_basn0g08(self):
392 |         self.util_test_flate_png_alt_from_png_log_file("./basn0g08.png.log")
393 | 
394 | 
395 | def main():
396 |     unittest.main()
397 | 
398 | 
399 | if __name__ == '__main__':
400 |     main()
401 | 


--------------------------------------------------------------------------------
/tests/test_pdfdict.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # encoding: utf-8
 3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
 4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
 5 | #                    2016 James Laird-Wah, Sydney, Australia
 6 | # MIT license -- See LICENSE.txt for details
 7 | 
 8 | '''
 9 | Run from the directory above like so:
10 | python -m tests.test_pdfstring
11 | '''
12 | 
13 | 
14 | from pdfrw import PdfDict, PdfName
15 | from pdfrw.objects import PdfIndirect
16 | 
17 | import unittest
18 | 
19 | 
20 | class TestPdfDicts(unittest.TestCase):
21 |     
22 |     def test_indirect_set_get(self):
23 |         io = PdfIndirect((1,2,3))
24 |         io.value = 42
25 |         d = PdfDict()
26 |         d.Name = io
27 |         test, = (x for x in dict.values(d))
28 |         self.assertEqual(test, io)
29 |         v = d['/Name']
30 |         self.assertEqual(v, io.value)
31 |         test, = d
32 |         self.assertEqual(type(test), type(PdfName.Name))
33 | 
34 | def main():
35 |     unittest.main()
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/tests/test_pdfreader_init.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | import static_pdfs
 3 | 
 4 | from pdfrw import PdfReader
 5 | 
 6 | try:
 7 |     import unittest2 as unittest
 8 | except ImportError:
 9 |     import unittest
10 | 
11 | 
12 | class TestPdfReaderInit(unittest.TestCase):
13 | 
14 |     def test_fname_binary_filelike(self):
15 |         with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
16 |             PdfReader(pdf_file)
17 | 
18 |     def test_fdata_binary(self):
19 |         with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
20 |             pdf_bytes = pdf_file.read()
21 |             PdfReader(fdata=pdf_bytes)
22 | 
23 | 
24 | def main():
25 |     unittest.main()
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/tests/test_pdfstring.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # encoding: utf-8
  3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  4 | # Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
  5 | #                    2016 James Laird-Wah, Sydney, Australia
  6 | # MIT license -- See LICENSE.txt for details
  7 | 
  8 | '''
  9 | Run from the directory above like so:
 10 | python -m tests.test_pdfstring
 11 | '''
 12 | 
 13 | 
 14 | from pdfrw import PdfString
 15 | from pdfrw.py23_diffs import convert_store
 16 | 
 17 | import unittest
 18 | 
 19 | 
 20 | class TestBaseEncoding(unittest.TestCase):
 21 | 
 22 |     def encode(self, value):
 23 |         x = PdfString.encode(value)
 24 |         if isinstance(value, type(u'')):
 25 |             y = PdfString.from_unicode(value)
 26 |         else:
 27 |             y = PdfString.from_bytes(value)
 28 |         self.assertEqual(x, y)
 29 |         return x
 30 | 
 31 |     def decode(self, value):
 32 |         s = PdfString(value)
 33 |         x = s.to_unicode()
 34 |         y = s.decode()
 35 |         self.assertEqual(x, y)
 36 |         return x
 37 | 
 38 |     def decode_bytes(self, decode_this, expected):
 39 |         """ Decode to bytes"""
 40 |         self.assertEqual(PdfString(decode_this).to_bytes(),
 41 |                          convert_store(expected))
 42 | 
 43 |     def roundtrip(self, value, expected=None):
 44 |         result = self.encode(value)
 45 |         self.assertEqual(value, self.decode(result))
 46 |         if expected is not None:
 47 |             self.assertEqual(result, expected)
 48 |         return result
 49 | 
 50 |     def test_doubleslash(self):
 51 |         self.roundtrip('\\')
 52 |         self.roundtrip(r'\\')
 53 | 
 54 |     def test_unicode_encoding(self):
 55 |         # These chars are in PdfDocEncoding
 56 |         self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
 57 |         # These chars are not in PdfDocEncoding
 58 |         self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
 59 |         # Check that we're doing a reasonable encoding
 60 |         # Might want to change this later if we change the definition of reasonable
 61 |         self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
 62 |         self.roundtrip(u'(\n\u0101', '<FEFF0028000A0101>')
 63 | 
 64 | 
 65 |     def test_constructor(self):
 66 |         obj = PdfString('hello')
 67 | 
 68 |     def test_continuation(self):
 69 |         # See PDF 1.7 ref section 3.2 page 55
 70 |         s1 = PdfString('(These two strings are the same.)')
 71 |         self.assertEqual(s1.decode(), s1[1:-1])
 72 |         s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
 73 |         self.assertEqual(s1.decode(), s2.decode())
 74 |         s2 = PdfString(s2.replace('\n', '\r'))
 75 |         self.assertEqual(s1.decode(), s2.decode())
 76 |         s2 = PdfString(s2.replace('\r', '\r\n'))
 77 |         self.assertEqual(s1.decode(), s2.decode())
 78 | 
 79 |     def test_hex_whitespace(self):
 80 |         # See PDF 1.7 ref section 3.2 page 56
 81 |         self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
 82 | 
 83 |     def test_unicode_escaped_decode(self):
 84 |         # Some PDF producers happily put unicode strings in PdfDocEncoding,
 85 |         # because the Unicode BOM and \0 are valid code points
 86 |         decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
 87 |         self.assertEqual(decoded, "hello")
 88 | 
 89 | 
 90 |     def test_unescaping(self):
 91 |         self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
 92 |                            ' ( ) \\ \n \t \f \r \r\n \\n')
 93 | 
 94 |         self.decode_bytes(r'(\b\010\10)', '\b\b\b')
 95 |         self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
 96 |                           '\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
 97 |         self.decode_bytes(r'(\\\nabc)', '\\\nabc')
 98 |         self.decode_bytes(r'(\ )', ' ')
 99 | 
100 |     def test_BOM_variants(self):
101 |         self.roundtrip(u'\ufeff', '<FEFFFEFF>')
102 |         self.roundtrip(u'\ufffe', '<FEFFFFFE>')
103 |         self.roundtrip(u'\xfe\xff', '<FEFF00FE00FF>')
104 |         self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
105 |         self.assertRaises(UnicodeError, PdfString.from_unicode,
106 |                           u'þÿ blah', text_encoding='pdfdocencoding')
107 | 
108 |     def test_byte_encode(self):
109 |         self.assertEqual(self.encode(b'ABC'), '(ABC)')
110 | 
111 |     def test_nullstring(self):
112 |         self.assertEqual(PdfString('<>').to_bytes(), b'')
113 |         self.assertEqual(PdfString('()').to_bytes(), b'')
114 | 
115 | def main():
116 |     unittest.main()
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/tests/test_roundtrip.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
  4 | # Copyright (C) 2015 Patrick Maupin, Austin, Texas
  5 | # MIT license -- See LICENSE.txt for details
  6 | 
  7 | '''
  8 | Run from the directory above like so:
  9 | 
 10 |    python -m tests.test_roundtrip
 11 | 
 12 | A PDF that has been determined to be good or bad
 13 | should be added to expected.txt with either a good
 14 | checksum, or just the word "fail".
 15 | 
 16 | These tests are incomplete, but they allow us to try
 17 | out various PDFs.  There is a collection of difficult
 18 | PDFs available on github.
 19 | 
 20 | In order to use them:
 21 | 
 22 |   1) Insure that github.com/pmaupin/static_pdfs is on your path.
 23 | 
 24 |   2) Use the imagemagick compare program to look at differences
 25 |      between the static_pdfs/global directory and the tmp_results
 26 |      directory after you run this.
 27 | 
 28 | 
 29 | '''
 30 | import os
 31 | import hashlib
 32 | import pdfrw
 33 | import static_pdfs
 34 | import expected
 35 | 
 36 | from pdfrw.py23_diffs import convert_store
 37 | 
 38 | try:
 39 |     import unittest2 as unittest
 40 | except ImportError:
 41 |     import unittest
 42 | 
 43 | 
 44 | class TestOnePdf(unittest.TestCase):
 45 | 
 46 |     def roundtrip(self, testname, basename, srcf, decompress=False,
 47 |                   compress=False, repaginate=False):
 48 |         dstd = os.path.join(expected.result_dir, testname)
 49 |         if not os.path.exists(dstd):
 50 |             os.makedirs(dstd)
 51 |         dstf = os.path.join(dstd, basename)
 52 |         hashfile = os.path.join(expected.result_dir, 'hashes.txt')
 53 |         hashkey = '%s/%s' % (testname, basename)
 54 |         hash = '------no-file-generated---------'
 55 |         expects = expected.results[hashkey]
 56 | 
 57 |         # If the test has been deliberately skipped,
 58 |         # we are done.  Otherwise, execute it even
 59 |         # if we don't know about it yet, so we have
 60 |         # results to compare.
 61 | 
 62 |         result = 'fail'
 63 |         size = 0
 64 |         try:
 65 |             if 'skip' in expects:
 66 |                 result = 'skip requested'
 67 |                 return self.skipTest(result)
 68 |             elif 'xfail' in expects:
 69 |                 result = 'xfail requested'
 70 |                 return self.fail(result)
 71 | 
 72 |             exists = os.path.exists(dstf)
 73 |             if expects or not exists:
 74 |                 if exists:
 75 |                     os.remove(dstf)
 76 |                 trailer = pdfrw.PdfReader(srcf, decompress=decompress,
 77 |                                           verbose=False)
 78 |                 writer = pdfrw.PdfWriter(dstf, compress=compress)
 79 |                 if repaginate:
 80 |                     writer.addpages(trailer.pages)
 81 |                 else:
 82 |                     writer.trailer = trailer
 83 |                 writer.write()
 84 |             with open(dstf, 'rb') as f:
 85 |                 data = f.read()
 86 |             size = len(data)
 87 |             if data:
 88 |                 hash = hashlib.md5(data).hexdigest()
 89 |             else:
 90 |                 os.remove(dstf)
 91 |             if expects:
 92 |                 if len(expects) == 1:
 93 |                     expects, = expects
 94 |                     self.assertEqual(hash, expects)
 95 |                 else:
 96 |                     self.assertIn(hash, expects)
 97 |                 result = 'pass'
 98 |             else:
 99 |                 result = 'skip'
100 |                 self.skipTest('No hash available')
101 |         finally:
102 |             result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
103 |             with open(hashfile, 'ab') as f:
104 |                 f.write(convert_store(result))
105 | 
106 | 
107 | def build_tests():
108 |     def test_closure(*args, **kw):
109 |         def test(self):
110 |             self.roundtrip(*args, **kw)
111 |         return test
112 |     for mytest, repaginate, decompress, compress in (
113 |             ('simple', False, False, False),
114 |             ('repaginate', True, False, False),
115 |             ('decompress', False, True, False),
116 |             ('compress', False, True, True),
117 |             ):
118 |         for srcf in static_pdfs.pdffiles[0]:
119 |             basename = os.path.basename(srcf)
120 |             test_name = 'test_%s_%s' % (mytest, basename)
121 |             test = test_closure(mytest, basename, srcf,
122 |                                 repaginate=repaginate,
123 |                                 decompress=decompress,
124 |                                 compress=compress,
125 |                                 )
126 |             setattr(TestOnePdf, test_name, test)
127 | build_tests()
128 | 
129 | 
130 | def main():
131 |     unittest.main()
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/tests/update_expected.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python2
 2 | """
 3 | Put old (good) results in ramdisk/reference,
 4 | then generate new (unknown) test results in ramdisk/tmp_results,
 5 | THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally:
 6 | 
 7 | run this to update any checksums in expected.txt where both versions
 8 | parse to same PDFs.
 9 | """
10 | 
11 | import os
12 | import hashlib
13 | from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject
14 | 
15 | 
16 | def make_canonical(trailer):
17 |     ''' Canonicalizes a PDF.  Assumes everything
18 |         is a Pdf object already.
19 |     '''
20 |     visited = set()
21 |     workitems = list(trailer.values())
22 |     while workitems:
23 |         obj = workitems.pop()
24 |         objid = id(obj)
25 |         if objid in visited:
26 |             continue
27 |         visited.add(objid)
28 |         obj.indirect = True
29 |         if isinstance(obj, (PdfArray, PdfDict)):
30 |             if isinstance(obj, PdfArray):
31 |                 workitems += obj
32 |             else:
33 |                 workitems += obj.values()
34 |     return trailer
35 | 
36 | with open('expected.txt', 'rb') as f:
37 |     expected = f.read()
38 | 
39 | def get_digest(fname):
40 |         with open(fname, 'rb') as f:
41 |             data = f.read()
42 |         if data:
43 |             return hashlib.md5(data).hexdigest()
44 | 
45 | tmp = '_temp.pdf'
46 | count = 0
47 | goodcount = 0
48 | 
49 | changes = []
50 | for (srcpath, _, filenames) in os.walk('ramdisk/reference'):
51 |     for name in filenames:
52 |         if not name.endswith('.pdf'):
53 |             continue
54 |         src = os.path.join(srcpath, name)
55 |         dst = src.replace('/reference/', '/tmp_results/')
56 |         if not os.path.exists(dst):
57 |             continue
58 |         src_digest = get_digest(src)
59 |         if not src_digest or src_digest not in expected:
60 |             continue
61 |         print src
62 |         count += 1
63 |         trailer = make_canonical(PdfReader(src))
64 |         out = PdfWriter(tmp)
65 |         out.write(trailer=trailer)
66 |         match_digest = get_digest(tmp)
67 |         if not match_digest:
68 |             continue
69 |         trailer = make_canonical(PdfReader(dst))
70 |         out = PdfWriter(tmp)
71 |         out.write(trailer=trailer)
72 |         if get_digest(tmp) != match_digest:
73 |             continue
74 |         goodcount += 1
75 |         print "OK"
76 |         changes.append((src_digest, get_digest(dst)))
77 | 
78 | print count, goodcount
79 | 
80 | for stuff in changes:
81 |     expected = expected.replace(*stuff)
82 | 
83 | with open('expected.txt', 'wb') as f:
84 |     f.write(expected)
85 | 


--------------------------------------------------------------------------------