├── .gitignore ├── CONTRIBUTORS ├── LICENSE ├── README.md ├── TODO ├── requirements.txt ├── staplelib ├── __init__.py ├── commands.py ├── iohelper.py ├── stapler.py ├── testfiles │ ├── 1page.pdf │ └── 5page.pdf └── tests.py └── stapler /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | pip-log.txt 3 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Philip Stark 2 | Fred Wenzel 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2009, Philip Stark, Fred Wenzel 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Stapler 2 | ======= 3 | 4 | Stapler is a pure Python alternative to [PDFtk][pdftk], a tool for manipulating 5 | PDF documents from the command line. 6 | 7 | [pdftk]: http://www.pdfhacks.com/pdftk/ 8 | 9 | History 10 | ------- 11 | PDFtk was written in Java and C++, and is natively compiled with gcj. Sadly, 12 | it has been discontinued a few years ago and bitrot is setting in (e.g., 13 | it does not compile easily on a number of platforms). 14 | 15 | Philip Stark decided to look for an alternative and found pypdf, a PDF library 16 | written in pure Python. He couldn't find a tool which actually used the 17 | library, so he started writing his own. 18 | 19 | This version of stapler is Fred Wenzel's fork of the project, with a completely 20 | refactored source code, tests, and added functionality. 21 | 22 | Like pdftk, stapler is a command-line tool. If you would like to add a GUI, 23 | compile it into a binary for your favorite platform, or contribute anything else, 24 | feel free to fork and send me a pull request. 25 | 26 | License 27 | ------- 28 | Stapler version 0.2 was written in 2009 by Philip Stark. 29 | Stapler version 0.3 and later were written from 2010--today by Fred Wenzel. 30 | 31 | For a list of contributors, check the ``CONTRIBUTORS`` file. 32 | 33 | Stapler is distributed under a BSD license. A copy of the BSD Style 34 | License used can be found in the file ``LICENSE``. 35 | 36 | Usage 37 | ----- 38 | There are the following modes in Stapler: 39 | 40 | ### select/delete (called with ``sel`` and ``del``, respectively) 41 | With select, you can cherry-pick pages from pdfs and concatenate them into 42 | a new pdf file. 43 | 44 | Syntax: 45 | 46 | stapler sel input1 page_or_range [page_or_range ...] [input2 p_o_r ...] 47 | 48 | Examples: 49 | 50 | # concatenate a and b into output.pdf 51 | stapler sel a.pdf b.pdf output.pdf 52 | 53 | # generate a pdf file called output.pdf with the following pages: 54 | # 1, 4-8, 20-40 from a.pdf, 1-5 from b.pdf in this order 55 | stapler sel a.pdf 1 4-8 20-40 b.pdf 1-5 output.pdf 56 | 57 | # reverse some of the pages in a.pdf by specifying a negative range 58 | stapler sel a.pdf 1-3 9-6 10 output.pdf 59 | 60 | The delete command works almost exactly the same as select, but inverse. 61 | It uses the pages and ranges which you _didn't_ specify. 62 | 63 | ### split/burst: 64 | Splits the specified pdf files into their single pages and writes each page 65 | into it's own pdf file with this naming scheme: 66 | 67 | ${origname}_${zero-padded page no}.pdf 68 | 69 | Syntax: 70 | 71 | stapler split input1 [input2 input3 ...] 72 | 73 | Example for a file foobar.pdf with 20 pages: 74 | 75 | $ stapler split foobar.pdf 76 | $ ls 77 | foobar_01.pdf foobar_02.pdf ... foobar_19.pdf foobar_20.pdf 78 | 79 | Multiple files can be specified, they will be processed as if you called 80 | single instances of stapler. 81 | 82 | ### info: 83 | Shows information on the metadata stored inside a PDF file. 84 | 85 | Syntax: 86 | 87 | stapler info foo.pdf 88 | 89 | Example output: 90 | *** Metadata for foo.pdf 91 | 92 | /ModDate: D:20100313082451+01'00' 93 | /CreationDate: D:20100313082451+01'00' 94 | /Producer: GPL Ghostscript 8.70 95 | /Title: foo.pdf 96 | /Creator: PDFCreator Version 0.9.9 97 | /Keywords: 98 | /Author: John Doe 99 | /Subject: 100 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | Comparison with pdftk features: 2 | [x] = done, [?] = maybe, [ ] = to do 3 | 4 | [x] Merge PDF Documents 5 | [x] Split PDF Pages into a New Document 6 | [x] Rotate PDF Pages or Documents 7 | [x] Decrypt Input as Necessary (Password Required) 8 | [x] Encrypt Output as Desired 9 | [?] Fill PDF Forms with FDF Data or XFDF Data and/or Flatten Forms 10 | [?] Apply a Background Watermark or a Foreground Stamp 11 | [ ] Report on PDF Metrics such as Metadata, Bookmarks, and Page Labels 12 | [?] Update PDF Metadata 13 | [?] Attach Files to PDF Pages or the PDF Document 14 | [?] Unpack PDF Attachments 15 | [x] Burst a PDF Document into Single Pages 16 | [ ] Uncompress and Re-Compress Page Streams 17 | [x] Repair Corrupted PDF (Where Possible) 18 | 19 | Known issues: 20 | - The same page of the same input file cannot be added to the output twice 21 | with two different rotations (e.g., ``cat foo.pdf 1R 1D`` won't do what 22 | you'd hope). 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyPDF2==1.20 2 | -------------------------------------------------------------------------------- /staplelib/__init__.py: -------------------------------------------------------------------------------- 1 | class CommandError(Exception): 2 | """ 3 | Exception class indicating a problem while executing a stapler command. 4 | """ 5 | pass 6 | 7 | OPTIONS = None # optparse options 8 | -------------------------------------------------------------------------------- /staplelib/commands.py: -------------------------------------------------------------------------------- 1 | """Module containing the actual commands stapler understands.""" 2 | 3 | import math 4 | import os.path 5 | 6 | from PyPDF2 import PdfFileWriter, PdfFileReader 7 | 8 | from . import CommandError, iohelper 9 | import staplelib 10 | 11 | 12 | def select(args, inverse=False): 13 | """ 14 | Concatenate files / select pages from files. 15 | 16 | inverse=True excludes rather than includes the selected pages from 17 | the file. 18 | """ 19 | 20 | filesandranges = iohelper.parse_ranges(args[:-1]) 21 | outputfilename = args[-1] 22 | verbose = staplelib.OPTIONS.verbose 23 | 24 | if not filesandranges or not outputfilename: 25 | raise CommandError("Both input and output filenames are required.") 26 | iohelper.check_output_file(outputfilename) 27 | 28 | output = PdfFileWriter() 29 | try: 30 | for input in filesandranges: 31 | pdf = input['pdf'] 32 | if verbose: 33 | print input['name'] 34 | 35 | # empty range means "include all pages" 36 | if not inverse: 37 | pagerange = input['pages'] or [ 38 | (p, iohelper.ROTATION_NONE) for p in 39 | range(1, pdf.getNumPages()+1)] 40 | else: 41 | excluded = [p for p, r in input['pages']] 42 | pagerange = [(p, iohelper.ROTATION_NONE) for p in 43 | range(1, pdf.getNumPages()+1) if 44 | p not in excluded] 45 | 46 | for pageno, rotate in pagerange: 47 | if 1 <= pageno <= pdf.getNumPages(): 48 | if verbose: 49 | print "Using page: %d (rotation: %d deg.)" % (pageno, rotate) 50 | output.addPage(pdf.getPage(pageno-1).rotateClockwise(rotate)) 51 | else: 52 | raise CommandError( 53 | "Page %d not found in %s." % (pageno, input['name'])) 54 | 55 | except Exception, e: 56 | raise CommandError(e) 57 | 58 | iohelper.write_pdf(output, outputfilename) 59 | 60 | 61 | def delete(args): 62 | """Concatenate files and remove pages from files.""" 63 | 64 | return select(args, inverse=True) 65 | 66 | 67 | def split(args): 68 | """Burst an input file into one file per page.""" 69 | 70 | files = iohelper.expand_input_files(args) 71 | verbose = staplelib.OPTIONS.verbose 72 | 73 | if not files: 74 | raise CommandError("No input files specified.") 75 | 76 | inputs = [] 77 | try: 78 | for f in files: 79 | inputs.append(iohelper.read_pdf(f)) 80 | except Exception, e: 81 | raise CommandError(e) 82 | 83 | filecount = 0 84 | pagecount = 0 85 | for input in inputs: 86 | # zero-padded output file name 87 | (base, ext) = os.path.splitext(os.path.basename(files[filecount])) 88 | output_template = ''.join([ 89 | base, '_', 90 | '%0', str(math.ceil(math.log10(input.getNumPages()))), 'd', 91 | ext 92 | ]) 93 | 94 | for pageno in range(input.getNumPages()): 95 | output = PdfFileWriter() 96 | output.addPage(input.getPage(pageno)) 97 | 98 | outputname = output_template % (pageno+1) 99 | if verbose: 100 | print outputname 101 | iohelper.write_pdf(output, outputname) 102 | 103 | pagecount += 1 104 | filecount += 1 105 | 106 | if verbose: 107 | print 108 | print "%d page(s) in %d file(s) processed." % (pagecount, filecount) 109 | 110 | 111 | def info(args): 112 | """Display Metadata content for all input files.""" 113 | files = iohelper.expand_input_files(args) 114 | 115 | if not files: 116 | raise CommandError("No input files specified.") 117 | 118 | for f in files: 119 | pdf = iohelper.read_pdf(f) 120 | print "*** Metadata for %s" % f 121 | print 122 | info = pdf.documentInfo 123 | if info: 124 | for name, value in info.items(): 125 | print " %s: %s" % (name, value) 126 | else: 127 | print " (No metadata found.)" 128 | print 129 | -------------------------------------------------------------------------------- /staplelib/iohelper.py: -------------------------------------------------------------------------------- 1 | """Helper functions for user-supplied arguments and file I/O.""" 2 | 3 | import getpass 4 | import glob 5 | import os.path 6 | import re 7 | import sys 8 | 9 | from PyPDF2 import PdfFileWriter, PdfFileReader 10 | 11 | from . import CommandError 12 | import staplelib 13 | 14 | 15 | ROTATION_NONE = 0 16 | ROTATION_RIGHT = 90 17 | ROTATION_TURN = 180 18 | ROTATION_LEFT = 270 19 | ROTATIONS = {'u': ROTATION_NONE, 20 | 'r': ROTATION_RIGHT, 21 | 'd': ROTATION_TURN, 22 | 'l': ROTATION_LEFT} 23 | 24 | 25 | def read_pdf(filename): 26 | """Open a PDF file with pyPdf.""" 27 | if not os.path.exists(filename): 28 | raise CommandError("%s does not exist" % filename) 29 | pdf = PdfFileReader(file(filename, "rb")) 30 | if pdf.isEncrypted: 31 | while True: 32 | pw = prompt_for_pw(filename) 33 | matched = pdf.decrypt(pw) 34 | if matched: 35 | break 36 | else: 37 | print "The password did not match." 38 | return pdf 39 | 40 | 41 | def write_pdf(pdf, filename): 42 | """Write the content of a PdfFileWriter object to a file.""" 43 | if os.path.exists(filename): 44 | raise CommandError("File already exists: %s" % filename) 45 | 46 | opt = staplelib.OPTIONS 47 | if opt: 48 | if opt.ownerpw or opt.userpw: 49 | pdf.encrypt(opt.userpw or '', opt.ownerpw) 50 | 51 | outputStream = file(filename, "wb") 52 | pdf.write(outputStream) 53 | outputStream.close() 54 | 55 | 56 | def prompt_for_pw(filename): 57 | """Prompt the user for the password to access an input file.""" 58 | print 'Please enter a password to decrypt %s.' % filename 59 | print '(The password will not be shown. Press ^C to cancel).' 60 | 61 | try: 62 | return getpass.getpass('--> ') 63 | except KeyboardInterrupt: 64 | sys.stderr.write('Aborted by user.\n') 65 | sys.exit(2) 66 | 67 | 68 | def expand_input_files(arglist): 69 | """Expand (glob) input files if necessary, and ensure they all exist.""" 70 | # Expand all files in input list, don't retain empty results 71 | files = [] 72 | for fs in filter(None, (glob.glob(arg) for arg in arglist)): 73 | for f in fs: 74 | files.append(f) 75 | 76 | for filename in files: 77 | if not os.path.exists(filename): 78 | raise CommandError("%s does not exist" % filename) 79 | 80 | return files 81 | 82 | 83 | def check_output_file(filename): 84 | """Make sure the output file does not exist.""" 85 | 86 | if os.path.exists(filename): 87 | raise CommandError("File already exists: %s" % filename) 88 | 89 | 90 | def parse_ranges(files_and_ranges): 91 | """Parse a list of filenames followed by ranges.""" 92 | 93 | operations = [] 94 | for inputname in files_and_ranges: 95 | if inputname.lower().endswith('.pdf'): 96 | filenames = expand_input_files((inputname,)) # Expand expects arg tuple. 97 | for filename in filenames: 98 | operations.append({"name": filename, 99 | "pdf": read_pdf(filename), 100 | "pages": []}) 101 | else: 102 | match = re.match('([0-9]+|end)(?:-([0-9]+|end))?([LRD]?)', inputname) 103 | if not match: 104 | raise CommandError('Invalid range: %s' % inputname) 105 | 106 | current = operations[-1] 107 | max_page = current['pdf'].getNumPages() 108 | # allow "end" as alias for the last page 109 | replace_end = lambda page: ( 110 | max_page if page.lower() == 'end' else int(page)) 111 | begin = replace_end(match.group(1)) 112 | end = replace_end(match.group(2)) if match.group(2) else begin 113 | 114 | rotate = ROTATIONS.get((match.group(3) or 'u').lower()) 115 | 116 | if begin > max_page or end > max_page: 117 | raise CommandError( 118 | 'Range %s-%s exceeds maximum page number %s of file %s' % ( 119 | begin, end, max_page, current['name'])) 120 | 121 | # negative ranges sort pages backwards 122 | if begin < end: 123 | pagerange = range(begin, end+1) 124 | else: 125 | pagerange = range(end, begin+1)[::-1] 126 | 127 | for p in pagerange: 128 | current['pages'].append((p, rotate)) 129 | 130 | return operations 131 | -------------------------------------------------------------------------------- /staplelib/stapler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Main stapler dispatcher.""" 4 | 5 | from optparse import OptionParser 6 | import sys 7 | 8 | from . import commands, CommandError 9 | import staplelib 10 | 11 | 12 | USAGE = """ 13 | usage: %prog [options] mode input.pdf ... [output.pdf] 14 | 15 | Modes: 16 | cat/sel: [] ... (output needed) 17 | Select the given pages/ranges from input files. 18 | No range means all pages. 19 | del: [[]] ... (output needed) 20 | Select all but the given pages/ranges from input files. 21 | burst/split: ... (no output needed) 22 | Create one file per page in input pdf files (no output needed) 23 | info: ... (no output needed) 24 | Display PDF metadata 25 | 26 | Page ranges: 27 | n - single numbers mean single pages (e.g., 15) 28 | n-m - page ranges include the entire specified range (e.g. 1-6) 29 | m-n - negative ranges sort pages backwards (e.g., 6-3) 30 | 31 | Extended page range options: 32 | ...-end will be replaced with the last page in the file 33 | R, L, or D will rotate the respective range +90, -90, or 180 degrees, 34 | respectively. (e.g., 1-15R) 35 | """.strip() 36 | 37 | 38 | # command line option parser 39 | parser = OptionParser(usage=USAGE) 40 | parser.add_option('-o', '--ownerpw', action='store', dest='ownerpw', 41 | help='Set owner password to encrypt output file with', 42 | default=None) 43 | parser.add_option('-u', '--userpw', action='store', dest='userpw', 44 | help='Set user password to encrypt output file with', 45 | default=None) 46 | parser.add_option('-v', '--verbose', action='store_true', dest='verbose', 47 | default=False) 48 | 49 | 50 | def main(): 51 | """ 52 | Handle all command line arguments and pass them on to the respective 53 | commands. 54 | """ 55 | (staplelib.OPTIONS, args) = parser.parse_args() 56 | 57 | if (len(args) < 2): 58 | print_error("Not enough arguments", show_usage=True) 59 | 60 | modes = { 61 | "cat": commands.select, 62 | "sel": commands.select, 63 | "split": commands.split, 64 | "burst": commands.split, 65 | "del": commands.delete, 66 | "info": commands.info, 67 | } 68 | 69 | mode = args[0] 70 | args = args[1:] 71 | if not mode in modes: 72 | print_error('Please enter a valid mode', show_usage=True) 73 | 74 | if staplelib.OPTIONS.verbose: 75 | print "Mode: %s" % mode 76 | 77 | # dispatch call to known subcommand 78 | try: 79 | modes[mode](args) 80 | except CommandError, e: 81 | print_error(e) 82 | 83 | 84 | def print_error(msg, code=1, show_usage=False): 85 | """Pretty-print an error to the user.""" 86 | sys.stderr.write(str('Error: %s\n' % msg)) 87 | 88 | if show_usage: 89 | sys.stderr.write("\n%s\n" % parser.get_usage()) 90 | 91 | sys.exit(code) 92 | -------------------------------------------------------------------------------- /staplelib/testfiles/1page.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fwenzel/stapler/02d8086e9ce92fd7eddb23725c8c652f82466b55/staplelib/testfiles/1page.pdf -------------------------------------------------------------------------------- /staplelib/testfiles/5page.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fwenzel/stapler/02d8086e9ce92fd7eddb23725c8c652f82466b55/staplelib/testfiles/5page.pdf -------------------------------------------------------------------------------- /staplelib/tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os.path 4 | import shutil 5 | from subprocess import check_call 6 | import tempfile 7 | import unittest 8 | 9 | from PyPDF2 import PdfFileReader 10 | 11 | 12 | HERE = os.path.abspath(os.path.dirname(__file__)) 13 | TESTFILE_DIR = os.path.join(HERE, 'testfiles') 14 | STAPLER = os.path.join(HERE, '..', 'stapler') 15 | ONEPAGE_PDF = os.path.join(TESTFILE_DIR, '1page.pdf') 16 | FIVEPAGE_PDF = os.path.join(TESTFILE_DIR, '5page.pdf') 17 | 18 | 19 | class TestStapler(unittest.TestCase): 20 | """Some unit tests for the stapler tool.""" 21 | 22 | def setUp(self): 23 | self.tmpdir = tempfile.mkdtemp() 24 | self.outputfile = os.path.join(self.tmpdir, 'output.pdf') 25 | os.chdir(self.tmpdir) 26 | 27 | def tearDown(self): 28 | shutil.rmtree(self.tmpdir) 29 | os.chdir(HERE) 30 | 31 | def test_cat(self): 32 | """Make sure files are properly concatenated.""" 33 | check_call([STAPLER, 'cat', ONEPAGE_PDF, FIVEPAGE_PDF, 34 | self.outputfile]) 35 | self.assert_(os.path.isfile(self.outputfile)) 36 | pdf = PdfFileReader(file(self.outputfile, 'rb')) 37 | self.assertEqual(pdf.getNumPages(), 6) 38 | 39 | def test_cat_glob(self): 40 | """Make sure wildcard inputs work.""" 41 | check_call([STAPLER, 'cat', os.path.join(TESTFILE_DIR, '*.pdf'), 42 | self.outputfile]) 43 | pdf = PdfFileReader(file(self.outputfile, 'rb')) 44 | self.assertEqual(pdf.getNumPages(), 6) 45 | 46 | def test_split(self): 47 | """Make sure a file is properly split into pages.""" 48 | check_call([STAPLER, 'split', FIVEPAGE_PDF]) 49 | 50 | filelist = os.listdir(self.tmpdir) 51 | self.assertEqual(len(filelist), 5) 52 | for f in os.listdir(self.tmpdir): 53 | pdf = PdfFileReader(file(os.path.join(self.tmpdir, f), 'rb')) 54 | self.assertEqual(pdf.getNumPages(), 1) 55 | 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /stapler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Stapler is a pure-python tool for modifying PDF files.""" 4 | 5 | from staplelib import stapler 6 | 7 | 8 | __author__ = 'Philip Stark, Fred Wenzel' 9 | __license__ = 'BSD' 10 | VERSION = (0, 4, 0) 11 | __version__ = '.'.join(map(str, VERSION)) 12 | 13 | 14 | if __name__ == '__main__': 15 | stapler.main() 16 | --------------------------------------------------------------------------------