├── .coveragerc
├── .gitignore
├── .isort.cfg
├── .travis.yml
├── CHANGELOG
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── pylintrc
├── pypdf
    ├── __init__.py
    ├── _version.py
    ├── filters.py
    ├── generic.py
    ├── merger.py
    ├── pagerange.py
    ├── pdf.py
    ├── utils.py
    └── xmp.py
├── samplecode
    ├── MergingComments.py
    ├── PDFComments2XL.py
    ├── README.md
    ├── __init__.py
    ├── basic_features.py
    ├── basic_merging.py
    └── pdfsamples
    │   ├── AutoCad_Diagram.pdf
    │   ├── AutoCad_Simple.pdf
    │   ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf
    │   ├── README.md
    │   ├── SF424_page2.pdf
    │   ├── Seige_of_Vicksburg_Sample_OCR.pdf
    │   └── jpeg.pdf
├── scripts
    ├── 2-up.py
    ├── codecs.py
    ├── pdf-image-extractor.py
    └── pdfcat
├── setup.py
├── tests
    ├── __init__.py
    ├── fixture_data
    │   ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf
    │   ├── Hamlet.txt
    │   ├── SF424_page2.pdf
    │   ├── Seige_of_Vicksburg_Sample_OCR.pdf
    │   ├── TheHappyPrince.txt
    │   ├── attachment_small.png
    │   ├── crazyones.pdf
    │   ├── jpeg.pdf
    │   ├── testDecodeStreamData
    │   │   ├── ASCII85Decode.pdf
    │   │   ├── CCITTFaxDecode.pdf
    │   │   ├── DCTDecode.pdf
    │   │   ├── FlateDecode.pdf
    │   │   └── LZWDecode.pdf
    │   ├── testFileLoad
    │   │   └── crazyones.txt
    │   ├── testIsObjectFree
    │   │   ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf
    │   │   ├── SF424_page2.pdf
    │   │   ├── Seige_of_Vicksburg_Sample_OCR.pdf
    │   │   └── jpeg.pdf
    │   ├── testJpegImage
    │   │   └── jpeg.txt
    │   ├── testReadXRefStreamCompressedObjects
    │   │   └── crazyones.pdf
    │   ├── testXRefStreamObjects
    │   │   └── crazyones.pdf
    │   ├── testXRefTableObjects
    │   │   ├── SF424_page2.pdf
    │   │   ├── Seige_of_Vicksburg_Sample_OCR.pdf
    │   │   └── jpeg.pdf
    │   └── testXTableAgainstXStream
    │   │   └── GeoBase_NHNC1_Data_Model_UML_EN.pdf
    ├── test_filters.py
    ├── test_generic.py
    ├── test_pdf.py
    ├── test_utils.py
    └── utils.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = .tox/*
3 | 
4 | # TODO:  still need to arrange coverage for samplecode/*.py and scripts/*.py.
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist/
 2 | *.pyc
 3 | *.sw[op]
 4 | .DS_Store
 5 | .tox
 6 | build
 7 | .idea/*
 8 | htmlcov/
 9 | .coverage
10 | MANIFEST
11 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | # CL hasn't yet identified a perfect match for Black.  This
 3 | # comes close.
 4 | force_grid_wrap=0
 5 | force_sort_within_sections=True
 6 | include_trailing_comma=True
 7 | # Note 100 is the default for Pylint.  Maybe I'll configure Black
 8 | # to enforce it, also.
 9 | # line_length=100
10 | multi_line_output=3
11 | use_parentheses=True
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: "2.7"
 3 | sudo: false
 4 | 
 5 | env:
 6 |   - TOX_ENV=py27
 7 |   - TOX_ENV=py33
 8 |   - TOX_ENV=py34
 9 |   - TOX_ENV=py35
10 | 
11 | install:
12 |   - pip install tox --use-mirrors
13 | 
14 | script:
15 |   - tox -e $TOX_ENV
16 | 
17 | matrix:
18 |   # Python 3.5 not yet available on travis, watch this to see when it is.
19 |   fast_finish: true
20 |   allow_failures:
21 |     - env: TOX_ENV=py35
22 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
  1 | Version 1.27.0, 2018-08-07
  2 | --------------------------
  3 | - NOTE:  Active maintenance shifts to PyPDF4
  4 | 
  5 | - No functional changes:  just migration to PyPDF4
  6 | 
  7 | 
  8 | Version 1.26.0, 2016-05-18
  9 | --------------------------
 10 | 
 11 |  - NOTE: Active maintenance on PyPDF2 is resuming after a hiatus
 12 | 
 13 |  - Fixed a bug where image resources where incorrectly
 14 |    overwritten when merging pages
 15 | 
 16 |  - Added dictionary for JavaScript actions to the root (louib)
 17 | 
 18 |  - Added unit tests for the JS functionality (louib)
 19 | 
 20 |  - Add more Python 3 compatibility when reading inline images (im2703
 21 |    and (VyacheslavHashov)
 22 | 
 23 |  - Return NullObject instead of raising error when failing to resolve
 24 |    object (ctate)
 25 | 
 26 |  - Don't output warning for non-zeroed xref table when strict=False
 27 |    (BenRussert)
 28 | 
 29 |  - Remove extraneous zeroes from output formatting (speedplane)
 30 | 
 31 |  - Fix bug where reading an inline image would cut off prematurely
 32 |    in certain cases (speedplane)
 33 | 
 34 | 
 35 | Patch 1.25.1, 2015-07-20
 36 | 
 37 |  - Fix bug when parsing inline images. Occurred when merging
 38 |    certain pages with inline images
 39 | 
 40 |  - Fixed type error when creating outlines by utilizing the
 41 |    isString() test
 42 | 
 43 | Version 1.25, 2015-07-07
 44 | ------------------------
 45 | 
 46 | BUGFIXES:
 47 | 
 48 |  - Added Python 3 algorithm for ASCII85Decode. Fixes issue when
 49 |    reading reportlab-generated files with Py 3 (jerickbixly)
 50 | 
 51 |  - Recognize more escape sequence which would otherwise throw an
 52 |    exception (manuelzs, robertsoakes)
 53 | 
 54 |  - Fixed overflow error in generic.py. Occurred
 55 |    when reading a too-large int in Python 2 (by Raja Jamwal)
 56 | 
 57 |  - Allow access to files which were encrypted with an empty
 58 |    password. Previously threw a "File has not been decrypted"
 59 |    exception (Elena Williams)
 60 | 
 61 |  - Do not attempt to decode an empty data stream. Previously
 62 |    would cause an error in decode algorithms (vladir)
 63 | 
 64 |  - Fixed some type issues specific to Py 2 or Py 3
 65 | 
 66 |  - Fix issue when stream data begins with whitespace (soloma83)
 67 | 
 68 |  - Recognize abbreviated filter names (AlmightyOatmeal and
 69 |    Matthew Weiss)
 70 | 
 71 |  - Copy decryption key from PdfFileReader to PdfFileMerger.
 72 |    Allows usage of PdfFileMerger with encrypted files (twolfson)
 73 | 
 74 |  - Fixed bug which occurred when a NameObject is present at end
 75 |    of a file stream. Threw a "Stream has ended unexpectedly"
 76 |    exception (speedplane)
 77 | 
 78 | FEATURES:
 79 | 
 80 |  - Initial work on a test suite; to be expanded in future.
 81 |    Tests and Resources directory added, README updated (robertsoakes)
 82 | 
 83 |  - Added document cloning methods to PdfFileWriter:
 84 |    appendPagesFromReader, cloneReaderDocumentRoot, and
 85 |    cloneDocumentFromReader. See official documentation (robertsoakes)
 86 | 
 87 |  - Added method for writing to form fields: updatePageFormFieldValues.
 88 |    This will be enhanced in the future. See official documentation
 89 |    (robertsoakes)
 90 | 
 91 |  - New addAttachment method. See documentation. Support for adding
 92 |    and extracting embedded files to be enhanced in the future
 93 |    (moshekaplan)
 94 | 
 95 |  - Added methods to get page number of given PageObject or
 96 |    Destination: getPageNumber and getDestinationPageNumber.
 97 |    See documentation (mozbugbox)
 98 | 
 99 | OTHER ENHANCEMENTS:
100 | 
101 |  - Enhanced type handling (Brent Amrhein)
102 | 
103 |  - Enhanced exception handling in NameObject (sbywater)
104 | 
105 |  - Enhanced extractText method output (peircej)
106 | 
107 |  - Better exception handling
108 | 
109 |  - Enhanced regex usage in NameObject class (speedplane)
110 | 
111 | 
112 | Version 1.24, 2014-12-31
113 | ------------------------
114 | 
115 |  - Bugfixes for reading files in Python 3 (by Anthony Tuininga and
116 |    pqqp)
117 | 
118 |  - Appropriate errors are now raised instead of infinite loops (by
119 |    naure and Cyrus Vafadari)
120 | 
121 |  - Bugfix for parsing number tokens with leading spaces (by Maxim
122 |    Kamenkov)
123 | 
124 |  - Don't crash on bad /Outlines reference (by eshellman)
125 | 
126 |  - Conform tabs/spaces and blank lines to PEP 8 standards
127 | 
128 |  - Utilize the readUntilRegex method when reading Number Objects
129 |    (by Brendan Jurd)
130 | 
131 |  - More bugfixes for Python 3 and clearer exception handling
132 | 
133 |  - Fixed encoding issue in merger (with eshellman)
134 | 
135 |  - Created separate folder for scripts
136 | 
137 | 
138 | Version 1.23, 2014-08-11
139 | ------------------------
140 | 
141 |  - Documentation now available at http://pythonhosted.org//PyPDF2
142 | 
143 |  - Bugfix in pagerange.py for when __init__.__doc__ has no value (by
144 |    Vladir Cruz)
145 | 
146 |  - Fix typos in OutlinesObject().add() (by shilluc)
147 | 
148 |  - Re-added a missing return statement in a utils.py method
149 | 
150 |  - Corrected viewing mode names (by Jason Scheirer)
151 | 
152 |  - New PdfFileWriter method: addJS() (by vfigueiro)
153 | 
154 |  - New bookmark features: color, boldness, italics, and page fit
155 |    (by Joshua Arnott)
156 | 
157 |  - New PdfFileReader method: getFields(). Used to extract field
158 |    information from PDFs with interactive forms. See documentation
159 |    for details
160 | 
161 |  - Converted README file to markdown format (by Stephen Bussard)
162 | 
163 |  - Several improvements to overall performance and efficiency
164 |    (by mozbugbox)
165 | 
166 |  - Fixed a bug where geospatial information was not scaling along with
167 |    its page
168 | 
169 |  - Fixed a type issue and a Python 3 issue in the decryption algorithms
170 |    (with Francisco Vieira and koba-ninkigumi)
171 | 
172 |  - Fixed a bug causing an infinite loop in the ASCII 85 decoding
173 |    algorithm (by madmaardigan)
174 | 
175 |  - Annotations (links, comment windows, etc.) are now preserved when
176 |    pages are merged together
177 | 
178 |  - Used the Destination class in addLink() and addBookmark() so that 
179 |    the page fit option could be properly customized
180 | 
181 | 
182 | Version 1.22, 2014-05-29
183 | ------------------------
184 | 
185 |  - Added .DS_Store to .gitignore (for Mac users) (by Steve Witham)
186 | 
187 |  - Removed __init__() implementation in NameObject (by Steve Witham)
188 | 
189 |  - Fixed bug (inf. loop) when merging pages in Python 3 (by commx)
190 | 
191 |  - Corrected error when calculating height in scaleTo()
192 | 
193 |  - Removed unnecessary code from DictionaryObject (by Georges Dubus)
194 | 
195 |  - Fixed bug where an exception was thrown upon reading a NULL string
196 |    (by speedplane)
197 | 
198 |  - Allow string literals (non-unicode strings in Python 2) to be passed 
199 |    to PdfFileReader
200 | 
201 |  - Allow ConvertFunctionsToVirtualList to be indexed with slices and
202 |    longs (in Python 2) (by Matt Gilson)
203 | 
204 |  - Major improvements and bugfixes to addLink() method (see documentation
205 |    in source code) (by Henry Keiter)
206 | 
207 |  - General code clean-up and improvements (with Steve Witham and Henry Keiter)
208 | 
209 |  - Fixed bug that caused crash when comments are present at end of 
210 |    dictionary
211 | 
212 | 
213 | Version 1.21, 2014-04-21
214 | ------------------------
215 | 
216 |  - Fix for when /Type isn't present in the Pages dictionary (by Rob1080)
217 | 
218 |  - More tolerance for extra whitespace in Indirect Objects
219 | 
220 |  - Improved Exception handling
221 | 
222 |  - Fixed error in getHeight() method (by Simon Kaempflein)
223 | 
224 |  - implement use of utils.string_type to resolve Py2-3 compatibility issues
225 | 
226 |  - Prevent exception for multiple definitions in a dictionary (with carlosfunk)
227 |    (only when strict = False)
228 | 
229 |  - Fixed errors when parsing a slice using pdfcat on command line (by
230 |    Steve Witham)
231 | 
232 |  - Tolerance for EOF markers within 1024 bytes of the actual end of the
233 |    file (with David Wolever)
234 | 
235 |  - Added overwriteWarnings parameter to PdfFileReader constructor, if False
236 |    PyPDF2 will NOT overwrite methods from Python's warnings.py module with
237 |    a custom implementation.
238 | 
239 |  - Fix NumberObject and NameObject constructors for compatibility with PyPy
240 |    (Rüdiger Jungbeck, Xavier Dupré, shezadkhan137, Steven Witham)
241 | 
242 |  - Utilize  utils.Str in pdf.py and pagerange.py to resolve type issues (by
243 |    egbutter)
244 | 
245 |  - Improvements in implementing StringIO for Python 2 and BytesIO for
246 |    Python 3 (by Xavier Dupré)
247 | 
248 |  - Added /x00 to Whitespaces, defined utils.WHITESPACES to clarify code (by
249 |    Maxim Kamenkov)
250 | 
251 |  - Bugfix for merging 3 or more resources with the same name (by lucky-user)
252 | 
253 |  - Improvements to Xref parsing algorithm (by speedplane)
254 | 
255 | 
256 | Version 1.20, 2014-01-27
257 | ------------------------
258 | 
259 |  - Official Python 3+ support (with contributions from TWAC and cgammans)
260 |    Support for Python versions 2.6 and 2.7 will be maintained
261 | 
262 |  - Command line concatenation (see pdfcat in sample code) (by Steve Witham)
263 | 
264 |  - New FAQ; link included in README
265 | 
266 |  - Allow more (although unnecessary) escape sequences
267 | 
268 |  - Prevent exception when reading a null object in decoding parameters
269 | 
270 |  - Corrected error in reading destination types (added a slash since they
271 |    are name objects)
272 | 
273 |  - Corrected TypeError in scaleTo() method
274 | 
275 |  - addBookmark() method in PdfFileMerger now returns bookmark (so nested
276 |    bookmarks can be created)
277 | 
278 |  - Additions to Sample Code and Sample PDFs
279 | 
280 |  - changes to allow 2up script to work (see sample code) (by Dylan McNamee)
281 | 
282 |  - changes to metadata encoding (by Chris Hiestand)
283 | 
284 |  - New methods for links: addLink() (by Enrico Lambertini) and removeLinks()
285 | 
286 |  - Bugfix to handle nested bookmarks correctly (by Jamie Lentin)
287 | 
288 |  - New methods removeImages() and removeText() available for PdfFileWriter
289 |    (by Tien Haï)
290 | 
291 |  - Exception handling for illegal characters in Name Objects
292 | 
293 | 
294 | Version 1.19, 2013-10-08
295 | ------------------------
296 | 
297 | BUGFIXES:
298 |  - Removed pop in sweepIndirectReferences to prevent infinite loop
299 |    (provided by ian-su-sirca)
300 | 
301 |  - Fixed bug caused by whitespace when parsing PDFs generated by AutoCad
302 | 
303 |  - Fixed a bug caused by reading a 'null' ASCII value in a dictionary
304 |    object (primarily in PDFs generated by AutoCad).
305 | 
306 | FEATURES:
307 |  - Added new folders for PyPDF2 sample code and example PDFs; see README
308 |    for each folder
309 | 
310 |  - Added a method for debugging purposes to show current location while
311 |    parsing
312 | 
313 |  - Ability to create custom metadata (by jamma313)
314 | 
315 |  - Ability to access and customize document layout and view mode
316 |    (by Joshua Arnott)
317 | 
318 | OTHER:
319 |  - Added and corrected some documentation
320 | 
321 |  - Added some more warnings and exception messages
322 | 
323 |  - Removed old test/debugging code
324 | 
325 | UPCOMING:
326 |  - More bugfixes (We have received many problematic PDFs via email, we
327 |    will work with them)
328 |  
329 |  - Documentation - It's time for PyPDF2 to get its own documentation
330 |    since it has grown much since the original pyPdf
331 | 
332 |  - A FAQ to answer common questions
333 | 
334 | 
335 | Version 1.18, 2013-08-19
336 | ------------------------
337 | 
338 |  - Fixed a bug where older verions of objects were incorrectly added to the 
339 |    cache, resulting in outdated or missing pages, images, and other objects
340 |    (from speedplane)
341 | 
342 |  - Fixed a bug in parsing the xref table where new xref values were 
343 |    overwritten; also cleaned up code (from speedplane)
344 | 
345 |  - New method mergeRotatedAroundPointPage which merges a page while rotating
346 |    it around a point (from speedplane)
347 | 
348 |  - Updated Destination syntax to respect PDF 1.6 specifications (from
349 |    jamma313)
350 | 
351 |  - Prevented infinite loop when a PdfFileReader object was instantiated
352 |    with an empty file (from Jerome Nexedi)
353 | 
354 | Other Changes:
355 | 
356 |  - Downloads now available via PyPI
357 |    https://pypi.python.org/pypi?:action=display&name=PyPDF2
358 | 
359 |  - Installation through pip library is fixed
360 | 
361 | 
362 | Version 1.17, 2013-07-25
363 | ------------------------
364 | 
365 |  - Removed one (from pdf.py) of the two Destination classes. Both 
366 |    classes had the same name, but were slightly different in content, 
367 |    causing some errors. (from Janne Vanhala)
368 | 
369 |  - Corrected and Expanded README file to demonstrate PdfFileMerger
370 | 
371 |  - Added filter for LZW encoded streams (from Michal Horejsek)
372 | 
373 |  - PyPDF2 issue tracker enabled on Github to allow community
374 |    discussion and collaboration
375 | 
376 | 
377 | Versions -1.16, -2013-06-30
378 | ---------------------------
379 | 
380 |  - Note: This ChangeLog has not been kept up-to-date for a while.
381 |    Hopefully we can keep better track of it from now on. Some of the
382 |    changes listed here come from previous versions 1.14 and 1.15; they
383 |    were only vaguely defined. With the new _version.py file we should 
384 |    have more structured and better documented versioning from now on.
385 |  
386 |  - Defined PyPDF2.__version__
387 | 
388 |  - Fixed encrypt() method (from Martijn The)
389 | 
390 |  - Improved error handling on PDFs with truncated streams (from cecilkorik)
391 | 
392 |  - Python 3 support (from kushal-kumaran)
393 | 
394 |  - Fixed example code in README (from Jeremy Bethmont)
395 | 
396 |  - Fixed an bug caused by DecimalError Exception (from Adam Morris)
397 | 
398 |  - Many other bug fixes and features by: 
399 | 	
400 | 	jeansch
401 | 	Anton Vlasenko
402 | 	Joseph Walton
403 | 	Jan Oliver Oelerich
404 | 	Fabian Henze
405 | 	And any others I missed. 
406 | 	Thanks for contributing!
407 | 
408 | 
409 | Version 1.13, 2010-12-04
410 | ------------------------
411 | 
412 |  - Fixed a typo in code for reading a "\b" escape character in strings.
413 | 
414 |  - Improved __repr__ in FloatObject.
415 | 
416 |  - Fixed a bug in reading octal escape sequences in strings.
417 | 
418 |  - Added getWidth and getHeight methods to the RectangleObject class.
419 | 
420 |  - Fixed compatibility warnings with Python 2.4 and 2.5.
421 | 
422 |  - Added addBlankPage and insertBlankPage methods on PdfFileWriter class.
423 | 
424 |  - Fixed a bug with circular references in page's object trees (typically
425 |    annotations) that prevented correctly writing out a copy of those pages.
426 | 
427 |  - New merge page functions allow application of a transformation matrix.
428 | 
429 |  - To all patch contributors: I did a poor job of keeping this ChangeLog
430 |    up-to-date for this release, so I am missing attributions here for any
431 |    changes you submitted.  Sorry!  I'll do better in the future.
432 | 
433 | 
434 | Version 1.12, 2008-09-02
435 | ------------------------
436 | 
437 |  - Added support for XMP metadata.
438 | 
439 |  - Fix reading files with xref streams with multiple /Index values.
440 | 
441 |  - Fix extracting content streams that use graphics operators longer than 2
442 |    characters.  Affects merging PDF files.
443 | 
444 | 
445 | Version 1.11, 2008-05-09
446 | ------------------------
447 | 
448 |  - Patch from Hartmut Goebel to permit RectangleObjects to accept NumberObject
449 |    or FloatObject values.
450 | 
451 |  - PDF compatibility fixes.
452 | 
453 |  - Fix to read object xref stream in correct order.
454 | 
455 |  - Fix for comments inside content streams.
456 | 
457 | 
458 | Version 1.10, 2007-10-04
459 | ------------------------
460 | 
461 |  - Text strings from PDF files are returned as Unicode string objects when
462 |  pyPdf determines that they can be decoded (as UTF-16 strings, or as
463 |  PDFDocEncoding strings).  Unicode objects are also written out when
464 |  necessary.  This means that string objects in pyPdf can be either
465 |  generic.ByteStringObject instances, or generic.TextStringObject instances.
466 | 
467 |  - The extractText method now returns a unicode string object.
468 | 
469 |  - All document information properties now return unicode string objects.  In
470 |  the event that a document provides docinfo properties that are not decoded by
471 |  pyPdf, the raw byte strings can be accessed with an "_raw" property (ie.
472 |  title_raw rather than title)
473 | 
474 |  - generic.DictionaryObject instances have been enhanced to be easier to use.
475 |  Values coming out of dictionary objects will automatically be de-referenced
476 |  (.getObject will be called on them), unless accessed by the new "raw_get"
477 |  method.  DictionaryObjects can now only contain PdfObject instances (as keys
478 |  and values), making it easier to debug where non-PdfObject values (which
479 |  cannot be written out) are entering dictionaries.
480 | 
481 |  - Support for reading named destinations and outlines in PDF files.  Original
482 |  patch by Ashish Kulkarni.
483 | 
484 |  - Stream compatibility reading enhancements for malformed PDF files.
485 | 
486 |  - Cross reference table reading enhancements for malformed PDF files.
487 | 
488 |  - Encryption documentation.
489 | 
490 |  - Replace some "assert" statements with error raising.
491 | 
492 |  - Minor optimizations to FlateDecode algorithm increase speed when using PNG
493 |  predictors.
494 | 
495 | Version 1.9, 2006-12-15
496 | -----------------------
497 | 
498 |  - Fix several serious bugs introduced in version 1.8, caused by a failure to
499 |    run through our PDF test suite before releasing that version.
500 | 
501 |  - Fix bug in NullObject reading and writing.
502 | 
503 | Version 1.8, 2006-12-14
504 | -----------------------
505 | 
506 |  - Add support for decryption with the standard PDF security handler.  This
507 |    allows for decrypting PDF files given the proper user or owner password.
508 | 
509 |  - Add support for encryption with the standard PDF security handler.
510 | 
511 |  - Add new pythondoc documentation.
512 | 
513 |  - Fix bug in ASCII85 decode that occurs when whitespace exists inside the
514 |    two terminating characters of the stream.
515 | 
516 | Version 1.7, 2006-12-10
517 | -----------------------
518 | 
519 |  - Fix a bug when using a single page object in two PdfFileWriter objects.
520 | 
521 |  - Adjust PyPDF to be tolerant of whitespace characters that don't belong
522 |    during a stream object.
523 | 
524 |  - Add documentInfo property to PdfFileReader.
525 | 
526 |  - Add numPages property to PdfFileReader.
527 | 
528 |  - Add pages property to PdfFileReader.
529 | 
530 |  - Add extractText function to PdfFileReader.
531 | 
532 | 
533 | Version 1.6, 2006-06-06
534 | -----------------------
535 | 
536 |  - Add basic support for comments in PDF files.  This allows us to read some
537 |    ReportLab PDFs that could not be read before.
538 | 
539 |  - Add "auto-repair" for finding xref table at slightly bad locations.
540 | 
541 |  - New StreamObject backend, cleaner and more powerful.  Allows the use of
542 |    stream filters more easily, including compressed streams.
543 | 
544 |  - Add a graphics state push/pop around page merges.  Improves quality of
545 |    page merges when one page's content stream leaves the graphics 
546 |    in an abnormal state.
547 | 
548 |  - Add PageObject.compressContentStreams function, which filters all content
549 |    streams and compresses them.  This will reduce the size of PDF pages,
550 |    especially after they could have been decompressed in a mergePage
551 |    operation.
552 | 
553 |  - Support inline images in PDF content streams.
554 | 
555 |  - Add support for using .NET framework compression when zlib is not
556 |    available.  This does not make pyPdf compatible with IronPython, but it
557 |    is a first step.
558 | 
559 |  - Add support for reading the document information dictionary, and extracting
560 |    title, author, subject, producer and creator tags.
561 | 
562 |  - Add patch to support NullObject and multiple xref streams, from Bradley
563 |    Lawrence.
564 | 
565 | 
566 | Version 1.5, 2006-01-28
567 | -----------------------
568 | 
569 | - Fix a bug where merging pages did not work in "no-rename" cases when the
570 |   second page has an array of content streams.
571 | 
572 | - Remove some debugging output that should not have been present.
573 | 
574 | 
575 | Version 1.4, 2006-01-27
576 | -----------------------
577 | 
578 | - Add capability to merge pages from multiple PDF files into a single page
579 |   using the PageObject.mergePage function.  See example code (README or web
580 |   site) for more information.
581 | 
582 | - Add ability to modify a page's MediaBox, CropBox, BleedBox, TrimBox, and
583 |   ArtBox properties through PageObject.  See example code (README or web site)
584 |   for more information.
585 | 
586 | - Refactor pdf.py into multiple files: generic.py (contains objects like
587 |   NameObject, DictionaryObject), filters.py (contains filter code),
588 |   utils.py (various).  This does not affect importing PdfFileReader
589 |   or PdfFileWriter.
590 | 
591 | - Add new decoding functions for standard PDF filters ASCIIHexDecode and
592 |   ASCII85Decode.
593 | 
594 | - Change url and download_url to refer to new pybrary.net web site.
595 | 
596 | 
597 | Version 1.3, 2006-01-23
598 | -----------------------
599 | 
600 | - Fix new bug introduced in 1.2 where PDF files with \r line endings did not
601 |   work properly anymore.  A new test suite developed with various PDF files
602 |   should prevent regression bugs from now on.
603 | 
604 | - Fix a bug where inheriting attributes from page nodes did not work.
605 | 
606 | 
607 | Version 1.2, 2006-01-23
608 | -----------------------
609 | 
610 | - Improved support for files with CRLF-based line endings, fixing a common
611 |   reported problem stating "assertion error: assert line == "%%EOF"".
612 | 
613 | - Software author/maintainer is now officially a proud married person, which
614 |   is sure to result in better software... somehow.
615 | 
616 | 
617 | Version 1.1, 2006-01-18
618 | -----------------------
619 | 
620 | - Add capability to rotate pages.
621 | 
622 | - Improved PDF reading support to properly manage inherited attributes from
623 |   /Type=/Pages nodes.  This means that page groups that are rotated or have
624 |   different media boxes or whatever will now work properly.
625 | 
626 | - Added PDF 1.5 support.  Namely cross-reference streams and object streams.
627 |   This release can mangle Adobe's PDFReference16.pdf successfully.
628 | 
629 | 
630 | Version 1.0, 2006-01-17
631 | -----------------------
632 | 
633 | - First distutils-capable true public release.  Supports a wide variety of PDF
634 |   files that I found sitting around on my system.
635 | 
636 | - Does not support some PDF 1.5 features, such as object streams,
637 |   cross-reference streams.
638 | 
639 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006-2008, Mathieu Fenniak
 2 | Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
 3 | Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
 4 | 
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are
 9 | met:
10 | 
11 | * Redistributions of source code must retain the above copyright notice,
12 | this list of conditions and the following disclaimer.
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 | * The name of the author may not be used to endorse or promote products
17 | derived from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyPDF4
 2 | PyPDF4 is a pure-python PDF library capable of splitting, merging together,
 3 | cropping, and transforming the pages of PDF files. It can also add custom data,
 4 | viewing options, and passwords to PDF files.  It can retrieve text and metadata
 5 | from PDFs as well as merge entire files together.
 6 | 
 7 | What happened to PyPDF2?  Nothing; it's still available at
 8 | https://github.com/mstamy2/PyPDF2.  For various reasons @claird will eventually
 9 | explain, I've simply decided to mark a new "business model" with a
10 | slightly-renamed project name.
11 | While PyPDF4 will continue to be available at no charge, I have strong plans
12 | for better ongoing support to start in August 2018.
13 | 
14 | Homepage (available soon): http://claird.github.io/PyPDF4/.
15 | 
16 | ## Examples
17 | Please see the `samplecode/` folder.
18 | 
19 | ## Documentation
20 | Documentation soon will be available, although probably not at
21 | https://pythonhosted.org/PyPDF4/.
22 | 
23 | ## FAQ
24 | Please see http://claird.github.io/PyPDF4/FAQ.html (available in early August).
25 | 
26 | ## Tests
27 | PyPDF4 includes a modest (but growing!) test suite built on the unittest
28 | framework. All tests are located in the `tests/` folder and are distributed
29 | among dedicated modules. Tox makes running all tests over all versions of Python
30 | quick work:
31 | 
32 | ```
33 | python -m pip install tox
34 | python -m tox
35 | ```
36 | 
37 | Individual tests are accessible as conventional **Pytest** sources;
38 | 
39 | ```
40 | pytest -v tests/test_pdf.py
41 | ```
42 | 
43 | is an example which assumes the `pytest` executable is activated.
44 | 
45 | ## Contributing
46 | For an exhaustive overview of what rules you are expected to maintain, please
47 | visit [Contributing](https://github.com/claird/PyPDF4/wiki/Contributing) in the
48 | project Wiki. A quick outline of these is:
49 | 
50 | * **Provide test cases** for individual units of development of your own.
51 | Proper testing is highly encouraged: *Code without tests is broken by design*
52 | \- Jacob Kaplan-Moss, Django's original development team member.
53 | * Follow the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style conventions, such as:
54 | 	* lower_case_with_underscores nomenclature (e.g., `file_name` rather than `fileName`,
55 | 	and `write_file()` rather than `writeFile()`).
56 |     * Line lengths of `79` characters or less.
57 |     * Correct spacing between global-scoped classes and functions (two newlines
58 | 	in between etc.) and within internal code blocks.
59 | * Target your code for Python 3 but maintain retrocompatibility with Python 2
60 | (do we retain Py2?  Still under active consideration).
61 | * Provide [docstring documentation](https://www.python.org/dev/peps/pep-0257/)
62 | for public classes and functions. 
63 | * Utilize `# TO-DO` or `TO-DO` markings within
64 | [docstrings](https://www.python.org/dev/peps/pep-0257/) for indicating a
65 | feature that is yet to be implemented or discussed. Some IDEs feature TO-DOs
66 | detection consoles.
67 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | # Override Pylint's default configuration.
 2 | #
 3 | # Reference: <URL: https://pylint.readthedocs.io/en/latest/technical_reference/features.html >.
 4 | #
 5 | # Generate a default config file with comments by commanding
 6 | #     pylint --generate-rcfile
 7 | 
 8 | [MESSAGES CONTROL]
 9 | # Construct the configuration as exceptions from the well-defined
10 | # "all-enabled" starting point.
11 | enable=all
12 | 
13 | disable=
14 |   suppressed-message,
15 |   locally-disabled
16 | 
17 | [BASIC]
18 | # PyPDF4 frequently binds exceptions as "e".  Use of "i" as a loop variable is to pervasive
19 | # that we account for it.
20 | good-names=
21 |   e,
22 |   i
23 | 


--------------------------------------------------------------------------------
/pypdf/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._version import __version__
 2 | from .generic import *
 3 | from .merger import PdfFileMerger
 4 | from .pagerange import PageRange
 5 | from .pdf import PdfFileReader, PdfFileWriter
 6 | 
 7 | __all__ = [
 8 |     # Basic PyPDF elements
 9 |     "PdfFileReader",
10 |     "PdfFileWriter",
11 |     "PdfFileMerger",
12 |     "PageRange",
13 |     # most used elements from generic
14 |     "BooleanObject",
15 |     "ArrayObject",
16 |     "IndirectObject",
17 |     "FloatObject",
18 |     "NumberObject",
19 |     "createStringObject",
20 |     "TextStringObject",
21 |     "NameObject",
22 |     "DictionaryObject",
23 |     "TreeObject",
24 |     "Destination",
25 |     "PageLabel",
26 |     "Bookmark",
27 |     # PyPDF modules
28 |     "pdf",
29 |     "generic",
30 |     "utils",
31 |     "filters",
32 |     "merger",
33 |     "pagerange",
34 |     "xmp",
35 | ]
36 | 


--------------------------------------------------------------------------------
/pypdf/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.27.0"
2 | 


--------------------------------------------------------------------------------
/pypdf/pagerange.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Representation and utils for ranges of PDF file pages.
  4 | 
  5 | Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
  6 | All rights reserved. This software is available under a BSD license;
  7 | see https://github.com/claird/PyPDF4/blob/master/LICENSE.md
  8 | """
  9 | 
 10 | import re
 11 | 
 12 | from .utils import isString
 13 | 
 14 | _INT_RE = r"(0|-?[1-9]\d*)"  # A decimal int, don't allow "-0".
 15 | PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
 16 | # groups:         12     34     5 6     7 8
 17 | 
 18 | 
 19 | class ParseError(Exception):
 20 |     pass
 21 | 
 22 | 
 23 | PAGE_RANGE_HELP = """Remember, page indices start with zero.
 24 |         Page range expression examples:
 25 |             :     all pages.                   -1    last page.
 26 |             22    just the 23rd page.          :-1   all but the last page.
 27 |             0:3   the first three pages.       -2    second-to-last page.
 28 |             :3    the first three pages.       -2:   last two pages.
 29 |             5:    from the sixth page onward.  -3:-1 third & second to last.
 30 |         The third, "stride" or "step" number is also recognized.
 31 |             ::2       0 2 4 ... to the end.    3:0:-1    3 2 1 but not 0.
 32 |             1:10:2    1 3 5 7 9                2::-1     2 1 0.
 33 |             ::-1      all pages in reverse order.
 34 | """
 35 | 
 36 | 
 37 | class PageRange(object):
 38 |     """
 39 |     A slice-like representation of a range of page indices,
 40 |         i.e. page numbers, only starting at zero.
 41 |     The syntax is like what you would put between brackets [ ].
 42 |     The slice is one of the few Python types that can't be subclassed,
 43 |     but this class converts to and from slices, and allows similar use.
 44 | 
 45 |     *  PageRange(str) parses a string representing a page range.
 46 |     *  PageRange(slice) directly "imports" a slice.
 47 |     *  _to_slice() gives the equivalent slice.
 48 |     *  str() and repr() allow printing.
 49 |     *  indices(n) is like slice.indices(n).
 50 |     """
 51 | 
 52 |     def __init__(self, arg):
 53 |         """
 54 |         Initialize with either a slice -- giving the equivalent page range,
 55 |         or a PageRange object -- making a copy,
 56 |         or a string like
 57 |             "int", "[int]:[int]" or "[int]:[int]:[int]",
 58 |             where the brackets indicate optional ints.
 59 |         {page_range_help}
 60 |         Note the difference between this notation and arguments to slice():
 61 |             slice(3) means the first three pages;
 62 |             PageRange("3") means the range of only the fourth page.
 63 |             However PageRange(slice(3)) means the first three pages.
 64 |         """
 65 |         if isinstance(arg, slice):
 66 |             self._slice = arg
 67 |             return
 68 | 
 69 |         if isinstance(arg, PageRange):
 70 |             self._slice = arg._to_slice()
 71 |             return
 72 | 
 73 |         match = isString(arg) and re.match(PAGE_RANGE_RE, arg)
 74 | 
 75 |         if not match:
 76 |             raise ParseError(arg)
 77 |         if match.group(2):
 78 |             # Special case: just an int means a range of one page.
 79 |             start = int(match.group(2))
 80 |             stop = start + 1 if start != -1 else None
 81 |             self._slice = slice(start, stop)
 82 |         else:
 83 |             self._slice = slice(*[int(g) if g else None for g in match.group(4, 6, 8)])
 84 | 
 85 |     # Just formatting this when there is __doc__ for __init__
 86 |     if __init__.__doc__:
 87 |         __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
 88 | 
 89 |     @staticmethod
 90 |     def valid(this_input):
 91 |         """ True if input is a valid initializer for a PageRange. """
 92 |         return isinstance(this_input, (slice, PageRange)) or (
 93 |             isString(this_input) and bool(re.match(PAGE_RANGE_RE, this_input))
 94 |         )
 95 | 
 96 |     def _to_slice(self):
 97 |         """ Return the slice equivalent of this page range. """
 98 |         return self._slice
 99 | 
100 |     def __str__(self):
101 |         """A string like "1:2:3"."""
102 |         s__ = self._slice
103 |         if s__.step is None:
104 |             if s__.start is not None and s__.stop == s__.start + 1:
105 |                 return str(s__.start)
106 | 
107 |             indices = s__.start, s__.stop
108 |         else:
109 |             indices = s__.start, s__.stop, s__.step
110 |         return ":".join("" if i is None else str(i) for i in indices)
111 | 
112 |     def __repr__(self):
113 |         """A string like "PageRange('1:2:3')"."""
114 |         return "PageRange(" + repr(str(self)) + ")"
115 | 
116 |     def indices(self, this_n):
117 |         """
118 |         ``this_n`` is the length of the list of pages to choose from.
119 |         Returns arguments for ``range()``.  See ``help(slice.indices)``.
120 |         """
121 |         return self._slice.indices(this_n)
122 | 
123 | 
124 | PAGE_RANGE_ALL = PageRange(":")  # The range of all pages.
125 | 
126 | 
127 | def parseFilenamePageRanges(args):
128 |     """
129 |     Given a list of filenames and page ranges, return a list of
130 |     (filename, page_range) pairs.
131 |     First arg must be a filename; other args are filenames, page-range
132 |     expressions, slice objects, or PageRange objects.
133 |     A filename not followed by a page range indicates all pages of the file.
134 |     """
135 |     pairs = []
136 |     pdfFilename = None
137 |     didPageRange = False
138 | 
139 |     for arg in args + [None]:
140 |         if PageRange.valid(arg):
141 |             if not pdfFilename:
142 |                 raise ValueError(
143 |                     "The first argument must be a filename, not a page range."
144 |                 )
145 | 
146 |             pairs.append((pdfFilename, PageRange(arg)))
147 |             didPageRange = True
148 |         else:
149 |             # New filename or end of list--do all of the previous file?
150 |             if pdfFilename and not didPageRange:
151 |                 pairs.append((pdfFilename, PAGE_RANGE_ALL))
152 | 
153 |             pdfFilename = arg
154 |             didPageRange = False
155 | 
156 |     return pairs
157 | 


--------------------------------------------------------------------------------
/pypdf/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2006, Mathieu Fenniak
  2 | # All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met:
  7 | #
  8 | # * Redistributions of source code must retain the above copyright notice,
  9 | # this list of conditions and the following disclaimer.
 10 | # * Redistributions in binary form must reproduce the above copyright notice,
 11 | # this list of conditions and the following disclaimer in the documentation
 12 | # and/or other materials provided with the distribution.
 13 | # * The name of the author may not be used to endorse or promote products
 14 | # derived from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 | # POSSIBILITY OF SUCH DAMAGE.
 27 | """
 28 | Utility functions for PDF library.
 29 | """
 30 | from binascii import hexlify
 31 | import sys
 32 | 
 33 | try:
 34 |     import __builtin__ as builtins
 35 | except ImportError:  # Py3
 36 |     import builtins
 37 | 
 38 | __author__ = "Mathieu Fenniak"
 39 | __author_email__ = "biziqe@mathieu.fenniak.net"
 40 | 
 41 | 
 42 | xrange_fn = getattr(builtins, "xrange", range)
 43 | _basestring = getattr(builtins, "basestring", str)
 44 | 
 45 | bytes_type = type(bytes())  # Works the same in Python 2.X and 3.X
 46 | string_type = getattr(builtins, "unicode", str)
 47 | int_types = (int, long) if sys.version_info[0] < 3 else (int,)
 48 | 
 49 | 
 50 | # Make basic type tests more consistent
 51 | def isString(s):
 52 |     """Test if arg is a string. Compatible with Python 2 and 3."""
 53 |     return isinstance(s, _basestring)
 54 | 
 55 | 
 56 | def isInt(n):
 57 |     """Test if arg is an int. Compatible with Python 2 and 3."""
 58 |     return isinstance(n, int_types)
 59 | 
 60 | 
 61 | def isBytes(b):
 62 |     """Test if arg is a bytes instance. Compatible with Python 2 and 3."""
 63 |     return isinstance(b, bytes_type)
 64 | 
 65 | 
 66 | # custom implementation of warnings.formatwarning
 67 | def formatWarning(message, category, filename, lineno, line=None):
 68 |     file = filename.replace("/", "\\").rsplit("\\", 1)[-1]  # find the file name
 69 |     return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno)
 70 | 
 71 | 
 72 | def readUntilWhitespace(stream, maxchars=None):
 73 |     """
 74 |     Reads non-whitespace characters and returns them.
 75 |     Stops upon encountering whitespace or when maxchars is reached.
 76 |     """
 77 |     txt = pypdfBytes("")
 78 | 
 79 |     while True:
 80 |         tok = stream.read(1)
 81 | 
 82 |         if tok.isspace() or not tok:
 83 |             break
 84 | 
 85 |         txt += tok
 86 |         if len(txt) == maxchars:
 87 |             break
 88 | 
 89 |     return txt
 90 | 
 91 | 
 92 | def readNonWhitespace(stream):
 93 |     """
 94 |     Finds and reads the next non-whitespace character (ignores whitespace).
 95 | 
 96 |     :param stream: a file-like object.
 97 |     """
 98 |     tok = WHITESPACES[0]
 99 | 
100 |     while tok in WHITESPACES:
101 |         tok = stream.read(1)
102 | 
103 |     return tok
104 | 
105 | 
106 | def skipOverWhitespace(stream):
107 |     """
108 |     Similar to ``readNonWhitespace()``, but returns a Boolean if more than
109 |     one whitespace character was read.
110 | 
111 |     :param stream: a file-like object.
112 |     """
113 |     tok = WHITESPACES[0]
114 |     cnt = 0
115 | 
116 |     while tok in WHITESPACES:
117 |         tok = stream.read(1)
118 |         cnt += 1
119 | 
120 |     return cnt > 1
121 | 
122 | 
123 | def skipOverComment(stream):
124 |     tok = stream.read(1)
125 |     stream.seek(-1, 1)
126 | 
127 |     if tok == pypdfBytes("%"):
128 |         while tok not in (pypdfBytes("\n"), pypdfBytes("\r")):
129 |             tok = stream.read(1)
130 | 
131 | 
132 | def readUntilRegex(stream, regex, ignore_eof=False):
133 |     """
134 |     Reads until the regular expression pattern matched (ignore the match)
135 |     Raise PdfStreamError on premature end-of-file.
136 |     :param bool ignore_eof: If true, ignore end-of-line and return immediately
137 |     """
138 |     name = pypdfBytes("")
139 | 
140 |     while True:
141 |         tok = stream.read(16)
142 | 
143 |         if not tok:
144 |             # stream has truncated prematurely
145 |             if ignore_eof:
146 |                 return name
147 |             raise PdfStreamError("Stream has ended unexpectedly")
148 |         m = regex.search(tok)
149 |         if m is not None:
150 |             name += tok[: m.start()]
151 |             stream.seek(m.start() - len(tok), 1)
152 |             break
153 |         name += tok
154 | 
155 |     return name
156 | 
157 | 
158 | class ConvertFunctionsToVirtualList(object):
159 |     def __init__(self, lengthFunction, getFunction):
160 |         self.lengthFunction = lengthFunction
161 |         self.getFunction = getFunction
162 | 
163 |     def __len__(self):
164 |         return self.lengthFunction()
165 | 
166 |     def __getitem__(self, index):
167 |         if isinstance(index, slice):
168 |             indices = xrange_fn(*index.indices(len(self)))
169 |             cls = type(self)
170 |             return cls(indices.__len__, lambda idx: self[indices[idx]])
171 |         if not isInt(index):
172 |             raise TypeError("sequence indices must be integers")
173 | 
174 |         len_self = len(self)
175 | 
176 |         if index < 0:
177 |             # support negative indexes
178 |             index = len_self + index
179 |         if index < 0 or index >= len_self:
180 |             raise IndexError("sequence index out of range")
181 | 
182 |         return self.getFunction(index)
183 | 
184 | 
185 | def RC4Encrypt(key, plaintext):
186 |     S = list(range(256))
187 |     j = 0
188 | 
189 |     for i in range(256):
190 |         j = (j + S[i] + pypdfOrd(key[i % len(key)])) % 256
191 |         S[i], S[j] = S[j], S[i]
192 | 
193 |     i, j = 0, 0
194 |     retval = []
195 | 
196 |     for x in range(len(plaintext)):
197 |         i = (i + 1) % 256
198 |         j = (j + S[i]) % 256
199 |         S[i], S[j] = S[j], S[i]
200 |         t = S[(S[i] + S[j]) % 256]
201 |         retval.append(pypdfBytes(chr(pypdfOrd(plaintext[x]) ^ t)))
202 | 
203 |     return pypdfBytes("").join(retval)
204 | 
205 | 
206 | def matrixMultiply(a, b):
207 |     return [
208 |         [sum([float(i) * float(j) for i, j in zip(row, col)]) for col in zip(*b)]
209 |         for row in a
210 |     ]
211 | 
212 | 
213 | class PyPdfError(Exception):
214 |     pass
215 | 
216 | 
217 | class PdfReadError(PyPdfError):
218 |     pass
219 | 
220 | 
221 | class PageSizeNotDefinedError(PyPdfError):
222 |     pass
223 | 
224 | 
225 | class PdfReadWarning(UserWarning):
226 |     pass
227 | 
228 | 
229 | class PdfStreamError(PdfReadError):
230 |     pass
231 | 
232 | 
233 | def pypdfBytes(s):
234 |     """
235 |     :type s: Union[bytes, str, int, unicode]
236 |     :rtype: bytes
237 |     """
238 |     if sys.version_info[0] < 3:
239 |         if isinstance(s, int):
240 |             return chr(s)
241 |         if isinstance(s, bytes):
242 |             return s
243 |         return s.encode("latin-1")
244 |     if isinstance(s, int):
245 |         return bytes([s])
246 |     if isinstance(s, bytes):
247 |         return s
248 |     return s.encode("latin-1")
249 | 
250 | 
251 | def pypdfUnicode(s):
252 |     """
253 |     :type s: Union[bytes, str, unicode]
254 |     :returns: ``unicode`` for Python 2, ``str`` for Python 3.
255 |     :rtype: Union[str, unicode]
256 |     """
257 |     if sys.version_info[0] < 3:
258 |         if isinstance(s, unicode):
259 |             return s
260 |         return unicode(s, "unicode_escape")
261 |     if isinstance(s, str):
262 |         return s
263 |     return s.decode("unicode_escape")
264 | 
265 | 
266 | def pypdfStr(b):
267 |     """
268 |     :type b: Union[bytes, str, unicode]
269 |     :rtype: str
270 |     """
271 |     if sys.version_info[0] < 3:
272 |         if isinstance(b, unicode):
273 |             return b.encode("latin-1")
274 |         return b
275 |     if isinstance(b, bytes):
276 |         return b.decode("latin-1")
277 |     return b
278 | 
279 | 
280 | def pypdfOrd(b):
281 |     """
282 |     :type b: Union[int, bytes, str, unicode]
283 |     :rtype: int
284 |     """
285 |     if isinstance(b, int):
286 |         return b
287 |     return ord(b)
288 | 
289 | 
290 | def pypdfChr(c):
291 |     """
292 |     :type c: Union[int, bytes, str, unicode]
293 |     :rtype: str
294 |     """
295 |     if isinstance(c, int):
296 |         return chr(c)
297 |     return chr(ord(c))
298 | 
299 | 
300 | def pypdfBytearray(b):
301 |     """
302 |     Abstracts the conversion from a ``bytes`` variable to a ``bytearray`` value
303 |     over versions 2.7.x and 3 of Python.
304 |     """
305 |     if sys.version_info[0] < 3:
306 |         return b
307 |     return bytearray(b)
308 | 
309 | 
310 | def hexEncode(s):
311 |     """
312 |     Abstracts the conversion from a LATIN 1 string to an hex-valued string
313 |     representation of the former over versions 2.7.x and 3 of Python.
314 | 
315 |     :param str s: a ``str`` to convert from LATIN 1 to an hexadecimal string
316 |         representation.
317 |     :return: a hex-valued string, e.g. ``hexEncode("$A'") == "244127"``.
318 |     :rtype: str
319 |     """
320 |     if sys.version_info < (3, 0):
321 |         return s.encode("hex")
322 |     if isinstance(s, str):
323 |         s = s.encode("LATIN1")
324 | 
325 |     # The output is in the set of "0123456789ABCDEF" characters. Using the
326 |     # ASCII decoder is a safeguard against anomalies, albeit unlikely
327 |     return hexlify(s).decode("ASCII")
328 | 
329 | 
330 | def hexStr(num):
331 |     return hex(num).replace("L", "")
332 | 
333 | 
334 | WHITESPACES = [pypdfBytes(x) for x in [" ", "\n", "\r", "\t", "\x00"]]
335 | 
336 | 
337 | def paethPredictor(left, up, up_left):
338 |     p = left + up - up_left
339 |     dist_left = abs(p - left)
340 |     dist_up = abs(p - up)
341 |     dist_up_left = abs(p - up_left)
342 | 
343 |     if dist_left <= dist_up and dist_left <= dist_up_left:
344 |         return left
345 |     if dist_up <= dist_up_left:
346 |         return up
347 |     return up_left
348 | 
349 | 
350 | def pairs(sequence):
351 |     """
352 |     :param sequence: an indexable sequence value with ``__len__()``.
353 |     :return: an iterable of paired values from ``sequence``.
354 |     """
355 |     if (len(sequence) % 2) != 0:
356 |         raise ValueError("sequence must contain an even number of elements")
357 | 
358 |     for i in range(0, len(sequence) - 1, 2):
359 |         yield (sequence[i], sequence[i + 1])
360 | 


--------------------------------------------------------------------------------
/pypdf/xmp.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import decimal
  3 | import re
  4 | from xml.dom.minidom import parseString
  5 | 
  6 | from .generic import PdfObject
  7 | from .utils import pypdfUnicode
  8 | 
  9 | RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 10 | DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
 11 | XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
 12 | PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
 13 | XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
 14 | 
 15 | # What is the PDFX namespace, you might ask?  I might ask that too.  It's
 16 | # a completely undocumented namespace used to place "custom metadata"
 17 | # properties, which are arbitrary metadata properties with no semantic or
 18 | # documented meaning.  Elements in the namespace are key/value-style storage,
 19 | # where the element name is the key and the content is the value.  The keys
 20 | # are transformed into valid XML identifiers by substituting an invalid
 21 | # identifier character with \u2182 followed by the unicode hex ID of the
 22 | # original character.  A key like "my car" is therefore "my\u21820020car".
 23 | #
 24 | # \u2182, in case you're wondering, is the unicode character
 25 | # \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
 26 | # escaping characters.
 27 | #
 28 | # Intentional users of the pdfx namespace should be shot on sight.  A
 29 | # custom data schema and sensical XML elements could be used instead, as is
 30 | # suggested by Adobe's own documentation on XMP (under "Extensibility of
 31 | # Schemas").
 32 | #
 33 | # Information presented here on the /pdfx/ schema is a result of limited
 34 | # reverse engineering, and does not constitute a full specification.
 35 | PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
 36 | 
 37 | iso8601 = re.compile(
 38 |     """
 39 |         (?P<year>[0-9]{4})
 40 |         (-
 41 |             (?P<month>[0-9]{2})
 42 |             (-
 43 |                 (?P<day>[0-9]+)
 44 |                 (T
 45 |                     (?P<hour>[0-9]{2}):
 46 |                     (?P<minute>[0-9]{2})
 47 |                     (:(?P<second>[0-9]{2}(.[0-9]+)?))?
 48 |                     (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
 49 |                 )?
 50 |             )?
 51 |         )?
 52 |         """,
 53 |     re.VERBOSE,
 54 | )
 55 | 
 56 | 
 57 | class XmpInformation(PdfObject):
 58 |     """
 59 |     An object that represents Adobe XMP metadata. Usually accessed by
 60 |     :meth:`xmpMetadata()<pypdf.PdfFileReader.xmpMetadata>`
 61 |     """
 62 | 
 63 |     def __init__(self, stream):
 64 |         self.stream = stream
 65 |         docRoot = parseString(self.stream.getData())
 66 |         self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
 67 |         self.cache = {}
 68 | 
 69 |     def writeToStream(self, stream, encryption_key):
 70 |         self.stream.writeToStream(stream, encryption_key)
 71 | 
 72 |     def getElement(self, aboutUri, namespace, name):
 73 |         for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
 74 |             if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
 75 |                 attr = desc.getAttributeNodeNS(namespace, name)
 76 | 
 77 |                 if attr is not None:
 78 |                     yield attr
 79 |                 for element in desc.getElementsByTagNameNS(namespace, name):
 80 |                     yield element
 81 | 
 82 |     def getNodesInNamespace(self, aboutUri, namespace):
 83 |         for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
 84 |             if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
 85 |                 for i in range(desc.attributes.length):
 86 |                     attr = desc.attributes.item(i)
 87 | 
 88 |                     if attr.namespaceURI == namespace:
 89 |                         yield attr
 90 |                 for child in desc.childNodes:
 91 |                     if child.namespaceURI == namespace:
 92 |                         yield child
 93 | 
 94 |     def _getText(self, element):
 95 |         text = ""
 96 | 
 97 |         for child in element.childNodes:
 98 |             if child.nodeType == child.TEXT_NODE:
 99 |                 text += child.data
100 | 
101 |         return text
102 | 
103 |     def _converterString(value):
104 |         return value
105 | 
106 |     def _converterDate(value):
107 |         m = iso8601.match(value)
108 |         year = int(m.group("year"))
109 |         month = int(m.group("month") or "1")
110 |         day = int(m.group("day") or "1")
111 |         hour = int(m.group("hour") or "0")
112 |         minute = int(m.group("minute") or "0")
113 |         second = decimal.Decimal(m.group("second") or "0")
114 |         seconds = second.to_integral(decimal.ROUND_FLOOR)
115 |         milliseconds = (second - seconds) * 1000000
116 |         tzd = m.group("tzd") or "Z"
117 |         dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
118 | 
119 |         if tzd != "Z":
120 |             tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
121 |             tzd_hours *= -1
122 |             if tzd_hours < 0:
123 |                 tzd_minutes *= -1
124 |             dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
125 | 
126 |         return dt
127 | 
128 |     _test_converter_date = staticmethod(_converterDate)
129 | 
130 |     def _getterBag(namespace, name, converter):
131 |         def get(self):
132 |             cached = self.cache.get(namespace, {}).get(name)
133 |             retval = []
134 | 
135 |             if cached:
136 |                 return cached
137 | 
138 |             for element in self.getElement("", namespace, name):
139 |                 bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
140 | 
141 |                 if len(bags):
142 |                     for bag in bags:
143 |                         for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
144 |                             value = self._getText(item)
145 |                             value = converter(value)
146 |                             retval.append(value)
147 | 
148 |             ns_cache = self.cache.setdefault(namespace, {})
149 |             ns_cache[name] = retval
150 | 
151 |             return retval
152 | 
153 |         return get
154 | 
155 |     def _getterSeq(namespace, name, converter):
156 |         def get(self):
157 |             cached = self.cache.get(namespace, {}).get(name)
158 |             retval = []
159 | 
160 |             if cached:
161 |                 return cached
162 | 
163 |             for element in self.getElement("", namespace, name):
164 |                 seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
165 | 
166 |                 if len(seqs):
167 |                     for seq in seqs:
168 |                         for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
169 |                             value = self._getText(item)
170 |                             value = converter(value)
171 |                             retval.append(value)
172 |                 else:
173 |                     value = converter(self._getText(element))
174 |                     retval.append(value)
175 | 
176 |             ns_cache = self.cache.setdefault(namespace, {})
177 |             ns_cache[name] = retval
178 | 
179 |             return retval
180 | 
181 |         return get
182 | 
183 |     def _getterLangalt(namespace, name, converter):
184 |         def get(self):
185 |             cached = self.cache.get(namespace, {}).get(name)
186 |             retval = {}
187 | 
188 |             if cached:
189 |                 return cached
190 |             for element in self.getElement("", namespace, name):
191 |                 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
192 |                 if len(alts):
193 |                     for alt in alts:
194 |                         for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
195 |                             value = self._getText(item)
196 |                             value = converter(value)
197 |                             retval[item.getAttribute("xml:lang")] = value
198 |                 else:
199 |                     retval["x-default"] = converter(self._getText(element))
200 | 
201 |             ns_cache = self.cache.setdefault(namespace, {})
202 |             ns_cache[name] = retval
203 | 
204 |             return retval
205 | 
206 |         return get
207 | 
208 |     def _getterSingle(namespace, name, converter):
209 |         def get(self):
210 |             cached = self.cache.get(namespace, {}).get(name)
211 | 
212 |             if cached:
213 |                 return cached
214 | 
215 |             value = None
216 | 
217 |             for element in self.getElement("", namespace, name):
218 |                 if element.nodeType == element.ATTRIBUTE_NODE:
219 |                     value = element.nodeValue
220 |                 else:
221 |                     value = self._getText(element)
222 |                 break
223 | 
224 |             if value is not None:
225 |                 value = converter(value)
226 | 
227 |             ns_cache = self.cache.setdefault(namespace, {})
228 |             ns_cache[name] = value
229 | 
230 |             return value
231 | 
232 |         return get
233 | 
234 |     dc_contributor = property(_getterBag(DC_NAMESPACE, "contributor", _converterString))
235 |     """
236 |     Contributors to the resource (other than the authors). An unsorted array of
237 |     names.
238 |     """
239 | 
240 |     dc_coverage = property(_getterSingle(DC_NAMESPACE, "coverage", _converterString))
241 |     """
242 |     Text describing the extent or scope of the resource.
243 |     """
244 | 
245 |     dc_creator = property(_getterSeq(DC_NAMESPACE, "creator", _converterString))
246 |     """
247 |     A sorted array of names of the authors of the resource, listed in order of
248 |     precedence.
249 |     """
250 | 
251 |     dc_date = property(_getterSeq(DC_NAMESPACE, "date", _converterDate))
252 |     """
253 |     A sorted array of dates (``datetime.datetime`` instances) of significance
254 |     to the resource.  The dates and times are in UTC.
255 |     """
256 | 
257 |     dc_description = property(
258 |         _getterLangalt(DC_NAMESPACE, "description", _converterString)
259 |     )
260 |     """
261 |     A language-keyed dictionary of textual descriptions of the content of the
262 |     resource.
263 |     """
264 | 
265 |     dc_format = property(_getterSingle(DC_NAMESPACE, "format", _converterString))
266 |     """
267 |     The mime-type of the resource.
268 |     """
269 | 
270 |     dc_identifier = property(
271 |         _getterSingle(DC_NAMESPACE, "identifier", _converterString)
272 |     )
273 |     """
274 |     Unique identifier of the resource.
275 |     """
276 | 
277 |     dc_language = property(_getterBag(DC_NAMESPACE, "language", _converterString))
278 |     """
279 |     An unordered array specifying the languages used in the resource.
280 |     """
281 | 
282 |     dc_publisher = property(_getterBag(DC_NAMESPACE, "publisher", _converterString))
283 |     """
284 |     An unordered array of publisher names.
285 |     """
286 | 
287 |     dc_relation = property(_getterBag(DC_NAMESPACE, "relation", _converterString))
288 |     """
289 |     An unordered array of text descriptions of relationships to other
290 |     documents.
291 |     """
292 | 
293 |     dc_rights = property(_getterLangalt(DC_NAMESPACE, "rights", _converterString))
294 |     """
295 |     A language-keyed dictionary of textual descriptions of the rights the user
296 |     has to this resource.
297 |     """
298 | 
299 |     dc_source = property(_getterSingle(DC_NAMESPACE, "source", _converterString))
300 |     """
301 |     Unique identifier of the work from which this resource was derived.
302 |     """
303 | 
304 |     dc_subject = property(_getterBag(DC_NAMESPACE, "subject", _converterString))
305 |     """
306 |     An unordered array of descriptive phrases or keywrods that specify the
307 |     topic of the content of the resource.
308 |     """
309 | 
310 |     dc_title = property(_getterLangalt(DC_NAMESPACE, "title", _converterString))
311 |     """
312 |     A language-keyed dictionary of the title of the resource.
313 |     """
314 | 
315 |     dc_type = property(_getterBag(DC_NAMESPACE, "type", _converterString))
316 |     """
317 |     An unordered array of textual descriptions of the document type.
318 |     """
319 | 
320 |     pdf_keywords = property(_getterSingle(PDF_NAMESPACE, "Keywords", _converterString))
321 |     """
322 |     An unformatted text string representing document keywords.
323 |     """
324 | 
325 |     pdf_pdfversion = property(
326 |         _getterSingle(PDF_NAMESPACE, "PDFVersion", _converterString)
327 |     )
328 |     """
329 |     The PDF file version, for example ``1.0``, ``1.3``.
330 |     """
331 | 
332 |     pdf_producer = property(_getterSingle(PDF_NAMESPACE, "Producer", _converterString))
333 |     """
334 |     The name of the tool that created the PDF document.
335 |     """
336 | 
337 |     xmp_createDate = property(
338 |         _getterSingle(XMP_NAMESPACE, "CreateDate", _converterDate)
339 |     )
340 |     """
341 |     The date and time the resource was originally created.  The date and time
342 |     are returned as a UTC ``datetime.datetime`` object.
343 |     """
344 | 
345 |     xmp_modifyDate = property(
346 |         _getterSingle(XMP_NAMESPACE, "ModifyDate", _converterDate)
347 |     )
348 |     """
349 |     The date and time the resource was last modified.  The date and time are
350 |     returned as a UTC ``datetime.datetime`` object.
351 |     """
352 | 
353 |     xmp_metadataDate = property(
354 |         _getterSingle(XMP_NAMESPACE, "MetadataDate", _converterDate)
355 |     )
356 |     """
357 |     The date and time that any metadata for this resource was last changed. The
358 |     date and time are returned as a UTC ``datetime.datetime`` object.
359 |     """
360 | 
361 |     xmp_creatorTool = property(
362 |         _getterSingle(XMP_NAMESPACE, "CreatorTool", _converterString)
363 |     )
364 |     """
365 |     The name of the first known tool used to create the resource.
366 |     """
367 | 
368 |     xmpmm_documentId = property(
369 |         _getterSingle(XMPMM_NAMESPACE, "DocumentID", _converterString)
370 |     )
371 |     """
372 |     The common identifier for all versions and renditions of this resource.
373 |     """
374 | 
375 |     xmpmm_instanceId = property(
376 |         _getterSingle(XMPMM_NAMESPACE, "InstanceID", _converterString)
377 |     )
378 |     """
379 |     An identifier for a specific incarnation of a document, updated each time a
380 |     file is saved.
381 |     """
382 | 
383 |     @property
384 |     def custom_properties(self):
385 |         """
386 |         Retrieves custom metadata properties defined in the undocumented pdfx
387 |         metadata schema.
388 | 
389 |         :return: a dictionary of key/value items for custom metadata
390 |             properties.
391 |         :rtype: dict
392 |         """
393 |         if not hasattr(self, "_custom_properties"):
394 |             self._custom_properties = {}
395 | 
396 |             for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
397 |                 key = node.localName
398 | 
399 |                 while True:
400 |                     # See documentation about PDFX_NAMESPACE earlier in file
401 |                     idx = key.find(pypdfUnicode("\u2182"))
402 | 
403 |                     if idx == -1:
404 |                         break
405 | 
406 |                     key = (
407 |                         key[:idx]
408 |                         + chr(int(key[idx + 1 : idx + 5], base=16))
409 |                         + key[idx + 5 :]
410 |                     )
411 |                 if node.nodeType == node.ATTRIBUTE_NODE:
412 |                     value = node.nodeValue
413 |                 else:
414 |                     value = self._getText(node)
415 |                 self._custom_properties[key] = value
416 | 
417 |         return self._custom_properties
418 | 


--------------------------------------------------------------------------------
/samplecode/MergingComments.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 |    test/demo program that copy alll comments from multiples pdf into one command line:
 4 |    PDFCommentsMerge [-d] [-o output.pdf] [input1.pdf] ... [inputN.pdf]
 5 |    -d: open Excel output at the end of extraction
 6 |    -o: prode the output Excel name/path ; if not present the file is created
 7 |        in temp folder named "FullCommented **input1**.pdf"
 8 |     if no parameters (mainly for idle test), the pdf filenames re asked for
 9 |     empty to finish
10 | """
11 | import os
12 | import sys
13 | 
14 | import pypdf as PDF
15 | 
16 | if sys.argv[0].upper().find("PYTHON.EXE") >= 0:
17 |     del sys.argv[0]
18 | del sys.argv[0]  # to ignore called program
19 | 
20 | displayOutput = ("-d" in sys.argv) or ("idlelib.run" in sys.modules)
21 | try:
22 |     del sys.argv[sys.argv.index("-d")]
23 | except:
24 |     pass
25 | 
26 | 
27 | if (len(sys.argv) == 0) or (("-o" in sys.argv) and (len(sys.argv) <= 2)):
28 |     print(globals()["__doc__"])
29 |     while True:
30 |         t = input("pdf file to scan:")
31 |         if t == "":
32 |             break
33 |         sys.argv.append(t)
34 | 
35 | if "-o" in sys.argv:
36 |     i = sys.argv.index("-o")
37 |     outFile = sys.argv[i + 1]
38 |     del sys.argv[i]
39 |     del sys.argv[i]
40 | else:
41 |     tempFolder = os.environ["TEMP"].replace("\\", "/")
42 |     if tempFolder[-1] != "/":
43 |         tempFolder += "/"
44 |     outFile = (
45 |         tempFolder
46 |         + "FullCommented "
47 |         + os.path.splitext(os.path.split(sys.argv[0])[-1])[0]
48 |         + ".pdf"
49 |     )
50 | 
51 | pdfO = PDF.PdfFileWriter(None, PDF.PdfFileReader(sys.argv[0]))
52 | del sys.argv[0]
53 | 
54 | pdfS = []
55 | for f in sys.argv:
56 |     pdfS.append(PDF.PdfFileReader(f))
57 |     # check if decryption is required ; normally not required
58 |     if pdfS[-1].isEncrypted:
59 |         pdfS[-1].decrypt("")
60 | 
61 | # we assume that all the documents are commenting the same original document
62 | for i in range(pdfO.numPages):
63 |     po = pdfO.getPage(i)
64 |     for pdfin in pdfS:
65 |         pdfO.addCommentsFromPage(i, pdfin.getPage(i))
66 | 
67 | pdfO.write(outFile)
68 | if displayOutput:
69 |     os.startfile(outFile)
70 | 


--------------------------------------------------------------------------------
/samplecode/PDFComments2XL.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | """
  3 |    test/demo program tha extract comments from an pdf into a Excel
  4 |    command line:
  5 |    PDFComments2XL [-d] [-o output.xls] [input.pdf]
  6 |    -d: open Excel output at the end of extraction
  7 |    -o: prode the output Excel name/path ; if not present the file is created
  8 |        in temp folder named "comments on **PDFfile**.xlsx"
  9 |     if no parameters (mainly for idle test), the pdf filename is asked for
 10 | """
 11 | from collections import OrderedDict
 12 | from datetime import datetime
 13 | import locale
 14 | import os
 15 | import sys
 16 | 
 17 | from openpyxl import Workbook
 18 | from openpyxl.utils import get_column_letter
 19 | 
 20 | import pypdf as PDF
 21 | 
 22 | locale.setlocale(locale.LC_ALL, locale.getdefaultlocale()[0])
 23 | 
 24 | 
 25 | def ListOutlines(pdfS, outl=None):
 26 |     """
 27 |     provide as a list of the outlines as tuple Title,Page(0 based),Vertical position in %
 28 |     """
 29 |     if outl is None:
 30 |         lst = [
 31 |             ("-", 0, 0),
 32 |         ]
 33 |         outl = pdfS.getOutlines()
 34 |     else:
 35 |         lst = []
 36 |     if isinstance(outl, list):
 37 |         for k in outl:
 38 |             lst += ListOutlines(pdfS, k)
 39 |     else:
 40 |         try:
 41 |             top = outl["/Top"]
 42 |         except:
 43 |             top = 0
 44 |         try:
 45 |             pp = pdfS.MyPages[outl.page.idnum]
 46 |             lst.append((outl.title, pp[0], 100.0 * (1.0 - float(top / pp[1]))))
 47 |         except:
 48 |             print("trouble with page idnum", outl.page.idnum)
 49 |     return lst
 50 | 
 51 | 
 52 | def ListAnnots(pdfS):
 53 |     """
 54 |     provide as a list of the comments with the response saved in .irt_str field, the list is indexed with idnums
 55 |     """
 56 |     lst = OrderedDict()
 57 |     for pn in range(pdfS.numPages):
 58 |         p = pdfS.getPage(pn)
 59 |         try:
 60 |             a = p.get("/Annots").getObject()
 61 |             if not isinstance(a, list):
 62 |                 a = [a]
 63 |             for b in a:
 64 |                 o = b.getObject()
 65 |                 if o["/Subtype"] == "/Text":
 66 |                     try:
 67 |                         o["/P"]  # le champs '/P' etant optionnel on le reconstruit...
 68 |                     except:
 69 |                         o.update({PDF.NameObject("/P"): p.indirectRef})
 70 |                     o.irt = {}
 71 |                     lst[b.idnum] = o
 72 |         except:
 73 |             pass
 74 |     # copy the information into the original comment
 75 |     for k, o in lst.items():
 76 |         if "/IRT" in o:
 77 |             t = o["/Contents"]
 78 |             if isinstance(t, bytes):
 79 |                 t = t.replace(b"\r", b"\n").decode("unicode_escape")
 80 |             lst[o.rawGet("/IRT").idnum].irt[o["/M"]] = "%s (%s):\n%s" % (
 81 |                 o["/T"],
 82 |                 datetime.strptime(o["/M"][2:10], "%Y%m%d").strftime("%x"),
 83 |                 t,
 84 |             )
 85 |     # concat all replied comments into one string to ease insertion later...
 86 |     for o in lst.values():
 87 |         o.irt_str = "\n".join([o.irt[x] for x in sorted(o.irt.keys())])
 88 |     return lst
 89 | 
 90 | 
 91 | def FindOutline(Outlines, pa, pe):
 92 |     """
 93 |     provide the outline just above the position (of the comment)
 94 |     """
 95 |     m = None
 96 |     for o in Outlines:
 97 |         if (o[1] < pa) or ((o[1] == pa) and (o[2] <= pe)):
 98 |             m = o
 99 |     return m
100 | 
101 | 
102 | if sys.argv[0].upper().find("PYTHON.EXE") >= 0:
103 |     del sys.argv[0]
104 | 
105 | if len(sys.argv) == 1:
106 |     print(globals()["__doc__"])
107 |     sys.argv.append(input("pdf file to scan:"))
108 | 
109 | pdfS = PDF.PdfFileReader(sys.argv[-1])
110 | 
111 | if "-o" in sys.argv:
112 |     xlFile = sys.argv[sys.argv.index("-o") + 1]
113 | else:
114 |     tempFolder = os.environ["TEMP"].replace("\\", "/")
115 |     if tempFolder[-1] != "/":
116 |         tempFolder += "/"
117 |     xlFile = (
118 |         tempFolder
119 |         + "Comments on "
120 |         + os.path.splitext(os.path.split(pdfS.filepath)[-1])[0]
121 |         + ".xlsx"
122 |     )
123 | 
124 | # prepare the destination workbook
125 | wb = Workbook()
126 | ws = wb.active
127 | ws.append(("Page", "Pos", "Chapt", "Originator", "Comment", "Answer"))
128 | ws.column_dimensions[get_column_letter(0 + 1)].width = 5
129 | ws.column_dimensions[get_column_letter(1 + 1)].width = 5
130 | ws.column_dimensions[get_column_letter(2 + 1)].width = 25
131 | ws.column_dimensions[get_column_letter(3 + 1)].width = 15
132 | ws.column_dimensions[get_column_letter(4 + 1)].width = 90
133 | ws.column_dimensions[get_column_letter(5 + 1)].width = 90
134 | 
135 | # check if decryption is required
136 | if pdfS.isEncrypted:
137 |     pdfS.decrypt("")
138 | 
139 | # MyPages will store the matching table page.idnum => pagenumer,page_height
140 | pdfS.MyPages = {}
141 | 
142 | for i, p in enumerate(pdfS.pages):
143 |     pdfS.MyPages[p.indirectRef.idnum] = [i, p["/MediaBox"][3]]
144 | 
145 | # extract the list of OutLines into MyOutlines
146 | pdfS.MyOutlines = ListOutlines(pdfS)
147 | 
148 | # extract the comments into MyAnnots
149 | pdfS.MyAnnots = ListAnnots(pdfS)
150 | 
151 | 
152 | # sort the comments in the order (Page, vertical position, date)
153 | lst = {}
154 | for p in pdfS.MyAnnots.values():
155 |     pp = pdfS.MyPages[p.rawGet("/P").idnum]
156 |     pc = 100.0 * (1.0 - float(int(p["/Rect"][1]) / pp[1]))
157 |     lst[(pp[0], pc, p["/M"])] = p
158 | 
159 | # fill the xl sheet with the comments
160 | for x in sorted(lst.keys()):
161 |     p = lst[x]
162 |     if "/IRT" in p:
163 |         continue  # the comments with IRT are already present in the original comment irt field, we can ignore this one
164 | 
165 |     # print(x[0],',',end='')
166 |     # print('%.0f %%'%pc,',',end='')
167 |     # print(FindOutline(pdfS.MyOutlines,x[0],x[1])[0],',',end='')
168 |     auth = p["/T"]
169 |     if isinstance(auth, bytes):
170 |         auth = auth.decode("unicode_escape")
171 |     cont = p["/Contents"]
172 |     if isinstance(cont, bytes):
173 |         cont = cont.replace(b"\r", b"\n").decode("unicode_escape")
174 |     # print(cont,',',end='')
175 |     if isinstance(p.irt_str, bytes):
176 |         p.irt_str = p.irt_str.replace(b"\r", b"\n").decode("unicode_escape")
177 |     # print(p.irt_str)
178 | 
179 |     ws.append(
180 |         (
181 |             pdfS.getPageLabel(x[0]),
182 |             "%.0f %%" % pc,
183 |             FindOutline(pdfS.MyOutlines, x[0], x[1])[0],
184 |             auth,
185 |             cont,
186 |             p.irt_str,
187 |         )
188 |     )
189 | 
190 | # post insertion formating
191 | for row in ws.iter_rows():
192 |     for cell in row:
193 |         cell.alignment = cell.alignment.copy(wrapText=True, vertical="top")
194 | 
195 | # save and open the file
196 | wb.save(xlFile)
197 | if ("-d" in sys.argv) or ("idlelib.run" in sys.modules):
198 |     os.startfile(xlFile)
199 | 


--------------------------------------------------------------------------------
/samplecode/README.md:
--------------------------------------------------------------------------------
 1 | # PyPDF4 Sample Code Folder
 2 | This will contain demonstrations of the many features PyPDF4 is capable of.
 3 | Example code should make it easy for users to know how to use all aspects of
 4 | PyPDF4.
 5 | 
 6 | ## How to run
 7 | Invoke the Python interpeter you prefer by specifying the script you wish to
 8 | run, e.g.:
 9 | ```
10 | python2 ./samplecode/basic_features.py
11 | python3 ./samplecode/basic_features.py
12 | ``` 
13 | 
14 | ## Contributing to `samplecode`
15 | Feel free to add any type of PDF file or sample code, either by:
16 | 
17 | 	1. Sending it via email to PyPDF4@phaseit.net
18 | 	2. Including it in a pull request on GitHub
19 | 


--------------------------------------------------------------------------------
/samplecode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/__init__.py


--------------------------------------------------------------------------------
/samplecode/basic_features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Showcases basic features of PyPDF.
  4 | """
  5 | from __future__ import print_function
  6 | 
  7 | from os import pardir
  8 | from os.path import abspath, basename, dirname, join
  9 | from sys import argv, path, stderr
 10 | 
 11 | from pypdf.pdf import PdfFileReader, PdfFileWriter
 12 | 
 13 | SAMPLE_CODE_ROOT = dirname(__file__)
 14 | SAMPLE_PDF_ROOT = join(SAMPLE_CODE_ROOT, "pdfsamples")
 15 | 
 16 | path.append(abspath(join(SAMPLE_CODE_ROOT, pardir)))
 17 | 
 18 | 
 19 | FLAG_HELP = {"-h", "--help"}
 20 | USAGE = """\
 21 | Showcases basic features of PyPDF.
 22 | 
 23 | %(progname)s: <input file> [output file]
 24 | %(progname)s: [-h | --help]
 25 | """ % {
 26 |     "progname": argv[0]
 27 | }
 28 | 
 29 | 
 30 | def main():
 31 |     pagesRequired = 5
 32 |     output = "PyPDF-Features-Output.pdf"
 33 | 
 34 |     if set(argv) & FLAG_HELP:
 35 |         print(USAGE)
 36 |         exit(0)
 37 |     elif len(argv) < 2:
 38 |         print(USAGE)
 39 |         exit(1)
 40 |     else:
 41 |         inputpath = argv[1].strip()
 42 |         filename = basename(inputpath)
 43 | 
 44 |         if len(argv) > 2:
 45 |             output = argv[2].strip()
 46 | 
 47 |     # We can instantiate a PdfFileReader/Writer by giving in a stream object
 48 |     # or a path string
 49 |     reader = PdfFileReader(open(inputpath, "rb"))
 50 |     writer = PdfFileWriter(output)
 51 | 
 52 |     # Check that the PDF file has the required number of pages
 53 |     if reader.numPages < pagesRequired:
 54 |         print(
 55 |             "We require a document with %d pages at least, %s has %d"
 56 |             % (pagesRequired, filename, reader.numPages),
 57 |             file=stderr,
 58 |         )
 59 |         exit(1)
 60 |     else:
 61 |         print("'%s' has %d pages... OK" % (filename, reader.numPages))
 62 | 
 63 |     # Add page 1 from reader to output document, unchanged
 64 |     writer.addPage(reader.getPage(0))
 65 | 
 66 |     # Add page 2 from reader, but rotated clockwise 90 degrees
 67 |     writer.addPage(reader.getPage(1).rotateClockwise(90))
 68 | 
 69 |     # Add page 3 from reader, rotated the other way:
 70 |     writer.addPage(reader.getPage(2).rotateCounterClockwise(90))
 71 |     # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270))
 72 | 
 73 |     # Add page 4 from reader, but first add a watermark from another PDF:
 74 |     page4 = reader.getPage(3)
 75 |     watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb"))
 76 |     page4.mergePage(watermark.getPage(0))
 77 |     writer.addPage(page4)
 78 | 
 79 |     # Add page 5 from reader, but crop it to half size:
 80 |     page5 = reader.getPage(4)
 81 |     page5.mediaBox.upperRight = (
 82 |         page5.mediaBox.getUpperRight_x() / 2,
 83 |         page5.mediaBox.getUpperRight_y() / 2,
 84 |     )
 85 |     writer.addPage(page5)
 86 | 
 87 |     # Add some Javascript to launch the print window on opening this PDF.
 88 |     # The password dialog may prevent the print dialog from being shown.
 89 |     # Comment the encrypted lines, if that's the case, to try this out
 90 |     writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
 91 | 
 92 |     # Encrypt your new PDF and add a password
 93 |     password = "secret"
 94 |     writer.encrypt(password)
 95 | 
 96 |     # Finally, write the resulting PDF document to ``output``
 97 |     writer.write()
 98 | 
 99 |     print("Output successfully written to", output)
100 | 
101 |     reader.close()
102 |     writer.close()
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/samplecode/basic_merging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Merges three PDF documents input from the command line.
 4 | """
 5 | from __future__ import print_function
 6 | 
 7 | from os import pardir
 8 | from os.path import abspath, dirname, join
 9 | from sys import argv, path
10 | 
11 | from pypdf import PdfFileMerger, PdfFileReader
12 | 
13 | SAMPLE_CODE_ROOT = dirname(__file__)
14 | SAMPLE_PDF_ROOT = join(SAMPLE_CODE_ROOT, "pdfsamples")
15 | 
16 | path.append(abspath(join(SAMPLE_CODE_ROOT, pardir)))
17 | 
18 | 
19 | FLAG_HELP = {"-h", "--help"}
20 | USAGE = """\
21 | Merges three PDF documents input from the command line.
22 | 
23 | %(progname)s: <PDF 1> <PDF 2> <PDF 3> [output filename]
24 | %(progname)s: [-h | --help]
25 | """ % {
26 |     "progname": argv[0]
27 | }
28 | 
29 | 
30 | def main():
31 |     requiredPages = 3
32 |     output = "PyPDF-Merging-Output.pdf"
33 | 
34 |     if set(argv) & FLAG_HELP:
35 |         print(USAGE)
36 |         exit(0)
37 |     elif len(argv) < 4:
38 |         print(USAGE)
39 |         exit(1)
40 |     else:
41 |         files = [f.strip() for f in argv[1:4]]
42 | 
43 |         if len(argv) > 4:
44 |             output = argv[4].strip()
45 | 
46 |     reader1 = PdfFileReader(files[0])
47 |     merger = PdfFileMerger(open(output, "wb"))
48 | 
49 |     if reader1.numPages < requiredPages:
50 |         print(
51 |             "File 1 requires %d pages, but it has just %d"
52 |             % (requiredPages, reader1.numPages)
53 |         )
54 |         exit(1)
55 | 
56 |     input1 = open(files[0], "rb")
57 |     input2 = open(files[1], "rb")
58 |     input3 = open(files[2], "rb")
59 | 
60 |     # Add the first 3 pages of input1 to output
61 |     merger.append(fileobj=input1, pages=(0, 3))
62 | 
63 |     # Insert the first page of input2 into the output beginning after the
64 |     # second page
65 |     merger.merge(position=2, fileobj=input2, pages=(0, 1))
66 | 
67 |     # Append entire input3 document to the end of the output document
68 |     merger.append(input3)
69 | 
70 |     merger.write()
71 |     print("Output successfully written to", output)
72 | 
73 |     merger.close()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/samplecode/pdfsamples/AutoCad_Diagram.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/AutoCad_Diagram.pdf


--------------------------------------------------------------------------------
/samplecode/pdfsamples/AutoCad_Simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/AutoCad_Simple.pdf


--------------------------------------------------------------------------------
/samplecode/pdfsamples/GeoBase_NHNC1_Data_Model_UML_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/GeoBase_NHNC1_Data_Model_UML_EN.pdf


--------------------------------------------------------------------------------
/samplecode/pdfsamples/README.md:
--------------------------------------------------------------------------------
 1 | # PDF Sample Folder
 2 | 
 3 | PDF files are generated by a large variety of sources for many different
 4 | purposes. One of the goals of PyPDF4 is to be able to read/write any PDF
 5 | instance that Adobe can.
 6 | 
 7 | This is a catalog of various PDF files. The files may not have worked with
 8 | PyPDF4 but do now, they may be complicated or unconventional files, or they may
 9 | just be good for testing. The purpose is to insure that when changes to PyPDF4
10 | are made, we keep them in mind.
11 | 
12 | If you have confidential PDFs that don't work with PyPDF4, feel free to still
13 | e-mail them for debugging - we won't add PDFs without expressed permission.
14 | (This folder is available through GitHub only.)
15 | 
16 | Feel free to add any type of PDF file or sample code, either by:
17 | 
18 | 	1. Sending it via email to PyPDF4@phaseit.net
19 | 	2. Including it in a pull request on GitHub
20 | 


--------------------------------------------------------------------------------
/samplecode/pdfsamples/SF424_page2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/SF424_page2.pdf


--------------------------------------------------------------------------------
/samplecode/pdfsamples/Seige_of_Vicksburg_Sample_OCR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/Seige_of_Vicksburg_Sample_OCR.pdf


--------------------------------------------------------------------------------
/samplecode/pdfsamples/jpeg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/jpeg.pdf


--------------------------------------------------------------------------------
/scripts/2-up.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | from os.path import abspath, dirname, join
 5 | import sys
 6 | 
 7 | from pypdf import PdfFileReader, PdfFileWriter
 8 | 
 9 | PROJECT_ROOT = abspath(join(dirname(__file__), os.pardir, os.pardir))
10 | sys.path.append(PROJECT_ROOT)
11 | 
12 | 
13 | # TO-DO Decide which one of the two halves below to keep
14 | def main():
15 |     if len(sys.argv) != 3:
16 |         print("usage: python 2-up.py input_file output_file")
17 |         sys.exit(1)
18 | 
19 |     print("2-up input " + sys.argv[1])
20 | 
21 |     input1 = PdfFileReader(open(sys.argv[1], "rb"))
22 |     output = PdfFileWriter()
23 | 
24 |     for iter in range(0, input1.numPages - 1, 2):
25 |         lhs = input1.getPage(iter)
26 |         rhs = input1.getPage(iter + 1)
27 |         lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True)
28 |         output.addPage(lhs)
29 |         print(str(iter) + " "),
30 |         sys.stdout.flush()
31 | 
32 |     print("writing " + sys.argv[2])
33 |     output_stream = open(sys.argv[2], "wb")
34 |     output.write()
35 |     print("done.")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 
41 | 
42 | def main():
43 |     if len(sys.argv) != 3:
44 |         print("usage: python 2-up.py input_file output_file")
45 |         sys.exit(1)
46 | 
47 |     print("2-up input " + sys.argv[1])
48 |     input1 = PdfFileReader(open(sys.argv[1], "rb"))
49 |     output = PdfFileWriter()
50 | 
51 |     for i in range(0, input1.numPages - 1, 2):
52 |         lhs = input1.getPage(i)
53 |         rhs = input1.getPage(i + 1)
54 |         lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True)
55 |         output.addPage(lhs)
56 |         print(str(i) + " "),
57 |         sys.stdout.flush()
58 | 
59 |     print("writing " + sys.argv[2])
60 |     output_stream = open(sys.argv[2], "wb")
61 |     output.write()
62 |     print("done.")
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/scripts/codecs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # TO-DO Add license notice
  3 | """
  4 | Encodes/decodes data fed from the command line with PyPDF codecs.
  5 | 
  6 | Although PyPDF4 mandates Python 2 support as well, only Python 3 is supported
  7 | by this script.
  8 | """
  9 | import argparse
 10 | from os.path import abspath, dirname, join, pardir
 11 | from sys import exit, path, stderr
 12 | 
 13 | from pypdf.filters import *
 14 | 
 15 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir))
 16 | path.append(PROJECT_ROOT)
 17 | 
 18 | 
 19 | __version__ = "0.3.0"
 20 | CODECS = {
 21 |     "flate": FlateCodec,
 22 |     "asciihex": ASCIIHexCodec,
 23 |     "lzw": LZWCodec,
 24 |     "ascii85": ASCII85Codec,
 25 |     "dct": DCTCodec,
 26 |     "jpx": JPXCodec,
 27 |     "ccittfax": CCITTFaxCodec,
 28 | }
 29 | 
 30 | ENCODE, DECODE, LIST = ("encode", "decode", "list")
 31 | CODEC_ACTIONS = (ENCODE, DECODE)
 32 | VIEW_ACTIONS = (LIST,)
 33 | 
 34 | 
 35 | def main():
 36 |     """
 37 |     :return: exit status of program (``0`` with no errors, ``1`` with a generic
 38 |         error).
 39 |     """
 40 |     parser = argparse.ArgumentParser(
 41 |         description="Encodes/decodes some data fed in with PyPDF codecs",
 42 |         epilog="Version %s" % __version__,
 43 |     )
 44 |     subparsers = parser.add_subparsers(title="Commands", dest="action")
 45 |     codec_parser = subparsers.add_parser(
 46 |         ENCODE, aliases=(DECODE,), help="Encode/decode data"
 47 |     )
 48 |     _list_parser = subparsers.add_parser(LIST, help="List available codecs")
 49 | 
 50 |     subparsers.required = True
 51 |     parser.add_argument(
 52 |         "-v", "--version", action="version", version="%(prog)s " + __version__
 53 |     )
 54 | 
 55 |     codec_parser.add_argument("data", help="Data to either encode or decode")
 56 |     # TO-DO Add chained list of encoders/decoders support (like
 57 |     # ASCIIHexDecode(LZWDecode(data))).
 58 |     codec_parser.add_argument(
 59 |         "-c",
 60 |         "--codec",
 61 |         choices=CODECS.keys(),
 62 |         required=True,
 63 |         help="The codec to encode/decode with",
 64 |     )
 65 |     codec_parser.add_argument(
 66 |         "-f",
 67 |         "--file",
 68 |         dest="isfile",
 69 |         action="store_const",
 70 |         const=True,
 71 |         help="Whether the argument provided to DATA should be interpreted as a"
 72 |         " file path",
 73 |     )
 74 | 
 75 |     args = parser.parse_args()
 76 | 
 77 |     # TO-DO Find a proper way of writing bytes directly to the console (perhaps
 78 |     # throught bytes streams?). Decoded byte strings are not enough reliable.
 79 |     if args.action in CODEC_ACTIONS:
 80 |         codec = CODECS[args.codec]
 81 | 
 82 |         if args.isfile:
 83 |             try:
 84 |                 with open(args.data, "rb") as instream:
 85 |                     data = instream.read()
 86 |             except IOError as e:
 87 |                 print(e, file=stderr)
 88 |                 return 1
 89 |         else:
 90 |             data = args.data.encode("LATIN1")
 91 | 
 92 |         if args.action == ENCODE:
 93 |             data = codec.encode(data)
 94 |         elif args.action == DECODE:
 95 |             data = codec.decode(data)
 96 | 
 97 |         if isinstance(data, bytes):
 98 |             data = data.decode("LATIN1")
 99 | 
100 |         print(data)
101 |     elif args.action == LIST:
102 |         print("Available codecs:", *CODECS.keys(), sep="\n\t")
103 |     else:
104 |         print("Unrecognized action", args.action, file=stderr)
105 |         return 1
106 | 
107 |     return 0
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     exit(main())
112 | 


--------------------------------------------------------------------------------
/scripts/pdf-image-extractor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extract images from PDFs without resampling or altering.
 3 | 
 4 | Adapted from work by Sylvain Pelissier
 5 | http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
 6 | """
 7 | from __future__ import print_function
 8 | 
 9 | import os
10 | from os.path import abspath, dirname, join
11 | import sys
12 | 
13 | from PIL import Image
14 | from pypdf import PdfFileReader
15 | 
16 | PROJECT_ROOT = abspath(join(dirname(__file__), os.pardir))
17 | sys.path.append(PROJECT_ROOT)
18 | 
19 | 
20 | def _handle_filter(x_object, obj, mode, size, data):
21 |     """ [EXPLAIN.] """
22 | # CL will eventually rewrite this so it's even simpler.
23 |     if "/Filter" in x_object[obj]:
24 |         x_filter = x_object[obj]["/Filter"]
25 |         if x_filter == "/FlateDecode":
26 |             pass
27 |         elif x_filter == "/DCTDecode":
28 |             img = open(obj[1:] + ".jpg", "wb")
29 |             img.write(data)
30 |             img.close()
31 |             return
32 |         elif x_filter == "/JPXDecode":
33 |             img = open(obj[1:] + ".jp2", "wb")
34 |             img.write(data)
35 |             img.close()
36 |             return
37 |         elif x_filter == "/CCITTFaxDecode":
38 |             img = open(obj[1:] + ".tiff", "wb")
39 |             img.write(data)
40 |             img.close()
41 |             return
42 |     img = Image.frombytes(mode, size, data)
43 |     img.save(obj[1:] + ".png")
44 | 
45 | 
46 | def main():
47 |     """ [EXPLAIN.] """
48 |     if len(sys.argv) != 2:
49 |         print("{}: <filepath>".format(sys.argv[0]))
50 |         return 1
51 | 
52 |     filepath = sys.argv[1].strip()
53 |     r__ = PdfFileReader(open(filepath, "rb"))
54 |     page_number = 0
55 | 
56 |     while page_number < r__.numPages:
57 |         page = r__.getPage(page_number)
58 | 
59 |         if "/XObject" in page["/Resources"]:
60 |             x_object = page["/Resources"]["/XObject"].getObject()
61 | 
62 |             for obj in x_object:
63 |                 if x_object[obj]["/Subtype"] == "/Image":
64 |                     size = (x_object[obj]["/Width"], x_object[obj]["/Height"])
65 |                     data = x_object[obj].getData()
66 | 
67 |                     if x_object[obj]["/ColorSpace"] == "/DeviceRGB":
68 |                         mode = "RGB"
69 |                     else:
70 |                         mode = "P"
71 | 
72 |                     _handle_filter(x_object, obj, mode, size, data)
73 |         else:
74 |             print("No image found.")
75 | 
76 |         page_number += 1
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     sys.exit(main())
81 | 


--------------------------------------------------------------------------------
/scripts/pdfcat:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Concatenate pages from pdf files into a single pdf file.
  4 | 
  5 | Page ranges refer to the previously-named file.
  6 | A file not followed by a page range means all the pages of the file.
  7 | 
  8 | PAGE RANGES are like Python slices.
  9 |         {page_range_help}
 10 | EXAMPLES
 11 |     pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1
 12 |         Concatenate all of head.pdf, all but page seven of content.pdf,
 13 |         and the last page of tail.pdf, producing output.pdf.
 14 | 
 15 |     pdfcat chapter*.pdf >book.pdf
 16 |         You can specify the output file by redirection.
 17 | 
 18 |     pdfcat chapter?.pdf chapter10.pdf >book.pdf
 19 |         In case you don't want chapter 10 before chapter 2.
 20 | """
 21 | # Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
 22 | # All rights reserved. This software is available under a BSD license;
 23 | # see https://github.com/mstamy2/PyPDF2/LICENSE
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | import traceback
 31 | 
 32 | from io import BytesIO
 33 | from sys import stderr, stdout, exit
 34 | from os.path import dirname, abspath, join
 35 | 
 36 | PROJECT_ROOT = abspath(
 37 |     join(dirname(__file__), os.pardir)
 38 | )
 39 | sys.path.append(PROJECT_ROOT)
 40 | 
 41 | from pypdf import PdfFileReader, PdfFileMerger
 42 | from pypdf.pagerange import PAGE_RANGE_HELP, parseFilenamePageRanges
 43 | 
 44 | 
 45 | def parseArgs():
 46 |     parser = argparse.ArgumentParser(
 47 |         description=__doc__.format(page_range_help=PAGE_RANGE_HELP),
 48 |         formatter_class=argparse.RawDescriptionHelpFormatter
 49 |     )
 50 |     parser.add_argument("-o", "--output", metavar="output_file")
 51 |     parser.add_argument("-v", "--verbose", action="store_true",
 52 |                         help="show page ranges as they are being read")
 53 |     parser.add_argument(
 54 |         "--no-strict", default=False, action="store_true",
 55 |         help="Whether to parse PDF files in strict mode (defaults to True)."
 56 |     )
 57 |     parser.add_argument("first_filename", nargs=1,
 58 |                         metavar="filename [page range...]")
 59 |     # argparse chokes on page ranges like "-2:" unless caught like this:
 60 |     parser.add_argument("fn_pgrgs", nargs=argparse.REMAINDER,
 61 |                         metavar="filenames and/or page ranges")
 62 |     parser.add_argument("-T", "--toc", action="store_true",
 63 |                         help="stores an auto-generated Table of Contents to "
 64 |                              "output file")
 65 |     args = parser.parse_args()
 66 |     args.fn_pgrgs.insert(0, args.first_filename[0])
 67 | 
 68 |     return args
 69 | 
 70 | 
 71 | if __name__ == "__main__":
 72 |     args = parseArgs()
 73 |     filename_page_ranges = parseFilenamePageRanges(args.fn_pgrgs)
 74 | 
 75 |     if args.output:
 76 |         output = open(args.output, "wb")
 77 |     else:
 78 |         output = BytesIO()
 79 | 
 80 |     merger = PdfFileMerger(output, not args.no_strict)
 81 |     in_fs = dict()
 82 |     curr_page = 0
 83 | 
 84 |     try:
 85 |         for (filename, page_range) in filename_page_ranges:
 86 |             if args.verbose:
 87 |                 print(filename, page_range, file=stderr)
 88 |             if filename not in in_fs:
 89 |                 in_fs[filename] = open(filename, "rb")
 90 | 
 91 |             merger.append(in_fs[filename], pages=page_range)
 92 | 
 93 |             if args.toc:
 94 |                 r = PdfFileReader(filename)
 95 |                 # fallbackName equals 'a' if filename == 'a.pdf', # or 'b'
 96 |                 # if filename == 'b.x', or 'c' if filename == '/u/v/w/c.x' ...
 97 |                 fallbackName = basename(filename)
 98 |                 fallbackName = fallbackName[:fallbackName.rfind(".")]
 99 | 
100 |                 merger.addBookmark(
101 |                     getattr(r.documentInfo, "title", fallbackName)
102 |                     or fallbackName, curr_page
103 |                 )
104 | 
105 |                 curr_page += r.numPages
106 |     except Exception:
107 |         print(traceback.format_exc(), file=stderr)
108 |         print("Error while reading " + filename, file=stderr)
109 |         exit(1)
110 | 
111 |     merger.write()
112 | 
113 |     if not args.output:
114 |         output.seek(0, 0)
115 |         stdout.buffer.write(output.read())
116 | 
117 |     merger.close()
118 |     # In 3.0, input files must stay open until output is written.
119 |     # Not closing the in_fs because this script exits now.
120 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | import re
 5 | 
 6 | long_description = """
 7 | A Pure-Python library built as a PDF toolkit.  It is capable of:
 8 | 
 9 | - extracting document information (title, author, ...)
10 | - splitting documents page by page
11 | - merging documents page by page
12 | - cropping pages
13 | - merging multiple pages into a single page
14 | - encrypting and decrypting PDF files
15 | - and more!
16 | 
17 | By being Pure-Python, it should run on any Python platform without any
18 | dependencies on external libraries.  It can also work entirely on StringIO
19 | objects rather than file streams, allowing for PDF manipulation in memory.
20 | It is therefore a useful tool for websites that manage or manipulate PDFs.
21 | """
22 | 
23 | VERSIONFILE="pypdf/_version.py"
24 | verstrline = open(VERSIONFILE, "rt").read()
25 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
26 | mo = re.search(VSRE, verstrline, re.M)
27 | if mo:
28 |     verstr = mo.group(1)
29 | else:
30 |     raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE))
31 | 
32 | setup(
33 |         name="pypdf4",
34 |         version=verstr,
35 |         description="PDF toolkit",
36 |         long_description=long_description,
37 |         author="Mathieu Fenniak",
38 |         author_email="biziqe@mathieu.fenniak.net",
39 |         maintainer="Phaseit, Inc.",
40 |         maintainer_email="PyPDF4@phaseit.net",
41 |         url="http://claird.github.com/PyPDF4",
42 |         classifiers = [
43 |             "Development Status :: 5 - Production/Stable",
44 |             "Intended Audience :: Developers",
45 |             "License :: OSI Approved :: BSD License",
46 |             "Programming Language :: Python :: 2",
47 |             "Programming Language :: Python :: 3",
48 |             "Operating System :: OS Independent",
49 |             "Topic :: Software Development :: Libraries :: Python Modules",
50 |             ],
51 |         packages=["pypdf"],
52 |     )
53 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixture_data/GeoBase_NHNC1_Data_Model_UML_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/GeoBase_NHNC1_Data_Model_UML_EN.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/SF424_page2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/SF424_page2.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/Seige_of_Vicksburg_Sample_OCR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/Seige_of_Vicksburg_Sample_OCR.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/TheHappyPrince.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | THE HAPPY PRINCE
  3 | By Oscar Wilde
  4 | Retrieved from Project Gutenberg: http://www.gutenberg.org/files/30120/30120.txt
  5 | 
  6 | 
  7 | First published by David Nutt, May, 1888
  8 | 
  9 | Reprinted January, 1889; February, 1902; September, 1905; February,
 10 | 1907; March, 1908; March, 1910
 11 | 
 12 | Reset and published by arrangement with David Nutt by Duckworth & Co.,
 13 | 1920
 14 | 
 15 | Special Edition, reset. With illustrations by Charles Robinson,
 16 | published by arrangement with David Nutt by Duckworth & Co., 1913.
 17 | Reprinted 1920
 18 | 
 19 | Printed in Great Britain
 20 | By Hazell, Watson and Viney, Ld.,
 21 | London and Aylesbury.
 22 | 
 23 | ====
 24 | 
 25 | High above the city, on a tall column, stood the statue of the Happy
 26 | Prince. He was gilded all over with thin leaves of fine gold, for eyes
 27 | he had two bright sapphires, and a large red ruby glowed on his
 28 | sword-hilt.
 29 | 
 30 | He was very much admired indeed. "He is as beautiful as a weathercock,"
 31 | remarked one of the Town Councillors who wished to gain a reputation for
 32 | having artistic tastes; "only not quite so useful," he added, fearing
 33 | lest people should think him unpractical, which he really was not.
 34 | 
 35 | "Why can't you be like the Happy Prince?" asked a sensible mother of her
 36 | little boy who was crying for the moon. "The Happy Prince never dreams
 37 | of crying for anything."
 38 | 
 39 | "I am glad there is some one in the world who is quite happy," muttered
 40 | a disappointed man as he gazed at the wonderful statue.
 41 | 
 42 | "He looks just like an angel," said the Charity Children as they came
 43 | out of the cathedral in their bright scarlet cloaks and their clean
 44 | white pinafores.
 45 | 
 46 | "How do you know?" said the Mathematical Master, "you have never seen
 47 | one."
 48 | 
 49 | "Ah! but we have, in our dreams," answered the children; and the
 50 | Mathematical Master frowned and looked very severe, for he did not
 51 | approve of children dreaming.
 52 | 
 53 | One night there flew over the city a little Swallow. His friends had
 54 | gone away to Egypt six weeks before, but he had stayed behind, for he
 55 | was in love with the most beautiful Reed. He had met her early in the
 56 | spring as he was flying down the river after a big yellow moth, and had
 57 | been so attracted by her slender waist that he had stopped to talk to
 58 | her.
 59 | 
 60 | "Shall I love you?" said the Swallow, who liked to come to the point at
 61 | once, and the Reed made him a low bow. So he flew round and round her,
 62 | touching the water with his wings, and making silver ripples. This was
 63 | his courtship, and it lasted all through the summer.
 64 | 
 65 | "It is a ridiculous attachment," twittered the other Swallows; "she has
 66 | no money, and far too many relations;" and indeed the river was quite
 67 | full of Reeds. Then, when the autumn came they all flew away.
 68 | 
 69 | After they had gone he felt lonely, and began to tire of his lady-love.
 70 | "She has no conversation," he said, "and I am afraid that she is a
 71 | coquette, for she is always flirting with the wind." And certainly,
 72 | whenever the wind blew, the Reed made the most graceful curtseys.
 73 | "I admit that she is domestic," he continued, "but I love travelling,
 74 | and my wife, consequently, should love travelling also."
 75 | 
 76 | "Will you come away with me?" he said finally to her; but the Reed shook
 77 | her head, she was so attached to her home.
 78 | 
 79 | "You have been trifling with me," he cried. "I am off to the Pyramids.
 80 | Good-bye!" and he flew away.
 81 | 
 82 | All day long he flew, and at night-time he arrived at the city. "Where
 83 | shall I put up?" he said; "I hope the town has made preparations."
 84 | 
 85 | Then he saw the statue on the tall column.
 86 | 
 87 | "I will put up there," he cried; "it is a fine position, with plenty of
 88 | fresh air." So he alighted just between the feet of the Happy Prince.
 89 | 
 90 | "I have a golden bedroom," he said softly to himself as he looked round,
 91 | and he prepared to go to sleep; but just as he was putting his head
 92 | under his wing a large drop of water fell on him. "What a curious
 93 | thing!" he cried; "there is not a single cloud in the sky, the stars are
 94 | quite clear and bright, and yet it is raining. The climate in the north
 95 | of Europe is really dreadful. The Reed used to like the rain, but that
 96 | was merely her selfishness."
 97 | 
 98 | Then another drop fell.
 99 | 
100 | "What is the use of a statue if it cannot keep the rain off?" he said;
101 | "I must look for a good chimney-pot," and he determined to fly away.
102 | 
103 | But before he had opened his wings, a third drop fell, and he looked up,
104 | and saw-- Ah! what did he see?
105 | 
106 | The eyes of the Happy Prince were filled with tears, and tears were
107 | running down his golden cheeks. His face was so beautiful in the
108 | moonlight that the little Swallow was filled with pity.
109 | 
110 | "Who are you?" he said.
111 | 
112 | "I am the Happy Prince."
113 | 
114 | "Why are you weeping then?" asked the Swallow; "you have quite
115 | drenched me."
116 | 
117 |   [Illustration: THE PALACE OF SANS-SOUCI]
118 | 
119 | "When I was alive and had a human heart," answered the statue, "I did
120 | not know what tears were, for I lived in the Palace of Sans-Souci, where
121 | sorrow is not allowed to enter. In the daytime I played with my
122 | companions in the garden, and in the evening I led the dance in the
123 | Great Hall. Round the garden ran a very lofty wall, but I never cared
124 | to ask what lay beyond it, everything about me was so beautiful.
125 | My courtiers called me the Happy Prince, and happy indeed I was, if
126 | pleasure be happiness. So I lived, and so I died. And now that I am dead
127 | they have set me up here so high that I can see all the ugliness and all
128 | the misery of my city, and though my heart is made of lead yet I cannot
129 | choose but weep."
130 | 
131 | "What! is he not solid gold?" said the Swallow to himself. He was too
132 | polite to make any personal remarks out loud.
133 | 
134 | "Far away," continued the statue in a low musical voice, "far away in a
135 | little street there is a poor house. One of the windows is open, and
136 | through it I can see a woman seated at a table. Her face is thin and
137 | worn, and she has coarse, red hands, all pricked by the needle, for she
138 | is a seamstress. She is embroidering passion-flowers on a satin gown for
139 | the loveliest of the Queen's maids-of-honour to wear at the next
140 | Court-ball. In a bed in the corner of the room her little boy is lying
141 | ill. He has a fever, and is asking for oranges. His mother has nothing
142 | to give him but river water, so he is crying. Swallow, Swallow, little
143 | Swallow, will you not bring her the ruby out of my sword-hilt? My feet
144 | are fastened to this pedestal and I cannot move."
145 | 
146 | "I am waited for in Egypt," said the Swallow. "My friends are flying up
147 | and down the Nile, and talking to the large lotus-flowers. Soon they
148 | will go to sleep in the tomb of the great King. The King is there
149 | himself in his painted coffin. He is wrapped in yellow linen, and
150 | embalmed with spices. Round his neck is a chain of pale green jade,
151 | and his hands are like withered leaves."
152 | 
153 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay
154 | with me for one night, and be my messenger? The boy is so thirsty, and
155 | the mother so sad."
156 | 
157 | "I don't think I like boys," answered the Swallow. "Last summer, when I
158 | was staying on the river, there were two rude boys, the miller's sons,
159 | who were always throwing stones at me. They never hit me, of course;
160 | we swallows fly far too well for that, and besides, I come of a family
161 | famous for its agility; but still, it was a mark of disrespect."
162 | 
163 | But the Happy Prince looked so sad that the little Swallow was sorry.
164 | "It is very cold here," he said; "but I will stay with you for one
165 | night, and be your messenger."
166 | 
167 | "Thank you, little Swallow," said the Prince.
168 | 
169 | So the Swallow picked out the great ruby from the Prince's sword,
170 | and flew away with it in his beak over the roofs of the town.
171 | 
172 | He passed by the cathedral tower, where the white marble angels were
173 | sculptured. He passed by the palace and heard the sound of dancing.
174 | A beautiful girl came out on the balcony with her lover. "How wonderful
175 | the stars are," he said to her, "and how wonderful is the power of
176 | love!"
177 | 
178 | "I hope my dress will be ready in time for the State-ball," she
179 | answered; "I have ordered passion-flowers to be embroidered on it;
180 | but the seamstresses are so lazy."
181 | 
182 | He passed over the river, and saw the lanterns hanging to the masts of
183 | the ships. He passed over the Ghetto, and saw the old Jews bargaining
184 | with each other, and weighing out money in copper scales. At last he
185 | came to the poor house and looked in. The boy was tossing feverishly on
186 | his bed, and the mother had fallen asleep, she was so tired. In he
187 | hopped, and laid the great ruby on the table beside the woman's thimble.
188 | Then he flew gently round the bed, fanning the boy's forehead with his
189 | wings. "How cool I feel!" said the boy, "I must be getting better;"
190 | and he sank into a delicious slumber.
191 | 
192 | Then the Swallow flew back to the Happy Prince, and told him what he had
193 | done. "It is curious," he remarked, "but I feel quite warm now, although
194 | it is so cold."
195 | 
196 | "That is because you have done a good action," said the Prince. And the
197 | little Swallow began to think, and then he fell asleep. Thinking always
198 | made him sleepy.
199 | 
200 | When day broke he flew down to the river and had a bath. "What a
201 | remarkable phenomenon," said the Professor of Ornithology as he was
202 | passing over the bridge. "A swallow in winter!" And he wrote a long
203 | letter about it to the local newspaper. Every one quoted it, it was full
204 | of so many words that they could not understand.
205 | 
206 | "To-night I go to Egypt," said the Swallow, and he was in high spirits
207 | at the prospect. He visited all the public monuments, and sat a long
208 | time on top of the church steeple. Wherever he went the Sparrows
209 | chirruped, and said to each other, "What a distinguished stranger!"
210 | so he enjoyed himself very much.
211 | 
212 | When the moon rose he flew back to the Happy Prince. "Have you any
213 | commissions for Egypt?" he cried; "I am just starting."
214 | 
215 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay
216 | with me one night longer?"
217 | 
218 |   [Illustration: THE LOVELIEST OF THE QUEEN'S MAIDS OF HONOUR]
219 | 
220 | "I am waited for in Egypt," answered the Swallow. "To-morrow my friends
221 | will fly up to the Second Cataract. The river-horse couches there among
222 | the bulrushes, and on a great granite throne sits the God Memnon. All
223 | night long he watches the stars, and when the morning star shines he
224 | utters one cry of joy, and then he is silent. At noon the yellow lions
225 | come down to the water's edge to drink. They have eyes like green
226 | beryls, and their roar is louder than the roar of the cataract."
227 | 
228 | "Swallow, Swallow, little Swallow," said the Prince, "far away across
229 | the city I see a young man in a garret. He is leaning over a desk
230 | covered with papers, and in a tumbler by his side there is a bunch of
231 | withered violets. His hair is brown and crisp, and his lips are red as a
232 | pomegranate, and he has large and dreamy eyes. He is trying to finish a
233 | play for the Director of the Theatre, but he is too cold to write any
234 | more. There is no fire in the grate, and hunger has made him faint."
235 | 
236 | "I will wait with you one night longer," said the Swallow, who really
237 | had a good heart. "Shall I take him another ruby?"
238 | 
239 | "Alas! I have no ruby now," said the Prince; "my eyes are all that I
240 | have left. They are made of rare sapphires, which were brought out of
241 | India a thousand years ago. Pluck out one of them and take it to him. He
242 | will sell it to the jeweller, and buy food and firewood, and finish his
243 | play."
244 | 
245 | "Dear Prince," said the Swallow, "I cannot do that"; and he began to
246 | weep.
247 | 
248 | "Swallow, Swallow, little Swallow," said the Prince, "do as I command
249 | you."
250 | 
251 | So the Swallow plucked out the Prince's eye, and flew away to the
252 | student's garret. It was easy enough to get in, as there was a hole in
253 | the roof. Through this he darted, and came into the room. The young man
254 | had his head buried in his hands, so he did not hear the flutter of the
255 | bird's wings, and when he looked up he found the beautiful sapphire
256 | lying on the withered violets.
257 | 
258 | "I am beginning to be appreciated," he cried; "this is from some great
259 | admirer. Now I can finish my play," and he looked quite happy.
260 | 
261 | The next day the Swallow flew down to the harbour. He sat on the mast of
262 | a large vessel and watched the sailors hauling big chests out of the
263 | hold with ropes. "Heave a-hoy!" they shouted as each chest came up.
264 | "I am going to Egypt!" cried the Swallow, but nobody minded, and when
265 | the moon rose he flew back to the Happy Prince.
266 | 
267 | "I am come to bid you good-bye," he cried.
268 | 
269 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay
270 | with me one night longer?"
271 | 
272 | "It is winter," answered the Swallow, "and the chill snow will soon be
273 | here. In Egypt the sun is warm on the green palm-trees, and the
274 | crocodiles lie in the mud and look lazily about them. My companions are
275 | building a nest in the Temple of Baalbec, and the pink and white doves
276 | are watching them, and cooing to each other. Dear Prince, I must leave
277 | you, but I will never forget you, and next spring I will bring you back
278 | two beautiful jewels in place of those you have given away. The ruby
279 | shall be redder than a red rose, and the sapphire shall be as blue as
280 | the great sea."
281 | 
282 | "In the square below," said the Happy Prince, "there stands a little
283 | match-girl. She has let her matches fall in the gutter, and they are all
284 | spoiled. Her father will beat her if she does not bring home some money,
285 | and she is crying. She has no shoes or stockings, and her little head is
286 | bare. Pluck out my other eye and give it to her, and her father will not
287 | beat her."
288 | 
289 | "I will stay with you one night longer," said the Swallow, "but I cannot
290 | pluck out your eye. You would be quite blind then."
291 | 
292 | "Swallow, Swallow, little Swallow," said the Prince, "do as I command
293 | you."
294 | 
295 | So he plucked out the Prince's other eye, and darted down with it. He
296 | swooped past the match-girl, and slipped the jewel into the palm of her
297 | hand. "What a lovely bit of glass!" cried the little girl; and she ran
298 | home, laughing.
299 | 
300 | Then the Swallow came back to the Prince. "You are blind now," he said,
301 | "so I will stay with you always."
302 | 
303 | "No, little Swallow," said the poor Prince, "you must go away to Egypt."
304 | 
305 | "I will stay with you always," said the Swallow, and he slept at the
306 | Prince's feet.
307 | 
308 | All the next day he sat on the Prince's shoulder, and told him stories
309 | of what he had seen in strange lands. He told him of the red ibises,
310 | who stand in long rows on the banks of the Nile, and catch gold-fish in
311 | their beaks; of the Sphinx, who is as old as the world itself, and lives
312 | in the desert, and knows everything; of the merchants, who walk slowly
313 | by the side of their camels and carry amber beads in their hands; of the
314 | King of the Mountains of the Moon, who is as black as ebony, and
315 | worships a large crystal; of the great green snake that sleeps in a palm
316 | tree, and has twenty priests to feed it with honey-cakes; and of the
317 | pygmies who sail over a big lake on large flat leaves, and are always at
318 | war with the butterflies.
319 | 
320 | "Dear little Swallow," said the Prince, "you tell me of marvellous
321 | things, but more marvellous than anything is the suffering of men and of
322 | women. There is no Mystery so great as Misery. Fly over my city, little
323 | Swallow, and tell me what you see there."
324 | 
325 |   [Illustration: THE RICH MAKING MERRY IN THEIR BEAUTIFUL HOUSES,
326 |   WHILE THE BEGGARS WERE SITTING AT THE GATES]
327 | 
328 | So the Swallow flew over the great city, and saw the rich making merry
329 | in their beautiful houses, while the beggars were sitting at the gates.
330 | He flew into dark lanes, and saw the white faces of starving children
331 | looking out listlessly at the black streets. Under the archway of a
332 | bridge two little boys were lying in one another's arms to try and keep
333 | themselves warm. "How hungry we are!" they said. "You must not lie
334 | here," shouted the Watchman, and they wandered out into the rain.
335 | 
336 | Then he flew back and told the Prince what he had seen.
337 | 
338 | "I am covered with fine gold," said the Prince, "you must take it off,
339 | leaf by leaf, and give it to my poor; the living always think that gold
340 | can make them happy."
341 | 
342 | Leaf after leaf of the fine gold the Swallow picked off, till the Happy
343 | Prince looked quite dull and grey. Leaf after leaf of the fine gold he
344 | brought to the poor, and the children's faces grew rosier, and they
345 | laughed and played games in the street. "We have bread now!" they cried.
346 | 
347 | Then the snow came, and after the snow came the frost. The streets
348 | looked as if they were made of silver, they were so bright and
349 | glistening; long icicles like crystal daggers hung down from the eaves
350 | of the houses, everybody went about in furs, and the little boys wore
351 | scarlet caps and skated on the ice.
352 | 
353 | The poor little Swallow grew colder and colder, but he would not leave
354 | the Prince, he loved him too well. He picked up crumbs outside the
355 | baker's door when the baker was not looking, and tried to keep himself
356 | warm by flapping his wings.
357 | 
358 | But at last he knew that he was going to die. He had just strength to
359 | fly up to the Prince's shoulder once more. "Good-bye, dear Prince!"
360 | he murmured, "will you let me kiss your hand?"
361 | 
362 | "I am glad that you are going to Egypt at last, little Swallow," said
363 | the Prince, "you have stayed too long here; but you must kiss me on the
364 | lips, for I love you."
365 | 
366 | "It is not to Egypt that I am going," said the Swallow. "I am going to
367 | the House of Death. Death is the brother of Sleep, is he not?"
368 | 
369 | And he kissed the Happy Prince on the lips, and fell down dead at his
370 | feet.
371 | 
372 | At that moment a curious crack sounded inside the statue, as if
373 | something had broken. The fact is that the leaden heart had snapped
374 | right in two. It certainly was a dreadfully hard frost.
375 | 
376 | Early the next morning the Mayor was walking in the square below in
377 | company with the Town Councillors. As they passed the column he looked
378 | up at the statue: "Dear me! how shabby the Happy Prince looks!" he said.
379 | 
380 | "How shabby, indeed!" cried the Town Councillors, who always agreed with
381 | the Mayor; and they went up to look at it.
382 | 
383 | "The ruby has fallen out of his sword, his eyes are gone, and he is
384 | golden no longer," said the Mayor; "in fact, he is little better than a
385 | beggar!"
386 | 
387 | "Little better than a beggar," said the Town Councillors.
388 | 
389 | "And here is actually a dead bird at his feet!" continued the Mayor. "We
390 | must really issue a proclamation that birds are not to be allowed to die
391 | here." And the Town Clerk made a note of the suggestion.
392 | 
393 | So they pulled down the statue of the Happy Prince. "As he is no longer
394 | beautiful he is no longer useful," said the Art Professor at the
395 | University.
396 | 
397 | Then they melted the statue in a furnace, and the Mayor held a meeting
398 | of the Corporation to decide what was to be done with the metal. "We
399 | must have another statue, of course," he said, "and it shall be a statue
400 | of myself."
401 | 
402 | "Of myself," said each of the Town Councillors, and they quarrelled.
403 | When I last heard of them they were quarrelling still.
404 | 
405 | "What a strange thing!" said the overseer of the workmen at the foundry.
406 | "This broken lead heart will not melt in the furnace. We must throw it
407 | away." So they threw it on a dust-heap where the dead Swallow was also
408 | lying.
409 | 
410 | "Bring me the two most precious things in the city," said God to one of
411 | His Angels; and the Angel brought Him the leaden heart and the dead
412 | bird.
413 | 
414 | "You have rightly chosen," said God, "for in my garden of Paradise this
415 | little bird shall sing for evermore, and in my city of gold the Happy
416 | Prince shall praise me."
417 | 
418 | 


--------------------------------------------------------------------------------
/tests/fixture_data/attachment_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/attachment_small.png


--------------------------------------------------------------------------------
/tests/fixture_data/crazyones.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/crazyones.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/jpeg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/jpeg.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testDecodeStreamData/ASCII85Decode.pdf:
--------------------------------------------------------------------------------
 1 | %PDF-1.4
 2 | % This file was purposefully assembled for PyPDF4 to contain a stream with an
 3 | % ASCII85Decode filter.
 4 | 1 0 obj
 5 |     << /Type /Catalog
 6 |        /Outlines 2 0 R
 7 |        /Pages 3 0 R
 8 |     >>
 9 | endobj
10 | 
11 | 2 0 obj
12 |     << /Type Outlines
13 |        /Count 0
14 |     >>
15 | endobj
16 | 
17 | 3 0 obj
18 |     << /Type /Pages
19 |        /Kids [ 4 0 R ]
20 |        /Count 1
21 |     >>
22 | endobj
23 | 
24 | 4 0 obj
25 |     << /Type /Page
26 |        /Parent 3 0 R
27 |        /MediaBox [ 0 0 612 792 ]
28 |        /Contents 5 0 R
29 |        /Resources << /ProcSet 6 0 >>
30 |     >>
31 | endobj
32 | 
33 | 5 0 obj
34 | 	<< /Length 486 /Filter /ASCII85Decode >>
35 | stream
36 | Gat=f>>)jf(l%MV/)Eo6mIKAcVI%E8JjXX\LYJuV['I-CRhu'N^LhAdW!F"AKY3'f+!CnrjS-:gc;G'?"Hl==+<>3mA"9I518%ESCnVLW>+8W;n:g?Jm8Lf,g;R3j'h:hD+/MCBA4.7R%cB^*K8Fqlg@[FpiFY'd\=Bs0qgO@n"mHUoH<MZ*jjg9%pjX!R.C9\(1%&)F`Pl%a\6X;cnPcuUP!@.OVHe\*T;M8BCeUZ&5(e46Y0*Vr$;Dq_8:'hK8IYQ]7,$#9Nb$f'Dj@BTG]L3-ZaD=!1fBDULj6]N18i!ZdN2UG@)o\aUfEn!b<JR6QoOBp#,A&t*h-:\@kD_q/6?k`-tA-DDM62jbd]4/EfOSG8cC76Oh%5VbU?A^Jrtqc9]V6B9_[H"M1k)arAERd32cD6epim9LU*@#gtLl9^FK%&SPg>A`!)4cZ<Sq7BsQ`"=/<NFgshmDqEPAHla(*V?0td4!sB]icd5G~>
37 | endstream
38 | endobj
39 | 
40 | 6 0 obj
41 |     << /Length 1554 /Filter /ASCII85Decode >>
42 | stream
43 | Gatm:?#SIU'Re<2\BE'[,#oo3Q=T>2>FT(e3i&m5DLc&,M3>DGU-_b#r9ni$I9KD$%Tj.XM(%5-GgADEdK^:I^AXUg!5EF0k0=3]!Lk),V7VE*4hTZW4C.a<Ro'UA13DQ$*TIW7^b,Ns-%J\rIUmWXgat43M<!9M/ASg8J0.=Jo0Z5e.^*7O%rC*i"J>1)SA\XGQ=LT]/;lDZ!5<ZIA[A0$E6p[DJP%5hoedFDFmHq&YR53pS%GWRk"&Y`WB&YgA@*'9VO6<]1H<`0Wdn%cL9lbhdK`N5jH,5*.;>WPQLo$D.]4VVBI52F%&+LE&<9==i6+elI_ZsJCJpV;p&MQi,3H'sCFW?!JdKKm))Yj2X=DqjIo_!+abe^bS:JYdQZtiBJ;e0I-&P#AA614u.%m4J7E?0"iZ3(!U!F;cXs5mlLmfuN.sE^efFp(f1C1gXNQOkH)FJ>&`boU[Fj^[&DN\O*M_/m@E_Gii<,!P!q_64>,$tHdGlL?EoGa'_h!fi*2<B./G;kh(:Xfu]HK9RjA*FJj_EZ7cr.CQ*e9G(7YLV)<E=H+D.8V-K8hqlY:X6qnc)F*f7+,^2c#*]T2jKAt:;"r.3Dg9cZBJAUPIKo##.U*adKp*kK%)Zs0*G%Ci*o>=*nTV<1Srp69_kCJ]r\J=3%dA=U_QuI4$=&L)Hgm;RlE=6]CPlBkb!6l(GC7O<QZ877D#o[O"lfu*dGlVO$pUm1^ub!DksIsXJZ9AKH6a?!^MOI_T8EdPct/<eA_]FeK&l/)'54BZ:`T?CH7dB<M2!FGZr,_;@pNHi?)b-X00.cJrNZW.K;0o?V@]qc?o)iaq!oD?rL%W%!H;I8TQFoAqaTQq6<N`5"M2bP9<H`-qKMI#(@cjj9cgli>e5n%N_"!P]3E-SX)4IbNHb2BL?l0BD&j`O>A_>.On.pAblZbr:TS$8g5@HgA7Y<2U<]4Ctl?%*%D;dN[?5I4h;(uW_&nC#))A4MDLK[$X\+b!ng[E0s*_-/')]%Ut0gtl?!U)aItV.\F./$p]g5hosPeOT>`l#XJ;^s55)Y'WEG3UVN6/u-hpn&OQ(ft<rs)4#^Amrn;6O9Y\/8pas+?0H/H;+/Akl*8BdD%hd-4WoB@"/9ZGS`eg43i,s@Xdc">S3oUQ!u7*u/;YqtG3VYNSm&/FJp*RTR@kMtnp=^Ju]l"@Th`#Q'\g>!:iG^)kFC<ij7MRFe\!HLeS,O*jhE2nnE&ndStR[,fi+`XHC/][fhVob23n"5;Gkn(pp74[a0eV#-kZ$W81D>L@GO2a6UT;36=TD#DMHJW\Q`*Tjl!q2'Kcm+5U):1K9,II2$"E"io2B9LM4`4K!8tn.c'rsQ/Lor&*qNfKgn<:CBN-)DhKklW[JXOsc@UCjh9P.f^'j)?Rbrnp$H_!PD8l<$G'XGZX,*I&B&UTN+&[Wat[>@/C"[q[!lO24!o;Ch%*u\r5P?:a*XCKlR=5i\P><C3b1Yfi[9C+eSqp$$e&uB?,q<bU'<p%sJfhgK\%UZ*D(=Ck/6Sitgn!jV\;"0+qk`g&V03]0o!<~>
44 | endstream
45 | endobj
46 | 
47 | xref
48 | 0 7
49 | 0000000000 65535 f
50 | 0000000111 00000 n
51 | 0000000199 00000 n
52 | 0000000260 00000 n
53 | 0000000342 00000 n
54 | 0000000498 00000 n
55 | 0000001060 00000 n
56 | 
57 | trailer
58 |     << /Size 7
59 |        /Root 1 0 R
60 |     >>
61 | startxref
62 | 2694
63 | %%EOF
64 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testDecodeStreamData/CCITTFaxDecode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/CCITTFaxDecode.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testDecodeStreamData/DCTDecode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/DCTDecode.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testDecodeStreamData/FlateDecode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/FlateDecode.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testDecodeStreamData/LZWDecode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/LZWDecode.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testFileLoad/crazyones.txt:
--------------------------------------------------------------------------------
1 | TheCrazyOnesOctober14,1998Herestothecrazyones.Themis˝ts.Therebels.Thetroublemakers.Theroundpegsinthesquareholes.Theoneswhoseethingsdi˙erently.Theyrenotfondofrules.Andtheyhavenorespectforthestatusquo.Youcanquotethem,disagreewiththem,glorifyorvilifythem.Abouttheonlythingyoucantdoisignorethem.Becausetheychangethings.Theyinvent.Theyimagine.Theyheal.Theyexplore.Theycreate.Theyinspire.Theypushthehumanraceforward.Maybetheyhavetobecrazy.Howelsecanyoustareatanemptycanvasandseeaworkofart?Orsitinsilenceandhearasongthatsneverbeenwritten?Orgazeataredplanetandseealaboratoryonwheels?Wemaketoolsforthesekindsofpeople.Whilesomeseethemasthecrazyones,weseegenius.Becausethepeoplewhoarecrazyenoughtothinktheycanchangetheworld,aretheoneswhodo.


--------------------------------------------------------------------------------
/tests/fixture_data/testIsObjectFree/GeoBase_NHNC1_Data_Model_UML_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testIsObjectFree/GeoBase_NHNC1_Data_Model_UML_EN.pdf


--------------------------------------------------------------------------------
/tests/fixture_data/testIsObjectFree/SF424_page2.pdf:
--------------------------------------------------------------------------------
  1 | 0 328
  2 | 0000000000 65535 f 
  3 | 0000000009 00000 n 
  4 | 0000000068 00000 n 
  5 | 0000000169 00000 n 
  6 | 0000000218 00000 n 
  7 | 0000000472 00000 n 
  8 | 0000002469 00000 n 
  9 | 0000002556 00000 n 
 10 | 0000003162 00000 n 
 11 | 0000003404 00000 n 
 12 | 0000004048 00000 n 
 13 | 0000004297 00000 n 
 14 | 0000005487 00000 n 
 15 | 0000005729 00000 n 
 16 | 0000005906 00000 n 
 17 | 0000006091 00000 n 
 18 | 0000006200 00000 n 
 19 | 0000006386 00000 n 
 20 | 0000006601 00000 n 
 21 | 0000006859 00000 n 
 22 | 0000011230 00000 n 
 23 | 0000011340 00000 n 
 24 | 0000011416 00000 n 
 25 | 0000011477 00000 n 
 26 | 0000011734 00000 n 
 27 | 0000015083 00000 n 
 28 | 0000015514 00000 n 
 29 | 0000015769 00000 n 
 30 | 0000016099 00000 n 
 31 | 0000016199 00000 n 
 32 | 0000017526 00000 n 
 33 | 0000017702 00000 n 
 34 | 0000017817 00000 n 
 35 | 0000018002 00000 n 
 36 | 0000018215 00000 n 
 37 | 0000018401 00000 n 
 38 | 0000018587 00000 n 
 39 | 0000018844 00000 n 
 40 | 0000019844 00000 n 
 41 | 0000021034 00000 n 
 42 | 0000021082 00000 n 
 43 | 0000021268 00000 n 
 44 | 0000021549 00000 n 
 45 | 0000021752 00000 n 
 46 | 0000021945 00000 n 
 47 | 0000022150 00000 n 
 48 | 0000022446 00000 n 
 49 | 0000028034 00000 n 
 50 | 0000028067 00000 n 
 51 | 0000028110 00000 n 
 52 | 0000028168 00000 n 
 53 | 0000028490 00000 n 
 54 | 0000028865 00000 n 
 55 | 0000029069 00000 n 
 56 | 0000029112 00000 n 
 57 | 0000029170 00000 n 
 58 | 0000029492 00000 n 
 59 | 0000029867 00000 n 
 60 | 0000030072 00000 n 
 61 | 0000030115 00000 n 
 62 | 0000030173 00000 n 
 63 | 0000030495 00000 n 
 64 | 0000030870 00000 n 
 65 | 0000031075 00000 n 
 66 | 0000031118 00000 n 
 67 | 0000031176 00000 n 
 68 | 0000031498 00000 n 
 69 | 0000031873 00000 n 
 70 | 0000032136 00000 n 
 71 | 0000041460 00000 n 
 72 | 0000041506 00000 n 
 73 | 0000041922 00000 n 
 74 | 0000042344 00000 n 
 75 | 0000046101 00000 n 
 76 | 0000046622 00000 n 
 77 | 0000047217 00000 n 
 78 | 0000051163 00000 n 
 79 | 0000051721 00000 n 
 80 | 0000052453 00000 n 
 81 | 0000057379 00000 n 
 82 | 0000057406 00000 n 
 83 | 0000057611 00000 n 
 84 | 0000057654 00000 n 
 85 | 0000057712 00000 n 
 86 | 0000058034 00000 n 
 87 | 0000058409 00000 n 
 88 | 0000058672 00000 n 
 89 | 0000063653 00000 n 
 90 | 0000063680 00000 n 
 91 | 0000063723 00000 n 
 92 | 0000063781 00000 n 
 93 | 0000064103 00000 n 
 94 | 0000064478 00000 n 
 95 | 0000064769 00000 n 
 96 | 0000065036 00000 n 
 97 | 0000065251 00000 n 
 98 | 0000065541 00000 n 
 99 | 0000065826 00000 n 
100 | 0000066095 00000 n 
101 | 0000066310 00000 n 
102 | 0000066600 00000 n 
103 | 0000066948 00000 n 
104 | 0000067216 00000 n 
105 | 0000067432 00000 n 
106 | 0000067723 00000 n 
107 | 0000068015 00000 n 
108 | 0000068280 00000 n 
109 | 0000068495 00000 n 
110 | 0000068783 00000 n 
111 | 0000069172 00000 n 
112 | 0000069437 00000 n 
113 | 0000069652 00000 n 
114 | 0000069940 00000 n 
115 | 0000070310 00000 n 
116 | 0000070577 00000 n 
117 | 0000070792 00000 n 
118 | 0000071082 00000 n 
119 | 0000071314 00000 n 
120 | 0000071518 00000 n 
121 | 0000071776 00000 n 
122 | 0000071875 00000 n 
123 | 0000071977 00000 n 
124 | 0000072157 00000 n 
125 | 0000072335 00000 n 
126 | 0000072530 00000 n 
127 | 0000072789 00000 n 
128 | 0000073069 00000 n 
129 | 0000073318 00000 n 
130 | 0000073417 00000 n 
131 | 0000073620 00000 n 
132 | 0000073824 00000 n 
133 | 0000074028 00000 n 
134 | 0000074217 00000 n 
135 | 0000074410 00000 n 
136 | 0000074601 00000 n 
137 | 0000074798 00000 n 
138 | 0000074993 00000 n 
139 | 0000075181 00000 n 
140 | 0000075367 00000 n 
141 | 0000075556 00000 n 
142 | 0000075886 00000 n 
143 | 0000075948 00000 n 
144 | 0000075969 00000 n 
145 | 0000076316 00000 n 
146 | 0000076534 00000 n 
147 | 0000076740 00000 n 
148 | 0000076955 00000 n 
149 | 0000077302 00000 n 
150 | 0000077359 00000 n 
151 | 0000077380 00000 n 
152 | 0000077726 00000 n 
153 | 0000077938 00000 n 
154 | 0000078186 00000 n 
155 | 0000078430 00000 n 
156 | 0000078507 00000 n 
157 | 0000078587 00000 n 
158 | 0000078823 00000 n 
159 | 0000078900 00000 n 
160 | 0000078980 00000 n 
161 | 0000079191 00000 n 
162 | 0000079565 00000 n 
163 | 0000079615 00000 n 
164 | 0000079923 00000 n 
165 | 0000080350 00000 n 
166 | 0000080775 00000 n 
167 | 0000081137 00000 n 
168 | 0000081174 00000 n 
169 | 0000081623 00000 n 
170 | 0000082078 00000 n 
171 | 0000083307 00000 n 
172 | 0000083443 00000 n 
173 | 0000083964 00000 n 
174 | 0000084487 00000 n 
175 | 0000084760 00000 n 
176 | 0000084874 00000 n 
177 | 0000085149 00000 n 
178 | 0000085244 00000 n 
179 | 0000085509 00000 n 
180 | 0000085782 00000 n 
181 | 0000085881 00000 n 
182 | 0000086093 00000 n 
183 | 0000086366 00000 n 
184 | 0000086465 00000 n 
185 | 0000086677 00000 n 
186 | 0000086950 00000 n 
187 | 0000087049 00000 n 
188 | 0000087261 00000 n 
189 | 0000087535 00000 n 
190 | 0000087634 00000 n 
191 | 0000087846 00000 n 
192 | 0000088121 00000 n 
193 | 0000088220 00000 n 
194 | 0000088432 00000 n 
195 | 0000088707 00000 n 
196 | 0000088806 00000 n 
197 | 0000089018 00000 n 
198 | 0000089293 00000 n 
199 | 0000089392 00000 n 
200 | 0000089604 00000 n 
201 | 0000089703 00000 n 
202 | 0000089915 00000 n 
203 | 0000090205 00000 n 
204 | 0000090426 00000 n 
205 | 0000090644 00000 n 
206 | 0000090962 00000 n 
207 | 0000091019 00000 n 
208 | 0000091387 00000 n 
209 | 0000091493 00000 n 
210 | 0000091735 00000 n 
211 | 0000091834 00000 n 
212 | 0000091936 00000 n 
213 | 0000092175 00000 n 
214 | 0000092274 00000 n 
215 | 0000092376 00000 n 
216 | 0000092685 00000 n 
217 | 0000092801 00000 n 
218 | 0000092920 00000 n 
219 | 0000093134 00000 n 
220 | 0000093437 00000 n 
221 | 0000093553 00000 n 
222 | 0000093672 00000 n 
223 | 0000093886 00000 n 
224 | 0000094182 00000 n 
225 | 0000094298 00000 n 
226 | 0000094417 00000 n 
227 | 0000094631 00000 n 
228 | 0000094926 00000 n 
229 | 0000095042 00000 n 
230 | 0000095161 00000 n 
231 | 0000095375 00000 n 
232 | 0000095670 00000 n 
233 | 0000095786 00000 n 
234 | 0000095905 00000 n 
235 | 0000096119 00000 n 
236 | 0000096420 00000 n 
237 | 0000096536 00000 n 
238 | 0000096655 00000 n 
239 | 0000096869 00000 n 
240 | 0000097196 00000 n 
241 | 0000097312 00000 n 
242 | 0000097575 00000 n 
243 | 0000097694 00000 n 
244 | 0000097804 00000 n 
245 | 0000098018 00000 n 
246 | 0000098354 00000 n 
247 | 0000098619 00000 n 
248 | 0000098834 00000 n 
249 | 0000099123 00000 n 
250 | 0000099433 00000 n 
251 | 0000099532 00000 n 
252 | 0000099634 00000 n 
253 | 0000099876 00000 n 
254 | 0000100216 00000 n 
255 | 0000100484 00000 n 
256 | 0000100700 00000 n 
257 | 0000100990 00000 n 
258 | 0000101286 00000 n 
259 | 0000101554 00000 n 
260 | 0000101770 00000 n 
261 | 0000102060 00000 n 
262 | 0000102369 00000 n 
263 | 0000102634 00000 n 
264 | 0000102848 00000 n 
265 | 0000103135 00000 n 
266 | 0000103447 00000 n 
267 | 0000103713 00000 n 
268 | 0000103929 00000 n 
269 | 0000104217 00000 n 
270 | 0000104525 00000 n 
271 | 0000104789 00000 n 
272 | 0000105005 00000 n 
273 | 0000105292 00000 n 
274 | 0000105630 00000 n 
275 | 0000105978 00000 n 
276 | 0000106204 00000 n 
277 | 0000106431 00000 n 
278 | 0000106654 00000 n 
279 | 0000107023 00000 n 
280 | 0000107044 00000 n 
281 | 0000107067 00000 n 
282 | 0000107407 00000 n 
283 | 0000107623 00000 n 
284 | 0000107886 00000 n 
285 | 0000107963 00000 n 
286 | 0000108043 00000 n 
287 | 0000108294 00000 n 
288 | 0000108371 00000 n 
289 | 0000108451 00000 n 
290 | 0000108679 00000 n 
291 | 0000108928 00000 n 
292 | 0000108996 00000 n 
293 | 0000109212 00000 n 
294 | 0000109425 00000 n 
295 | 0000109736 00000 n 
296 | 0000110145 00000 n 
297 | 0000110166 00000 n 
298 | 0000110514 00000 n 
299 | 0000110825 00000 n 
300 | 0000111176 00000 n 
301 | 0000111511 00000 n 
302 | 0000111859 00000 n 
303 | 0000112062 00000 n 
304 | 0000112373 00000 n 
305 | 0000112456 00000 n 
306 | 0000112882 00000 n 
307 | 0000113180 00000 n 
308 | 0000113263 00000 n 
309 | 0000113535 00000 n 
310 | 0000113635 00000 n 
311 | 0000113899 00000 n 
312 | 0000114197 00000 n 
313 | 0000114297 00000 n 
314 | 0000114561 00000 n 
315 | 0000114859 00000 n 
316 | 0000114959 00000 n 
317 | 0000115222 00000 n 
318 | 0000115521 00000 n 
319 | 0000115633 00000 n 
320 | 0000115897 00000 n 
321 | 0000115997 00000 n 
322 | 0000116261 00000 n 
323 | 0000116486 00000 n 
324 | 0000116549 00000 n 
325 | 0000116814 00000 n 
326 | 0000117080 00000 n 
327 | 0000117336 00000 n 
328 | 0000117614 00000 n 
329 | 0000117895 00000 n 
330 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testIsObjectFree/Seige_of_Vicksburg_Sample_OCR.pdf:
--------------------------------------------------------------------------------
 1 | 38 16
 2 | 0000000023 00000 n
 3 | 0000000593 00000 n
 4 | 0000000962 00000 n
 5 | 0000001150 00000 n
 6 | 0000002182 00000 n
 7 | 0000002299 00000 n
 8 | 0000005902 00000 n
 9 | 0000006033 00000 n
10 | 0000006086 00000 n
11 | 0000006133 00000 n
12 | 0000022142 00000 n
13 | 0000023745 00000 n
14 | 0000036171 00000 n
15 | 0000036218 00000 n
16 | 0000036362 00000 n
17 | 0000000761 00000 n
18 | 0 38
19 | 0000000000 65535 f 
20 | 0000036415 00000 n 
21 | 0000036608 00000 n 
22 | 0000040043 00000 n 
23 | 0000062526 00000 n 
24 | 0000064168 00000 n 
25 | 0000079123 00000 n 
26 | 0000079265 00000 n 
27 | 0000079460 00000 n 
28 | 0000080497 00000 n 
29 | 0000084022 00000 n 
30 | 0000103504 00000 n 
31 | 0000105136 00000 n 
32 | 0000119406 00000 n 
33 | 0000119550 00000 n 
34 | 0000119748 00000 n 
35 | 0000123678 00000 n 
36 | 0000124435 00000 n 
37 | 0000126063 00000 n 
38 | 0000141743 00000 n 
39 | 0000141887 00000 n 
40 | 0000142095 00000 n 
41 | 0000145954 00000 n 
42 | 0000146720 00000 n 
43 | 0000148350 00000 n 
44 | 0000164181 00000 n 
45 | 0000164325 00000 n 
46 | 0000164543 00000 n 
47 | 0000165585 00000 n 
48 | 0000169113 00000 n 
49 | 0000184252 00000 n 
50 | 0000185896 00000 n 
51 | 0000201138 00000 n 
52 | 0000201282 00000 n 
53 | 0000202316 00000 n 
54 | 0000203347 00000 n 
55 | 0000203433 00000 n 
56 | 0000205114 00000 n 
57 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testIsObjectFree/jpeg.pdf:
--------------------------------------------------------------------------------
 1 | 0 17
 2 | 0000000000 65535 f 
 3 | 0000099839 00000 n 
 4 | 0000000019 00000 n 
 5 | 0000000276 00000 n 
 6 | 0000000296 00000 n 
 7 | 0000041872 00000 n 
 8 | 0000091180 00000 n 
 9 | 0000099982 00000 n 
10 | 0000091202 00000 n 
11 | 0000098889 00000 n 
12 | 0000098910 00000 n 
13 | 0000099106 00000 n 
14 | 0000099476 00000 n 
15 | 0000099707 00000 n 
16 | 0000099740 00000 n 
17 | 0000100081 00000 n 
18 | 0000100178 00000 n 
19 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testReadXRefStreamCompressedObjects/crazyones.pdf:
--------------------------------------------------------------------------------
 1 | 8 0 <</Font<</F1 4 0 R/F2 5 0 R/F3 6 0 R>>/ProcSet[/PDF/Text/ImageC/ImageB/ImageI]>>
 2 | 3 81 <</Resources 8 0 R/Type/Page/Parent 10 0 R/Contents[7 0 R]>>
 3 | 10 142 <</Type/Pages/Count 1/Kids[3 0 R]/MediaBox[0 0 612 792]>>
 4 | 2 200 <</Creator( XeTeX output 2015.06.04:1334)/Producer(xdvipdfmx \(20140317\))/CreationDate(D:20150604133406-06'00')>>
 5 | 1 315 <</Pages 10 0 R/Type/Catalog>>
 6 | 11 346 [684.7 0 0 0 0 0 0 0 0 0 0 0 733.6 0 0 0 0 684.7 0 0 0 0 0 0 0 0 0 0 0 0 489.1 0 0 0 440.2 0 0 489.1 0 0 0 0 0 538 0 0 0 403.5 391.3 0 0 0 0 0 464.6 391.3]
 7 | 13 502 [299.9 0 0 0 0 499.9 0 0 499.9 0 0 0 499.9 499.9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 749.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 449.9 449.9 0 449.9 0 0 0 0 0 0 0 0 0 499.9 0 0 412.4 0 324.9]
 8 | 15 698 [599.4 570.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285.4 0 285.4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 485.2 0 770.5 727.8 0 0 0 0 0 770.5 0 0 0 0 941.7 0 799.2 0 0 0 0 742.1 0 0 1055.9 0 770.5 0 0 0 0 0 0 0 513.8 570.8 456.7 570.8 457.1 314 513.8 570.8 285.4 0 542.3 285.4 856.3 570.8 513.8 570.8 542.3 401.9 405.3 399.6 570.8 542.3 742.1 542.3 542.3 456.7]
 9 | 4 1043 <</Type/Font/Subtype/Type1/Widths 11 0 R/FirstChar 67/LastChar 122/Encoding 17 0 R/ToUnicode 18 0 R/BaseFont/HHXGQB+SFTI1440/FontDescriptor 19 0 R>>
10 | 19 1192 <</Type/FontDescriptor/CapHeight 684/Ascent 695/Descent -193/ItalicAngle -14.04/StemV 50/Flags 70/FontBBox[-88 -320 1347 942]/FontFile3 12 0 R/FontName/HHXGQB+SFTI1440>>
11 | 5 1362 <</Type/Font/Subtype/Type1/Widths 13 0 R/FirstChar 44/LastChar 116/Encoding 17 0 R/ToUnicode 18 0 R/BaseFont/YISQAD+SFTI1200/FontDescriptor 20 0 R>>
12 | 20 1511 <</Type/FontDescriptor/CapHeight 684/Ascent 695/Descent -193/ItalicAngle -14.04/StemV 50/Flags 70/FontBBox[-91 -320 1380 938]/FontFile3 14 0 R/FontName/YISQAD+SFTI1200>>
13 | 6 1681 <</Type/Font/Subtype/Type1/Widths 15 0 R/FirstChar 27/LastChar 122/Encoding 17 0 R/ToUnicode 18 0 R/BaseFont/TITXYI+SFRM0900/FontDescriptor 21 0 R>>
14 | 21 1830 <</Type/FontDescriptor/CapHeight 684/Ascent 695/Descent -193/ItalicAngle 0/StemV 50/Flags 6/FontBBox[-196 -322 1502 937]/FontFile3 16 0 R/FontName/TITXYI+SFRM0900>>
15 | 17 1995 <</BaseEncoding/WinAnsiEncoding/Differences[27/ff/fi]>>


--------------------------------------------------------------------------------
/tests/fixture_data/testXRefStreamObjects/crazyones.pdf:
--------------------------------------------------------------------------------
 1 | % The size of the XRef-Stream dictionary is 115 bytes, accounting for 23.000000
 2 | % items
 3 | 0 0 65535
 4 | 1 15 0
 5 | 1 10245 0
 6 | 1 939 0
 7 | 1 2999 0
 8 | 1 4982 0
 9 | 1 9949 0
10 | 1 11160 0
11 | 2 9 4
12 | 2 9 3
13 | 2 9 1
14 | 2 9 8
15 | 2 9 10
16 | 2 9 12
17 | 2 9 0
18 | 2 9 2
19 | 2 9 5
20 | 2 9 6
21 | 2 9 7
22 | 2 9 14
23 | 2 9 9
24 | 2 9 11
25 | 2 9 13
26 | 
27 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testXRefTableObjects/SF424_page2.pdf:
--------------------------------------------------------------------------------
  1 | % This is the XRef Table excerpt of the SF424_page2.pdf file
  2 | 0 328
  3 | 0000000000 65535 f 
  4 | 0000000009 00000 n 
  5 | 0000000068 00000 n 
  6 | 0000000169 00000 n 
  7 | 0000000218 00000 n 
  8 | 0000000472 00000 n 
  9 | 0000002469 00000 n 
 10 | 0000002556 00000 n 
 11 | 0000003162 00000 n 
 12 | 0000003404 00000 n 
 13 | 0000004048 00000 n 
 14 | 0000004297 00000 n 
 15 | 0000005487 00000 n 
 16 | 0000005729 00000 n 
 17 | 0000005906 00000 n 
 18 | 0000006091 00000 n 
 19 | 0000006200 00000 n 
 20 | 0000006386 00000 n 
 21 | 0000006601 00000 n 
 22 | 0000006859 00000 n 
 23 | 0000011230 00000 n 
 24 | 0000011340 00000 n 
 25 | 0000011416 00000 n 
 26 | 0000011477 00000 n 
 27 | 0000011734 00000 n 
 28 | 0000015083 00000 n 
 29 | 0000015514 00000 n 
 30 | 0000015769 00000 n 
 31 | 0000016099 00000 n 
 32 | 0000016199 00000 n 
 33 | 0000017526 00000 n 
 34 | 0000017702 00000 n 
 35 | 0000017817 00000 n 
 36 | 0000018002 00000 n 
 37 | 0000018215 00000 n 
 38 | 0000018401 00000 n 
 39 | 0000018587 00000 n 
 40 | 0000018844 00000 n 
 41 | 0000019844 00000 n 
 42 | 0000021034 00000 n 
 43 | 0000021082 00000 n 
 44 | 0000021268 00000 n 
 45 | 0000021549 00000 n 
 46 | 0000021752 00000 n 
 47 | 0000021945 00000 n 
 48 | 0000022150 00000 n 
 49 | 0000022446 00000 n 
 50 | 0000028034 00000 n 
 51 | 0000028067 00000 n 
 52 | 0000028110 00000 n 
 53 | 0000028168 00000 n 
 54 | 0000028490 00000 n 
 55 | 0000028865 00000 n 
 56 | 0000029069 00000 n 
 57 | 0000029112 00000 n 
 58 | 0000029170 00000 n 
 59 | 0000029492 00000 n 
 60 | 0000029867 00000 n 
 61 | 0000030072 00000 n 
 62 | 0000030115 00000 n 
 63 | 0000030173 00000 n 
 64 | 0000030495 00000 n 
 65 | 0000030870 00000 n 
 66 | 0000031075 00000 n 
 67 | 0000031118 00000 n 
 68 | 0000031176 00000 n 
 69 | 0000031498 00000 n 
 70 | 0000031873 00000 n 
 71 | 0000032136 00000 n 
 72 | 0000041460 00000 n 
 73 | 0000041506 00000 n 
 74 | 0000041922 00000 n 
 75 | 0000042344 00000 n 
 76 | 0000046101 00000 n 
 77 | 0000046622 00000 n 
 78 | 0000047217 00000 n 
 79 | 0000051163 00000 n 
 80 | 0000051721 00000 n 
 81 | 0000052453 00000 n 
 82 | 0000057379 00000 n 
 83 | 0000057406 00000 n 
 84 | 0000057611 00000 n 
 85 | 0000057654 00000 n 
 86 | 0000057712 00000 n 
 87 | 0000058034 00000 n 
 88 | 0000058409 00000 n 
 89 | 0000058672 00000 n 
 90 | 0000063653 00000 n 
 91 | 0000063680 00000 n 
 92 | 0000063723 00000 n 
 93 | 0000063781 00000 n 
 94 | 0000064103 00000 n 
 95 | 0000064478 00000 n 
 96 | 0000064769 00000 n 
 97 | 0000065036 00000 n 
 98 | 0000065251 00000 n 
 99 | 0000065541 00000 n 
100 | 0000065826 00000 n 
101 | 0000066095 00000 n 
102 | 0000066310 00000 n 
103 | 0000066600 00000 n 
104 | 0000066948 00000 n 
105 | 0000067216 00000 n 
106 | 0000067432 00000 n 
107 | 0000067723 00000 n 
108 | 0000068015 00000 n 
109 | 0000068280 00000 n 
110 | 0000068495 00000 n 
111 | 0000068783 00000 n 
112 | 0000069172 00000 n 
113 | 0000069437 00000 n 
114 | 0000069652 00000 n 
115 | 0000069940 00000 n 
116 | 0000070310 00000 n 
117 | 0000070577 00000 n 
118 | 0000070792 00000 n 
119 | 0000071082 00000 n 
120 | 0000071314 00000 n 
121 | 0000071518 00000 n 
122 | 0000071776 00000 n 
123 | 0000071875 00000 n 
124 | 0000071977 00000 n 
125 | 0000072157 00000 n 
126 | 0000072335 00000 n 
127 | 0000072530 00000 n 
128 | 0000072789 00000 n 
129 | 0000073069 00000 n 
130 | 0000073318 00000 n 
131 | 0000073417 00000 n 
132 | 0000073620 00000 n 
133 | 0000073824 00000 n 
134 | 0000074028 00000 n 
135 | 0000074217 00000 n 
136 | 0000074410 00000 n 
137 | 0000074601 00000 n 
138 | 0000074798 00000 n 
139 | 0000074993 00000 n 
140 | 0000075181 00000 n 
141 | 0000075367 00000 n 
142 | 0000075556 00000 n 
143 | 0000075886 00000 n 
144 | 0000075948 00000 n 
145 | 0000075969 00000 n 
146 | 0000076316 00000 n 
147 | 0000076534 00000 n 
148 | 0000076740 00000 n 
149 | 0000076955 00000 n 
150 | 0000077302 00000 n 
151 | 0000077359 00000 n 
152 | 0000077380 00000 n 
153 | 0000077726 00000 n 
154 | 0000077938 00000 n 
155 | 0000078186 00000 n 
156 | 0000078430 00000 n 
157 | 0000078507 00000 n 
158 | 0000078587 00000 n 
159 | 0000078823 00000 n 
160 | 0000078900 00000 n 
161 | 0000078980 00000 n 
162 | 0000079191 00000 n 
163 | 0000079565 00000 n 
164 | 0000079615 00000 n 
165 | 0000079923 00000 n 
166 | 0000080350 00000 n 
167 | 0000080775 00000 n 
168 | 0000081137 00000 n 
169 | 0000081174 00000 n 
170 | 0000081623 00000 n 
171 | 0000082078 00000 n 
172 | 0000083307 00000 n 
173 | 0000083443 00000 n 
174 | 0000083964 00000 n 
175 | 0000084487 00000 n 
176 | 0000084760 00000 n 
177 | 0000084874 00000 n 
178 | 0000085149 00000 n 
179 | 0000085244 00000 n 
180 | 0000085509 00000 n 
181 | 0000085782 00000 n 
182 | 0000085881 00000 n 
183 | 0000086093 00000 n 
184 | 0000086366 00000 n 
185 | 0000086465 00000 n 
186 | 0000086677 00000 n 
187 | 0000086950 00000 n 
188 | 0000087049 00000 n 
189 | 0000087261 00000 n 
190 | 0000087535 00000 n 
191 | 0000087634 00000 n 
192 | 0000087846 00000 n 
193 | 0000088121 00000 n 
194 | 0000088220 00000 n 
195 | 0000088432 00000 n 
196 | 0000088707 00000 n 
197 | 0000088806 00000 n 
198 | 0000089018 00000 n 
199 | 0000089293 00000 n 
200 | 0000089392 00000 n 
201 | 0000089604 00000 n 
202 | 0000089703 00000 n 
203 | 0000089915 00000 n 
204 | 0000090205 00000 n 
205 | 0000090426 00000 n 
206 | 0000090644 00000 n 
207 | 0000090962 00000 n 
208 | 0000091019 00000 n 
209 | 0000091387 00000 n 
210 | 0000091493 00000 n 
211 | 0000091735 00000 n 
212 | 0000091834 00000 n 
213 | 0000091936 00000 n 
214 | 0000092175 00000 n 
215 | 0000092274 00000 n 
216 | 0000092376 00000 n 
217 | 0000092685 00000 n 
218 | 0000092801 00000 n 
219 | 0000092920 00000 n 
220 | 0000093134 00000 n 
221 | 0000093437 00000 n 
222 | 0000093553 00000 n 
223 | 0000093672 00000 n 
224 | 0000093886 00000 n 
225 | 0000094182 00000 n 
226 | 0000094298 00000 n 
227 | 0000094417 00000 n 
228 | 0000094631 00000 n 
229 | 0000094926 00000 n 
230 | 0000095042 00000 n 
231 | 0000095161 00000 n 
232 | 0000095375 00000 n 
233 | 0000095670 00000 n 
234 | 0000095786 00000 n 
235 | 0000095905 00000 n 
236 | 0000096119 00000 n 
237 | 0000096420 00000 n 
238 | 0000096536 00000 n 
239 | 0000096655 00000 n 
240 | 0000096869 00000 n 
241 | 0000097196 00000 n 
242 | 0000097312 00000 n 
243 | 0000097575 00000 n 
244 | 0000097694 00000 n 
245 | 0000097804 00000 n 
246 | 0000098018 00000 n 
247 | 0000098354 00000 n 
248 | 0000098619 00000 n 
249 | 0000098834 00000 n 
250 | 0000099123 00000 n 
251 | 0000099433 00000 n 
252 | 0000099532 00000 n 
253 | 0000099634 00000 n 
254 | 0000099876 00000 n 
255 | 0000100216 00000 n 
256 | 0000100484 00000 n 
257 | 0000100700 00000 n 
258 | 0000100990 00000 n 
259 | 0000101286 00000 n 
260 | 0000101554 00000 n 
261 | 0000101770 00000 n 
262 | 0000102060 00000 n 
263 | 0000102369 00000 n 
264 | 0000102634 00000 n 
265 | 0000102848 00000 n 
266 | 0000103135 00000 n 
267 | 0000103447 00000 n 
268 | 0000103713 00000 n 
269 | 0000103929 00000 n 
270 | 0000104217 00000 n 
271 | 0000104525 00000 n 
272 | 0000104789 00000 n 
273 | 0000105005 00000 n 
274 | 0000105292 00000 n 
275 | 0000105630 00000 n 
276 | 0000105978 00000 n 
277 | 0000106204 00000 n 
278 | 0000106431 00000 n 
279 | 0000106654 00000 n 
280 | 0000107023 00000 n 
281 | 0000107044 00000 n 
282 | 0000107067 00000 n 
283 | 0000107407 00000 n 
284 | 0000107623 00000 n 
285 | 0000107886 00000 n 
286 | 0000107963 00000 n 
287 | 0000108043 00000 n 
288 | 0000108294 00000 n 
289 | 0000108371 00000 n 
290 | 0000108451 00000 n 
291 | 0000108679 00000 n 
292 | 0000108928 00000 n 
293 | 0000108996 00000 n 
294 | 0000109212 00000 n 
295 | 0000109425 00000 n 
296 | 0000109736 00000 n 
297 | 0000110145 00000 n 
298 | 0000110166 00000 n 
299 | 0000110514 00000 n 
300 | 0000110825 00000 n 
301 | 0000111176 00000 n 
302 | 0000111511 00000 n 
303 | 0000111859 00000 n 
304 | 0000112062 00000 n 
305 | 0000112373 00000 n 
306 | 0000112456 00000 n 
307 | 0000112882 00000 n 
308 | 0000113180 00000 n 
309 | 0000113263 00000 n 
310 | 0000113535 00000 n 
311 | 0000113635 00000 n 
312 | 0000113899 00000 n 
313 | 0000114197 00000 n 
314 | 0000114297 00000 n 
315 | 0000114561 00000 n 
316 | 0000114859 00000 n 
317 | 0000114959 00000 n 
318 | 0000115222 00000 n 
319 | 0000115521 00000 n 
320 | 0000115633 00000 n 
321 | 0000115897 00000 n 
322 | 0000115997 00000 n 
323 | 0000116261 00000 n 
324 | 0000116486 00000 n 
325 | 0000116549 00000 n 
326 | 0000116814 00000 n 
327 | 0000117080 00000 n 
328 | 0000117336 00000 n 
329 | 0000117614 00000 n 
330 | 0000117895 00000 n
331 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testXRefTableObjects/Seige_of_Vicksburg_Sample_OCR.pdf:
--------------------------------------------------------------------------------
 1 | % This is the XRef Table excerpt of the Seige_of_Vicksburg_Sample_OCR.pdf file
 2 | 38 16
 3 | 0000000023 00000 n 
 4 | 0000000593 00000 n 
 5 | 0000000962 00000 n 
 6 | 0000001150 00000 n 
 7 | 0000002182 00000 n 
 8 | 0000002299 00000 n 
 9 | 0000005902 00000 n 
10 | 0000006033 00000 n 
11 | 0000006086 00000 n 
12 | 0000006133 00000 n 
13 | 0000022142 00000 n 
14 | 0000023745 00000 n 
15 | 0000036171 00000 n 
16 | 0000036218 00000 n 
17 | 0000036362 00000 n 
18 | 0000000761 00000 n 
19 | 0 38
20 | 0000000000 65535 f 
21 | 0000036415 00000 n 
22 | 0000036608 00000 n 
23 | 0000040043 00000 n 
24 | 0000062526 00000 n 
25 | 0000064168 00000 n 
26 | 0000079123 00000 n 
27 | 0000079265 00000 n 
28 | 0000079460 00000 n 
29 | 0000080497 00000 n 
30 | 0000084022 00000 n 
31 | 0000103504 00000 n 
32 | 0000105136 00000 n 
33 | 0000119406 00000 n 
34 | 0000119550 00000 n 
35 | 0000119748 00000 n 
36 | 0000123678 00000 n 
37 | 0000124435 00000 n 
38 | 0000126063 00000 n 
39 | 0000141743 00000 n 
40 | 0000141887 00000 n 
41 | 0000142095 00000 n 
42 | 0000145954 00000 n 
43 | 0000146720 00000 n 
44 | 0000148350 00000 n 
45 | 0000164181 00000 n 
46 | 0000164325 00000 n 
47 | 0000164543 00000 n 
48 | 0000165585 00000 n 
49 | 0000169113 00000 n 
50 | 0000184252 00000 n 
51 | 0000185896 00000 n 
52 | 0000201138 00000 n 
53 | 0000201282 00000 n 
54 | 0000202316 00000 n 
55 | 0000203347 00000 n 
56 | 0000203433 00000 n 
57 | 0000205114 00000 n 
58 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testXRefTableObjects/jpeg.pdf:
--------------------------------------------------------------------------------
 1 | % This is the XRef Table excerpt of the jpeg.pdf file
 2 | 0 17
 3 | 0000000000 65535 f 
 4 | 0000099839 00000 n 
 5 | 0000000019 00000 n 
 6 | 0000000276 00000 n 
 7 | 0000000296 00000 n 
 8 | 0000041872 00000 n 
 9 | 0000091180 00000 n 
10 | 0000099982 00000 n 
11 | 0000091202 00000 n 
12 | 0000098889 00000 n 
13 | 0000098910 00000 n 
14 | 0000099106 00000 n 
15 | 0000099476 00000 n 
16 | 0000099707 00000 n 
17 | 0000099740 00000 n 
18 | 0000100081 00000 n 
19 | 0000100178 00000 n 
20 | 


--------------------------------------------------------------------------------
/tests/fixture_data/testXTableAgainstXStream/GeoBase_NHNC1_Data_Model_UML_EN.pdf:
--------------------------------------------------------------------------------
  1 | % This is the XRef Table excerpt of the GeoBase_NHNC1_Data_Model_UML_EN.pdf
  2 | % file, encoding objects that are "ignored" by a PDF reader supporting version
  3 | % 1.5 or lower, but stored into the XRef Stream by a reader capable of version
  4 | % 1.5 or higher.
  5 | 0 678
  6 | 0000000119 65535 f
  7 | 0000000017 00000 n
  8 | 0000000126 00000 n
  9 | 0000000309 00000 n
 10 | 0000000642 00000 n
 11 | 0000002376 00000 n
 12 | 0000002536 00000 n
 13 | 0000002760 00000 n
 14 | 0000005960 00000 n
 15 | 0000006125 00000 n
 16 | 0000006354 00000 n
 17 | 0000006530 00000 n
 18 | 0000006777 00000 n
 19 | 0000006915 00000 n
 20 | 0000006945 00000 n
 21 | 0000007111 00000 n
 22 | 0000007185 00000 n
 23 | 0000007432 00000 n
 24 | 0000007603 00000 n
 25 | 0000007845 00000 n
 26 | 0000008005 00000 n
 27 | 0000008165 00000 n
 28 | 0000008925 00000 n
 29 | 0000009176 00000 n
 30 | 0000010704 00000 n
 31 | 0000010879 00000 n
 32 | 0000011118 00000 n
 33 | 0000011379 00000 n
 34 | 0000015477 00000 n
 35 | 0000015601 00000 n
 36 | 0000015631 00000 n
 37 | 0000015783 00000 n
 38 | 0000015857 00000 n
 39 | 0000016100 00000 n
 40 | 0000016225 00000 n
 41 | 0000016255 00000 n
 42 | 0000016408 00000 n
 43 | 0000016482 00000 n
 44 | 0000016713 00000 n
 45 | 0000017071 00000 n
 46 | 0000019176 00000 n
 47 | 0000019302 00000 n
 48 | 0000019592 00000 n
 49 | 0000019898 00000 n
 50 | 0000020024 00000 n
 51 | 0000020150 00000 n
 52 | 0000020461 00000 n
 53 | 0000020590 00000 n
 54 | 0000020868 00000 n
 55 | 0000020999 00000 n
 56 | 0000021304 00000 n
 57 | 0000021434 00000 n
 58 | 0000021603 00000 n
 59 | 0000021837 00000 n
 60 | 0000021967 00000 n
 61 | 0000022283 00000 n
 62 | 0000022413 00000 n
 63 | 0000022720 00000 n
 64 | 0000022850 00000 n
 65 | 0000023170 00000 n
 66 | 0000023490 00000 n
 67 | 0000023800 00000 n
 68 | 0000023930 00000 n
 69 | 0000024250 00000 n
 70 | 0000024570 00000 n
 71 | 0000024890 00000 n
 72 | 0000025188 00000 n
 73 | 0000025318 00000 n
 74 | 0000025663 00000 n
 75 | 0000026673 00000 n
 76 | 0000026727 00000 n
 77 | 0000026781 00000 n
 78 | 0000030750 00000 n
 79 | 0000030882 00000 n
 80 | 0000031011 00000 n
 81 | 0000031041 00000 n
 82 | 0000031198 00000 n
 83 | 0000031272 00000 n
 84 | 0000031520 00000 n
 85 | 0000035757 00000 n
 86 | 0000036682 00000 n
 87 | 0000037439 00000 n
 88 | 0000057203 00000 n
 89 | 0000058791 00000 n
 90 | 0000059493 00000 n
 91 | 0000064855 00000 n
 92 | 0000065023 00000 n
 93 | 0000065253 00000 n
 94 | 0000065377 00000 n
 95 | 0000065407 00000 n
 96 | 0000065559 00000 n
 97 | 0000065633 00000 n
 98 | 0000065876 00000 n
 99 | 0000066039 00000 n
100 | 0000066264 00000 n
101 | 0000066438 00000 n
102 | 0000066676 00000 n
103 | 0000066846 00000 n
104 | 0000067080 00000 n
105 | 0000067769 00000 n
106 | 0000070097 00000 n
107 | 0000070782 00000 n
108 | 0000074703 00000 n
109 | 0000075372 00000 n
110 | 0000079610 00000 n
111 | 0000080530 00000 n
112 | 0000086751 00000 n
113 | 0000087445 00000 n
114 | 0000094019 00000 n
115 | 0000094195 00000 n
116 | 0000094434 00000 n
117 | 0000095146 00000 n
118 | 0000097610 00000 n
119 | 0000098290 00000 n
120 | 0000101036 00000 n
121 | 0000101753 00000 n
122 | 0000106802 00000 n
123 | 0000107713 00000 n
124 | 0000109253 00000 n
125 | 0000000120 65535 f
126 | 0000000121 65535 f
127 | 0000000122 65535 f
128 | 0000000123 65535 f
129 | 0000000124 65535 f
130 | 0000000125 65535 f
131 | 0000000126 65535 f
132 | 0000000127 65535 f
133 | 0000000128 65535 f
134 | 0000000129 65535 f
135 | 0000000130 65535 f
136 | 0000000131 65535 f
137 | 0000000132 65535 f
138 | 0000000133 65535 f
139 | 0000000134 65535 f
140 | 0000000135 65535 f
141 | 0000000136 65535 f
142 | 0000000137 65535 f
143 | 0000000138 65535 f
144 | 0000000139 65535 f
145 | 0000000140 65535 f
146 | 0000000141 65535 f
147 | 0000000142 65535 f
148 | 0000000143 65535 f
149 | 0000000144 65535 f
150 | 0000000145 65535 f
151 | 0000000146 65535 f
152 | 0000000147 65535 f
153 | 0000000148 65535 f
154 | 0000000149 65535 f
155 | 0000000150 65535 f
156 | 0000000151 65535 f
157 | 0000000152 65535 f
158 | 0000000153 65535 f
159 | 0000000154 65535 f
160 | 0000000155 65535 f
161 | 0000000156 65535 f
162 | 0000000157 65535 f
163 | 0000000158 65535 f
164 | 0000000159 65535 f
165 | 0000000160 65535 f
166 | 0000000161 65535 f
167 | 0000000162 65535 f
168 | 0000000163 65535 f
169 | 0000000164 65535 f
170 | 0000000165 65535 f
171 | 0000000166 65535 f
172 | 0000000167 65535 f
173 | 0000000168 65535 f
174 | 0000000169 65535 f
175 | 0000000170 65535 f
176 | 0000000171 65535 f
177 | 0000000172 65535 f
178 | 0000000173 65535 f
179 | 0000000174 65535 f
180 | 0000000175 65535 f
181 | 0000000176 65535 f
182 | 0000000177 65535 f
183 | 0000000178 65535 f
184 | 0000000179 65535 f
185 | 0000000180 65535 f
186 | 0000000181 65535 f
187 | 0000000182 65535 f
188 | 0000000183 65535 f
189 | 0000000184 65535 f
190 | 0000000185 65535 f
191 | 0000000186 65535 f
192 | 0000000187 65535 f
193 | 0000000188 65535 f
194 | 0000000189 65535 f
195 | 0000000190 65535 f
196 | 0000000191 65535 f
197 | 0000000192 65535 f
198 | 0000000193 65535 f
199 | 0000000194 65535 f
200 | 0000000195 65535 f
201 | 0000000196 65535 f
202 | 0000000197 65535 f
203 | 0000000198 65535 f
204 | 0000000199 65535 f
205 | 0000000200 65535 f
206 | 0000000201 65535 f
207 | 0000000202 65535 f
208 | 0000000203 65535 f
209 | 0000000204 65535 f
210 | 0000000205 65535 f
211 | 0000000206 65535 f
212 | 0000000207 65535 f
213 | 0000000208 65535 f
214 | 0000000209 65535 f
215 | 0000000210 65535 f
216 | 0000000211 65535 f
217 | 0000000212 65535 f
218 | 0000000213 65535 f
219 | 0000000214 65535 f
220 | 0000000215 65535 f
221 | 0000000216 65535 f
222 | 0000000217 65535 f
223 | 0000000218 65535 f
224 | 0000000219 65535 f
225 | 0000000220 65535 f
226 | 0000000221 65535 f
227 | 0000000222 65535 f
228 | 0000000223 65535 f
229 | 0000000224 65535 f
230 | 0000000225 65535 f
231 | 0000000226 65535 f
232 | 0000000227 65535 f
233 | 0000000228 65535 f
234 | 0000000229 65535 f
235 | 0000000230 65535 f
236 | 0000000231 65535 f
237 | 0000000232 65535 f
238 | 0000000233 65535 f
239 | 0000000234 65535 f
240 | 0000000235 65535 f
241 | 0000000236 65535 f
242 | 0000000237 65535 f
243 | 0000000238 65535 f
244 | 0000000239 65535 f
245 | 0000000240 65535 f
246 | 0000000241 65535 f
247 | 0000000242 65535 f
248 | 0000000243 65535 f
249 | 0000000244 65535 f
250 | 0000000245 65535 f
251 | 0000000246 65535 f
252 | 0000000247 65535 f
253 | 0000000248 65535 f
254 | 0000000249 65535 f
255 | 0000000250 65535 f
256 | 0000000251 65535 f
257 | 0000000252 65535 f
258 | 0000000253 65535 f
259 | 0000000254 65535 f
260 | 0000000255 65535 f
261 | 0000000256 65535 f
262 | 0000000257 65535 f
263 | 0000000258 65535 f
264 | 0000000259 65535 f
265 | 0000000260 65535 f
266 | 0000000261 65535 f
267 | 0000000262 65535 f
268 | 0000000263 65535 f
269 | 0000000264 65535 f
270 | 0000000265 65535 f
271 | 0000000266 65535 f
272 | 0000000267 65535 f
273 | 0000000268 65535 f
274 | 0000000269 65535 f
275 | 0000000270 65535 f
276 | 0000000271 65535 f
277 | 0000000272 65535 f
278 | 0000000273 65535 f
279 | 0000000274 65535 f
280 | 0000000275 65535 f
281 | 0000000276 65535 f
282 | 0000000277 65535 f
283 | 0000000278 65535 f
284 | 0000000279 65535 f
285 | 0000000280 65535 f
286 | 0000000281 65535 f
287 | 0000000282 65535 f
288 | 0000000283 65535 f
289 | 0000000284 65535 f
290 | 0000000285 65535 f
291 | 0000000286 65535 f
292 | 0000000287 65535 f
293 | 0000000288 65535 f
294 | 0000000289 65535 f
295 | 0000000290 65535 f
296 | 0000000291 65535 f
297 | 0000000292 65535 f
298 | 0000000293 65535 f
299 | 0000000294 65535 f
300 | 0000000295 65535 f
301 | 0000000296 65535 f
302 | 0000000297 65535 f
303 | 0000000298 65535 f
304 | 0000000299 65535 f
305 | 0000000300 65535 f
306 | 0000000301 65535 f
307 | 0000000302 65535 f
308 | 0000000303 65535 f
309 | 0000000304 65535 f
310 | 0000000305 65535 f
311 | 0000000306 65535 f
312 | 0000000307 65535 f
313 | 0000000308 65535 f
314 | 0000000309 65535 f
315 | 0000000310 65535 f
316 | 0000000311 65535 f
317 | 0000000312 65535 f
318 | 0000000313 65535 f
319 | 0000000314 65535 f
320 | 0000000315 65535 f
321 | 0000000316 65535 f
322 | 0000000317 65535 f
323 | 0000000318 65535 f
324 | 0000000319 65535 f
325 | 0000000320 65535 f
326 | 0000000321 65535 f
327 | 0000000322 65535 f
328 | 0000000323 65535 f
329 | 0000000324 65535 f
330 | 0000000325 65535 f
331 | 0000000326 65535 f
332 | 0000000327 65535 f
333 | 0000000328 65535 f
334 | 0000000329 65535 f
335 | 0000000330 65535 f
336 | 0000000331 65535 f
337 | 0000000332 65535 f
338 | 0000000333 65535 f
339 | 0000000334 65535 f
340 | 0000000335 65535 f
341 | 0000000336 65535 f
342 | 0000000337 65535 f
343 | 0000000338 65535 f
344 | 0000000339 65535 f
345 | 0000000340 65535 f
346 | 0000000341 65535 f
347 | 0000000342 65535 f
348 | 0000000343 65535 f
349 | 0000000344 65535 f
350 | 0000000345 65535 f
351 | 0000000346 65535 f
352 | 0000000347 65535 f
353 | 0000000348 65535 f
354 | 0000000349 65535 f
355 | 0000000350 65535 f
356 | 0000000351 65535 f
357 | 0000000352 65535 f
358 | 0000000353 65535 f
359 | 0000000354 65535 f
360 | 0000000355 65535 f
361 | 0000000356 65535 f
362 | 0000000357 65535 f
363 | 0000000358 65535 f
364 | 0000000359 65535 f
365 | 0000000360 65535 f
366 | 0000000361 65535 f
367 | 0000000362 65535 f
368 | 0000000363 65535 f
369 | 0000000364 65535 f
370 | 0000000365 65535 f
371 | 0000000366 65535 f
372 | 0000000367 65535 f
373 | 0000000368 65535 f
374 | 0000000369 65535 f
375 | 0000000370 65535 f
376 | 0000000371 65535 f
377 | 0000000372 65535 f
378 | 0000000373 65535 f
379 | 0000000374 65535 f
380 | 0000000375 65535 f
381 | 0000000376 65535 f
382 | 0000000377 65535 f
383 | 0000000378 65535 f
384 | 0000000379 65535 f
385 | 0000000380 65535 f
386 | 0000000381 65535 f
387 | 0000000382 65535 f
388 | 0000000383 65535 f
389 | 0000000384 65535 f
390 | 0000000385 65535 f
391 | 0000000386 65535 f
392 | 0000000387 65535 f
393 | 0000000388 65535 f
394 | 0000000389 65535 f
395 | 0000000390 65535 f
396 | 0000000391 65535 f
397 | 0000000392 65535 f
398 | 0000000393 65535 f
399 | 0000000394 65535 f
400 | 0000000395 65535 f
401 | 0000000396 65535 f
402 | 0000000397 65535 f
403 | 0000000398 65535 f
404 | 0000000399 65535 f
405 | 0000000400 65535 f
406 | 0000000401 65535 f
407 | 0000000402 65535 f
408 | 0000000403 65535 f
409 | 0000000404 65535 f
410 | 0000000405 65535 f
411 | 0000000406 65535 f
412 | 0000000407 65535 f
413 | 0000000408 65535 f
414 | 0000000409 65535 f
415 | 0000000410 65535 f
416 | 0000000411 65535 f
417 | 0000000412 65535 f
418 | 0000000413 65535 f
419 | 0000000414 65535 f
420 | 0000000415 65535 f
421 | 0000000416 65535 f
422 | 0000000417 65535 f
423 | 0000000418 65535 f
424 | 0000000419 65535 f
425 | 0000000420 65535 f
426 | 0000000421 65535 f
427 | 0000000422 65535 f
428 | 0000000423 65535 f
429 | 0000000424 65535 f
430 | 0000000425 65535 f
431 | 0000000426 65535 f
432 | 0000000427 65535 f
433 | 0000000428 65535 f
434 | 0000000429 65535 f
435 | 0000000430 65535 f
436 | 0000000431 65535 f
437 | 0000000432 65535 f
438 | 0000000433 65535 f
439 | 0000000434 65535 f
440 | 0000000435 65535 f
441 | 0000000436 65535 f
442 | 0000000437 65535 f
443 | 0000000438 65535 f
444 | 0000000439 65535 f
445 | 0000000440 65535 f
446 | 0000000441 65535 f
447 | 0000000442 65535 f
448 | 0000000443 65535 f
449 | 0000000444 65535 f
450 | 0000000445 65535 f
451 | 0000000446 65535 f
452 | 0000000447 65535 f
453 | 0000000448 65535 f
454 | 0000000449 65535 f
455 | 0000000450 65535 f
456 | 0000000451 65535 f
457 | 0000000452 65535 f
458 | 0000000453 65535 f
459 | 0000000454 65535 f
460 | 0000000455 65535 f
461 | 0000000456 65535 f
462 | 0000000457 65535 f
463 | 0000000458 65535 f
464 | 0000000459 65535 f
465 | 0000000460 65535 f
466 | 0000000461 65535 f
467 | 0000000462 65535 f
468 | 0000000463 65535 f
469 | 0000000464 65535 f
470 | 0000000465 65535 f
471 | 0000000466 65535 f
472 | 0000000467 65535 f
473 | 0000000468 65535 f
474 | 0000000469 65535 f
475 | 0000000470 65535 f
476 | 0000000471 65535 f
477 | 0000000472 65535 f
478 | 0000000473 65535 f
479 | 0000000474 65535 f
480 | 0000000475 65535 f
481 | 0000000476 65535 f
482 | 0000000477 65535 f
483 | 0000000478 65535 f
484 | 0000000479 65535 f
485 | 0000000480 65535 f
486 | 0000000481 65535 f
487 | 0000000482 65535 f
488 | 0000000483 65535 f
489 | 0000000484 65535 f
490 | 0000000485 65535 f
491 | 0000000486 65535 f
492 | 0000000487 65535 f
493 | 0000000488 65535 f
494 | 0000000489 65535 f
495 | 0000000490 65535 f
496 | 0000000491 65535 f
497 | 0000000492 65535 f
498 | 0000000493 65535 f
499 | 0000000494 65535 f
500 | 0000000495 65535 f
501 | 0000000496 65535 f
502 | 0000000497 65535 f
503 | 0000000498 65535 f
504 | 0000000499 65535 f
505 | 0000000500 65535 f
506 | 0000000501 65535 f
507 | 0000000502 65535 f
508 | 0000000503 65535 f
509 | 0000000504 65535 f
510 | 0000000505 65535 f
511 | 0000000506 65535 f
512 | 0000000507 65535 f
513 | 0000000508 65535 f
514 | 0000000509 65535 f
515 | 0000000510 65535 f
516 | 0000000511 65535 f
517 | 0000000512 65535 f
518 | 0000000513 65535 f
519 | 0000000514 65535 f
520 | 0000000515 65535 f
521 | 0000000516 65535 f
522 | 0000000517 65535 f
523 | 0000000518 65535 f
524 | 0000000519 65535 f
525 | 0000000520 65535 f
526 | 0000000521 65535 f
527 | 0000000522 65535 f
528 | 0000000523 65535 f
529 | 0000000524 65535 f
530 | 0000000525 65535 f
531 | 0000000526 65535 f
532 | 0000000527 65535 f
533 | 0000000528 65535 f
534 | 0000000529 65535 f
535 | 0000000530 65535 f
536 | 0000000531 65535 f
537 | 0000000532 65535 f
538 | 0000000533 65535 f
539 | 0000000534 65535 f
540 | 0000000535 65535 f
541 | 0000000536 65535 f
542 | 0000000537 65535 f
543 | 0000000538 65535 f
544 | 0000000539 65535 f
545 | 0000000540 65535 f
546 | 0000000541 65535 f
547 | 0000000542 65535 f
548 | 0000000543 65535 f
549 | 0000000544 65535 f
550 | 0000000545 65535 f
551 | 0000000546 65535 f
552 | 0000000547 65535 f
553 | 0000000548 65535 f
554 | 0000000549 65535 f
555 | 0000000550 65535 f
556 | 0000000551 65535 f
557 | 0000000552 65535 f
558 | 0000000553 65535 f
559 | 0000000554 65535 f
560 | 0000000555 65535 f
561 | 0000000556 65535 f
562 | 0000000557 65535 f
563 | 0000000558 65535 f
564 | 0000000559 65535 f
565 | 0000000560 65535 f
566 | 0000000561 65535 f
567 | 0000000562 65535 f
568 | 0000000563 65535 f
569 | 0000000564 65535 f
570 | 0000000565 65535 f
571 | 0000000566 65535 f
572 | 0000000567 65535 f
573 | 0000000568 65535 f
574 | 0000000569 65535 f
575 | 0000000570 65535 f
576 | 0000000571 65535 f
577 | 0000000572 65535 f
578 | 0000000573 65535 f
579 | 0000000574 65535 f
580 | 0000000575 65535 f
581 | 0000000576 65535 f
582 | 0000000577 65535 f
583 | 0000000578 65535 f
584 | 0000000579 65535 f
585 | 0000000580 65535 f
586 | 0000000581 65535 f
587 | 0000000582 65535 f
588 | 0000000583 65535 f
589 | 0000000584 65535 f
590 | 0000000585 65535 f
591 | 0000000586 65535 f
592 | 0000000587 65535 f
593 | 0000000588 65535 f
594 | 0000000589 65535 f
595 | 0000000590 65535 f
596 | 0000000591 65535 f
597 | 0000000592 65535 f
598 | 0000000593 65535 f
599 | 0000000594 65535 f
600 | 0000000595 65535 f
601 | 0000000596 65535 f
602 | 0000000597 65535 f
603 | 0000000598 65535 f
604 | 0000000599 65535 f
605 | 0000000600 65535 f
606 | 0000000601 65535 f
607 | 0000000602 65535 f
608 | 0000000603 65535 f
609 | 0000000604 65535 f
610 | 0000000605 65535 f
611 | 0000000606 65535 f
612 | 0000000607 65535 f
613 | 0000000608 65535 f
614 | 0000000609 65535 f
615 | 0000000610 65535 f
616 | 0000000611 65535 f
617 | 0000000612 65535 f
618 | 0000000613 65535 f
619 | 0000000614 65535 f
620 | 0000000615 65535 f
621 | 0000000616 65535 f
622 | 0000000617 65535 f
623 | 0000000618 65535 f
624 | 0000000619 65535 f
625 | 0000000620 65535 f
626 | 0000000621 65535 f
627 | 0000000622 65535 f
628 | 0000000623 65535 f
629 | 0000000624 65535 f
630 | 0000000625 65535 f
631 | 0000000626 65535 f
632 | 0000000627 65535 f
633 | 0000000628 65535 f
634 | 0000000629 65535 f
635 | 0000000630 65535 f
636 | 0000000631 65535 f
637 | 0000000632 65535 f
638 | 0000000633 65535 f
639 | 0000000634 65535 f
640 | 0000000635 65535 f
641 | 0000000636 65535 f
642 | 0000000637 65535 f
643 | 0000000638 65535 f
644 | 0000000639 65535 f
645 | 0000000640 65535 f
646 | 0000000641 65535 f
647 | 0000000642 65535 f
648 | 0000000643 65535 f
649 | 0000000644 65535 f
650 | 0000000645 65535 f
651 | 0000000646 65535 f
652 | 0000000647 65535 f
653 | 0000000648 65535 f
654 | 0000000000 65535 f
655 | 0000117841 00000 n
656 | 0000118046 00000 n
657 | 0000118346 00000 n
658 | 0000180620 00000 n
659 | 0000181077 00000 n
660 | 0000181665 00000 n
661 | 0000181965 00000 n
662 | 0000238489 00000 n
663 | 0000238886 00000 n
664 | 0000239442 00000 n
665 | 0000239830 00000 n
666 | 0000253551 00000 n
667 | 0000253742 00000 n
668 | 0000253975 00000 n
669 | 0000254214 00000 n
670 | 0000269283 00000 n
671 | 0000269311 00000 n
672 | 0000269612 00000 n
673 | 0000281924 00000 n
674 | 0000281968 00000 n
675 | 0000282268 00000 n
676 | 0000282562 00000 n
677 | 0000282959 00000 n
678 | 0000343200 00000 n
679 | 0000343599 00000 n
680 | 0000343943 00000 n
681 | 0000344223 00000 n
682 | 0000344399 00000 n
683 | 0000344663 00000 n
684 | 


--------------------------------------------------------------------------------
/tests/test_filters.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Performs unit tests for filters.py.
  4 | 
  5 | TO-DO Add license notice, if any.
  6 | """
  7 | from itertools import product as cartesian_product
  8 | from math import floor, log
  9 | from os.path import abspath, dirname, join
 10 | import string
 11 | import sys
 12 | import unittest
 13 | 
 14 | import pytest
 15 | 
 16 | from pypdf.filters import (
 17 |     ASCII85Codec,
 18 |     ASCIIHexCodec,
 19 |     CCITTFaxCodec,
 20 |     DCTCodec,
 21 |     FlateCodec,
 22 |     LZWCodec,
 23 |     decodeStreamData,
 24 | )
 25 | from pypdf.generic import EncodedStreamObject, IndirectObject
 26 | from pypdf.pdf import PdfFileReader
 27 | from pypdf.utils import PdfReadError, PdfStreamError, hexEncode
 28 | from tests.utils import intToBitstring
 29 | 
 30 | TESTS_ROOT = abspath(dirname(__file__))
 31 | TEST_DATA_ROOT = join(TESTS_ROOT, "fixture_data")
 32 | 
 33 | 
 34 | # Establish bytes/str/unicode types.
 35 | try:
 36 |     unicode
 37 | except NameError:
 38 |     # Python 3
 39 |     BytesType = bytes
 40 |     StrType = str
 41 |     UnicodeType = str
 42 | else:
 43 |     # Python 2
 44 |     BytesType = bytes
 45 |     StrType = str
 46 |     UnicodeType = unicode
 47 | 
 48 | 
 49 | class FlateCodecTestCase(unittest.TestCase):
 50 |     """
 51 |     Tests expected results and edge cases of FlateCodec.
 52 |     """
 53 | 
 54 |     @classmethod
 55 |     def setUpClass(cls):
 56 |         cls.filterInputs = [
 57 |             "",
 58 |             "",
 59 |             """""",
 60 |             string.ascii_lowercase,
 61 |             string.ascii_uppercase,
 62 |             string.ascii_letters,
 63 |             string.digits,
 64 |             string.hexdigits,
 65 |             string.punctuation,
 66 |             string.whitespace,  # Add more...
 67 |         ]
 68 |         for f__ in ("TheHappyPrince.txt",):
 69 |             with open(join(TEST_DATA_ROOT, f__)) as infile:
 70 |                 cls.filterInputs.append(infile.read())
 71 | 
 72 |         cls.filterInputs = tuple(s.encode("latin1") for s in cls.filterInputs)
 73 | 
 74 |     def testExpectedResults(self):
 75 |         """
 76 |         Tests FlateCodec decode() and encode() methods.
 77 | 
 78 |         TO-DO Test the result with the omitted predictor values.
 79 |         """
 80 |         codec = FlateCodec()
 81 |         predictors = [1]  # , 10, 11, 12, 13, 14, 15]
 82 | 
 83 |         for predictor, s__ in cartesian_product(predictors, self.filterInputs):
 84 |             self.assertEqual(
 85 |                 s__,
 86 |                 codec.decode(codec.encode(s__), {"/Predictor": predictor}),
 87 |                 "(predictor, s__) = (%d, %s)" % (predictor, s__),
 88 |             )
 89 | 
 90 |     def testInvalidPredictors(self):
 91 |         """
 92 |         Inputs a series of invalid predictor values (outside the
 93 |         {1, 2} U [10, 15] range) checking that ``PdfReadError`` is raised.
 94 |         """
 95 |         codec = FlateCodec()
 96 |         predictors = tuple(set(range(-20, 21)) - {1, 2, 10, 11, 12, 13, 14, 15})
 97 | 
 98 |         for predictor, s__ in cartesian_product(predictors, self.filterInputs):
 99 |             with self.assertRaises(
100 |                 PdfReadError,  # pylint: disable=bad-continuation
101 |                 msg="(predictor, input) = (%d, %s)"  # pylint: disable=bad-continuation
102 |                 % (predictor, s__),  # pylint: disable=bad-continuation
103 |             ):
104 |                 codec.decode(codec.encode(s__), {"/Predictor": predictor})
105 | 
106 | 
107 | class ASCIIHexCodecTestCase(unittest.TestCase):
108 |     """
109 |     Tests primarily the decode() method of ASCIIHexCodec.
110 |     """
111 | 
112 |     @classmethod
113 |     def setUpClass(cls):
114 |         cls.filterInputs = (
115 |             "",
116 |             "",
117 |             """""",
118 |             ">",
119 |             ">>",
120 |             ">>>",
121 |             string.ascii_lowercase,
122 |             string.ascii_uppercase,
123 |             string.ascii_letters,
124 |             string.digits,
125 |             string.hexdigits,
126 |             string.punctuation,
127 |             string.whitespace,  # Add more...
128 |         )
129 | 
130 |     def testExpectedResults(self):
131 |         """
132 |         Feeds a bunch of values to ``ASCIIHexCodec.decode()`` and ensures that
133 |         the correct output is returned.
134 | 
135 |         TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or
136 |         any other not terminated by ">"? (For the latter case, an exception
137 |         is currently raised.)
138 |         """
139 |         inputs = (
140 |             ">",
141 |             "6162636465666768696a6b6c6d6e6f707172737475767778797a>",
142 |             "4142434445464748494a4b4c4d4e4f505152535455565758595a>",
143 |             "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464"
144 |             "748494a4b4c4d4e4f505152535455565758595a>",
145 |             "30313233343536373839>",
146 |             "3  031323334353637   3839>",  # Same as previous, but whitespaced
147 |             "30313233343536373839616263646566414243444546>",
148 |             hexEncode(string.whitespace) + ">",
149 |         )
150 |         expected_outputs = (
151 |             "",
152 |             string.ascii_lowercase,
153 |             string.ascii_uppercase,
154 |             string.ascii_letters,
155 |             string.digits,
156 |             string.digits,
157 |             string.hexdigits,
158 |             string.whitespace,
159 |         )
160 | 
161 |         for o__, i in zip(expected_outputs, inputs):
162 |             self.assertEqual(
163 |                 o__,
164 |                 ASCIIHexCodec.decode(i),
165 |                 "Expected = %s\tReceived = %s"
166 |                 % (repr(o__), repr(ASCIIHexCodec.decode(i))),
167 |             )
168 | 
169 |     def testNoEod(self):
170 |         """
171 |         Tests when no EOD character is present, ensuring an exception is
172 |         raised.
173 |         """
174 |         inputs = ("", "", """""", """""")
175 | 
176 |         for i in inputs:
177 |             with self.assertRaises(PdfStreamError):
178 |                 ASCIIHexCodec.decode(i)
179 | 
180 | 
181 | class ASCII85CodecTestCase(unittest.TestCase):
182 |     """
183 |     Tests the ``decode()`` method of ``ASCII85Codec``.
184 |     """
185 | 
186 |     def testEncodeDecode(self):
187 |         """
188 |         Verifies that decode(encode(data)) == data, with encode() and decode()
189 |         from ASCII85Codec.
190 |         """
191 |         e__, d__ = ASCII85Codec.encode, ASCII85Codec.decode
192 |         inputs = [
193 |             string.ascii_lowercase.encode("ascii"),
194 |             string.ascii_uppercase.encode("ascii"),
195 |             string.ascii_letters.encode("ascii"),
196 |             string.whitespace.encode("ascii"),
197 |             b"\x00\x00\x00\x00",
198 |             2 * b"\x00\x00\x00\x00",
199 |         ]
200 | 
201 |         for filename in ("TheHappyPrince.txt",):
202 |             with open(join(TEST_DATA_ROOT, filename), "rb") as infile:
203 |                 inputs.append(infile.read())
204 | 
205 |         for i in inputs:
206 |             if sys.version_info > (3, 0) and isinstance(i, str):
207 |                 # The Python 3 version of decode() returns a bytes instance
208 |                 exp = i.encode("LATIN1")
209 |             else:
210 |                 exp = i
211 | 
212 |             self.assertEqual(exp, d__(e__(i)))
213 | 
214 |     def testWithOverflow(self):
215 |         """ [EXPLAIN THIS.] """
216 |         inputs = (
217 |             v__ + "~>"
218 |             for v__ in "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f"
219 |             "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a"
220 |             "\x1b\x1c\x1d\x1e\x1fvwxy{|}~\x7f\x80\x81\x82"
221 |             "\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d"
222 |             "\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98"
223 |             "\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬"
224 |             "\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇ"
225 |         )
226 | 
227 |         for i in inputs:
228 |             with self.assertRaises(ValueError, msg="char = " + repr(i)):
229 |                 ASCII85Codec.decode(i)
230 | 
231 |     def testFiveZeroBytes(self):
232 |         """
233 |         From ISO 32000 (2008) sect. 7.4.3:
234 |         «As a special case, if all five bytes are 0, they shall be represented
235 |         by the character with code 122 (z) instead of by five exclamation
236 |         points (!!!!!).»
237 |         """
238 |         inputs = (b"z~>", b"zz~>", b"zzz~>")
239 |         exp_outputs = (
240 |             b"\x00\x00\x00\x00",
241 |             b"\x00\x00\x00\x00" * 2,
242 |             b"\x00\x00\x00\x00" * 3,
243 |         )
244 | 
245 |         self.assertEqual(ASCII85Codec.decode(b"!!!!!~>"), ASCII85Codec.decode(b"z~>"))
246 | 
247 |         for o__, i in zip(exp_outputs, inputs):
248 |             self.assertEqual(o__, ASCII85Codec.decode(i))
249 | 
250 | 
251 | class LZWCodecTestCase(unittest.TestCase):
252 |     """
253 |     Tests the ``LZWCodec.decode()`` method by means of a LZW Encoder built
254 |     specifically for testing it.
255 |     """
256 | 
257 |     def testWriteCode(self):
258 |         """
259 |         Tests that the memorization of bit values performed by ``_writeCode()``
260 |         as a contiguous bit-stream works as intended.
261 |         """
262 |         self.maxDiff = None
263 |         e__ = LZWCodec.Encoder("")
264 |         e__.output = list()
265 | 
266 |         inputs = range(2 ** 8, 2 ** 12 - 1)
267 |         e__.bitspercode = int(floor(log(inputs[0], 2))) + 1
268 |         exp_output = "".join(intToBitstring(n__, floor(log(n__, 2))) for n__ in inputs)
269 | 
270 |         for i in inputs:
271 |             if floor(log(i, 2)) + 1 > e__.bitspercode:
272 |                 e__.bitspercode += 1
273 | 
274 |             e__._writeCode(i)
275 | 
276 |         self.assertEqual(
277 |             exp_output, "".join(intToBitstring(n__) for n__ in e__.output)[: e__.bitpos]
278 |         )
279 | 
280 |     def testReadCode(self):
281 |         """
282 |         Tests that the interpretation of bit values performed by
283 |         ``_readCode()`` as a contiguous bit-stream works as intended.
284 |         """
285 |         inputs = bytearray(range(256))
286 |         d__ = LZWCodec.Decoder(inputs)
287 |         exp_output_stream = "".join(intToBitstring(b__) for b__ in inputs)
288 |         curr = 0
289 |         # TODO:  this deserves to be an assignment expression.
290 |         code = d__._readCode()
291 | 
292 |         while code != -1:
293 |             if curr + d__.bitspercode >= len(exp_output_stream):
294 |                 exp_output = exp_output_stream[curr:] + "0" * (
295 |                     (curr + d__.bitspercode) - len(exp_output_stream)
296 |                 )
297 |             else:
298 |                 exp_output = exp_output_stream[curr : curr + d__.bitspercode]
299 | 
300 |             self.assertEqual(
301 |                 exp_output,
302 |                 intToBitstring(code, d__.bitspercode),
303 |                 msg="(curr, code) = (%d, %d)" % (curr, code),
304 |             )
305 | 
306 |             curr += d__.bitspercode
307 |             code = d__._readCode()
308 | 
309 |     def testEncodeDecode(self):
310 |         """
311 |         Ensures that the ``decode(encode(data))`` concatenation equals data,
312 |         where data can be an arbitrary byte stream.
313 |         """
314 |         self.maxDiff = None
315 |         inputs = [
316 |             string.ascii_lowercase,
317 |             string.ascii_uppercase,
318 |             string.whitespace,
319 |             string.ascii_letters,
320 |             2000 * string.ascii_letters,
321 |         ]
322 | 
323 |         if sys.version_info > (3, 0):
324 |             for index, e in enumerate(inputs):
325 |                 inputs[index] = e.encode("LATIN1")
326 | 
327 |         for f__ in (
328 |             "Hamlet.txt",  # pylint: disable=bad-continuation
329 |             "TheHappyPrince.txt",  # pylint: disable=bad-continuation
330 |         ):
331 |             with open(join(TEST_DATA_ROOT, f__), "rb") as infile:
332 |                 # TO-DO If we approach the number of read bytes to 10K the
333 |                 # codec stops working correctly. This is a bug to fix!
334 |                 inputs.append(infile.read())
335 | 
336 |         for b__ in inputs:
337 |             # IMPORTANT TODO:  why does this round trip fail for full-length
338 |             # data?  Until we solve this, truncate the inputs to the first
339 |             # 9500 bytes.
340 |             b__ = b__[:9500]
341 |             e__ = LZWCodec.Encoder(b__)
342 |             d__ = LZWCodec.Decoder(e__.encode())
343 | 
344 |             self.assertEqual(b__, d__.decode())
345 | 
346 | 
347 | class DecodeStreamDataTestCase(unittest.TestCase):
348 |     """
349 |     Test case intended to test the
350 |     :meth:`decodeStreamData<filters.decodeStreamData>` method. If functions by
351 |     querying known object references, asking ``decodeStreamData()`` to decode
352 |     their stream content and check the decoded value against what would be
353 |     produced by the filter that is known to be used.
354 |     """
355 | 
356 |     def testDecodeStreamData(self):
357 |         """ Stores PDF files infos and the coordinates of stream objects. We
358 |             don't care if we need to open a new file stream for each obj.
359 |             reference -- unit tests don't have to be efficient
360 |         """
361 |         this_dir = join(TEST_DATA_ROOT, self.testDecodeStreamData.__name__)
362 |         filters = (
363 |             # (filter type, filename, id, gen. number)
364 |             (FlateCodec, "FlateDecode.pdf", 4, 0),
365 |             (FlateCodec, "FlateDecode.pdf", 8, 0),
366 |             (FlateCodec, "FlateDecode.pdf", 9, 0),
367 |             # TO-DO No PDF files found with this type of encoding, get them.
368 |             # (ASCIIHexCodec, "ASCIIHexDecode.pdf", ?, ?)
369 |             (LZWCodec, "LZWDecode.pdf", 209, 0),
370 |             (LZWCodec, "LZWDecode.pdf", 210, 0),
371 |             (LZWCodec, "LZWDecode.pdf", 211, 0),
372 |             (ASCII85Codec, "ASCII85Decode.pdf", 5, 0),
373 |             (ASCII85Codec, "ASCII85Decode.pdf", 6, 0),
374 |             (DCTCodec, "DCTDecode.pdf", 4, 0),
375 |             # TO-DO No PDF files found with this type of encoding, get them.
376 |             # (JPXCodec, "JPXDecode.pdf", ?, ?)
377 |             (CCITTFaxCodec, "CCITTFaxDecode.pdf", 46, 0),
378 |         )
379 | 
380 |         for f__ in filters:
381 |             with open(join(this_dir, f__[1]), "rb") as infile:
382 |                 reader = PdfFileReader(infile)
383 |                 ref = IndirectObject(f__[2], f__[3], reader)
384 |                 stream = reader.getObject(ref)
385 | 
386 |                 # Ensures that the PdfFileReader reads a stream object
387 |                 self.assertEqual(EncodedStreamObject, type(stream))
388 | 
389 |                 # print("Running with %s!" % f[0].__name__)
390 |                 if f__[0] is CCITTFaxCodec:
391 |                     self.assertEqual(
392 |                         f__[0].decode(
393 |                             stream._data,
394 |                             stream.get("/DecodeParms"),
395 |                             stream.get("/Height"),
396 |                         ),
397 |                         decodeStreamData(stream),
398 |                     )
399 |                 else:
400 |                     self.assertEqual(
401 |                         f__[0].decode(stream._data, stream.get("/DecodeParms")),
402 |                         decodeStreamData(stream),
403 |                     )
404 | 
405 | 
406 | @pytest.mark.parametrize(
407 |     "data, expected_value, exception",
408 |     (
409 |         (b"<~~>", b"", None),  # Empty input
410 |         (b"<~@:E^~>", b"abc", None),  # Basic decoding
411 |         (u"<~@:E^~>", b"abc", None),  # Handle a str (or unicode) object
412 |         (b"<~@: E^~>", b"abc", None),  # Ignore whitespace
413 |         (b"<~z~>", b"\x00\x00\x00\x00", None),  # Handle 'z'
414 |         (b"~>", b"", None),  # No initial '<~'
415 |         (b"@:E^~>", b"abc", None),  # No initial '<~'
416 |         (b"", None, ValueError),  # Choke on missing '~>'
417 |         (b">", None, ValueError),  # Choke on missing '~>'
418 |         (b"<~<~~>", None, ValueError),  # Don't double-skip '<~'
419 |         (b"<~~~>", None, ValueError),  # Choke on bare '~'
420 |         (b"<~aazaa~>", None, ValueError),  # Choke on mid-group 'z'
421 |         (u"<~\x80~>", None, ValueError),  # Choke on non-ASCII characters
422 |     ),
423 | )
424 | def test_ascii85_decode(data, expected_value, exception):
425 |     """ [EXPLAIN THIS.] """
426 |     if exception:
427 |         with pytest.raises(exception):
428 |             ASCII85Codec.decode(data)
429 |     else:
430 |         value = ASCII85Codec.decode(data)
431 |         assert value == expected_value
432 |         assert isinstance(value, BytesType)
433 | 
434 | 
435 | @pytest.mark.parametrize(
436 |     "data, expected_value",
437 |     (
438 |         (b"", b"<~~>"),
439 |         (b"abc", b"<~@:E^~>"),
440 |         (b"\x00", b"<~!!~>"),
441 |         (b"\xff", b"<~rr~>"),
442 |         (b"\x00\x00\x00\x00", b"<~z~>"),
443 |     ),
444 | )  # pylint: disable=invalid-name
445 | def testASCII85Encode(data, expected_value):
446 |     """ [EXPLAIN THIS.] """
447 |     value = ASCII85Codec.encode(data)
448 |     assert value == expected_value
449 |     assert isinstance(value, BytesType)
450 | 


--------------------------------------------------------------------------------
/tests/test_generic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the ``pypdf/generic.py`` module.
  3 | """
  4 | import io
  5 | from os.path import abspath, dirname, join, pardir
  6 | import sys
  7 | import unittest
  8 | 
  9 | from pypdf.generic import IndirectObject, ObjectStream, TextStringObject
 10 | from pypdf.pdf import PdfFileReader
 11 | 
 12 | # Configure path environment
 13 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir))
 14 | TESTS_DATA_ROOT = join(PROJECT_ROOT, "tests", "fixture_data")
 15 | 
 16 | sys.path.append(PROJECT_ROOT)
 17 | 
 18 | 
 19 | class ObjectStreamTestCase(unittest.TestCase):
 20 |     """ [EXPLAIN THIS.] """
 21 | 
 22 |     def test_object_ids(self):
 23 |         """
 24 |         Tests the ``ObjectStream.objectIds()`` method.
 25 |         """
 26 |         exp_results = (
 27 |             (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17),
 28 |             (
 29 |                 644,
 30 |                 642,
 31 |                 646,
 32 |                 647,
 33 |                 648,
 34 |                 122,
 35 |                 119,
 36 |                 120,
 37 |                 121,
 38 |                 124,
 39 |                 179,
 40 |                 232,
 41 |                 327,
 42 |                 467,
 43 |                 478,
 44 |                 519,
 45 |                 568,
 46 |                 573,
 47 |                 580,
 48 |                 586,
 49 |                 592,
 50 |                 598,
 51 |                 603,
 52 |                 611,
 53 |                 616,
 54 |                 623,
 55 |                 629,
 56 |                 634,
 57 |             ),
 58 |         )
 59 |         # Files we know to have Object Streams within
 60 |         input_data = (
 61 |             # (filename, id, generation number)
 62 |             ("crazyones.pdf", 9, 0),
 63 |             ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0),
 64 |         )
 65 | 
 66 |         for o__, d__ in zip(exp_results, input_data):
 67 |             filepath = join(TESTS_DATA_ROOT, d__[0])
 68 |             r__ = PdfFileReader(filepath)
 69 |             ref = IndirectObject(d__[1], d__[2], r__)
 70 |             obj_stm = r__.getObject(ref)
 71 | 
 72 |             r__.close()
 73 | 
 74 |             self.assertIsInstance(obj_stm, ObjectStream)
 75 |             self.assertTupleEqual(tuple(o__), tuple(obj_stm.objectIds))
 76 | 
 77 | 
 78 | class TextStringObjectTestCase(unittest.TestCase):
 79 |     """ [EXPLAIN THIS.] """
 80 | 
 81 |     @staticmethod
 82 |     def _get_output_bytes_for_string(input_string):
 83 |         stream = io.BytesIO()
 84 |         text_string_object = TextStringObject(input_string)
 85 |         text_string_object.writeToStream(stream, encryption_key=None)
 86 |         stream_output = stream.getvalue()
 87 |         return stream_output
 88 | 
 89 |     def test_write_to_stream(self):
 90 |         """
 91 |         Tests the ``TextStringObject.writeToStream()`` method.
 92 |         """
 93 | 
 94 |         output_for_lowercase_letter = self._get_output_bytes_for_string("k")
 95 |         self.assertEqual(output_for_lowercase_letter, b"(k)")
 96 | 
 97 |         output_for_uppercase_letter = self._get_output_bytes_for_string("K")
 98 |         self.assertEqual(output_for_uppercase_letter, b"(K)")
 99 | 
100 |         output_for_digit = self._get_output_bytes_for_string("7")
101 |         self.assertEqual(output_for_digit, b"(7)")
102 | 
103 |         output_for_space = self._get_output_bytes_for_string(" ")
104 |         self.assertEqual(output_for_space, b"( )")
105 | 
106 |         output_for_opening_parentheses = self._get_output_bytes_for_string("(")
107 |         self.assertEqual(output_for_opening_parentheses, b"(\\050)")
108 | 
109 |         output_for_backslash = self._get_output_bytes_for_string("\\")
110 |         self.assertEqual(output_for_backslash, b"(\\134)")
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     unittest.main()
115 | 


--------------------------------------------------------------------------------
/tests/test_pdf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests PDF primitives from pypdf.pdf.
  3 | 
  4 | Note for future developers: if defining some code in a ``testX()`` method
  5 | that relies on a "fixture data" (e.g. a test file to read from) place it in the
  6 | ``/tests/fixture_data/testX/`` path (see some of the examples below to have a
  7 | hint on how to do this).
  8 | """
  9 | # TODO:  switch dependence to pathlib.
 10 | import binascii
 11 | from io import BytesIO
 12 | import os
 13 | from os.path import abspath, basename, dirname, join, pardir
 14 | import sys
 15 | import tempfile
 16 | import unittest
 17 | 
 18 | from pypdf.generic import IndirectObject, readObject
 19 | from pypdf.pdf import PdfFileReader, PdfFileWriter
 20 | 
 21 | # Configure path environment
 22 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir))
 23 | TEST_DATA_ROOT = join(PROJECT_ROOT, "tests", "fixture_data")
 24 | 
 25 | sys.path.append(PROJECT_ROOT)
 26 | 
 27 | 
 28 | class PdfReaderTestCases(unittest.TestCase):
 29 |     """ [EXPLAIN THIS CLASS.] """
 30 | 
 31 |     def setUp(self):
 32 |         # Variable defining the path where the method to be run next can store
 33 |         # its own fixture (test) data.
 34 |         self.localDataRoot = join(TEST_DATA_ROOT, self.id().split(".")[-1])
 35 | 
 36 |     def testDel(self):
 37 |         """
 38 |         Tests the ``__del__()`` method of ``PdfFileReader`` and
 39 |         ``PdfFileWriter`` ensuring that no exceptions are raised.
 40 |         """
 41 |         r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf"))
 42 |         w = PdfFileWriter(BytesIO(b""))
 43 | 
 44 |         try:
 45 |             r.__del__()
 46 |             self.assertTrue(True)
 47 |         except Exception as e:  # pylint: disable=broad-except
 48 |             self.assertTrue(
 49 |                 False,
 50 |                 "Exception '%s' was raised in %s.__del__()"
 51 |                 % (e, PdfFileReader.__name__),
 52 |             )
 53 | 
 54 |         try:
 55 |             w.__del__()
 56 |             self.assertTrue(True)
 57 |         except Exception as e:  # pylint: disable=broad-except
 58 |             self.assertTrue(
 59 |                 False,
 60 |                 "Exception '%s' was raised in %s.__del__()"
 61 |                 % (e, PdfFileWriter.__name__),
 62 |             )
 63 | 
 64 |     def testFileLoad(self):
 65 |         """
 66 |         Test loading and parsing of a file. Extract text of the file and
 67 |         compare to expected textual output. Expected outcome: file loads, text
 68 |         matches expected.
 69 |         """
 70 |         with open(join(TEST_DATA_ROOT, "crazyones.pdf"), "rb") as inputfile:
 71 |             # Load PDF file from file
 72 |             r = PdfFileReader(inputfile)
 73 |             page1 = r.getPage(0)
 74 | 
 75 |             # Retrieve the text of the PDF
 76 |             with open(join(self.localDataRoot, "crazyones.txt"), "rb") as pdftextFile:
 77 |                 pdftext = pdftextFile.read()
 78 | 
 79 |             page1Text = page1.extractText().replace("\n", "").encode("utf-8")
 80 | 
 81 |             # Compare the text of the PDF to a known source
 82 |             self.assertEqual(
 83 |                 pdftext,
 84 |                 page1Text,
 85 |                 msg="PDF extracted text differs from expected value."
 86 |                 "\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, page1Text),
 87 |             )
 88 | 
 89 |             r.close()
 90 | 
 91 |     def testJpegImage(self):
 92 |         """
 93 |         Test loading and parsing of a file. Extract the image of the file and
 94 |         compare to expected textual output. Expected outcome: file loads, image
 95 |         matches expected.
 96 |         """
 97 |         with open(join(TEST_DATA_ROOT, "jpeg.pdf"), "rb") as inputfile:
 98 |             # Load PDF file from file
 99 |             r = PdfFileReader(inputfile)
100 | 
101 |             # Retrieve the text of the image
102 |             with open(join(self.localDataRoot, "jpeg.txt"), "r") as pdftextFile:
103 |                 imagetext = pdftextFile.read()
104 | 
105 |             page1 = r.getPage(0)
106 |             xObject = page1["/Resources"]["/XObject"].getObject()
107 |             data = xObject["/Im4"].getData()
108 | 
109 |             # Compare the text of the PDF to a known source
110 |             self.assertEqual(
111 |                 binascii.hexlify(data).decode(),
112 |                 imagetext,
113 |                 msg="PDF extracted image differs from expected value."
114 |                 "\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
115 |                 % (imagetext, binascii.hexlify(data).decode()),
116 |             )
117 | 
118 |             r.close()
119 | 
120 |     def testXRefTableObjects(self):
121 |         """
122 |         Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect
123 |         references from the XRef-Table *only* have been loaded as expected.
124 |         Objects from the free entries list are included as well in the test.
125 | 
126 |         This case tests the part of ``PdfFileReader.objects()`` responsible for
127 |         generating the Cross-Reference Table entries too.
128 |         """
129 |         self.maxDiff = None
130 |         inputFiles = (
131 |             "jpeg.pdf",
132 |             "Seige_of_Vicksburg_Sample_OCR.pdf",
133 |             "SF424_page2.pdf",
134 |         )
135 | 
136 |         for filename in inputFiles:
137 |             filepath = join(TEST_DATA_ROOT, filename)
138 |             xtablepath = join(self.localDataRoot, filename)
139 |             r = PdfFileReader(filepath)
140 |             # The two below are (id, gen, byte offset)-valued lists
141 |             actualItems = list()
142 |             expItems = list()
143 | 
144 |             for ref in r.objects(PdfFileReader.R_XTABLE, True):
145 |                 actualItems.append(
146 |                     (
147 |                         ref.idnum,
148 |                         ref.generation,
149 |                         r._xrefTable[ref.generation][ref.idnum][0],
150 |                     )
151 |                 )
152 | 
153 |             r.close()
154 |             # We artificially read the XRef Table entries that we know belong
155 |             # to filepath, and store them into expItems.
156 |             expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2)))
157 |             actualItems = sorted(actualItems)
158 |             expItems = sorted(expItems)
159 | 
160 |             self.assertListEqual(
161 |                 expItems, actualItems, "Differences found in " + filename
162 |             )
163 | 
164 |     def testXRefStreamObjects(self):
165 |         """
166 |         Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it
167 |         tests objects referenced by the Cross-Reference Stream.
168 |         ``PdfFileReader.objects()`` second part (dealing with XStream objects)
169 |         is invoked and implicitly tested.
170 |         """
171 |         inputFiles = ("crazyones.pdf",)
172 | 
173 |         for filename in inputFiles:
174 |             filepath = join(self.localDataRoot, filename)
175 |             r = PdfFileReader(join(TEST_DATA_ROOT, filename))
176 |             # Two lists of tuples as explained by Table 18
177 |             actualItems = list()
178 |             expItems = list()
179 | 
180 |             with open(filepath, "r") as instream:
181 |                 for line in instream:
182 |                     if not line or line.isspace() or line.startswith("%"):
183 |                         continue
184 | 
185 |                     this_type, field2, field3 = (int(f) for f in line.split())
186 |                     expItems.append((this_type, field2, field3))
187 | 
188 |             for item in r.objects(PdfFileReader.R_XSTREAM, True):
189 |                 priv8Item = r._xrefStm[item.idnum]
190 | 
191 |                 if priv8Item[0] in {0, 1}:
192 |                     self.assertEqual(priv8Item[2], item.generation)
193 |                 elif priv8Item[0] == 2:
194 |                     self.assertEqual(item.generation, 0)
195 | 
196 |                 actualItems.append(priv8Item)
197 | 
198 |             r.close()
199 |             actualItems = sorted(actualItems)
200 |             expItems = sorted(expItems)
201 | 
202 |             self.assertListEqual(
203 |                 expItems,
204 |                 actualItems,
205 |                 "Didn't correctly read the Cross-Reference Stream",
206 |             )
207 | 
208 |     def testReadXRefStreamCompressedObjects(self):  # pylint: disable=too-many-locals
209 |         """
210 |         Targets the same objects as ``testXRefStreamObjects()``, but instead
211 |         of ensuring an identity between the list of items read and the one
212 |         expected, it verifies that their *contents* are identical.
213 | 
214 |         This method does **not** test ``PdfFileReader.objects()`` as two of the
215 |         previous test cases did.
216 |         """
217 |         self.maxDiff = None
218 |         inputFiles = ("crazyones.pdf",)
219 |         # expItems and actualItems will contain two-element tuples, where the
220 |         # first element is the object ID, used to sort.
221 |         sortKey = lambda e: e[0]
222 |         compressedObj = lambda e: e[1][0] == 2
223 | 
224 |         for filename in inputFiles:
225 |             filepath = join(self.localDataRoot, filename)
226 |             r = PdfFileReader(join(TEST_DATA_ROOT, filename))
227 |             expItems = list()
228 |             actualItems = list()
229 | 
230 |             with open(filepath, "rb") as instream:
231 |                 for line in instream:
232 |                     if not line or line.isspace() or line.startswith(b"%"):
233 |                         continue
234 | 
235 |                     globalId, offset, obj = line.split(b" ", 2)
236 |                     globalId, offset = int(globalId), int(offset)
237 | 
238 |                     with BytesIO(obj) as objStream:
239 |                         obj = readObject(objStream, r)
240 | 
241 |                     expItems.append((globalId, obj))
242 | 
243 |             for itemid, _item in filter(compressedObj, r._xrefStm.items()):
244 |                 # We deal exclusively with compressed objects (from Table 18 of
245 |                 # ISO 32000 reference, 2008) whose generation number is 0
246 |                 actualItems.append(
247 |                     # (ID, PdfObject) tuples
248 |                     (itemid, IndirectObject(itemid, 0, r).getObject())
249 |                 )
250 | 
251 |             r.close()
252 |             expItems = sorted(expItems, key=sortKey)
253 |             actualItems = sorted(actualItems, key=sortKey)
254 | 
255 |             self.assertListEqual(expItems, actualItems)
256 | 
257 |     def testXTableAgainstXStream(self):
258 |         """
259 |         In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That
260 |         Do Not Support Compressed Reference Streams", the standard describes a
261 |         means of crafting PDF files designed for versions 1.5+ that can be
262 |         opened nevertheless by readers that support older versions.
263 | 
264 |         This test case verifies that all the items hidden by the XRef Table in
265 |         non-conforming readers are *all and exactly* loaded into the XRef
266 |         Stream by readers that support PDF 1.5+.
267 |         """
268 |         self.maxDiff = None
269 |         # TO-DO Possibly add a few other files to this test case
270 |         inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf",)
271 | 
272 |         for filename in inputFiles:
273 |             filepath = join(self.localDataRoot, filename)
274 |             expItems = {e[0]: e[1:] for e in self._parseXRefTable(filepath, (0, 2, 3))}
275 |             actualItems = list()
276 |             r = PdfFileReader(join(TEST_DATA_ROOT, filename))
277 | 
278 |             for ref in r.objects(PdfFileReader.R_XSTREAM, True):
279 |                 actualItems.append(ref)
280 | 
281 |             r.close()
282 |             actualItems = sorted(actualItems, key=lambda e: e.idnum)
283 |             expKeys = sorted(expItems.keys())
284 |             actualKeys = list(map(lambda e: e.idnum, actualItems))
285 | 
286 |             self.assertListEqual(
287 |                 expKeys, actualKeys, "Lists of item IDs are not identical"
288 |             )
289 | 
290 |             for e, a in zip(expKeys, actualItems):
291 |                 self.assertEqual(e, a.idnum, "Items ID does not correspond")
292 | 
293 |                 # If an item is in use in the XRef Stream, ensure then that it
294 |                 # is marked free in the XRef Table.
295 |                 if r._xrefStm[a.idnum][0] in (2,):
296 |                     self.assertTrue(
297 |                         expItems[e][-1],
298 |                         "Item %d should be hid by the XRef Table, but it was "
299 |                         "not." % e,
300 |                     )
301 | 
302 |     def testIsObjectFree(self):
303 |         """
304 |         Tests the ``PdfFileReader.isObjectFree()` method.
305 |         """
306 |         # TO-DO Find PDF files that feature free-entry lists. We are checking
307 |         # isObjectFree() only against used items.
308 |         inputFiles = (
309 |             "jpeg.pdf",
310 |             "Seige_of_Vicksburg_Sample_OCR.pdf",
311 |             "SF424_page2.pdf",
312 |         )
313 | 
314 |         for filename in inputFiles:
315 |             filepath = join(self.localDataRoot, filename)
316 |             r = PdfFileReader(join(TEST_DATA_ROOT, filename))
317 |             expItems = self._parseXRefTable(filepath, (0, 1, 3))
318 |             actualItems = list()
319 | 
320 |             for ref in r.objects(PdfFileReader.R_XTABLE, True):
321 |                 actualItems.append(
322 |                     # This is where isObjectFree() gets invoked
323 |                     (ref.idnum, ref.generation, r.isObjectFree(ref))
324 |                 )
325 | 
326 |             r.close()
327 |             expItems = sorted(expItems)
328 |             actualItems = sorted(actualItems)
329 | 
330 |             self.assertListEqual(expItems, actualItems)
331 | 
332 |     def testContextManager(self):
333 |         """
334 |         Tests the context manager implementation (the ``with <expr> as
335 |         identifier`` feature) of ``PdfFileReader``.
336 |         """
337 |         inputFiles = (
338 |             "jpeg.pdf",
339 |             "Seige_of_Vicksburg_Sample_OCR.pdf",
340 |             "SF424_page2.pdf",
341 |         )
342 | 
343 |         for filename in inputFiles:
344 |             r = None
345 | 
346 |             with PdfFileReader(join(TEST_DATA_ROOT, filename)) as r:
347 |                 # Test assertions not strictly related to the whole test case
348 |                 self.assertEqual(filename, basename(r.filepath))
349 |                 self.assertFalse(r.isClosed)
350 | 
351 |             self.assertTrue(r.isClosed)
352 | 
353 |     @staticmethod
354 |     def _parseXRefTable(filepath, mask=tuple()):
355 |         """
356 |         Parses a Cross-Reference Table, such as the sampled ones used for
357 |         testing.
358 | 
359 |         :param filepath: file path where the table is stored in.
360 |         :param mask: a list of fields' indices indicating which fields are to
361 |             be returned. For example, ``(0, 2, 3)`` indicates that only the
362 |             ``id``, ``byteOffset`` and ``isFree`` fields have to be returned.
363 |         :return: an iterable of items of the form
364 |             ``(id, gen, byteOffset, isFree)`` if ``mask`` hasn't been set,
365 |             otherwise an iterable of all the items ``mask`` has specified.
366 |         """
367 |         startid = None
368 |         expecteditems = None
369 |         itemssofar = None
370 | 
371 |         if not mask:
372 |             mask = tuple(range(4))
373 | 
374 |         with open(filepath, "r") as instream:
375 |             for line in instream:
376 |                 if not line or line.isspace() or line.startswith("%"):
377 |                     continue
378 | 
379 |                 tokens = line.strip().split()
380 | 
381 |                 # We are beginning a new sub reference section
382 |                 if len(tokens) == 2:
383 |                     if itemssofar != expecteditems:
384 |                         raise ValueError(
385 |                             'Line "%d %d" specified %d items, %d read'  # pylint: disable=bad-string-format-type
386 |                             % (startid, expecteditems, expecteditems, itemssofar)
387 |                         )
388 | 
389 |                     startid = int(tokens[0])
390 |                     expecteditems = int(tokens[1])
391 |                     itemssofar = 0
392 |                 elif len(tokens) == 3:  # New object info to add
393 |                     # We yield an (id, gen, byte offset) tuple
394 |                     output = (
395 |                         startid + itemssofar,
396 |                         int(tokens[1]),
397 |                         int(tokens[0]),
398 |                         tokens[2] == "f",
399 |                     )
400 |                     yield tuple(output[s] for s in mask)
401 | 
402 |                     itemssofar += 1
403 |                 else:
404 |                     raise ValueError("Unexpected token in %s" % filepath)
405 | 
406 |     def testProperties(self):
407 |         """
408 |         The switch from PyPDF2 to PyPDF4 sees many stylistic changes, including
409 |         the use of the ``@property`` decorator (where possible) and pruning out
410 |         of unnecessary arguments to ``property()`` as a function.
411 |         In some cases, functions that previously had a ``@property`` accessor
412 |         have it no more (to remove duplicate accesses).
413 | 
414 |         This test ensures that the two styles, the older and the newer, are
415 |         functionally equivalent.
416 |         """
417 |         properties = (
418 |             "documentInfo",
419 |             "xmpMetadata",
420 |             "numPages",
421 |             "pages",
422 |             "pageLayout",
423 |             "pageMode",
424 |             "isEncrypted",
425 |         )
426 |         methods = ("getNamedDestinations", "getOutlines")
427 | 
428 |         for p in properties:
429 |             self.assertIsInstance(getattr(PdfFileReader, p), property)
430 | 
431 |         for m in methods:
432 |             self.assertTrue(
433 |                 hasattr(PdfFileReader, m),
434 |                 "%s() is not part of %s" % (m, PdfFileReader.__name__),
435 |             )
436 |             self.assertTrue(
437 |                 callable(getattr(PdfFileReader, m)),
438 |                 "%s.%s() is not callable" % (PdfFileReader.__name__, m),
439 |             )
440 | 
441 |     def testAddAttachment(self):
442 |         """
443 |         Tests the addAttachment function for attaching a single file.
444 | 
445 |         Since the Names array in the EmbeddedFiles dictionary contains both the
446 |         name (string) and indirect object (dictionary) for each file, we have
447 |         to check for two entries per attached file.
448 |         """
449 | 
450 |         _, testfile = tempfile.mkstemp()
451 | 
452 |         try:
453 |             # Make PDF with attachment
454 |             with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader:
455 |                 with PdfFileWriter(testfile) as writer:
456 |                     writer.appendPagesFromReader(reader)
457 |                     with open(
458 |                         join(  # pylint: disable=bad-continuation
459 |                             TEST_DATA_ROOT, "attachment_small.png"
460 |                         ),
461 |                         "rb",  # pylint: disable=bad-continuation  # pylint: disable=bad-continuation
462 |                     ) as attachment_stream:
463 |                         read_data = attachment_stream.read()
464 |                         writer.addAttachment("attachment_small.png", read_data)
465 |                     writer.write()
466 | 
467 |             # Check for attachment entries
468 |             with PdfFileReader(testfile) as pdf:
469 |                 # For caching _cachedObjects data
470 |                 pdf.numPages  # pylint: disable=pointless-statement
471 |                 for _k, v in pdf._cachedObjects.items():
472 |                     if "/Type" in v:
473 |                         if v["/Type"] == "/Catalog":
474 |                             self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"])
475 |                             real = len(v["/Names"]["/EmbeddedFiles"]["/Names"])
476 |                             self.assertEqual(2, real)
477 |         finally:
478 |             os.remove(testfile)
479 | 
480 |     def testAttachFiles(self):
481 |         """
482 |         Tests the addAttachment function for attaching multiple files.
483 | 
484 |         Since the Names array in the EmbeddedFiles dictionary contains both the
485 |         name (string) and indirect object (dictionary) for each file, we have
486 |         to check for two entries per attached file.
487 |         """
488 | 
489 |         numAttachments = 3
490 |         _, testfile = tempfile.mkstemp()
491 | 
492 |         try:
493 |             # Make PDF with attachment
494 |             with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader:
495 |                 with PdfFileWriter(testfile) as writer:
496 |                     writer.appendPagesFromReader(reader)
497 | 
498 |                     writer.attachFiles(
499 |                         [join(TEST_DATA_ROOT, "attachment_small.png")] * numAttachments
500 |                     )
501 |                     writer.write()
502 | 
503 |             # Check for attachment entries
504 |             with PdfFileReader(testfile) as pdf:
505 |                 # For caching _cachedObjects data
506 |                 pdf.numPages  # pylint: disable=pointless-statement
507 |                 for _k, v in pdf._cachedObjects.items():
508 |                     if "/Type" in v:
509 |                         if v["/Type"] == "/Catalog":
510 |                             self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"])
511 |                             real = len(v["/Names"]["/EmbeddedFiles"]["/Names"])
512 |                             self.assertEqual(numAttachments * 2, real)
513 |         finally:
514 |             os.remove(testfile)
515 | 
516 | 
517 | class AddJsTestCase(unittest.TestCase):
518 |     """ [EXPLAIN THIS CLASS.] """
519 | 
520 |     def setUp(self):
521 |         """ [EXPLAIN THIS CONVENIENCE.] """
522 |         reader = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf"))
523 |         self.writer = PdfFileWriter(BytesIO(b""))
524 |         self.writer.appendPagesFromReader(reader)
525 | 
526 |     def testAdd(self):
527 |         """ [EXPLAIN THIS TEST.] """
528 |         self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
529 | 
530 |         self.assertIn(
531 |             "/Names",
532 |             self.writer._rootObject,
533 |             "addJS should add a name catalog in the root object.",
534 |         )
535 |         self.assertIn(
536 |             "/JavaScript",
537 |             self.writer._rootObject["/Names"],
538 |             "addJS should add a JavaScript name tree under the name catalog.",
539 |         )
540 |         self.assertIn(
541 |             "/JavaScript",
542 |             self.writer._rootObject,
543 |             "addJS should add a JavaScript action to the catalog.",
544 |         )
545 | 
546 |     def testOverwrite(self):
547 |         """ [EXPLAIN THIS TEST.] """
548 |         self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
549 |         first_js = self._getJavascriptName()
550 | 
551 |         self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
552 |         second_js = self._getJavascriptName()
553 | 
554 |         self.assertNotEqual(
555 |             first_js,
556 |             second_js,
557 |             "addJS should overwrite the previous script in the catalog.",
558 |         )
559 | 
560 |     def _getJavascriptName(self):
561 |         self.assertIn("/Names", self.writer._rootObject)
562 |         self.assertIn("/JavaScript", self.writer._rootObject["/Names"])
563 |         self.assertIn("/Names", self.writer._rootObject["/Names"]["/JavaScript"])
564 |         return self.writer._rootObject["/Names"]["/JavaScript"]["/Names"][0]
565 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Acsor <nildexo@yandex.com>
  2 | # Copyright 2019 Kurt McKee <contactme@kurtmckee.org>
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | # * Redistributions of source code must retain the above copyright notice,
 10 | # this list of conditions and the following disclaimer.
 11 | # * Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation
 13 | # and/or other materials provided with the distribution.
 14 | # * The name of the author may not be used to endorse or promote products
 15 | # derived from this software without specific prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 21 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 | # POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | import io
 30 | import re
 31 | import string
 32 | import unittest
 33 | 
 34 | import pytest
 35 | 
 36 | import pypdf.utils
 37 | from tests.utils import bitstringToInt, intToBitstring
 38 | 
 39 | # Establish the bytes/str/unicode types.
 40 | try:
 41 |     unicode
 42 | except NameError:
 43 |     # Python 3
 44 |     bytes_type = bytes
 45 |     str_type = str
 46 |     unicode_type = str
 47 | else:
 48 |     # Python 2
 49 |     bytes_type = str
 50 |     str_type = str
 51 |     unicode_type = unicode
 52 | 
 53 | 
 54 | class UtilsTestCase(unittest.TestCase):
 55 |     """
 56 |     UtilsTestCase is intended to test the code utilities in utils.py.
 57 |     """
 58 | 
 59 |     def testHexEncode(self):
 60 |         inputs = (
 61 |             string.ascii_lowercase,
 62 |             string.ascii_uppercase,
 63 |             string.ascii_letters,
 64 |             " \t\n\r\x0b\x0c",
 65 |             # All the characters from \x00 to \xff in ascending order
 66 |             "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
 67 |             '\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#'
 68 |             "$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ab"
 69 |             "cdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87"
 70 |             "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97"
 71 |             "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
 72 |             "\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
 73 |             "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
 74 |             "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
 75 |             "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
 76 |             "\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
 77 |             "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
 78 |         )
 79 |         expOutputs = (
 80 |             "6162636465666768696a6b6c6d6e6f707172737475767778797a",
 81 |             "4142434445464748494a4b4c4d4e4f505152535455565758595a",
 82 |             "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464"
 83 |             "748494a4b4c4d4e4f505152535455565758595a",
 84 |             "20090a0d0b0c",
 85 |             "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2"
 86 |             "02122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f40"
 87 |             "4142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606"
 88 |             "162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f8081"
 89 |             "82838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a"
 90 |             "2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2"
 91 |             "c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e"
 92 |             "3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff",
 93 |         )
 94 | 
 95 |         for o, i in zip(expOutputs, inputs):
 96 |             self.assertEqual(o, pypdf.utils.hexEncode(i))
 97 | 
 98 |     def testPairs(self):
 99 |         """
100 |         Tests ``utils.pairs()``.
101 |         """
102 |         inputs = (range(0), range(6), range(10))
103 |         expOutputs = (
104 |             tuple(),
105 |             ((0, 1), (2, 3), (4, 5)),
106 |             ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)),
107 |         )
108 | 
109 |         for o, i in zip(expOutputs, inputs):
110 |             self.assertTupleEqual(o, tuple(pypdf.utils.pairs(i)))
111 | 
112 |     def testPairsException(self):
113 |         """
114 |         Tests ``utils.pairs()`` when it is fed unaccepted values.
115 |         """
116 |         inputs = (range(1), range(5), range(11), range(111))
117 | 
118 |         for i in inputs:
119 |             with self.assertRaises(ValueError):
120 |                 list(pypdf.utils.pairs(i))
121 | 
122 | 
123 | class TestUtilsTestCase(unittest.TestCase):
124 |     """
125 |     TestUtilsTestCase is intended to test test-related utils functions, not
126 |     project-wide ones.
127 |     """
128 | 
129 |     def testIntToBitstringToInt(self):
130 |         """
131 |         Ensures that bitstringToInt(intToBitsring(input)) == input.
132 |         """
133 |         inputs = range(2 ** 12 + 1)
134 | 
135 |         for i in inputs:
136 |             self.assertEqual(i, bitstringToInt(intToBitstring(i)))
137 | 
138 |     def testBitstringToInt(self):
139 |         """
140 |         Ensures that bitstringToInt() produces the expected result from some
141 |         of its possible inputs.
142 |         """
143 |         inputs = (
144 |             "00000000",
145 |             "",
146 |             "00000001",
147 |             "1",
148 |             "01010101",
149 |             "1010101",
150 |             "10101010",
151 |             "11111111",
152 |             "100000000",
153 |             "0100000000",
154 |             "00100000000",
155 |             "000100000000",
156 |             "100000001",
157 |             "0100000001",
158 |             "00100000001",
159 |             "000100000001",
160 |         )
161 |         expOutputs = (
162 |             0,
163 |             0,
164 |             1,
165 |             1,
166 |             85,
167 |             85,
168 |             170,
169 |             255,
170 |             256,
171 |             256,
172 |             256,
173 |             256,
174 |             257,
175 |             257,
176 |             257,
177 |             257,
178 |         )
179 | 
180 |         for o, b in zip(expOutputs, inputs):
181 |             self.assertEqual(o, bitstringToInt(b))
182 | 
183 | 
184 | @pytest.mark.parametrize(
185 |     "arg, expected",
186 |     (
187 |         (u"str", True),
188 |         ("str", True),
189 |         (123, False),
190 |         # I *think* that the function behaves incorrectly here.
191 |         #
192 |         # Python 2 support ends in 2020, but for now, I think that is_string()
193 |         # should return False for Python 2 `bytes` and `str` objects, and only
194 |         # `unicode` objects should return True.
195 |         #
196 |         # If this gets fixed, this additional test parameter will need
197 |         # to be uncommented:
198 |         #
199 |         # (b'bytes', False)
200 |     ),
201 | )
202 | def testIsString(arg, expected):
203 |     assert pypdf.utils.isString(arg) == expected
204 | 
205 | 
206 | @pytest.mark.parametrize(
207 |     "arg, expected", ((123, True), (1 << 100, True), (123.123, False), ("str", False),)
208 | )
209 | def testIsInt(arg, expected):
210 |     assert pypdf.utils.isInt(arg) == expected
211 | 
212 | 
213 | @pytest.mark.parametrize(
214 |     "arg, expected",
215 |     (
216 |         (b"bytes", True),
217 |         (u"bytes".encode("utf8"), True),
218 |         (u"str", False),
219 |         (b"str".decode("utf8"), False),
220 |         (10, False),
221 |     ),
222 | )
223 | def testIsBytes(arg, expected):
224 |     assert pypdf.utils.isBytes(arg) == expected
225 | 
226 | 
227 | @pytest.mark.parametrize(
228 |     "data, maxchars, expected_value, expected_tell",
229 |     (
230 |         (b"", None, b"", 0),
231 |         (b"abcdef", None, b"abcdef", 6),
232 |         (b"abcdef", 3, b"abc", 3),
233 |         (b"abc def", None, b"abc", 4),
234 |     ),
235 | )
236 | def testReadUntilWhitespace(data, maxchars, expected_value, expected_tell):
237 |     stream = io.BytesIO(data)
238 |     assert pypdf.utils.readUntilWhitespace(stream, maxchars) == expected_value
239 |     assert stream.tell() == expected_tell
240 | 
241 | 
242 | @pytest.mark.parametrize(
243 |     "data, expected_value, expected_tell",
244 |     ((b"", b"", 0), (b"      ", b"", 6), (b"   a   ", b"a", 4),),
245 | )
246 | def testReadNonWhitespace(data, expected_value, expected_tell):
247 |     stream = io.BytesIO(data)
248 |     assert pypdf.utils.readNonWhitespace(stream) == expected_value
249 |     assert stream.tell() == expected_tell
250 | 
251 | 
252 | @pytest.mark.parametrize(
253 |     "data, expected_result, expected_tell",
254 |     (
255 |         (b"", False, 0),
256 |         (b"      ", True, 6),
257 |         (b"a   ", False, 1),
258 |         (b" a   ", True, 2),
259 |         (b"  a   ", True, 3),
260 |     ),
261 | )
262 | def testSkipOverWhitespace(data, expected_result, expected_tell):
263 |     stream = io.BytesIO(data)
264 |     assert pypdf.utils.skipOverWhitespace(stream) == expected_result
265 |     assert stream.tell() == expected_tell
266 | 
267 | 
268 | @pytest.mark.parametrize(
269 |     "data, expected_tell",
270 |     ((b"", 0), (b" ", 0), (b"a", 0), (b"%a\n\r", 3), (b"%a\r\n", 3), (b"%aa\r", 4),),
271 | )
272 | def testSkipOverComments(data, expected_tell):
273 |     stream = io.BytesIO(data)
274 |     pypdf.utils.skipOverComment(stream)
275 |     assert stream.tell() == expected_tell
276 | 
277 | 
278 | @pytest.mark.parametrize(
279 |     "data, pattern, expected_value, expected_tell",
280 |     (
281 |         (b"", b"123", b"", 0),
282 |         (b"abc123def", b"123", b"abc", 3),
283 |         (b"abcdef", b"123", b"abcdef", 6),
284 |     ),
285 | )
286 | def testReadUntilRegex(data, pattern, expected_value, expected_tell):
287 |     stream = io.BytesIO(data)
288 |     regex = re.compile(pattern)
289 |     assert pypdf.utils.readUntilRegex(stream, regex, ignore_eof=True) == expected_value
290 |     assert stream.tell() == expected_tell
291 | 
292 | 
293 | def testReadUntilRegexException():
294 |     stream = io.BytesIO(b"abcdef")
295 |     regex = re.compile(b"123")
296 |     with pytest.raises(pypdf.utils.PdfStreamError):
297 |         pypdf.utils.readUntilRegex(stream, regex, ignore_eof=False)
298 | 
299 | 
300 | def testMatrixMultiply():
301 |     matrix1 = [
302 |         [1, 2],
303 |         [3, 4],
304 |     ]
305 |     matrix2 = [
306 |         [2, 3],
307 |         [5, 7],
308 |     ]
309 |     expected_result = [
310 |         [12, 17],
311 |         [26, 37],
312 |     ]
313 |     assert pypdf.utils.matrixMultiply(matrix1, matrix2) == expected_result
314 | 
315 | 
316 | @pytest.mark.parametrize(
317 |     "arg, expected_value",
318 |     ((b"a", b"a"), (b"a"[0], b"a"), ("a", b"a"), (u"a", b"a"), (97, b"a"),),
319 | )
320 | def testPypdfBytes(arg, expected_value):
321 |     value = pypdf.utils.pypdfBytes(arg)
322 |     assert value == expected_value
323 |     assert isinstance(value, bytes_type)
324 | 
325 | 
326 | @pytest.mark.parametrize(
327 |     "arg, expected_value", ((b"abc", "abc"), ("abc", "abc"), (u"abc", "abc"),)
328 | )
329 | def testPypdfStr(arg, expected_value):
330 |     value = pypdf.utils.pypdfStr(arg)
331 |     assert value == expected_value
332 |     assert isinstance(value, str_type)
333 | 
334 | 
335 | @pytest.mark.parametrize(
336 |     "arg, expected_value",
337 |     (
338 |         (b"abc", u"abc"),
339 |         ("abc", u"abc"),
340 |         (u"abc", u"abc"),
341 |         (b"\\u0061bc", u"abc"),
342 |         (u"\\u0061bc", u"\\u0061bc"),
343 |     ),
344 | )
345 | def testPypdfUnicode(arg, expected_value):
346 |     value = pypdf.utils.pypdfUnicode(arg)
347 |     assert value == expected_value
348 |     assert isinstance(value, unicode_type)
349 | 
350 | 
351 | @pytest.mark.parametrize(
352 |     "arg, expected_value",
353 |     (
354 |         (b"a", 97),
355 |         (b"a"[0], 97),
356 |         ("a", 97),
357 |         ("a"[0], 97),
358 |         (u"a", 97),
359 |         (u"a"[0], 97),
360 |         (97, 97),
361 |     ),
362 | )
363 | def testPypdfOrd(arg, expected_value):
364 |     value = pypdf.utils.pypdfOrd(arg)
365 |     assert value == expected_value
366 |     assert isinstance(value, int)
367 | 
368 | 
369 | @pytest.mark.parametrize(
370 |     "arg, expected_value", ((97, "a"), (b"a", "a"), ("a", "a"), (u"a", "a"),)
371 | )
372 | def testPypdfChr(arg, expected_value):
373 |     value = pypdf.utils.pypdfChr(arg)
374 |     assert value == expected_value
375 |     assert isinstance(value, str_type)
376 | 
377 | 
378 | @pytest.mark.parametrize(
379 |     "arg, expected_value", ((0x1, "0x1"), (1 << 100, "0x10000000000000000000000000"),)
380 | )
381 | def testHexStr(arg, expected_value):
382 |     value = pypdf.utils.hexStr(arg)
383 |     assert value == expected_value
384 |     assert isinstance(value, str_type)
385 | 
386 | 
387 | def testRC4Encode():
388 |     crypto_text = pypdf.utils.RC4Encrypt("def", "abc")
389 |     assert crypto_text == b"\x9e\xa6\xef"
390 |     assert isinstance(crypto_text, bytes)
391 | 
392 | 
393 | @pytest.mark.parametrize(
394 |     "filename", (r"path/to/filename", r"path\to\filename", r"filename",)
395 | )
396 | def testFormatWarning(filename):
397 |     args = ("message", Warning, filename, "lineno", "line")
398 |     warning = pypdf.utils.formatWarning(*args)
399 |     assert warning == "Warning: message [filename:lineno]\n"
400 | 
401 | 
402 | def testWhitespaces():
403 |     whitespaces = {b" ", b"\n", b"\r", b"\t", b"\x00"}
404 |     assert whitespaces == set(pypdf.utils.WHITESPACES)
405 |     for character in pypdf.utils.WHITESPACES:
406 |         assert isinstance(character, bytes_type)
407 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | File containing utils intended to be used in unit testing rather than the
 3 | internal project codebase.
 4 | """
 5 | 
 6 | 
 7 | def intToBitstring(n__, fill=8):
 8 |     """
 9 |     Turns an integer ``n`` into its corresponding textual bit representation.
10 | 
11 |     :param fill: number of zeros to pad the bit representation with.
12 |     :raises TypeError: if n is not an integer.
13 |     """
14 |     if not isinstance(n__, int):
15 |         raise TypeError("n must be an integer")
16 | 
17 |     return ("{bits:0>%db}" % fill).format(bits=n__)
18 | 
19 | 
20 | def bitstringToInt(b__):
21 |     """Performs the reverse of ``intToBitstring()``."""
22 |     if not isinstance(b__, str):
23 |         raise TypeError("Expected str, got %s" % b__.__class__)
24 |     if not set(b__).issubset({"0", "1"}):
25 |         raise ValueError("b must be a string containing only 0's and 1's")
26 | 
27 |     result, bitlen = 0, len(b__)
28 | 
29 |     for index, i in enumerate(b__):
30 |         if i == "1":
31 |             result += 2 ** (bitlen - index - 1)
32 | 
33 |     return result
34 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | # Summer 2020:  is there value to retention of py27?  py35?
 3 | envlist =
 4 |     clean,
 5 |     py27, py35, py36, py37, py38
 6 |     report
 7 | 
 8 | [testenv]
 9 | commands = pytest --cov --cov-append tests/
10 | deps =
11 |     pytest
12 |     pytest-cov
13 | 
14 | [testenv:clean]
15 | commands = python -m coverage erase
16 | 
17 | [testenv:py27]
18 | basepython = python2.7
19 | 
20 | [testenv:py35]
21 | basepython = python3.5
22 | 
23 | [testenv:py36]
24 | basepython = python3.6
25 | 
26 | [testenv:py37]
27 | basepython = python3.7
28 | 
29 | [testenv:report]
30 | commands = python -m coverage html
31 | 


--------------------------------------------------------------------------------