├── .coveragerc ├── .gitignore ├── .isort.cfg ├── .travis.yml ├── CHANGELOG ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── pylintrc ├── pypdf ├── __init__.py ├── _version.py ├── filters.py ├── generic.py ├── merger.py ├── pagerange.py ├── pdf.py ├── utils.py └── xmp.py ├── samplecode ├── MergingComments.py ├── PDFComments2XL.py ├── README.md ├── __init__.py ├── basic_features.py ├── basic_merging.py └── pdfsamples │ ├── AutoCad_Diagram.pdf │ ├── AutoCad_Simple.pdf │ ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf │ ├── README.md │ ├── SF424_page2.pdf │ ├── Seige_of_Vicksburg_Sample_OCR.pdf │ └── jpeg.pdf ├── scripts ├── 2-up.py ├── codecs.py ├── pdf-image-extractor.py └── pdfcat ├── setup.py ├── tests ├── __init__.py ├── fixture_data │ ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf │ ├── Hamlet.txt │ ├── SF424_page2.pdf │ ├── Seige_of_Vicksburg_Sample_OCR.pdf │ ├── TheHappyPrince.txt │ ├── attachment_small.png │ ├── crazyones.pdf │ ├── jpeg.pdf │ ├── testDecodeStreamData │ │ ├── ASCII85Decode.pdf │ │ ├── CCITTFaxDecode.pdf │ │ ├── DCTDecode.pdf │ │ ├── FlateDecode.pdf │ │ └── LZWDecode.pdf │ ├── testFileLoad │ │ └── crazyones.txt │ ├── testIsObjectFree │ │ ├── GeoBase_NHNC1_Data_Model_UML_EN.pdf │ │ ├── SF424_page2.pdf │ │ ├── Seige_of_Vicksburg_Sample_OCR.pdf │ │ └── jpeg.pdf │ ├── testJpegImage │ │ └── jpeg.txt │ ├── testReadXRefStreamCompressedObjects │ │ └── crazyones.pdf │ ├── testXRefStreamObjects │ │ └── crazyones.pdf │ ├── testXRefTableObjects │ │ ├── SF424_page2.pdf │ │ ├── Seige_of_Vicksburg_Sample_OCR.pdf │ │ └── jpeg.pdf │ └── testXTableAgainstXStream │ │ └── GeoBase_NHNC1_Data_Model_UML_EN.pdf ├── test_filters.py ├── test_generic.py ├── test_pdf.py ├── test_utils.py └── utils.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = .tox/* 3 | 4 | # TODO: still need to arrange coverage for samplecode/*.py and scripts/*.py. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | *.pyc 3 | *.sw[op] 4 | .DS_Store 5 | .tox 6 | build 7 | .idea/* 8 | htmlcov/ 9 | .coverage 10 | MANIFEST 11 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | # CL hasn't yet identified a perfect match for Black. This 3 | # comes close. 4 | force_grid_wrap=0 5 | force_sort_within_sections=True 6 | include_trailing_comma=True 7 | # Note 100 is the default for Pylint. Maybe I'll configure Black 8 | # to enforce it, also. 9 | # line_length=100 10 | multi_line_output=3 11 | use_parentheses=True 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: "2.7" 3 | sudo: false 4 | 5 | env: 6 | - TOX_ENV=py27 7 | - TOX_ENV=py33 8 | - TOX_ENV=py34 9 | - TOX_ENV=py35 10 | 11 | install: 12 | - pip install tox --use-mirrors 13 | 14 | script: 15 | - tox -e $TOX_ENV 16 | 17 | matrix: 18 | # Python 3.5 not yet available on travis, watch this to see when it is. 19 | fast_finish: true 20 | allow_failures: 21 | - env: TOX_ENV=py35 22 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | Version 1.27.0, 2018-08-07 2 | -------------------------- 3 | - NOTE: Active maintenance shifts to PyPDF4 4 | 5 | - No functional changes: just migration to PyPDF4 6 | 7 | 8 | Version 1.26.0, 2016-05-18 9 | -------------------------- 10 | 11 | - NOTE: Active maintenance on PyPDF2 is resuming after a hiatus 12 | 13 | - Fixed a bug where image resources where incorrectly 14 | overwritten when merging pages 15 | 16 | - Added dictionary for JavaScript actions to the root (louib) 17 | 18 | - Added unit tests for the JS functionality (louib) 19 | 20 | - Add more Python 3 compatibility when reading inline images (im2703 21 | and (VyacheslavHashov) 22 | 23 | - Return NullObject instead of raising error when failing to resolve 24 | object (ctate) 25 | 26 | - Don't output warning for non-zeroed xref table when strict=False 27 | (BenRussert) 28 | 29 | - Remove extraneous zeroes from output formatting (speedplane) 30 | 31 | - Fix bug where reading an inline image would cut off prematurely 32 | in certain cases (speedplane) 33 | 34 | 35 | Patch 1.25.1, 2015-07-20 36 | 37 | - Fix bug when parsing inline images. Occurred when merging 38 | certain pages with inline images 39 | 40 | - Fixed type error when creating outlines by utilizing the 41 | isString() test 42 | 43 | Version 1.25, 2015-07-07 44 | ------------------------ 45 | 46 | BUGFIXES: 47 | 48 | - Added Python 3 algorithm for ASCII85Decode. Fixes issue when 49 | reading reportlab-generated files with Py 3 (jerickbixly) 50 | 51 | - Recognize more escape sequence which would otherwise throw an 52 | exception (manuelzs, robertsoakes) 53 | 54 | - Fixed overflow error in generic.py. Occurred 55 | when reading a too-large int in Python 2 (by Raja Jamwal) 56 | 57 | - Allow access to files which were encrypted with an empty 58 | password. Previously threw a "File has not been decrypted" 59 | exception (Elena Williams) 60 | 61 | - Do not attempt to decode an empty data stream. Previously 62 | would cause an error in decode algorithms (vladir) 63 | 64 | - Fixed some type issues specific to Py 2 or Py 3 65 | 66 | - Fix issue when stream data begins with whitespace (soloma83) 67 | 68 | - Recognize abbreviated filter names (AlmightyOatmeal and 69 | Matthew Weiss) 70 | 71 | - Copy decryption key from PdfFileReader to PdfFileMerger. 72 | Allows usage of PdfFileMerger with encrypted files (twolfson) 73 | 74 | - Fixed bug which occurred when a NameObject is present at end 75 | of a file stream. Threw a "Stream has ended unexpectedly" 76 | exception (speedplane) 77 | 78 | FEATURES: 79 | 80 | - Initial work on a test suite; to be expanded in future. 81 | Tests and Resources directory added, README updated (robertsoakes) 82 | 83 | - Added document cloning methods to PdfFileWriter: 84 | appendPagesFromReader, cloneReaderDocumentRoot, and 85 | cloneDocumentFromReader. See official documentation (robertsoakes) 86 | 87 | - Added method for writing to form fields: updatePageFormFieldValues. 88 | This will be enhanced in the future. See official documentation 89 | (robertsoakes) 90 | 91 | - New addAttachment method. See documentation. Support for adding 92 | and extracting embedded files to be enhanced in the future 93 | (moshekaplan) 94 | 95 | - Added methods to get page number of given PageObject or 96 | Destination: getPageNumber and getDestinationPageNumber. 97 | See documentation (mozbugbox) 98 | 99 | OTHER ENHANCEMENTS: 100 | 101 | - Enhanced type handling (Brent Amrhein) 102 | 103 | - Enhanced exception handling in NameObject (sbywater) 104 | 105 | - Enhanced extractText method output (peircej) 106 | 107 | - Better exception handling 108 | 109 | - Enhanced regex usage in NameObject class (speedplane) 110 | 111 | 112 | Version 1.24, 2014-12-31 113 | ------------------------ 114 | 115 | - Bugfixes for reading files in Python 3 (by Anthony Tuininga and 116 | pqqp) 117 | 118 | - Appropriate errors are now raised instead of infinite loops (by 119 | naure and Cyrus Vafadari) 120 | 121 | - Bugfix for parsing number tokens with leading spaces (by Maxim 122 | Kamenkov) 123 | 124 | - Don't crash on bad /Outlines reference (by eshellman) 125 | 126 | - Conform tabs/spaces and blank lines to PEP 8 standards 127 | 128 | - Utilize the readUntilRegex method when reading Number Objects 129 | (by Brendan Jurd) 130 | 131 | - More bugfixes for Python 3 and clearer exception handling 132 | 133 | - Fixed encoding issue in merger (with eshellman) 134 | 135 | - Created separate folder for scripts 136 | 137 | 138 | Version 1.23, 2014-08-11 139 | ------------------------ 140 | 141 | - Documentation now available at http://pythonhosted.org//PyPDF2 142 | 143 | - Bugfix in pagerange.py for when __init__.__doc__ has no value (by 144 | Vladir Cruz) 145 | 146 | - Fix typos in OutlinesObject().add() (by shilluc) 147 | 148 | - Re-added a missing return statement in a utils.py method 149 | 150 | - Corrected viewing mode names (by Jason Scheirer) 151 | 152 | - New PdfFileWriter method: addJS() (by vfigueiro) 153 | 154 | - New bookmark features: color, boldness, italics, and page fit 155 | (by Joshua Arnott) 156 | 157 | - New PdfFileReader method: getFields(). Used to extract field 158 | information from PDFs with interactive forms. See documentation 159 | for details 160 | 161 | - Converted README file to markdown format (by Stephen Bussard) 162 | 163 | - Several improvements to overall performance and efficiency 164 | (by mozbugbox) 165 | 166 | - Fixed a bug where geospatial information was not scaling along with 167 | its page 168 | 169 | - Fixed a type issue and a Python 3 issue in the decryption algorithms 170 | (with Francisco Vieira and koba-ninkigumi) 171 | 172 | - Fixed a bug causing an infinite loop in the ASCII 85 decoding 173 | algorithm (by madmaardigan) 174 | 175 | - Annotations (links, comment windows, etc.) are now preserved when 176 | pages are merged together 177 | 178 | - Used the Destination class in addLink() and addBookmark() so that 179 | the page fit option could be properly customized 180 | 181 | 182 | Version 1.22, 2014-05-29 183 | ------------------------ 184 | 185 | - Added .DS_Store to .gitignore (for Mac users) (by Steve Witham) 186 | 187 | - Removed __init__() implementation in NameObject (by Steve Witham) 188 | 189 | - Fixed bug (inf. loop) when merging pages in Python 3 (by commx) 190 | 191 | - Corrected error when calculating height in scaleTo() 192 | 193 | - Removed unnecessary code from DictionaryObject (by Georges Dubus) 194 | 195 | - Fixed bug where an exception was thrown upon reading a NULL string 196 | (by speedplane) 197 | 198 | - Allow string literals (non-unicode strings in Python 2) to be passed 199 | to PdfFileReader 200 | 201 | - Allow ConvertFunctionsToVirtualList to be indexed with slices and 202 | longs (in Python 2) (by Matt Gilson) 203 | 204 | - Major improvements and bugfixes to addLink() method (see documentation 205 | in source code) (by Henry Keiter) 206 | 207 | - General code clean-up and improvements (with Steve Witham and Henry Keiter) 208 | 209 | - Fixed bug that caused crash when comments are present at end of 210 | dictionary 211 | 212 | 213 | Version 1.21, 2014-04-21 214 | ------------------------ 215 | 216 | - Fix for when /Type isn't present in the Pages dictionary (by Rob1080) 217 | 218 | - More tolerance for extra whitespace in Indirect Objects 219 | 220 | - Improved Exception handling 221 | 222 | - Fixed error in getHeight() method (by Simon Kaempflein) 223 | 224 | - implement use of utils.string_type to resolve Py2-3 compatibility issues 225 | 226 | - Prevent exception for multiple definitions in a dictionary (with carlosfunk) 227 | (only when strict = False) 228 | 229 | - Fixed errors when parsing a slice using pdfcat on command line (by 230 | Steve Witham) 231 | 232 | - Tolerance for EOF markers within 1024 bytes of the actual end of the 233 | file (with David Wolever) 234 | 235 | - Added overwriteWarnings parameter to PdfFileReader constructor, if False 236 | PyPDF2 will NOT overwrite methods from Python's warnings.py module with 237 | a custom implementation. 238 | 239 | - Fix NumberObject and NameObject constructors for compatibility with PyPy 240 | (Rüdiger Jungbeck, Xavier Dupré, shezadkhan137, Steven Witham) 241 | 242 | - Utilize utils.Str in pdf.py and pagerange.py to resolve type issues (by 243 | egbutter) 244 | 245 | - Improvements in implementing StringIO for Python 2 and BytesIO for 246 | Python 3 (by Xavier Dupré) 247 | 248 | - Added /x00 to Whitespaces, defined utils.WHITESPACES to clarify code (by 249 | Maxim Kamenkov) 250 | 251 | - Bugfix for merging 3 or more resources with the same name (by lucky-user) 252 | 253 | - Improvements to Xref parsing algorithm (by speedplane) 254 | 255 | 256 | Version 1.20, 2014-01-27 257 | ------------------------ 258 | 259 | - Official Python 3+ support (with contributions from TWAC and cgammans) 260 | Support for Python versions 2.6 and 2.7 will be maintained 261 | 262 | - Command line concatenation (see pdfcat in sample code) (by Steve Witham) 263 | 264 | - New FAQ; link included in README 265 | 266 | - Allow more (although unnecessary) escape sequences 267 | 268 | - Prevent exception when reading a null object in decoding parameters 269 | 270 | - Corrected error in reading destination types (added a slash since they 271 | are name objects) 272 | 273 | - Corrected TypeError in scaleTo() method 274 | 275 | - addBookmark() method in PdfFileMerger now returns bookmark (so nested 276 | bookmarks can be created) 277 | 278 | - Additions to Sample Code and Sample PDFs 279 | 280 | - changes to allow 2up script to work (see sample code) (by Dylan McNamee) 281 | 282 | - changes to metadata encoding (by Chris Hiestand) 283 | 284 | - New methods for links: addLink() (by Enrico Lambertini) and removeLinks() 285 | 286 | - Bugfix to handle nested bookmarks correctly (by Jamie Lentin) 287 | 288 | - New methods removeImages() and removeText() available for PdfFileWriter 289 | (by Tien Haï) 290 | 291 | - Exception handling for illegal characters in Name Objects 292 | 293 | 294 | Version 1.19, 2013-10-08 295 | ------------------------ 296 | 297 | BUGFIXES: 298 | - Removed pop in sweepIndirectReferences to prevent infinite loop 299 | (provided by ian-su-sirca) 300 | 301 | - Fixed bug caused by whitespace when parsing PDFs generated by AutoCad 302 | 303 | - Fixed a bug caused by reading a 'null' ASCII value in a dictionary 304 | object (primarily in PDFs generated by AutoCad). 305 | 306 | FEATURES: 307 | - Added new folders for PyPDF2 sample code and example PDFs; see README 308 | for each folder 309 | 310 | - Added a method for debugging purposes to show current location while 311 | parsing 312 | 313 | - Ability to create custom metadata (by jamma313) 314 | 315 | - Ability to access and customize document layout and view mode 316 | (by Joshua Arnott) 317 | 318 | OTHER: 319 | - Added and corrected some documentation 320 | 321 | - Added some more warnings and exception messages 322 | 323 | - Removed old test/debugging code 324 | 325 | UPCOMING: 326 | - More bugfixes (We have received many problematic PDFs via email, we 327 | will work with them) 328 | 329 | - Documentation - It's time for PyPDF2 to get its own documentation 330 | since it has grown much since the original pyPdf 331 | 332 | - A FAQ to answer common questions 333 | 334 | 335 | Version 1.18, 2013-08-19 336 | ------------------------ 337 | 338 | - Fixed a bug where older verions of objects were incorrectly added to the 339 | cache, resulting in outdated or missing pages, images, and other objects 340 | (from speedplane) 341 | 342 | - Fixed a bug in parsing the xref table where new xref values were 343 | overwritten; also cleaned up code (from speedplane) 344 | 345 | - New method mergeRotatedAroundPointPage which merges a page while rotating 346 | it around a point (from speedplane) 347 | 348 | - Updated Destination syntax to respect PDF 1.6 specifications (from 349 | jamma313) 350 | 351 | - Prevented infinite loop when a PdfFileReader object was instantiated 352 | with an empty file (from Jerome Nexedi) 353 | 354 | Other Changes: 355 | 356 | - Downloads now available via PyPI 357 | https://pypi.python.org/pypi?:action=display&name=PyPDF2 358 | 359 | - Installation through pip library is fixed 360 | 361 | 362 | Version 1.17, 2013-07-25 363 | ------------------------ 364 | 365 | - Removed one (from pdf.py) of the two Destination classes. Both 366 | classes had the same name, but were slightly different in content, 367 | causing some errors. (from Janne Vanhala) 368 | 369 | - Corrected and Expanded README file to demonstrate PdfFileMerger 370 | 371 | - Added filter for LZW encoded streams (from Michal Horejsek) 372 | 373 | - PyPDF2 issue tracker enabled on Github to allow community 374 | discussion and collaboration 375 | 376 | 377 | Versions -1.16, -2013-06-30 378 | --------------------------- 379 | 380 | - Note: This ChangeLog has not been kept up-to-date for a while. 381 | Hopefully we can keep better track of it from now on. Some of the 382 | changes listed here come from previous versions 1.14 and 1.15; they 383 | were only vaguely defined. With the new _version.py file we should 384 | have more structured and better documented versioning from now on. 385 | 386 | - Defined PyPDF2.__version__ 387 | 388 | - Fixed encrypt() method (from Martijn The) 389 | 390 | - Improved error handling on PDFs with truncated streams (from cecilkorik) 391 | 392 | - Python 3 support (from kushal-kumaran) 393 | 394 | - Fixed example code in README (from Jeremy Bethmont) 395 | 396 | - Fixed an bug caused by DecimalError Exception (from Adam Morris) 397 | 398 | - Many other bug fixes and features by: 399 | 400 | jeansch 401 | Anton Vlasenko 402 | Joseph Walton 403 | Jan Oliver Oelerich 404 | Fabian Henze 405 | And any others I missed. 406 | Thanks for contributing! 407 | 408 | 409 | Version 1.13, 2010-12-04 410 | ------------------------ 411 | 412 | - Fixed a typo in code for reading a "\b" escape character in strings. 413 | 414 | - Improved __repr__ in FloatObject. 415 | 416 | - Fixed a bug in reading octal escape sequences in strings. 417 | 418 | - Added getWidth and getHeight methods to the RectangleObject class. 419 | 420 | - Fixed compatibility warnings with Python 2.4 and 2.5. 421 | 422 | - Added addBlankPage and insertBlankPage methods on PdfFileWriter class. 423 | 424 | - Fixed a bug with circular references in page's object trees (typically 425 | annotations) that prevented correctly writing out a copy of those pages. 426 | 427 | - New merge page functions allow application of a transformation matrix. 428 | 429 | - To all patch contributors: I did a poor job of keeping this ChangeLog 430 | up-to-date for this release, so I am missing attributions here for any 431 | changes you submitted. Sorry! I'll do better in the future. 432 | 433 | 434 | Version 1.12, 2008-09-02 435 | ------------------------ 436 | 437 | - Added support for XMP metadata. 438 | 439 | - Fix reading files with xref streams with multiple /Index values. 440 | 441 | - Fix extracting content streams that use graphics operators longer than 2 442 | characters. Affects merging PDF files. 443 | 444 | 445 | Version 1.11, 2008-05-09 446 | ------------------------ 447 | 448 | - Patch from Hartmut Goebel to permit RectangleObjects to accept NumberObject 449 | or FloatObject values. 450 | 451 | - PDF compatibility fixes. 452 | 453 | - Fix to read object xref stream in correct order. 454 | 455 | - Fix for comments inside content streams. 456 | 457 | 458 | Version 1.10, 2007-10-04 459 | ------------------------ 460 | 461 | - Text strings from PDF files are returned as Unicode string objects when 462 | pyPdf determines that they can be decoded (as UTF-16 strings, or as 463 | PDFDocEncoding strings). Unicode objects are also written out when 464 | necessary. This means that string objects in pyPdf can be either 465 | generic.ByteStringObject instances, or generic.TextStringObject instances. 466 | 467 | - The extractText method now returns a unicode string object. 468 | 469 | - All document information properties now return unicode string objects. In 470 | the event that a document provides docinfo properties that are not decoded by 471 | pyPdf, the raw byte strings can be accessed with an "_raw" property (ie. 472 | title_raw rather than title) 473 | 474 | - generic.DictionaryObject instances have been enhanced to be easier to use. 475 | Values coming out of dictionary objects will automatically be de-referenced 476 | (.getObject will be called on them), unless accessed by the new "raw_get" 477 | method. DictionaryObjects can now only contain PdfObject instances (as keys 478 | and values), making it easier to debug where non-PdfObject values (which 479 | cannot be written out) are entering dictionaries. 480 | 481 | - Support for reading named destinations and outlines in PDF files. Original 482 | patch by Ashish Kulkarni. 483 | 484 | - Stream compatibility reading enhancements for malformed PDF files. 485 | 486 | - Cross reference table reading enhancements for malformed PDF files. 487 | 488 | - Encryption documentation. 489 | 490 | - Replace some "assert" statements with error raising. 491 | 492 | - Minor optimizations to FlateDecode algorithm increase speed when using PNG 493 | predictors. 494 | 495 | Version 1.9, 2006-12-15 496 | ----------------------- 497 | 498 | - Fix several serious bugs introduced in version 1.8, caused by a failure to 499 | run through our PDF test suite before releasing that version. 500 | 501 | - Fix bug in NullObject reading and writing. 502 | 503 | Version 1.8, 2006-12-14 504 | ----------------------- 505 | 506 | - Add support for decryption with the standard PDF security handler. This 507 | allows for decrypting PDF files given the proper user or owner password. 508 | 509 | - Add support for encryption with the standard PDF security handler. 510 | 511 | - Add new pythondoc documentation. 512 | 513 | - Fix bug in ASCII85 decode that occurs when whitespace exists inside the 514 | two terminating characters of the stream. 515 | 516 | Version 1.7, 2006-12-10 517 | ----------------------- 518 | 519 | - Fix a bug when using a single page object in two PdfFileWriter objects. 520 | 521 | - Adjust PyPDF to be tolerant of whitespace characters that don't belong 522 | during a stream object. 523 | 524 | - Add documentInfo property to PdfFileReader. 525 | 526 | - Add numPages property to PdfFileReader. 527 | 528 | - Add pages property to PdfFileReader. 529 | 530 | - Add extractText function to PdfFileReader. 531 | 532 | 533 | Version 1.6, 2006-06-06 534 | ----------------------- 535 | 536 | - Add basic support for comments in PDF files. This allows us to read some 537 | ReportLab PDFs that could not be read before. 538 | 539 | - Add "auto-repair" for finding xref table at slightly bad locations. 540 | 541 | - New StreamObject backend, cleaner and more powerful. Allows the use of 542 | stream filters more easily, including compressed streams. 543 | 544 | - Add a graphics state push/pop around page merges. Improves quality of 545 | page merges when one page's content stream leaves the graphics 546 | in an abnormal state. 547 | 548 | - Add PageObject.compressContentStreams function, which filters all content 549 | streams and compresses them. This will reduce the size of PDF pages, 550 | especially after they could have been decompressed in a mergePage 551 | operation. 552 | 553 | - Support inline images in PDF content streams. 554 | 555 | - Add support for using .NET framework compression when zlib is not 556 | available. This does not make pyPdf compatible with IronPython, but it 557 | is a first step. 558 | 559 | - Add support for reading the document information dictionary, and extracting 560 | title, author, subject, producer and creator tags. 561 | 562 | - Add patch to support NullObject and multiple xref streams, from Bradley 563 | Lawrence. 564 | 565 | 566 | Version 1.5, 2006-01-28 567 | ----------------------- 568 | 569 | - Fix a bug where merging pages did not work in "no-rename" cases when the 570 | second page has an array of content streams. 571 | 572 | - Remove some debugging output that should not have been present. 573 | 574 | 575 | Version 1.4, 2006-01-27 576 | ----------------------- 577 | 578 | - Add capability to merge pages from multiple PDF files into a single page 579 | using the PageObject.mergePage function. See example code (README or web 580 | site) for more information. 581 | 582 | - Add ability to modify a page's MediaBox, CropBox, BleedBox, TrimBox, and 583 | ArtBox properties through PageObject. See example code (README or web site) 584 | for more information. 585 | 586 | - Refactor pdf.py into multiple files: generic.py (contains objects like 587 | NameObject, DictionaryObject), filters.py (contains filter code), 588 | utils.py (various). This does not affect importing PdfFileReader 589 | or PdfFileWriter. 590 | 591 | - Add new decoding functions for standard PDF filters ASCIIHexDecode and 592 | ASCII85Decode. 593 | 594 | - Change url and download_url to refer to new pybrary.net web site. 595 | 596 | 597 | Version 1.3, 2006-01-23 598 | ----------------------- 599 | 600 | - Fix new bug introduced in 1.2 where PDF files with \r line endings did not 601 | work properly anymore. A new test suite developed with various PDF files 602 | should prevent regression bugs from now on. 603 | 604 | - Fix a bug where inheriting attributes from page nodes did not work. 605 | 606 | 607 | Version 1.2, 2006-01-23 608 | ----------------------- 609 | 610 | - Improved support for files with CRLF-based line endings, fixing a common 611 | reported problem stating "assertion error: assert line == "%%EOF"". 612 | 613 | - Software author/maintainer is now officially a proud married person, which 614 | is sure to result in better software... somehow. 615 | 616 | 617 | Version 1.1, 2006-01-18 618 | ----------------------- 619 | 620 | - Add capability to rotate pages. 621 | 622 | - Improved PDF reading support to properly manage inherited attributes from 623 | /Type=/Pages nodes. This means that page groups that are rotated or have 624 | different media boxes or whatever will now work properly. 625 | 626 | - Added PDF 1.5 support. Namely cross-reference streams and object streams. 627 | This release can mangle Adobe's PDFReference16.pdf successfully. 628 | 629 | 630 | Version 1.0, 2006-01-17 631 | ----------------------- 632 | 633 | - First distutils-capable true public release. Supports a wide variety of PDF 634 | files that I found sitting around on my system. 635 | 636 | - Does not support some PDF 1.5 features, such as object streams, 637 | cross-reference streams. 638 | 639 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2008, Mathieu Fenniak 2 | Some contributions copyright (c) 2007, Ashish Kulkarni 3 | Some contributions copyright (c) 2014, Steve Witham 4 | 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright notice, 12 | this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | * The name of the author may not be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyPDF4 2 | PyPDF4 is a pure-python PDF library capable of splitting, merging together, 3 | cropping, and transforming the pages of PDF files. It can also add custom data, 4 | viewing options, and passwords to PDF files. It can retrieve text and metadata 5 | from PDFs as well as merge entire files together. 6 | 7 | What happened to PyPDF2? Nothing; it's still available at 8 | https://github.com/mstamy2/PyPDF2. For various reasons @claird will eventually 9 | explain, I've simply decided to mark a new "business model" with a 10 | slightly-renamed project name. 11 | While PyPDF4 will continue to be available at no charge, I have strong plans 12 | for better ongoing support to start in August 2018. 13 | 14 | Homepage (available soon): http://claird.github.io/PyPDF4/. 15 | 16 | ## Examples 17 | Please see the `samplecode/` folder. 18 | 19 | ## Documentation 20 | Documentation soon will be available, although probably not at 21 | https://pythonhosted.org/PyPDF4/. 22 | 23 | ## FAQ 24 | Please see http://claird.github.io/PyPDF4/FAQ.html (available in early August). 25 | 26 | ## Tests 27 | PyPDF4 includes a modest (but growing!) test suite built on the unittest 28 | framework. All tests are located in the `tests/` folder and are distributed 29 | among dedicated modules. Tox makes running all tests over all versions of Python 30 | quick work: 31 | 32 | ``` 33 | python -m pip install tox 34 | python -m tox 35 | ``` 36 | 37 | Individual tests are accessible as conventional **Pytest** sources; 38 | 39 | ``` 40 | pytest -v tests/test_pdf.py 41 | ``` 42 | 43 | is an example which assumes the `pytest` executable is activated. 44 | 45 | ## Contributing 46 | For an exhaustive overview of what rules you are expected to maintain, please 47 | visit [Contributing](https://github.com/claird/PyPDF4/wiki/Contributing) in the 48 | project Wiki. A quick outline of these is: 49 | 50 | * **Provide test cases** for individual units of development of your own. 51 | Proper testing is highly encouraged: *Code without tests is broken by design* 52 | \- Jacob Kaplan-Moss, Django's original development team member. 53 | * Follow the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style conventions, such as: 54 | * lower_case_with_underscores nomenclature (e.g., `file_name` rather than `fileName`, 55 | and `write_file()` rather than `writeFile()`). 56 | * Line lengths of `79` characters or less. 57 | * Correct spacing between global-scoped classes and functions (two newlines 58 | in between etc.) and within internal code blocks. 59 | * Target your code for Python 3 but maintain retrocompatibility with Python 2 60 | (do we retain Py2? Still under active consideration). 61 | * Provide [docstring documentation](https://www.python.org/dev/peps/pep-0257/) 62 | for public classes and functions. 63 | * Utilize `# TO-DO` or `TO-DO` markings within 64 | [docstrings](https://www.python.org/dev/peps/pep-0257/) for indicating a 65 | feature that is yet to be implemented or discussed. Some IDEs feature TO-DOs 66 | detection consoles. 67 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | # Override Pylint's default configuration. 2 | # 3 | # Reference: . 4 | # 5 | # Generate a default config file with comments by commanding 6 | # pylint --generate-rcfile 7 | 8 | [MESSAGES CONTROL] 9 | # Construct the configuration as exceptions from the well-defined 10 | # "all-enabled" starting point. 11 | enable=all 12 | 13 | disable= 14 | suppressed-message, 15 | locally-disabled 16 | 17 | [BASIC] 18 | # PyPDF4 frequently binds exceptions as "e". Use of "i" as a loop variable is to pervasive 19 | # that we account for it. 20 | good-names= 21 | e, 22 | i 23 | -------------------------------------------------------------------------------- /pypdf/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | from .generic import * 3 | from .merger import PdfFileMerger 4 | from .pagerange import PageRange 5 | from .pdf import PdfFileReader, PdfFileWriter 6 | 7 | __all__ = [ 8 | # Basic PyPDF elements 9 | "PdfFileReader", 10 | "PdfFileWriter", 11 | "PdfFileMerger", 12 | "PageRange", 13 | # most used elements from generic 14 | "BooleanObject", 15 | "ArrayObject", 16 | "IndirectObject", 17 | "FloatObject", 18 | "NumberObject", 19 | "createStringObject", 20 | "TextStringObject", 21 | "NameObject", 22 | "DictionaryObject", 23 | "TreeObject", 24 | "Destination", 25 | "PageLabel", 26 | "Bookmark", 27 | # PyPDF modules 28 | "pdf", 29 | "generic", 30 | "utils", 31 | "filters", 32 | "merger", 33 | "pagerange", 34 | "xmp", 35 | ] 36 | -------------------------------------------------------------------------------- /pypdf/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.27.0" 2 | -------------------------------------------------------------------------------- /pypdf/pagerange.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Representation and utils for ranges of PDF file pages. 4 | 5 | Copyright (c) 2014, Steve Witham . 6 | All rights reserved. This software is available under a BSD license; 7 | see https://github.com/claird/PyPDF4/blob/master/LICENSE.md 8 | """ 9 | 10 | import re 11 | 12 | from .utils import isString 13 | 14 | _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". 15 | PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) 16 | # groups: 12 34 5 6 7 8 17 | 18 | 19 | class ParseError(Exception): 20 | pass 21 | 22 | 23 | PAGE_RANGE_HELP = """Remember, page indices start with zero. 24 | Page range expression examples: 25 | : all pages. -1 last page. 26 | 22 just the 23rd page. :-1 all but the last page. 27 | 0:3 the first three pages. -2 second-to-last page. 28 | :3 the first three pages. -2: last two pages. 29 | 5: from the sixth page onward. -3:-1 third & second to last. 30 | The third, "stride" or "step" number is also recognized. 31 | ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. 32 | 1:10:2 1 3 5 7 9 2::-1 2 1 0. 33 | ::-1 all pages in reverse order. 34 | """ 35 | 36 | 37 | class PageRange(object): 38 | """ 39 | A slice-like representation of a range of page indices, 40 | i.e. page numbers, only starting at zero. 41 | The syntax is like what you would put between brackets [ ]. 42 | The slice is one of the few Python types that can't be subclassed, 43 | but this class converts to and from slices, and allows similar use. 44 | 45 | * PageRange(str) parses a string representing a page range. 46 | * PageRange(slice) directly "imports" a slice. 47 | * _to_slice() gives the equivalent slice. 48 | * str() and repr() allow printing. 49 | * indices(n) is like slice.indices(n). 50 | """ 51 | 52 | def __init__(self, arg): 53 | """ 54 | Initialize with either a slice -- giving the equivalent page range, 55 | or a PageRange object -- making a copy, 56 | or a string like 57 | "int", "[int]:[int]" or "[int]:[int]:[int]", 58 | where the brackets indicate optional ints. 59 | {page_range_help} 60 | Note the difference between this notation and arguments to slice(): 61 | slice(3) means the first three pages; 62 | PageRange("3") means the range of only the fourth page. 63 | However PageRange(slice(3)) means the first three pages. 64 | """ 65 | if isinstance(arg, slice): 66 | self._slice = arg 67 | return 68 | 69 | if isinstance(arg, PageRange): 70 | self._slice = arg._to_slice() 71 | return 72 | 73 | match = isString(arg) and re.match(PAGE_RANGE_RE, arg) 74 | 75 | if not match: 76 | raise ParseError(arg) 77 | if match.group(2): 78 | # Special case: just an int means a range of one page. 79 | start = int(match.group(2)) 80 | stop = start + 1 if start != -1 else None 81 | self._slice = slice(start, stop) 82 | else: 83 | self._slice = slice(*[int(g) if g else None for g in match.group(4, 6, 8)]) 84 | 85 | # Just formatting this when there is __doc__ for __init__ 86 | if __init__.__doc__: 87 | __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) 88 | 89 | @staticmethod 90 | def valid(this_input): 91 | """ True if input is a valid initializer for a PageRange. """ 92 | return isinstance(this_input, (slice, PageRange)) or ( 93 | isString(this_input) and bool(re.match(PAGE_RANGE_RE, this_input)) 94 | ) 95 | 96 | def _to_slice(self): 97 | """ Return the slice equivalent of this page range. """ 98 | return self._slice 99 | 100 | def __str__(self): 101 | """A string like "1:2:3".""" 102 | s__ = self._slice 103 | if s__.step is None: 104 | if s__.start is not None and s__.stop == s__.start + 1: 105 | return str(s__.start) 106 | 107 | indices = s__.start, s__.stop 108 | else: 109 | indices = s__.start, s__.stop, s__.step 110 | return ":".join("" if i is None else str(i) for i in indices) 111 | 112 | def __repr__(self): 113 | """A string like "PageRange('1:2:3')".""" 114 | return "PageRange(" + repr(str(self)) + ")" 115 | 116 | def indices(self, this_n): 117 | """ 118 | ``this_n`` is the length of the list of pages to choose from. 119 | Returns arguments for ``range()``. See ``help(slice.indices)``. 120 | """ 121 | return self._slice.indices(this_n) 122 | 123 | 124 | PAGE_RANGE_ALL = PageRange(":") # The range of all pages. 125 | 126 | 127 | def parseFilenamePageRanges(args): 128 | """ 129 | Given a list of filenames and page ranges, return a list of 130 | (filename, page_range) pairs. 131 | First arg must be a filename; other args are filenames, page-range 132 | expressions, slice objects, or PageRange objects. 133 | A filename not followed by a page range indicates all pages of the file. 134 | """ 135 | pairs = [] 136 | pdfFilename = None 137 | didPageRange = False 138 | 139 | for arg in args + [None]: 140 | if PageRange.valid(arg): 141 | if not pdfFilename: 142 | raise ValueError( 143 | "The first argument must be a filename, not a page range." 144 | ) 145 | 146 | pairs.append((pdfFilename, PageRange(arg))) 147 | didPageRange = True 148 | else: 149 | # New filename or end of list--do all of the previous file? 150 | if pdfFilename and not didPageRange: 151 | pairs.append((pdfFilename, PAGE_RANGE_ALL)) 152 | 153 | pdfFilename = arg 154 | didPageRange = False 155 | 156 | return pairs 157 | -------------------------------------------------------------------------------- /pypdf/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2006, Mathieu Fenniak 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # * The name of the author may not be used to endorse or promote products 14 | # derived from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | """ 28 | Utility functions for PDF library. 29 | """ 30 | from binascii import hexlify 31 | import sys 32 | 33 | try: 34 | import __builtin__ as builtins 35 | except ImportError: # Py3 36 | import builtins 37 | 38 | __author__ = "Mathieu Fenniak" 39 | __author_email__ = "biziqe@mathieu.fenniak.net" 40 | 41 | 42 | xrange_fn = getattr(builtins, "xrange", range) 43 | _basestring = getattr(builtins, "basestring", str) 44 | 45 | bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X 46 | string_type = getattr(builtins, "unicode", str) 47 | int_types = (int, long) if sys.version_info[0] < 3 else (int,) 48 | 49 | 50 | # Make basic type tests more consistent 51 | def isString(s): 52 | """Test if arg is a string. Compatible with Python 2 and 3.""" 53 | return isinstance(s, _basestring) 54 | 55 | 56 | def isInt(n): 57 | """Test if arg is an int. Compatible with Python 2 and 3.""" 58 | return isinstance(n, int_types) 59 | 60 | 61 | def isBytes(b): 62 | """Test if arg is a bytes instance. Compatible with Python 2 and 3.""" 63 | return isinstance(b, bytes_type) 64 | 65 | 66 | # custom implementation of warnings.formatwarning 67 | def formatWarning(message, category, filename, lineno, line=None): 68 | file = filename.replace("/", "\\").rsplit("\\", 1)[-1] # find the file name 69 | return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno) 70 | 71 | 72 | def readUntilWhitespace(stream, maxchars=None): 73 | """ 74 | Reads non-whitespace characters and returns them. 75 | Stops upon encountering whitespace or when maxchars is reached. 76 | """ 77 | txt = pypdfBytes("") 78 | 79 | while True: 80 | tok = stream.read(1) 81 | 82 | if tok.isspace() or not tok: 83 | break 84 | 85 | txt += tok 86 | if len(txt) == maxchars: 87 | break 88 | 89 | return txt 90 | 91 | 92 | def readNonWhitespace(stream): 93 | """ 94 | Finds and reads the next non-whitespace character (ignores whitespace). 95 | 96 | :param stream: a file-like object. 97 | """ 98 | tok = WHITESPACES[0] 99 | 100 | while tok in WHITESPACES: 101 | tok = stream.read(1) 102 | 103 | return tok 104 | 105 | 106 | def skipOverWhitespace(stream): 107 | """ 108 | Similar to ``readNonWhitespace()``, but returns a Boolean if more than 109 | one whitespace character was read. 110 | 111 | :param stream: a file-like object. 112 | """ 113 | tok = WHITESPACES[0] 114 | cnt = 0 115 | 116 | while tok in WHITESPACES: 117 | tok = stream.read(1) 118 | cnt += 1 119 | 120 | return cnt > 1 121 | 122 | 123 | def skipOverComment(stream): 124 | tok = stream.read(1) 125 | stream.seek(-1, 1) 126 | 127 | if tok == pypdfBytes("%"): 128 | while tok not in (pypdfBytes("\n"), pypdfBytes("\r")): 129 | tok = stream.read(1) 130 | 131 | 132 | def readUntilRegex(stream, regex, ignore_eof=False): 133 | """ 134 | Reads until the regular expression pattern matched (ignore the match) 135 | Raise PdfStreamError on premature end-of-file. 136 | :param bool ignore_eof: If true, ignore end-of-line and return immediately 137 | """ 138 | name = pypdfBytes("") 139 | 140 | while True: 141 | tok = stream.read(16) 142 | 143 | if not tok: 144 | # stream has truncated prematurely 145 | if ignore_eof: 146 | return name 147 | raise PdfStreamError("Stream has ended unexpectedly") 148 | m = regex.search(tok) 149 | if m is not None: 150 | name += tok[: m.start()] 151 | stream.seek(m.start() - len(tok), 1) 152 | break 153 | name += tok 154 | 155 | return name 156 | 157 | 158 | class ConvertFunctionsToVirtualList(object): 159 | def __init__(self, lengthFunction, getFunction): 160 | self.lengthFunction = lengthFunction 161 | self.getFunction = getFunction 162 | 163 | def __len__(self): 164 | return self.lengthFunction() 165 | 166 | def __getitem__(self, index): 167 | if isinstance(index, slice): 168 | indices = xrange_fn(*index.indices(len(self))) 169 | cls = type(self) 170 | return cls(indices.__len__, lambda idx: self[indices[idx]]) 171 | if not isInt(index): 172 | raise TypeError("sequence indices must be integers") 173 | 174 | len_self = len(self) 175 | 176 | if index < 0: 177 | # support negative indexes 178 | index = len_self + index 179 | if index < 0 or index >= len_self: 180 | raise IndexError("sequence index out of range") 181 | 182 | return self.getFunction(index) 183 | 184 | 185 | def RC4Encrypt(key, plaintext): 186 | S = list(range(256)) 187 | j = 0 188 | 189 | for i in range(256): 190 | j = (j + S[i] + pypdfOrd(key[i % len(key)])) % 256 191 | S[i], S[j] = S[j], S[i] 192 | 193 | i, j = 0, 0 194 | retval = [] 195 | 196 | for x in range(len(plaintext)): 197 | i = (i + 1) % 256 198 | j = (j + S[i]) % 256 199 | S[i], S[j] = S[j], S[i] 200 | t = S[(S[i] + S[j]) % 256] 201 | retval.append(pypdfBytes(chr(pypdfOrd(plaintext[x]) ^ t))) 202 | 203 | return pypdfBytes("").join(retval) 204 | 205 | 206 | def matrixMultiply(a, b): 207 | return [ 208 | [sum([float(i) * float(j) for i, j in zip(row, col)]) for col in zip(*b)] 209 | for row in a 210 | ] 211 | 212 | 213 | class PyPdfError(Exception): 214 | pass 215 | 216 | 217 | class PdfReadError(PyPdfError): 218 | pass 219 | 220 | 221 | class PageSizeNotDefinedError(PyPdfError): 222 | pass 223 | 224 | 225 | class PdfReadWarning(UserWarning): 226 | pass 227 | 228 | 229 | class PdfStreamError(PdfReadError): 230 | pass 231 | 232 | 233 | def pypdfBytes(s): 234 | """ 235 | :type s: Union[bytes, str, int, unicode] 236 | :rtype: bytes 237 | """ 238 | if sys.version_info[0] < 3: 239 | if isinstance(s, int): 240 | return chr(s) 241 | if isinstance(s, bytes): 242 | return s 243 | return s.encode("latin-1") 244 | if isinstance(s, int): 245 | return bytes([s]) 246 | if isinstance(s, bytes): 247 | return s 248 | return s.encode("latin-1") 249 | 250 | 251 | def pypdfUnicode(s): 252 | """ 253 | :type s: Union[bytes, str, unicode] 254 | :returns: ``unicode`` for Python 2, ``str`` for Python 3. 255 | :rtype: Union[str, unicode] 256 | """ 257 | if sys.version_info[0] < 3: 258 | if isinstance(s, unicode): 259 | return s 260 | return unicode(s, "unicode_escape") 261 | if isinstance(s, str): 262 | return s 263 | return s.decode("unicode_escape") 264 | 265 | 266 | def pypdfStr(b): 267 | """ 268 | :type b: Union[bytes, str, unicode] 269 | :rtype: str 270 | """ 271 | if sys.version_info[0] < 3: 272 | if isinstance(b, unicode): 273 | return b.encode("latin-1") 274 | return b 275 | if isinstance(b, bytes): 276 | return b.decode("latin-1") 277 | return b 278 | 279 | 280 | def pypdfOrd(b): 281 | """ 282 | :type b: Union[int, bytes, str, unicode] 283 | :rtype: int 284 | """ 285 | if isinstance(b, int): 286 | return b 287 | return ord(b) 288 | 289 | 290 | def pypdfChr(c): 291 | """ 292 | :type c: Union[int, bytes, str, unicode] 293 | :rtype: str 294 | """ 295 | if isinstance(c, int): 296 | return chr(c) 297 | return chr(ord(c)) 298 | 299 | 300 | def pypdfBytearray(b): 301 | """ 302 | Abstracts the conversion from a ``bytes`` variable to a ``bytearray`` value 303 | over versions 2.7.x and 3 of Python. 304 | """ 305 | if sys.version_info[0] < 3: 306 | return b 307 | return bytearray(b) 308 | 309 | 310 | def hexEncode(s): 311 | """ 312 | Abstracts the conversion from a LATIN 1 string to an hex-valued string 313 | representation of the former over versions 2.7.x and 3 of Python. 314 | 315 | :param str s: a ``str`` to convert from LATIN 1 to an hexadecimal string 316 | representation. 317 | :return: a hex-valued string, e.g. ``hexEncode("$A'") == "244127"``. 318 | :rtype: str 319 | """ 320 | if sys.version_info < (3, 0): 321 | return s.encode("hex") 322 | if isinstance(s, str): 323 | s = s.encode("LATIN1") 324 | 325 | # The output is in the set of "0123456789ABCDEF" characters. Using the 326 | # ASCII decoder is a safeguard against anomalies, albeit unlikely 327 | return hexlify(s).decode("ASCII") 328 | 329 | 330 | def hexStr(num): 331 | return hex(num).replace("L", "") 332 | 333 | 334 | WHITESPACES = [pypdfBytes(x) for x in [" ", "\n", "\r", "\t", "\x00"]] 335 | 336 | 337 | def paethPredictor(left, up, up_left): 338 | p = left + up - up_left 339 | dist_left = abs(p - left) 340 | dist_up = abs(p - up) 341 | dist_up_left = abs(p - up_left) 342 | 343 | if dist_left <= dist_up and dist_left <= dist_up_left: 344 | return left 345 | if dist_up <= dist_up_left: 346 | return up 347 | return up_left 348 | 349 | 350 | def pairs(sequence): 351 | """ 352 | :param sequence: an indexable sequence value with ``__len__()``. 353 | :return: an iterable of paired values from ``sequence``. 354 | """ 355 | if (len(sequence) % 2) != 0: 356 | raise ValueError("sequence must contain an even number of elements") 357 | 358 | for i in range(0, len(sequence) - 1, 2): 359 | yield (sequence[i], sequence[i + 1]) 360 | -------------------------------------------------------------------------------- /pypdf/xmp.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import decimal 3 | import re 4 | from xml.dom.minidom import parseString 5 | 6 | from .generic import PdfObject 7 | from .utils import pypdfUnicode 8 | 9 | RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 10 | DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 11 | XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 12 | PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 13 | XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 14 | 15 | # What is the PDFX namespace, you might ask? I might ask that too. It's 16 | # a completely undocumented namespace used to place "custom metadata" 17 | # properties, which are arbitrary metadata properties with no semantic or 18 | # documented meaning. Elements in the namespace are key/value-style storage, 19 | # where the element name is the key and the content is the value. The keys 20 | # are transformed into valid XML identifiers by substituting an invalid 21 | # identifier character with \u2182 followed by the unicode hex ID of the 22 | # original character. A key like "my car" is therefore "my\u21820020car". 23 | # 24 | # \u2182, in case you're wondering, is the unicode character 25 | # \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for 26 | # escaping characters. 27 | # 28 | # Intentional users of the pdfx namespace should be shot on sight. A 29 | # custom data schema and sensical XML elements could be used instead, as is 30 | # suggested by Adobe's own documentation on XMP (under "Extensibility of 31 | # Schemas"). 32 | # 33 | # Information presented here on the /pdfx/ schema is a result of limited 34 | # reverse engineering, and does not constitute a full specification. 35 | PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 36 | 37 | iso8601 = re.compile( 38 | """ 39 | (?P[0-9]{4}) 40 | (- 41 | (?P[0-9]{2}) 42 | (- 43 | (?P[0-9]+) 44 | (T 45 | (?P[0-9]{2}): 46 | (?P[0-9]{2}) 47 | (:(?P[0-9]{2}(.[0-9]+)?))? 48 | (?PZ|[-+][0-9]{2}:[0-9]{2}) 49 | )? 50 | )? 51 | )? 52 | """, 53 | re.VERBOSE, 54 | ) 55 | 56 | 57 | class XmpInformation(PdfObject): 58 | """ 59 | An object that represents Adobe XMP metadata. Usually accessed by 60 | :meth:`xmpMetadata()` 61 | """ 62 | 63 | def __init__(self, stream): 64 | self.stream = stream 65 | docRoot = parseString(self.stream.getData()) 66 | self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] 67 | self.cache = {} 68 | 69 | def writeToStream(self, stream, encryption_key): 70 | self.stream.writeToStream(stream, encryption_key) 71 | 72 | def getElement(self, aboutUri, namespace, name): 73 | for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 74 | if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 75 | attr = desc.getAttributeNodeNS(namespace, name) 76 | 77 | if attr is not None: 78 | yield attr 79 | for element in desc.getElementsByTagNameNS(namespace, name): 80 | yield element 81 | 82 | def getNodesInNamespace(self, aboutUri, namespace): 83 | for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 84 | if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 85 | for i in range(desc.attributes.length): 86 | attr = desc.attributes.item(i) 87 | 88 | if attr.namespaceURI == namespace: 89 | yield attr 90 | for child in desc.childNodes: 91 | if child.namespaceURI == namespace: 92 | yield child 93 | 94 | def _getText(self, element): 95 | text = "" 96 | 97 | for child in element.childNodes: 98 | if child.nodeType == child.TEXT_NODE: 99 | text += child.data 100 | 101 | return text 102 | 103 | def _converterString(value): 104 | return value 105 | 106 | def _converterDate(value): 107 | m = iso8601.match(value) 108 | year = int(m.group("year")) 109 | month = int(m.group("month") or "1") 110 | day = int(m.group("day") or "1") 111 | hour = int(m.group("hour") or "0") 112 | minute = int(m.group("minute") or "0") 113 | second = decimal.Decimal(m.group("second") or "0") 114 | seconds = second.to_integral(decimal.ROUND_FLOOR) 115 | milliseconds = (second - seconds) * 1000000 116 | tzd = m.group("tzd") or "Z" 117 | dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 118 | 119 | if tzd != "Z": 120 | tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] 121 | tzd_hours *= -1 122 | if tzd_hours < 0: 123 | tzd_minutes *= -1 124 | dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 125 | 126 | return dt 127 | 128 | _test_converter_date = staticmethod(_converterDate) 129 | 130 | def _getterBag(namespace, name, converter): 131 | def get(self): 132 | cached = self.cache.get(namespace, {}).get(name) 133 | retval = [] 134 | 135 | if cached: 136 | return cached 137 | 138 | for element in self.getElement("", namespace, name): 139 | bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") 140 | 141 | if len(bags): 142 | for bag in bags: 143 | for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 144 | value = self._getText(item) 145 | value = converter(value) 146 | retval.append(value) 147 | 148 | ns_cache = self.cache.setdefault(namespace, {}) 149 | ns_cache[name] = retval 150 | 151 | return retval 152 | 153 | return get 154 | 155 | def _getterSeq(namespace, name, converter): 156 | def get(self): 157 | cached = self.cache.get(namespace, {}).get(name) 158 | retval = [] 159 | 160 | if cached: 161 | return cached 162 | 163 | for element in self.getElement("", namespace, name): 164 | seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") 165 | 166 | if len(seqs): 167 | for seq in seqs: 168 | for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 169 | value = self._getText(item) 170 | value = converter(value) 171 | retval.append(value) 172 | else: 173 | value = converter(self._getText(element)) 174 | retval.append(value) 175 | 176 | ns_cache = self.cache.setdefault(namespace, {}) 177 | ns_cache[name] = retval 178 | 179 | return retval 180 | 181 | return get 182 | 183 | def _getterLangalt(namespace, name, converter): 184 | def get(self): 185 | cached = self.cache.get(namespace, {}).get(name) 186 | retval = {} 187 | 188 | if cached: 189 | return cached 190 | for element in self.getElement("", namespace, name): 191 | alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 192 | if len(alts): 193 | for alt in alts: 194 | for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 195 | value = self._getText(item) 196 | value = converter(value) 197 | retval[item.getAttribute("xml:lang")] = value 198 | else: 199 | retval["x-default"] = converter(self._getText(element)) 200 | 201 | ns_cache = self.cache.setdefault(namespace, {}) 202 | ns_cache[name] = retval 203 | 204 | return retval 205 | 206 | return get 207 | 208 | def _getterSingle(namespace, name, converter): 209 | def get(self): 210 | cached = self.cache.get(namespace, {}).get(name) 211 | 212 | if cached: 213 | return cached 214 | 215 | value = None 216 | 217 | for element in self.getElement("", namespace, name): 218 | if element.nodeType == element.ATTRIBUTE_NODE: 219 | value = element.nodeValue 220 | else: 221 | value = self._getText(element) 222 | break 223 | 224 | if value is not None: 225 | value = converter(value) 226 | 227 | ns_cache = self.cache.setdefault(namespace, {}) 228 | ns_cache[name] = value 229 | 230 | return value 231 | 232 | return get 233 | 234 | dc_contributor = property(_getterBag(DC_NAMESPACE, "contributor", _converterString)) 235 | """ 236 | Contributors to the resource (other than the authors). An unsorted array of 237 | names. 238 | """ 239 | 240 | dc_coverage = property(_getterSingle(DC_NAMESPACE, "coverage", _converterString)) 241 | """ 242 | Text describing the extent or scope of the resource. 243 | """ 244 | 245 | dc_creator = property(_getterSeq(DC_NAMESPACE, "creator", _converterString)) 246 | """ 247 | A sorted array of names of the authors of the resource, listed in order of 248 | precedence. 249 | """ 250 | 251 | dc_date = property(_getterSeq(DC_NAMESPACE, "date", _converterDate)) 252 | """ 253 | A sorted array of dates (``datetime.datetime`` instances) of significance 254 | to the resource. The dates and times are in UTC. 255 | """ 256 | 257 | dc_description = property( 258 | _getterLangalt(DC_NAMESPACE, "description", _converterString) 259 | ) 260 | """ 261 | A language-keyed dictionary of textual descriptions of the content of the 262 | resource. 263 | """ 264 | 265 | dc_format = property(_getterSingle(DC_NAMESPACE, "format", _converterString)) 266 | """ 267 | The mime-type of the resource. 268 | """ 269 | 270 | dc_identifier = property( 271 | _getterSingle(DC_NAMESPACE, "identifier", _converterString) 272 | ) 273 | """ 274 | Unique identifier of the resource. 275 | """ 276 | 277 | dc_language = property(_getterBag(DC_NAMESPACE, "language", _converterString)) 278 | """ 279 | An unordered array specifying the languages used in the resource. 280 | """ 281 | 282 | dc_publisher = property(_getterBag(DC_NAMESPACE, "publisher", _converterString)) 283 | """ 284 | An unordered array of publisher names. 285 | """ 286 | 287 | dc_relation = property(_getterBag(DC_NAMESPACE, "relation", _converterString)) 288 | """ 289 | An unordered array of text descriptions of relationships to other 290 | documents. 291 | """ 292 | 293 | dc_rights = property(_getterLangalt(DC_NAMESPACE, "rights", _converterString)) 294 | """ 295 | A language-keyed dictionary of textual descriptions of the rights the user 296 | has to this resource. 297 | """ 298 | 299 | dc_source = property(_getterSingle(DC_NAMESPACE, "source", _converterString)) 300 | """ 301 | Unique identifier of the work from which this resource was derived. 302 | """ 303 | 304 | dc_subject = property(_getterBag(DC_NAMESPACE, "subject", _converterString)) 305 | """ 306 | An unordered array of descriptive phrases or keywrods that specify the 307 | topic of the content of the resource. 308 | """ 309 | 310 | dc_title = property(_getterLangalt(DC_NAMESPACE, "title", _converterString)) 311 | """ 312 | A language-keyed dictionary of the title of the resource. 313 | """ 314 | 315 | dc_type = property(_getterBag(DC_NAMESPACE, "type", _converterString)) 316 | """ 317 | An unordered array of textual descriptions of the document type. 318 | """ 319 | 320 | pdf_keywords = property(_getterSingle(PDF_NAMESPACE, "Keywords", _converterString)) 321 | """ 322 | An unformatted text string representing document keywords. 323 | """ 324 | 325 | pdf_pdfversion = property( 326 | _getterSingle(PDF_NAMESPACE, "PDFVersion", _converterString) 327 | ) 328 | """ 329 | The PDF file version, for example ``1.0``, ``1.3``. 330 | """ 331 | 332 | pdf_producer = property(_getterSingle(PDF_NAMESPACE, "Producer", _converterString)) 333 | """ 334 | The name of the tool that created the PDF document. 335 | """ 336 | 337 | xmp_createDate = property( 338 | _getterSingle(XMP_NAMESPACE, "CreateDate", _converterDate) 339 | ) 340 | """ 341 | The date and time the resource was originally created. The date and time 342 | are returned as a UTC ``datetime.datetime`` object. 343 | """ 344 | 345 | xmp_modifyDate = property( 346 | _getterSingle(XMP_NAMESPACE, "ModifyDate", _converterDate) 347 | ) 348 | """ 349 | The date and time the resource was last modified. The date and time are 350 | returned as a UTC ``datetime.datetime`` object. 351 | """ 352 | 353 | xmp_metadataDate = property( 354 | _getterSingle(XMP_NAMESPACE, "MetadataDate", _converterDate) 355 | ) 356 | """ 357 | The date and time that any metadata for this resource was last changed. The 358 | date and time are returned as a UTC ``datetime.datetime`` object. 359 | """ 360 | 361 | xmp_creatorTool = property( 362 | _getterSingle(XMP_NAMESPACE, "CreatorTool", _converterString) 363 | ) 364 | """ 365 | The name of the first known tool used to create the resource. 366 | """ 367 | 368 | xmpmm_documentId = property( 369 | _getterSingle(XMPMM_NAMESPACE, "DocumentID", _converterString) 370 | ) 371 | """ 372 | The common identifier for all versions and renditions of this resource. 373 | """ 374 | 375 | xmpmm_instanceId = property( 376 | _getterSingle(XMPMM_NAMESPACE, "InstanceID", _converterString) 377 | ) 378 | """ 379 | An identifier for a specific incarnation of a document, updated each time a 380 | file is saved. 381 | """ 382 | 383 | @property 384 | def custom_properties(self): 385 | """ 386 | Retrieves custom metadata properties defined in the undocumented pdfx 387 | metadata schema. 388 | 389 | :return: a dictionary of key/value items for custom metadata 390 | properties. 391 | :rtype: dict 392 | """ 393 | if not hasattr(self, "_custom_properties"): 394 | self._custom_properties = {} 395 | 396 | for node in self.getNodesInNamespace("", PDFX_NAMESPACE): 397 | key = node.localName 398 | 399 | while True: 400 | # See documentation about PDFX_NAMESPACE earlier in file 401 | idx = key.find(pypdfUnicode("\u2182")) 402 | 403 | if idx == -1: 404 | break 405 | 406 | key = ( 407 | key[:idx] 408 | + chr(int(key[idx + 1 : idx + 5], base=16)) 409 | + key[idx + 5 :] 410 | ) 411 | if node.nodeType == node.ATTRIBUTE_NODE: 412 | value = node.nodeValue 413 | else: 414 | value = self._getText(node) 415 | self._custom_properties[key] = value 416 | 417 | return self._custom_properties 418 | -------------------------------------------------------------------------------- /samplecode/MergingComments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | test/demo program that copy alll comments from multiples pdf into one command line: 4 | PDFCommentsMerge [-d] [-o output.pdf] [input1.pdf] ... [inputN.pdf] 5 | -d: open Excel output at the end of extraction 6 | -o: prode the output Excel name/path ; if not present the file is created 7 | in temp folder named "FullCommented **input1**.pdf" 8 | if no parameters (mainly for idle test), the pdf filenames re asked for 9 | empty to finish 10 | """ 11 | import os 12 | import sys 13 | 14 | import pypdf as PDF 15 | 16 | if sys.argv[0].upper().find("PYTHON.EXE") >= 0: 17 | del sys.argv[0] 18 | del sys.argv[0] # to ignore called program 19 | 20 | displayOutput = ("-d" in sys.argv) or ("idlelib.run" in sys.modules) 21 | try: 22 | del sys.argv[sys.argv.index("-d")] 23 | except: 24 | pass 25 | 26 | 27 | if (len(sys.argv) == 0) or (("-o" in sys.argv) and (len(sys.argv) <= 2)): 28 | print(globals()["__doc__"]) 29 | while True: 30 | t = input("pdf file to scan:") 31 | if t == "": 32 | break 33 | sys.argv.append(t) 34 | 35 | if "-o" in sys.argv: 36 | i = sys.argv.index("-o") 37 | outFile = sys.argv[i + 1] 38 | del sys.argv[i] 39 | del sys.argv[i] 40 | else: 41 | tempFolder = os.environ["TEMP"].replace("\\", "/") 42 | if tempFolder[-1] != "/": 43 | tempFolder += "/" 44 | outFile = ( 45 | tempFolder 46 | + "FullCommented " 47 | + os.path.splitext(os.path.split(sys.argv[0])[-1])[0] 48 | + ".pdf" 49 | ) 50 | 51 | pdfO = PDF.PdfFileWriter(None, PDF.PdfFileReader(sys.argv[0])) 52 | del sys.argv[0] 53 | 54 | pdfS = [] 55 | for f in sys.argv: 56 | pdfS.append(PDF.PdfFileReader(f)) 57 | # check if decryption is required ; normally not required 58 | if pdfS[-1].isEncrypted: 59 | pdfS[-1].decrypt("") 60 | 61 | # we assume that all the documents are commenting the same original document 62 | for i in range(pdfO.numPages): 63 | po = pdfO.getPage(i) 64 | for pdfin in pdfS: 65 | pdfO.addCommentsFromPage(i, pdfin.getPage(i)) 66 | 67 | pdfO.write(outFile) 68 | if displayOutput: 69 | os.startfile(outFile) 70 | -------------------------------------------------------------------------------- /samplecode/PDFComments2XL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | test/demo program tha extract comments from an pdf into a Excel 4 | command line: 5 | PDFComments2XL [-d] [-o output.xls] [input.pdf] 6 | -d: open Excel output at the end of extraction 7 | -o: prode the output Excel name/path ; if not present the file is created 8 | in temp folder named "comments on **PDFfile**.xlsx" 9 | if no parameters (mainly for idle test), the pdf filename is asked for 10 | """ 11 | from collections import OrderedDict 12 | from datetime import datetime 13 | import locale 14 | import os 15 | import sys 16 | 17 | from openpyxl import Workbook 18 | from openpyxl.utils import get_column_letter 19 | 20 | import pypdf as PDF 21 | 22 | locale.setlocale(locale.LC_ALL, locale.getdefaultlocale()[0]) 23 | 24 | 25 | def ListOutlines(pdfS, outl=None): 26 | """ 27 | provide as a list of the outlines as tuple Title,Page(0 based),Vertical position in % 28 | """ 29 | if outl is None: 30 | lst = [ 31 | ("-", 0, 0), 32 | ] 33 | outl = pdfS.getOutlines() 34 | else: 35 | lst = [] 36 | if isinstance(outl, list): 37 | for k in outl: 38 | lst += ListOutlines(pdfS, k) 39 | else: 40 | try: 41 | top = outl["/Top"] 42 | except: 43 | top = 0 44 | try: 45 | pp = pdfS.MyPages[outl.page.idnum] 46 | lst.append((outl.title, pp[0], 100.0 * (1.0 - float(top / pp[1])))) 47 | except: 48 | print("trouble with page idnum", outl.page.idnum) 49 | return lst 50 | 51 | 52 | def ListAnnots(pdfS): 53 | """ 54 | provide as a list of the comments with the response saved in .irt_str field, the list is indexed with idnums 55 | """ 56 | lst = OrderedDict() 57 | for pn in range(pdfS.numPages): 58 | p = pdfS.getPage(pn) 59 | try: 60 | a = p.get("/Annots").getObject() 61 | if not isinstance(a, list): 62 | a = [a] 63 | for b in a: 64 | o = b.getObject() 65 | if o["/Subtype"] == "/Text": 66 | try: 67 | o["/P"] # le champs '/P' etant optionnel on le reconstruit... 68 | except: 69 | o.update({PDF.NameObject("/P"): p.indirectRef}) 70 | o.irt = {} 71 | lst[b.idnum] = o 72 | except: 73 | pass 74 | # copy the information into the original comment 75 | for k, o in lst.items(): 76 | if "/IRT" in o: 77 | t = o["/Contents"] 78 | if isinstance(t, bytes): 79 | t = t.replace(b"\r", b"\n").decode("unicode_escape") 80 | lst[o.rawGet("/IRT").idnum].irt[o["/M"]] = "%s (%s):\n%s" % ( 81 | o["/T"], 82 | datetime.strptime(o["/M"][2:10], "%Y%m%d").strftime("%x"), 83 | t, 84 | ) 85 | # concat all replied comments into one string to ease insertion later... 86 | for o in lst.values(): 87 | o.irt_str = "\n".join([o.irt[x] for x in sorted(o.irt.keys())]) 88 | return lst 89 | 90 | 91 | def FindOutline(Outlines, pa, pe): 92 | """ 93 | provide the outline just above the position (of the comment) 94 | """ 95 | m = None 96 | for o in Outlines: 97 | if (o[1] < pa) or ((o[1] == pa) and (o[2] <= pe)): 98 | m = o 99 | return m 100 | 101 | 102 | if sys.argv[0].upper().find("PYTHON.EXE") >= 0: 103 | del sys.argv[0] 104 | 105 | if len(sys.argv) == 1: 106 | print(globals()["__doc__"]) 107 | sys.argv.append(input("pdf file to scan:")) 108 | 109 | pdfS = PDF.PdfFileReader(sys.argv[-1]) 110 | 111 | if "-o" in sys.argv: 112 | xlFile = sys.argv[sys.argv.index("-o") + 1] 113 | else: 114 | tempFolder = os.environ["TEMP"].replace("\\", "/") 115 | if tempFolder[-1] != "/": 116 | tempFolder += "/" 117 | xlFile = ( 118 | tempFolder 119 | + "Comments on " 120 | + os.path.splitext(os.path.split(pdfS.filepath)[-1])[0] 121 | + ".xlsx" 122 | ) 123 | 124 | # prepare the destination workbook 125 | wb = Workbook() 126 | ws = wb.active 127 | ws.append(("Page", "Pos", "Chapt", "Originator", "Comment", "Answer")) 128 | ws.column_dimensions[get_column_letter(0 + 1)].width = 5 129 | ws.column_dimensions[get_column_letter(1 + 1)].width = 5 130 | ws.column_dimensions[get_column_letter(2 + 1)].width = 25 131 | ws.column_dimensions[get_column_letter(3 + 1)].width = 15 132 | ws.column_dimensions[get_column_letter(4 + 1)].width = 90 133 | ws.column_dimensions[get_column_letter(5 + 1)].width = 90 134 | 135 | # check if decryption is required 136 | if pdfS.isEncrypted: 137 | pdfS.decrypt("") 138 | 139 | # MyPages will store the matching table page.idnum => pagenumer,page_height 140 | pdfS.MyPages = {} 141 | 142 | for i, p in enumerate(pdfS.pages): 143 | pdfS.MyPages[p.indirectRef.idnum] = [i, p["/MediaBox"][3]] 144 | 145 | # extract the list of OutLines into MyOutlines 146 | pdfS.MyOutlines = ListOutlines(pdfS) 147 | 148 | # extract the comments into MyAnnots 149 | pdfS.MyAnnots = ListAnnots(pdfS) 150 | 151 | 152 | # sort the comments in the order (Page, vertical position, date) 153 | lst = {} 154 | for p in pdfS.MyAnnots.values(): 155 | pp = pdfS.MyPages[p.rawGet("/P").idnum] 156 | pc = 100.0 * (1.0 - float(int(p["/Rect"][1]) / pp[1])) 157 | lst[(pp[0], pc, p["/M"])] = p 158 | 159 | # fill the xl sheet with the comments 160 | for x in sorted(lst.keys()): 161 | p = lst[x] 162 | if "/IRT" in p: 163 | continue # the comments with IRT are already present in the original comment irt field, we can ignore this one 164 | 165 | # print(x[0],',',end='') 166 | # print('%.0f %%'%pc,',',end='') 167 | # print(FindOutline(pdfS.MyOutlines,x[0],x[1])[0],',',end='') 168 | auth = p["/T"] 169 | if isinstance(auth, bytes): 170 | auth = auth.decode("unicode_escape") 171 | cont = p["/Contents"] 172 | if isinstance(cont, bytes): 173 | cont = cont.replace(b"\r", b"\n").decode("unicode_escape") 174 | # print(cont,',',end='') 175 | if isinstance(p.irt_str, bytes): 176 | p.irt_str = p.irt_str.replace(b"\r", b"\n").decode("unicode_escape") 177 | # print(p.irt_str) 178 | 179 | ws.append( 180 | ( 181 | pdfS.getPageLabel(x[0]), 182 | "%.0f %%" % pc, 183 | FindOutline(pdfS.MyOutlines, x[0], x[1])[0], 184 | auth, 185 | cont, 186 | p.irt_str, 187 | ) 188 | ) 189 | 190 | # post insertion formating 191 | for row in ws.iter_rows(): 192 | for cell in row: 193 | cell.alignment = cell.alignment.copy(wrapText=True, vertical="top") 194 | 195 | # save and open the file 196 | wb.save(xlFile) 197 | if ("-d" in sys.argv) or ("idlelib.run" in sys.modules): 198 | os.startfile(xlFile) 199 | -------------------------------------------------------------------------------- /samplecode/README.md: -------------------------------------------------------------------------------- 1 | # PyPDF4 Sample Code Folder 2 | This will contain demonstrations of the many features PyPDF4 is capable of. 3 | Example code should make it easy for users to know how to use all aspects of 4 | PyPDF4. 5 | 6 | ## How to run 7 | Invoke the Python interpeter you prefer by specifying the script you wish to 8 | run, e.g.: 9 | ``` 10 | python2 ./samplecode/basic_features.py 11 | python3 ./samplecode/basic_features.py 12 | ``` 13 | 14 | ## Contributing to `samplecode` 15 | Feel free to add any type of PDF file or sample code, either by: 16 | 17 | 1. Sending it via email to PyPDF4@phaseit.net 18 | 2. Including it in a pull request on GitHub 19 | -------------------------------------------------------------------------------- /samplecode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/__init__.py -------------------------------------------------------------------------------- /samplecode/basic_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Showcases basic features of PyPDF. 4 | """ 5 | from __future__ import print_function 6 | 7 | from os import pardir 8 | from os.path import abspath, basename, dirname, join 9 | from sys import argv, path, stderr 10 | 11 | from pypdf.pdf import PdfFileReader, PdfFileWriter 12 | 13 | SAMPLE_CODE_ROOT = dirname(__file__) 14 | SAMPLE_PDF_ROOT = join(SAMPLE_CODE_ROOT, "pdfsamples") 15 | 16 | path.append(abspath(join(SAMPLE_CODE_ROOT, pardir))) 17 | 18 | 19 | FLAG_HELP = {"-h", "--help"} 20 | USAGE = """\ 21 | Showcases basic features of PyPDF. 22 | 23 | %(progname)s: [output file] 24 | %(progname)s: [-h | --help] 25 | """ % { 26 | "progname": argv[0] 27 | } 28 | 29 | 30 | def main(): 31 | pagesRequired = 5 32 | output = "PyPDF-Features-Output.pdf" 33 | 34 | if set(argv) & FLAG_HELP: 35 | print(USAGE) 36 | exit(0) 37 | elif len(argv) < 2: 38 | print(USAGE) 39 | exit(1) 40 | else: 41 | inputpath = argv[1].strip() 42 | filename = basename(inputpath) 43 | 44 | if len(argv) > 2: 45 | output = argv[2].strip() 46 | 47 | # We can instantiate a PdfFileReader/Writer by giving in a stream object 48 | # or a path string 49 | reader = PdfFileReader(open(inputpath, "rb")) 50 | writer = PdfFileWriter(output) 51 | 52 | # Check that the PDF file has the required number of pages 53 | if reader.numPages < pagesRequired: 54 | print( 55 | "We require a document with %d pages at least, %s has %d" 56 | % (pagesRequired, filename, reader.numPages), 57 | file=stderr, 58 | ) 59 | exit(1) 60 | else: 61 | print("'%s' has %d pages... OK" % (filename, reader.numPages)) 62 | 63 | # Add page 1 from reader to output document, unchanged 64 | writer.addPage(reader.getPage(0)) 65 | 66 | # Add page 2 from reader, but rotated clockwise 90 degrees 67 | writer.addPage(reader.getPage(1).rotateClockwise(90)) 68 | 69 | # Add page 3 from reader, rotated the other way: 70 | writer.addPage(reader.getPage(2).rotateCounterClockwise(90)) 71 | # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270)) 72 | 73 | # Add page 4 from reader, but first add a watermark from another PDF: 74 | page4 = reader.getPage(3) 75 | watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb")) 76 | page4.mergePage(watermark.getPage(0)) 77 | writer.addPage(page4) 78 | 79 | # Add page 5 from reader, but crop it to half size: 80 | page5 = reader.getPage(4) 81 | page5.mediaBox.upperRight = ( 82 | page5.mediaBox.getUpperRight_x() / 2, 83 | page5.mediaBox.getUpperRight_y() / 2, 84 | ) 85 | writer.addPage(page5) 86 | 87 | # Add some Javascript to launch the print window on opening this PDF. 88 | # The password dialog may prevent the print dialog from being shown. 89 | # Comment the encrypted lines, if that's the case, to try this out 90 | writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 91 | 92 | # Encrypt your new PDF and add a password 93 | password = "secret" 94 | writer.encrypt(password) 95 | 96 | # Finally, write the resulting PDF document to ``output`` 97 | writer.write() 98 | 99 | print("Output successfully written to", output) 100 | 101 | reader.close() 102 | writer.close() 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | -------------------------------------------------------------------------------- /samplecode/basic_merging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Merges three PDF documents input from the command line. 4 | """ 5 | from __future__ import print_function 6 | 7 | from os import pardir 8 | from os.path import abspath, dirname, join 9 | from sys import argv, path 10 | 11 | from pypdf import PdfFileMerger, PdfFileReader 12 | 13 | SAMPLE_CODE_ROOT = dirname(__file__) 14 | SAMPLE_PDF_ROOT = join(SAMPLE_CODE_ROOT, "pdfsamples") 15 | 16 | path.append(abspath(join(SAMPLE_CODE_ROOT, pardir))) 17 | 18 | 19 | FLAG_HELP = {"-h", "--help"} 20 | USAGE = """\ 21 | Merges three PDF documents input from the command line. 22 | 23 | %(progname)s: [output filename] 24 | %(progname)s: [-h | --help] 25 | """ % { 26 | "progname": argv[0] 27 | } 28 | 29 | 30 | def main(): 31 | requiredPages = 3 32 | output = "PyPDF-Merging-Output.pdf" 33 | 34 | if set(argv) & FLAG_HELP: 35 | print(USAGE) 36 | exit(0) 37 | elif len(argv) < 4: 38 | print(USAGE) 39 | exit(1) 40 | else: 41 | files = [f.strip() for f in argv[1:4]] 42 | 43 | if len(argv) > 4: 44 | output = argv[4].strip() 45 | 46 | reader1 = PdfFileReader(files[0]) 47 | merger = PdfFileMerger(open(output, "wb")) 48 | 49 | if reader1.numPages < requiredPages: 50 | print( 51 | "File 1 requires %d pages, but it has just %d" 52 | % (requiredPages, reader1.numPages) 53 | ) 54 | exit(1) 55 | 56 | input1 = open(files[0], "rb") 57 | input2 = open(files[1], "rb") 58 | input3 = open(files[2], "rb") 59 | 60 | # Add the first 3 pages of input1 to output 61 | merger.append(fileobj=input1, pages=(0, 3)) 62 | 63 | # Insert the first page of input2 into the output beginning after the 64 | # second page 65 | merger.merge(position=2, fileobj=input2, pages=(0, 1)) 66 | 67 | # Append entire input3 document to the end of the output document 68 | merger.append(input3) 69 | 70 | merger.write() 71 | print("Output successfully written to", output) 72 | 73 | merger.close() 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /samplecode/pdfsamples/AutoCad_Diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/AutoCad_Diagram.pdf -------------------------------------------------------------------------------- /samplecode/pdfsamples/AutoCad_Simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/AutoCad_Simple.pdf -------------------------------------------------------------------------------- /samplecode/pdfsamples/GeoBase_NHNC1_Data_Model_UML_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/GeoBase_NHNC1_Data_Model_UML_EN.pdf -------------------------------------------------------------------------------- /samplecode/pdfsamples/README.md: -------------------------------------------------------------------------------- 1 | # PDF Sample Folder 2 | 3 | PDF files are generated by a large variety of sources for many different 4 | purposes. One of the goals of PyPDF4 is to be able to read/write any PDF 5 | instance that Adobe can. 6 | 7 | This is a catalog of various PDF files. The files may not have worked with 8 | PyPDF4 but do now, they may be complicated or unconventional files, or they may 9 | just be good for testing. The purpose is to insure that when changes to PyPDF4 10 | are made, we keep them in mind. 11 | 12 | If you have confidential PDFs that don't work with PyPDF4, feel free to still 13 | e-mail them for debugging - we won't add PDFs without expressed permission. 14 | (This folder is available through GitHub only.) 15 | 16 | Feel free to add any type of PDF file or sample code, either by: 17 | 18 | 1. Sending it via email to PyPDF4@phaseit.net 19 | 2. Including it in a pull request on GitHub 20 | -------------------------------------------------------------------------------- /samplecode/pdfsamples/SF424_page2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/SF424_page2.pdf -------------------------------------------------------------------------------- /samplecode/pdfsamples/Seige_of_Vicksburg_Sample_OCR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/Seige_of_Vicksburg_Sample_OCR.pdf -------------------------------------------------------------------------------- /samplecode/pdfsamples/jpeg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/samplecode/pdfsamples/jpeg.pdf -------------------------------------------------------------------------------- /scripts/2-up.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | from os.path import abspath, dirname, join 5 | import sys 6 | 7 | from pypdf import PdfFileReader, PdfFileWriter 8 | 9 | PROJECT_ROOT = abspath(join(dirname(__file__), os.pardir, os.pardir)) 10 | sys.path.append(PROJECT_ROOT) 11 | 12 | 13 | # TO-DO Decide which one of the two halves below to keep 14 | def main(): 15 | if len(sys.argv) != 3: 16 | print("usage: python 2-up.py input_file output_file") 17 | sys.exit(1) 18 | 19 | print("2-up input " + sys.argv[1]) 20 | 21 | input1 = PdfFileReader(open(sys.argv[1], "rb")) 22 | output = PdfFileWriter() 23 | 24 | for iter in range(0, input1.numPages - 1, 2): 25 | lhs = input1.getPage(iter) 26 | rhs = input1.getPage(iter + 1) 27 | lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True) 28 | output.addPage(lhs) 29 | print(str(iter) + " "), 30 | sys.stdout.flush() 31 | 32 | print("writing " + sys.argv[2]) 33 | output_stream = open(sys.argv[2], "wb") 34 | output.write() 35 | print("done.") 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | 41 | 42 | def main(): 43 | if len(sys.argv) != 3: 44 | print("usage: python 2-up.py input_file output_file") 45 | sys.exit(1) 46 | 47 | print("2-up input " + sys.argv[1]) 48 | input1 = PdfFileReader(open(sys.argv[1], "rb")) 49 | output = PdfFileWriter() 50 | 51 | for i in range(0, input1.numPages - 1, 2): 52 | lhs = input1.getPage(i) 53 | rhs = input1.getPage(i + 1) 54 | lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True) 55 | output.addPage(lhs) 56 | print(str(i) + " "), 57 | sys.stdout.flush() 58 | 59 | print("writing " + sys.argv[2]) 60 | output_stream = open(sys.argv[2], "wb") 61 | output.write() 62 | print("done.") 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /scripts/codecs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # TO-DO Add license notice 3 | """ 4 | Encodes/decodes data fed from the command line with PyPDF codecs. 5 | 6 | Although PyPDF4 mandates Python 2 support as well, only Python 3 is supported 7 | by this script. 8 | """ 9 | import argparse 10 | from os.path import abspath, dirname, join, pardir 11 | from sys import exit, path, stderr 12 | 13 | from pypdf.filters import * 14 | 15 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir)) 16 | path.append(PROJECT_ROOT) 17 | 18 | 19 | __version__ = "0.3.0" 20 | CODECS = { 21 | "flate": FlateCodec, 22 | "asciihex": ASCIIHexCodec, 23 | "lzw": LZWCodec, 24 | "ascii85": ASCII85Codec, 25 | "dct": DCTCodec, 26 | "jpx": JPXCodec, 27 | "ccittfax": CCITTFaxCodec, 28 | } 29 | 30 | ENCODE, DECODE, LIST = ("encode", "decode", "list") 31 | CODEC_ACTIONS = (ENCODE, DECODE) 32 | VIEW_ACTIONS = (LIST,) 33 | 34 | 35 | def main(): 36 | """ 37 | :return: exit status of program (``0`` with no errors, ``1`` with a generic 38 | error). 39 | """ 40 | parser = argparse.ArgumentParser( 41 | description="Encodes/decodes some data fed in with PyPDF codecs", 42 | epilog="Version %s" % __version__, 43 | ) 44 | subparsers = parser.add_subparsers(title="Commands", dest="action") 45 | codec_parser = subparsers.add_parser( 46 | ENCODE, aliases=(DECODE,), help="Encode/decode data" 47 | ) 48 | _list_parser = subparsers.add_parser(LIST, help="List available codecs") 49 | 50 | subparsers.required = True 51 | parser.add_argument( 52 | "-v", "--version", action="version", version="%(prog)s " + __version__ 53 | ) 54 | 55 | codec_parser.add_argument("data", help="Data to either encode or decode") 56 | # TO-DO Add chained list of encoders/decoders support (like 57 | # ASCIIHexDecode(LZWDecode(data))). 58 | codec_parser.add_argument( 59 | "-c", 60 | "--codec", 61 | choices=CODECS.keys(), 62 | required=True, 63 | help="The codec to encode/decode with", 64 | ) 65 | codec_parser.add_argument( 66 | "-f", 67 | "--file", 68 | dest="isfile", 69 | action="store_const", 70 | const=True, 71 | help="Whether the argument provided to DATA should be interpreted as a" 72 | " file path", 73 | ) 74 | 75 | args = parser.parse_args() 76 | 77 | # TO-DO Find a proper way of writing bytes directly to the console (perhaps 78 | # throught bytes streams?). Decoded byte strings are not enough reliable. 79 | if args.action in CODEC_ACTIONS: 80 | codec = CODECS[args.codec] 81 | 82 | if args.isfile: 83 | try: 84 | with open(args.data, "rb") as instream: 85 | data = instream.read() 86 | except IOError as e: 87 | print(e, file=stderr) 88 | return 1 89 | else: 90 | data = args.data.encode("LATIN1") 91 | 92 | if args.action == ENCODE: 93 | data = codec.encode(data) 94 | elif args.action == DECODE: 95 | data = codec.decode(data) 96 | 97 | if isinstance(data, bytes): 98 | data = data.decode("LATIN1") 99 | 100 | print(data) 101 | elif args.action == LIST: 102 | print("Available codecs:", *CODECS.keys(), sep="\n\t") 103 | else: 104 | print("Unrecognized action", args.action, file=stderr) 105 | return 1 106 | 107 | return 0 108 | 109 | 110 | if __name__ == "__main__": 111 | exit(main()) 112 | -------------------------------------------------------------------------------- /scripts/pdf-image-extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extract images from PDFs without resampling or altering. 3 | 4 | Adapted from work by Sylvain Pelissier 5 | http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python 6 | """ 7 | from __future__ import print_function 8 | 9 | import os 10 | from os.path import abspath, dirname, join 11 | import sys 12 | 13 | from PIL import Image 14 | from pypdf import PdfFileReader 15 | 16 | PROJECT_ROOT = abspath(join(dirname(__file__), os.pardir)) 17 | sys.path.append(PROJECT_ROOT) 18 | 19 | 20 | def _handle_filter(x_object, obj, mode, size, data): 21 | """ [EXPLAIN.] """ 22 | # CL will eventually rewrite this so it's even simpler. 23 | if "/Filter" in x_object[obj]: 24 | x_filter = x_object[obj]["/Filter"] 25 | if x_filter == "/FlateDecode": 26 | pass 27 | elif x_filter == "/DCTDecode": 28 | img = open(obj[1:] + ".jpg", "wb") 29 | img.write(data) 30 | img.close() 31 | return 32 | elif x_filter == "/JPXDecode": 33 | img = open(obj[1:] + ".jp2", "wb") 34 | img.write(data) 35 | img.close() 36 | return 37 | elif x_filter == "/CCITTFaxDecode": 38 | img = open(obj[1:] + ".tiff", "wb") 39 | img.write(data) 40 | img.close() 41 | return 42 | img = Image.frombytes(mode, size, data) 43 | img.save(obj[1:] + ".png") 44 | 45 | 46 | def main(): 47 | """ [EXPLAIN.] """ 48 | if len(sys.argv) != 2: 49 | print("{}: ".format(sys.argv[0])) 50 | return 1 51 | 52 | filepath = sys.argv[1].strip() 53 | r__ = PdfFileReader(open(filepath, "rb")) 54 | page_number = 0 55 | 56 | while page_number < r__.numPages: 57 | page = r__.getPage(page_number) 58 | 59 | if "/XObject" in page["/Resources"]: 60 | x_object = page["/Resources"]["/XObject"].getObject() 61 | 62 | for obj in x_object: 63 | if x_object[obj]["/Subtype"] == "/Image": 64 | size = (x_object[obj]["/Width"], x_object[obj]["/Height"]) 65 | data = x_object[obj].getData() 66 | 67 | if x_object[obj]["/ColorSpace"] == "/DeviceRGB": 68 | mode = "RGB" 69 | else: 70 | mode = "P" 71 | 72 | _handle_filter(x_object, obj, mode, size, data) 73 | else: 74 | print("No image found.") 75 | 76 | page_number += 1 77 | 78 | 79 | if __name__ == "__main__": 80 | sys.exit(main()) 81 | -------------------------------------------------------------------------------- /scripts/pdfcat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Concatenate pages from pdf files into a single pdf file. 4 | 5 | Page ranges refer to the previously-named file. 6 | A file not followed by a page range means all the pages of the file. 7 | 8 | PAGE RANGES are like Python slices. 9 | {page_range_help} 10 | EXAMPLES 11 | pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 12 | Concatenate all of head.pdf, all but page seven of content.pdf, 13 | and the last page of tail.pdf, producing output.pdf. 14 | 15 | pdfcat chapter*.pdf >book.pdf 16 | You can specify the output file by redirection. 17 | 18 | pdfcat chapter?.pdf chapter10.pdf >book.pdf 19 | In case you don't want chapter 10 before chapter 2. 20 | """ 21 | # Copyright (c) 2014, Steve Witham . 22 | # All rights reserved. This software is available under a BSD license; 23 | # see https://github.com/mstamy2/PyPDF2/LICENSE 24 | 25 | from __future__ import print_function 26 | 27 | import argparse 28 | import os 29 | import sys 30 | import traceback 31 | 32 | from io import BytesIO 33 | from sys import stderr, stdout, exit 34 | from os.path import dirname, abspath, join 35 | 36 | PROJECT_ROOT = abspath( 37 | join(dirname(__file__), os.pardir) 38 | ) 39 | sys.path.append(PROJECT_ROOT) 40 | 41 | from pypdf import PdfFileReader, PdfFileMerger 42 | from pypdf.pagerange import PAGE_RANGE_HELP, parseFilenamePageRanges 43 | 44 | 45 | def parseArgs(): 46 | parser = argparse.ArgumentParser( 47 | description=__doc__.format(page_range_help=PAGE_RANGE_HELP), 48 | formatter_class=argparse.RawDescriptionHelpFormatter 49 | ) 50 | parser.add_argument("-o", "--output", metavar="output_file") 51 | parser.add_argument("-v", "--verbose", action="store_true", 52 | help="show page ranges as they are being read") 53 | parser.add_argument( 54 | "--no-strict", default=False, action="store_true", 55 | help="Whether to parse PDF files in strict mode (defaults to True)." 56 | ) 57 | parser.add_argument("first_filename", nargs=1, 58 | metavar="filename [page range...]") 59 | # argparse chokes on page ranges like "-2:" unless caught like this: 60 | parser.add_argument("fn_pgrgs", nargs=argparse.REMAINDER, 61 | metavar="filenames and/or page ranges") 62 | parser.add_argument("-T", "--toc", action="store_true", 63 | help="stores an auto-generated Table of Contents to " 64 | "output file") 65 | args = parser.parse_args() 66 | args.fn_pgrgs.insert(0, args.first_filename[0]) 67 | 68 | return args 69 | 70 | 71 | if __name__ == "__main__": 72 | args = parseArgs() 73 | filename_page_ranges = parseFilenamePageRanges(args.fn_pgrgs) 74 | 75 | if args.output: 76 | output = open(args.output, "wb") 77 | else: 78 | output = BytesIO() 79 | 80 | merger = PdfFileMerger(output, not args.no_strict) 81 | in_fs = dict() 82 | curr_page = 0 83 | 84 | try: 85 | for (filename, page_range) in filename_page_ranges: 86 | if args.verbose: 87 | print(filename, page_range, file=stderr) 88 | if filename not in in_fs: 89 | in_fs[filename] = open(filename, "rb") 90 | 91 | merger.append(in_fs[filename], pages=page_range) 92 | 93 | if args.toc: 94 | r = PdfFileReader(filename) 95 | # fallbackName equals 'a' if filename == 'a.pdf', # or 'b' 96 | # if filename == 'b.x', or 'c' if filename == '/u/v/w/c.x' ... 97 | fallbackName = basename(filename) 98 | fallbackName = fallbackName[:fallbackName.rfind(".")] 99 | 100 | merger.addBookmark( 101 | getattr(r.documentInfo, "title", fallbackName) 102 | or fallbackName, curr_page 103 | ) 104 | 105 | curr_page += r.numPages 106 | except Exception: 107 | print(traceback.format_exc(), file=stderr) 108 | print("Error while reading " + filename, file=stderr) 109 | exit(1) 110 | 111 | merger.write() 112 | 113 | if not args.output: 114 | output.seek(0, 0) 115 | stdout.buffer.write(output.read()) 116 | 117 | merger.close() 118 | # In 3.0, input files must stay open until output is written. 119 | # Not closing the in_fs because this script exits now. 120 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | import re 5 | 6 | long_description = """ 7 | A Pure-Python library built as a PDF toolkit. It is capable of: 8 | 9 | - extracting document information (title, author, ...) 10 | - splitting documents page by page 11 | - merging documents page by page 12 | - cropping pages 13 | - merging multiple pages into a single page 14 | - encrypting and decrypting PDF files 15 | - and more! 16 | 17 | By being Pure-Python, it should run on any Python platform without any 18 | dependencies on external libraries. It can also work entirely on StringIO 19 | objects rather than file streams, allowing for PDF manipulation in memory. 20 | It is therefore a useful tool for websites that manage or manipulate PDFs. 21 | """ 22 | 23 | VERSIONFILE="pypdf/_version.py" 24 | verstrline = open(VERSIONFILE, "rt").read() 25 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" 26 | mo = re.search(VSRE, verstrline, re.M) 27 | if mo: 28 | verstr = mo.group(1) 29 | else: 30 | raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE)) 31 | 32 | setup( 33 | name="pypdf4", 34 | version=verstr, 35 | description="PDF toolkit", 36 | long_description=long_description, 37 | author="Mathieu Fenniak", 38 | author_email="biziqe@mathieu.fenniak.net", 39 | maintainer="Phaseit, Inc.", 40 | maintainer_email="PyPDF4@phaseit.net", 41 | url="http://claird.github.com/PyPDF4", 42 | classifiers = [ 43 | "Development Status :: 5 - Production/Stable", 44 | "Intended Audience :: Developers", 45 | "License :: OSI Approved :: BSD License", 46 | "Programming Language :: Python :: 2", 47 | "Programming Language :: Python :: 3", 48 | "Operating System :: OS Independent", 49 | "Topic :: Software Development :: Libraries :: Python Modules", 50 | ], 51 | packages=["pypdf"], 52 | ) 53 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixture_data/GeoBase_NHNC1_Data_Model_UML_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/GeoBase_NHNC1_Data_Model_UML_EN.pdf -------------------------------------------------------------------------------- /tests/fixture_data/SF424_page2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/SF424_page2.pdf -------------------------------------------------------------------------------- /tests/fixture_data/Seige_of_Vicksburg_Sample_OCR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/Seige_of_Vicksburg_Sample_OCR.pdf -------------------------------------------------------------------------------- /tests/fixture_data/TheHappyPrince.txt: -------------------------------------------------------------------------------- 1 | 2 | THE HAPPY PRINCE 3 | By Oscar Wilde 4 | Retrieved from Project Gutenberg: http://www.gutenberg.org/files/30120/30120.txt 5 | 6 | 7 | First published by David Nutt, May, 1888 8 | 9 | Reprinted January, 1889; February, 1902; September, 1905; February, 10 | 1907; March, 1908; March, 1910 11 | 12 | Reset and published by arrangement with David Nutt by Duckworth & Co., 13 | 1920 14 | 15 | Special Edition, reset. With illustrations by Charles Robinson, 16 | published by arrangement with David Nutt by Duckworth & Co., 1913. 17 | Reprinted 1920 18 | 19 | Printed in Great Britain 20 | By Hazell, Watson and Viney, Ld., 21 | London and Aylesbury. 22 | 23 | ==== 24 | 25 | High above the city, on a tall column, stood the statue of the Happy 26 | Prince. He was gilded all over with thin leaves of fine gold, for eyes 27 | he had two bright sapphires, and a large red ruby glowed on his 28 | sword-hilt. 29 | 30 | He was very much admired indeed. "He is as beautiful as a weathercock," 31 | remarked one of the Town Councillors who wished to gain a reputation for 32 | having artistic tastes; "only not quite so useful," he added, fearing 33 | lest people should think him unpractical, which he really was not. 34 | 35 | "Why can't you be like the Happy Prince?" asked a sensible mother of her 36 | little boy who was crying for the moon. "The Happy Prince never dreams 37 | of crying for anything." 38 | 39 | "I am glad there is some one in the world who is quite happy," muttered 40 | a disappointed man as he gazed at the wonderful statue. 41 | 42 | "He looks just like an angel," said the Charity Children as they came 43 | out of the cathedral in their bright scarlet cloaks and their clean 44 | white pinafores. 45 | 46 | "How do you know?" said the Mathematical Master, "you have never seen 47 | one." 48 | 49 | "Ah! but we have, in our dreams," answered the children; and the 50 | Mathematical Master frowned and looked very severe, for he did not 51 | approve of children dreaming. 52 | 53 | One night there flew over the city a little Swallow. His friends had 54 | gone away to Egypt six weeks before, but he had stayed behind, for he 55 | was in love with the most beautiful Reed. He had met her early in the 56 | spring as he was flying down the river after a big yellow moth, and had 57 | been so attracted by her slender waist that he had stopped to talk to 58 | her. 59 | 60 | "Shall I love you?" said the Swallow, who liked to come to the point at 61 | once, and the Reed made him a low bow. So he flew round and round her, 62 | touching the water with his wings, and making silver ripples. This was 63 | his courtship, and it lasted all through the summer. 64 | 65 | "It is a ridiculous attachment," twittered the other Swallows; "she has 66 | no money, and far too many relations;" and indeed the river was quite 67 | full of Reeds. Then, when the autumn came they all flew away. 68 | 69 | After they had gone he felt lonely, and began to tire of his lady-love. 70 | "She has no conversation," he said, "and I am afraid that she is a 71 | coquette, for she is always flirting with the wind." And certainly, 72 | whenever the wind blew, the Reed made the most graceful curtseys. 73 | "I admit that she is domestic," he continued, "but I love travelling, 74 | and my wife, consequently, should love travelling also." 75 | 76 | "Will you come away with me?" he said finally to her; but the Reed shook 77 | her head, she was so attached to her home. 78 | 79 | "You have been trifling with me," he cried. "I am off to the Pyramids. 80 | Good-bye!" and he flew away. 81 | 82 | All day long he flew, and at night-time he arrived at the city. "Where 83 | shall I put up?" he said; "I hope the town has made preparations." 84 | 85 | Then he saw the statue on the tall column. 86 | 87 | "I will put up there," he cried; "it is a fine position, with plenty of 88 | fresh air." So he alighted just between the feet of the Happy Prince. 89 | 90 | "I have a golden bedroom," he said softly to himself as he looked round, 91 | and he prepared to go to sleep; but just as he was putting his head 92 | under his wing a large drop of water fell on him. "What a curious 93 | thing!" he cried; "there is not a single cloud in the sky, the stars are 94 | quite clear and bright, and yet it is raining. The climate in the north 95 | of Europe is really dreadful. The Reed used to like the rain, but that 96 | was merely her selfishness." 97 | 98 | Then another drop fell. 99 | 100 | "What is the use of a statue if it cannot keep the rain off?" he said; 101 | "I must look for a good chimney-pot," and he determined to fly away. 102 | 103 | But before he had opened his wings, a third drop fell, and he looked up, 104 | and saw-- Ah! what did he see? 105 | 106 | The eyes of the Happy Prince were filled with tears, and tears were 107 | running down his golden cheeks. His face was so beautiful in the 108 | moonlight that the little Swallow was filled with pity. 109 | 110 | "Who are you?" he said. 111 | 112 | "I am the Happy Prince." 113 | 114 | "Why are you weeping then?" asked the Swallow; "you have quite 115 | drenched me." 116 | 117 | [Illustration: THE PALACE OF SANS-SOUCI] 118 | 119 | "When I was alive and had a human heart," answered the statue, "I did 120 | not know what tears were, for I lived in the Palace of Sans-Souci, where 121 | sorrow is not allowed to enter. In the daytime I played with my 122 | companions in the garden, and in the evening I led the dance in the 123 | Great Hall. Round the garden ran a very lofty wall, but I never cared 124 | to ask what lay beyond it, everything about me was so beautiful. 125 | My courtiers called me the Happy Prince, and happy indeed I was, if 126 | pleasure be happiness. So I lived, and so I died. And now that I am dead 127 | they have set me up here so high that I can see all the ugliness and all 128 | the misery of my city, and though my heart is made of lead yet I cannot 129 | choose but weep." 130 | 131 | "What! is he not solid gold?" said the Swallow to himself. He was too 132 | polite to make any personal remarks out loud. 133 | 134 | "Far away," continued the statue in a low musical voice, "far away in a 135 | little street there is a poor house. One of the windows is open, and 136 | through it I can see a woman seated at a table. Her face is thin and 137 | worn, and she has coarse, red hands, all pricked by the needle, for she 138 | is a seamstress. She is embroidering passion-flowers on a satin gown for 139 | the loveliest of the Queen's maids-of-honour to wear at the next 140 | Court-ball. In a bed in the corner of the room her little boy is lying 141 | ill. He has a fever, and is asking for oranges. His mother has nothing 142 | to give him but river water, so he is crying. Swallow, Swallow, little 143 | Swallow, will you not bring her the ruby out of my sword-hilt? My feet 144 | are fastened to this pedestal and I cannot move." 145 | 146 | "I am waited for in Egypt," said the Swallow. "My friends are flying up 147 | and down the Nile, and talking to the large lotus-flowers. Soon they 148 | will go to sleep in the tomb of the great King. The King is there 149 | himself in his painted coffin. He is wrapped in yellow linen, and 150 | embalmed with spices. Round his neck is a chain of pale green jade, 151 | and his hands are like withered leaves." 152 | 153 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay 154 | with me for one night, and be my messenger? The boy is so thirsty, and 155 | the mother so sad." 156 | 157 | "I don't think I like boys," answered the Swallow. "Last summer, when I 158 | was staying on the river, there were two rude boys, the miller's sons, 159 | who were always throwing stones at me. They never hit me, of course; 160 | we swallows fly far too well for that, and besides, I come of a family 161 | famous for its agility; but still, it was a mark of disrespect." 162 | 163 | But the Happy Prince looked so sad that the little Swallow was sorry. 164 | "It is very cold here," he said; "but I will stay with you for one 165 | night, and be your messenger." 166 | 167 | "Thank you, little Swallow," said the Prince. 168 | 169 | So the Swallow picked out the great ruby from the Prince's sword, 170 | and flew away with it in his beak over the roofs of the town. 171 | 172 | He passed by the cathedral tower, where the white marble angels were 173 | sculptured. He passed by the palace and heard the sound of dancing. 174 | A beautiful girl came out on the balcony with her lover. "How wonderful 175 | the stars are," he said to her, "and how wonderful is the power of 176 | love!" 177 | 178 | "I hope my dress will be ready in time for the State-ball," she 179 | answered; "I have ordered passion-flowers to be embroidered on it; 180 | but the seamstresses are so lazy." 181 | 182 | He passed over the river, and saw the lanterns hanging to the masts of 183 | the ships. He passed over the Ghetto, and saw the old Jews bargaining 184 | with each other, and weighing out money in copper scales. At last he 185 | came to the poor house and looked in. The boy was tossing feverishly on 186 | his bed, and the mother had fallen asleep, she was so tired. In he 187 | hopped, and laid the great ruby on the table beside the woman's thimble. 188 | Then he flew gently round the bed, fanning the boy's forehead with his 189 | wings. "How cool I feel!" said the boy, "I must be getting better;" 190 | and he sank into a delicious slumber. 191 | 192 | Then the Swallow flew back to the Happy Prince, and told him what he had 193 | done. "It is curious," he remarked, "but I feel quite warm now, although 194 | it is so cold." 195 | 196 | "That is because you have done a good action," said the Prince. And the 197 | little Swallow began to think, and then he fell asleep. Thinking always 198 | made him sleepy. 199 | 200 | When day broke he flew down to the river and had a bath. "What a 201 | remarkable phenomenon," said the Professor of Ornithology as he was 202 | passing over the bridge. "A swallow in winter!" And he wrote a long 203 | letter about it to the local newspaper. Every one quoted it, it was full 204 | of so many words that they could not understand. 205 | 206 | "To-night I go to Egypt," said the Swallow, and he was in high spirits 207 | at the prospect. He visited all the public monuments, and sat a long 208 | time on top of the church steeple. Wherever he went the Sparrows 209 | chirruped, and said to each other, "What a distinguished stranger!" 210 | so he enjoyed himself very much. 211 | 212 | When the moon rose he flew back to the Happy Prince. "Have you any 213 | commissions for Egypt?" he cried; "I am just starting." 214 | 215 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay 216 | with me one night longer?" 217 | 218 | [Illustration: THE LOVELIEST OF THE QUEEN'S MAIDS OF HONOUR] 219 | 220 | "I am waited for in Egypt," answered the Swallow. "To-morrow my friends 221 | will fly up to the Second Cataract. The river-horse couches there among 222 | the bulrushes, and on a great granite throne sits the God Memnon. All 223 | night long he watches the stars, and when the morning star shines he 224 | utters one cry of joy, and then he is silent. At noon the yellow lions 225 | come down to the water's edge to drink. They have eyes like green 226 | beryls, and their roar is louder than the roar of the cataract." 227 | 228 | "Swallow, Swallow, little Swallow," said the Prince, "far away across 229 | the city I see a young man in a garret. He is leaning over a desk 230 | covered with papers, and in a tumbler by his side there is a bunch of 231 | withered violets. His hair is brown and crisp, and his lips are red as a 232 | pomegranate, and he has large and dreamy eyes. He is trying to finish a 233 | play for the Director of the Theatre, but he is too cold to write any 234 | more. There is no fire in the grate, and hunger has made him faint." 235 | 236 | "I will wait with you one night longer," said the Swallow, who really 237 | had a good heart. "Shall I take him another ruby?" 238 | 239 | "Alas! I have no ruby now," said the Prince; "my eyes are all that I 240 | have left. They are made of rare sapphires, which were brought out of 241 | India a thousand years ago. Pluck out one of them and take it to him. He 242 | will sell it to the jeweller, and buy food and firewood, and finish his 243 | play." 244 | 245 | "Dear Prince," said the Swallow, "I cannot do that"; and he began to 246 | weep. 247 | 248 | "Swallow, Swallow, little Swallow," said the Prince, "do as I command 249 | you." 250 | 251 | So the Swallow plucked out the Prince's eye, and flew away to the 252 | student's garret. It was easy enough to get in, as there was a hole in 253 | the roof. Through this he darted, and came into the room. The young man 254 | had his head buried in his hands, so he did not hear the flutter of the 255 | bird's wings, and when he looked up he found the beautiful sapphire 256 | lying on the withered violets. 257 | 258 | "I am beginning to be appreciated," he cried; "this is from some great 259 | admirer. Now I can finish my play," and he looked quite happy. 260 | 261 | The next day the Swallow flew down to the harbour. He sat on the mast of 262 | a large vessel and watched the sailors hauling big chests out of the 263 | hold with ropes. "Heave a-hoy!" they shouted as each chest came up. 264 | "I am going to Egypt!" cried the Swallow, but nobody minded, and when 265 | the moon rose he flew back to the Happy Prince. 266 | 267 | "I am come to bid you good-bye," he cried. 268 | 269 | "Swallow, Swallow, little Swallow," said the Prince, "will you not stay 270 | with me one night longer?" 271 | 272 | "It is winter," answered the Swallow, "and the chill snow will soon be 273 | here. In Egypt the sun is warm on the green palm-trees, and the 274 | crocodiles lie in the mud and look lazily about them. My companions are 275 | building a nest in the Temple of Baalbec, and the pink and white doves 276 | are watching them, and cooing to each other. Dear Prince, I must leave 277 | you, but I will never forget you, and next spring I will bring you back 278 | two beautiful jewels in place of those you have given away. The ruby 279 | shall be redder than a red rose, and the sapphire shall be as blue as 280 | the great sea." 281 | 282 | "In the square below," said the Happy Prince, "there stands a little 283 | match-girl. She has let her matches fall in the gutter, and they are all 284 | spoiled. Her father will beat her if she does not bring home some money, 285 | and she is crying. She has no shoes or stockings, and her little head is 286 | bare. Pluck out my other eye and give it to her, and her father will not 287 | beat her." 288 | 289 | "I will stay with you one night longer," said the Swallow, "but I cannot 290 | pluck out your eye. You would be quite blind then." 291 | 292 | "Swallow, Swallow, little Swallow," said the Prince, "do as I command 293 | you." 294 | 295 | So he plucked out the Prince's other eye, and darted down with it. He 296 | swooped past the match-girl, and slipped the jewel into the palm of her 297 | hand. "What a lovely bit of glass!" cried the little girl; and she ran 298 | home, laughing. 299 | 300 | Then the Swallow came back to the Prince. "You are blind now," he said, 301 | "so I will stay with you always." 302 | 303 | "No, little Swallow," said the poor Prince, "you must go away to Egypt." 304 | 305 | "I will stay with you always," said the Swallow, and he slept at the 306 | Prince's feet. 307 | 308 | All the next day he sat on the Prince's shoulder, and told him stories 309 | of what he had seen in strange lands. He told him of the red ibises, 310 | who stand in long rows on the banks of the Nile, and catch gold-fish in 311 | their beaks; of the Sphinx, who is as old as the world itself, and lives 312 | in the desert, and knows everything; of the merchants, who walk slowly 313 | by the side of their camels and carry amber beads in their hands; of the 314 | King of the Mountains of the Moon, who is as black as ebony, and 315 | worships a large crystal; of the great green snake that sleeps in a palm 316 | tree, and has twenty priests to feed it with honey-cakes; and of the 317 | pygmies who sail over a big lake on large flat leaves, and are always at 318 | war with the butterflies. 319 | 320 | "Dear little Swallow," said the Prince, "you tell me of marvellous 321 | things, but more marvellous than anything is the suffering of men and of 322 | women. There is no Mystery so great as Misery. Fly over my city, little 323 | Swallow, and tell me what you see there." 324 | 325 | [Illustration: THE RICH MAKING MERRY IN THEIR BEAUTIFUL HOUSES, 326 | WHILE THE BEGGARS WERE SITTING AT THE GATES] 327 | 328 | So the Swallow flew over the great city, and saw the rich making merry 329 | in their beautiful houses, while the beggars were sitting at the gates. 330 | He flew into dark lanes, and saw the white faces of starving children 331 | looking out listlessly at the black streets. Under the archway of a 332 | bridge two little boys were lying in one another's arms to try and keep 333 | themselves warm. "How hungry we are!" they said. "You must not lie 334 | here," shouted the Watchman, and they wandered out into the rain. 335 | 336 | Then he flew back and told the Prince what he had seen. 337 | 338 | "I am covered with fine gold," said the Prince, "you must take it off, 339 | leaf by leaf, and give it to my poor; the living always think that gold 340 | can make them happy." 341 | 342 | Leaf after leaf of the fine gold the Swallow picked off, till the Happy 343 | Prince looked quite dull and grey. Leaf after leaf of the fine gold he 344 | brought to the poor, and the children's faces grew rosier, and they 345 | laughed and played games in the street. "We have bread now!" they cried. 346 | 347 | Then the snow came, and after the snow came the frost. The streets 348 | looked as if they were made of silver, they were so bright and 349 | glistening; long icicles like crystal daggers hung down from the eaves 350 | of the houses, everybody went about in furs, and the little boys wore 351 | scarlet caps and skated on the ice. 352 | 353 | The poor little Swallow grew colder and colder, but he would not leave 354 | the Prince, he loved him too well. He picked up crumbs outside the 355 | baker's door when the baker was not looking, and tried to keep himself 356 | warm by flapping his wings. 357 | 358 | But at last he knew that he was going to die. He had just strength to 359 | fly up to the Prince's shoulder once more. "Good-bye, dear Prince!" 360 | he murmured, "will you let me kiss your hand?" 361 | 362 | "I am glad that you are going to Egypt at last, little Swallow," said 363 | the Prince, "you have stayed too long here; but you must kiss me on the 364 | lips, for I love you." 365 | 366 | "It is not to Egypt that I am going," said the Swallow. "I am going to 367 | the House of Death. Death is the brother of Sleep, is he not?" 368 | 369 | And he kissed the Happy Prince on the lips, and fell down dead at his 370 | feet. 371 | 372 | At that moment a curious crack sounded inside the statue, as if 373 | something had broken. The fact is that the leaden heart had snapped 374 | right in two. It certainly was a dreadfully hard frost. 375 | 376 | Early the next morning the Mayor was walking in the square below in 377 | company with the Town Councillors. As they passed the column he looked 378 | up at the statue: "Dear me! how shabby the Happy Prince looks!" he said. 379 | 380 | "How shabby, indeed!" cried the Town Councillors, who always agreed with 381 | the Mayor; and they went up to look at it. 382 | 383 | "The ruby has fallen out of his sword, his eyes are gone, and he is 384 | golden no longer," said the Mayor; "in fact, he is little better than a 385 | beggar!" 386 | 387 | "Little better than a beggar," said the Town Councillors. 388 | 389 | "And here is actually a dead bird at his feet!" continued the Mayor. "We 390 | must really issue a proclamation that birds are not to be allowed to die 391 | here." And the Town Clerk made a note of the suggestion. 392 | 393 | So they pulled down the statue of the Happy Prince. "As he is no longer 394 | beautiful he is no longer useful," said the Art Professor at the 395 | University. 396 | 397 | Then they melted the statue in a furnace, and the Mayor held a meeting 398 | of the Corporation to decide what was to be done with the metal. "We 399 | must have another statue, of course," he said, "and it shall be a statue 400 | of myself." 401 | 402 | "Of myself," said each of the Town Councillors, and they quarrelled. 403 | When I last heard of them they were quarrelling still. 404 | 405 | "What a strange thing!" said the overseer of the workmen at the foundry. 406 | "This broken lead heart will not melt in the furnace. We must throw it 407 | away." So they threw it on a dust-heap where the dead Swallow was also 408 | lying. 409 | 410 | "Bring me the two most precious things in the city," said God to one of 411 | His Angels; and the Angel brought Him the leaden heart and the dead 412 | bird. 413 | 414 | "You have rightly chosen," said God, "for in my garden of Paradise this 415 | little bird shall sing for evermore, and in my city of gold the Happy 416 | Prince shall praise me." 417 | 418 | -------------------------------------------------------------------------------- /tests/fixture_data/attachment_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/attachment_small.png -------------------------------------------------------------------------------- /tests/fixture_data/crazyones.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/crazyones.pdf -------------------------------------------------------------------------------- /tests/fixture_data/jpeg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/jpeg.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testDecodeStreamData/ASCII85Decode.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.4 2 | % This file was purposefully assembled for PyPDF4 to contain a stream with an 3 | % ASCII85Decode filter. 4 | 1 0 obj 5 | << /Type /Catalog 6 | /Outlines 2 0 R 7 | /Pages 3 0 R 8 | >> 9 | endobj 10 | 11 | 2 0 obj 12 | << /Type Outlines 13 | /Count 0 14 | >> 15 | endobj 16 | 17 | 3 0 obj 18 | << /Type /Pages 19 | /Kids [ 4 0 R ] 20 | /Count 1 21 | >> 22 | endobj 23 | 24 | 4 0 obj 25 | << /Type /Page 26 | /Parent 3 0 R 27 | /MediaBox [ 0 0 612 792 ] 28 | /Contents 5 0 R 29 | /Resources << /ProcSet 6 0 >> 30 | >> 31 | endobj 32 | 33 | 5 0 obj 34 | << /Length 486 /Filter /ASCII85Decode >> 35 | stream 36 | Gat=f>>)jf(l%MV/)Eo6mIKAcVI%E8JjXX\LYJuV['I-CRhu'N^LhAdW!F"AKY3'f+!CnrjS-:gc;G'?"Hl==+<>3mA"9I518%ESCnVLW>+8W;n:g?Jm8Lf,g;R3j'h:hD+/MCBA4.7R%cB^*K8Fqlg@[FpiFY'd\=Bs0qgO@n"mHUoHA`!)4cZ 37 | endstream 38 | endobj 39 | 40 | 6 0 obj 41 | << /Length 1554 /Filter /ASCII85Decode >> 42 | stream 43 | Gatm:?#SIU'Re<2\BE'[,#oo3Q=T>2>FT(e3i&m5DLc&,M3>DGU-_b#r9ni$I9KD$%Tj.XM(%5-GgADEdK^:I^AXUg!5EF0k0=3]!Lk),V7VE*4hTZW4C.a1)SA\XGQ=LT]/;lDZ!5WPQLo$D.]4VVBI52F%&+LE&<9==i6+elI_ZsJCJpV;p&MQi,3H'sCFW?!JdKKm))Yj2X=DqjIo_!+abe^bS:JYdQZtiBJ;e0I-&P#AA614u.%m4J7E?0"iZ3(!U!F;cXs5mlLmfuN.sE^efFp(f1C1gXNQOkH)FJ>&`boU[Fj^[&DN\O*M_/m@E_Gii<,!P!q_64>,$tHdGlL?EoGa'_h!fi*2=*nTV<1Srp69_kCJ]r\J=3%dA=U_QuI4$=&L)Hgm;RlE=6]CPlBkb!6l(GC7Oe5n%N_"!P]3E-SX)4IbNHb2BL?l0BD&j`O>A_>.On.pAblZbr:TS$8g5@HgA7Y<2U<]4Ctl?%*%D;dN[?5I4h;(uW_&nC#))A4MDLK[$X\+b!ng[E0s*_-/')]%Ut0gtl?!U)aItV.\F./$p]g5hosPeOT>`l#XJ;^s55)Y'WEG3UVN6/u-hpn&OQ(ftS3oUQ!u7*u/;YqtG3VYNSm&/FJp*RTR@kMtnp=^Ju]l"@Th`#Q'\g>!:iG^)kFCL@GO2a6UT;36=TD#DMHJW\Q`*Tjl!q2'Kcm+5U):1K9,II2$"E"io2B9LM4`4K!8tn.c'rsQ/Lor&*qNfKgn<:CBN-)DhKklW[JXOsc@UCjh9P.f^'j)?Rbrnp$H_!PD8l<$G'XGZX,*I&B&UTN+&[Wat[>@/C"[q[!lO24!o;Ch%*u\r5P?:a*XCKlR=5i\P> 44 | endstream 45 | endobj 46 | 47 | xref 48 | 0 7 49 | 0000000000 65535 f 50 | 0000000111 00000 n 51 | 0000000199 00000 n 52 | 0000000260 00000 n 53 | 0000000342 00000 n 54 | 0000000498 00000 n 55 | 0000001060 00000 n 56 | 57 | trailer 58 | << /Size 7 59 | /Root 1 0 R 60 | >> 61 | startxref 62 | 2694 63 | %%EOF 64 | -------------------------------------------------------------------------------- /tests/fixture_data/testDecodeStreamData/CCITTFaxDecode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/CCITTFaxDecode.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testDecodeStreamData/DCTDecode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/DCTDecode.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testDecodeStreamData/FlateDecode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/FlateDecode.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testDecodeStreamData/LZWDecode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testDecodeStreamData/LZWDecode.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testFileLoad/crazyones.txt: -------------------------------------------------------------------------------- 1 | TheCrazyOnesOctober14,1998Herestothecrazyones.Themis˝ts.Therebels.Thetroublemakers.Theroundpegsinthesquareholes.Theoneswhoseethingsdi˙erently.Theyrenotfondofrules.Andtheyhavenorespectforthestatusquo.Youcanquotethem,disagreewiththem,glorifyorvilifythem.Abouttheonlythingyoucantdoisignorethem.Becausetheychangethings.Theyinvent.Theyimagine.Theyheal.Theyexplore.Theycreate.Theyinspire.Theypushthehumanraceforward.Maybetheyhavetobecrazy.Howelsecanyoustareatanemptycanvasandseeaworkofart?Orsitinsilenceandhearasongthatsneverbeenwritten?Orgazeataredplanetandseealaboratoryonwheels?Wemaketoolsforthesekindsofpeople.Whilesomeseethemasthecrazyones,weseegenius.Becausethepeoplewhoarecrazyenoughtothinktheycanchangetheworld,aretheoneswhodo. -------------------------------------------------------------------------------- /tests/fixture_data/testIsObjectFree/GeoBase_NHNC1_Data_Model_UML_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/claird/PyPDF4/9c60d9df3a56edd32226c9e76695018f997fafe6/tests/fixture_data/testIsObjectFree/GeoBase_NHNC1_Data_Model_UML_EN.pdf -------------------------------------------------------------------------------- /tests/fixture_data/testIsObjectFree/SF424_page2.pdf: -------------------------------------------------------------------------------- 1 | 0 328 2 | 0000000000 65535 f 3 | 0000000009 00000 n 4 | 0000000068 00000 n 5 | 0000000169 00000 n 6 | 0000000218 00000 n 7 | 0000000472 00000 n 8 | 0000002469 00000 n 9 | 0000002556 00000 n 10 | 0000003162 00000 n 11 | 0000003404 00000 n 12 | 0000004048 00000 n 13 | 0000004297 00000 n 14 | 0000005487 00000 n 15 | 0000005729 00000 n 16 | 0000005906 00000 n 17 | 0000006091 00000 n 18 | 0000006200 00000 n 19 | 0000006386 00000 n 20 | 0000006601 00000 n 21 | 0000006859 00000 n 22 | 0000011230 00000 n 23 | 0000011340 00000 n 24 | 0000011416 00000 n 25 | 0000011477 00000 n 26 | 0000011734 00000 n 27 | 0000015083 00000 n 28 | 0000015514 00000 n 29 | 0000015769 00000 n 30 | 0000016099 00000 n 31 | 0000016199 00000 n 32 | 0000017526 00000 n 33 | 0000017702 00000 n 34 | 0000017817 00000 n 35 | 0000018002 00000 n 36 | 0000018215 00000 n 37 | 0000018401 00000 n 38 | 0000018587 00000 n 39 | 0000018844 00000 n 40 | 0000019844 00000 n 41 | 0000021034 00000 n 42 | 0000021082 00000 n 43 | 0000021268 00000 n 44 | 0000021549 00000 n 45 | 0000021752 00000 n 46 | 0000021945 00000 n 47 | 0000022150 00000 n 48 | 0000022446 00000 n 49 | 0000028034 00000 n 50 | 0000028067 00000 n 51 | 0000028110 00000 n 52 | 0000028168 00000 n 53 | 0000028490 00000 n 54 | 0000028865 00000 n 55 | 0000029069 00000 n 56 | 0000029112 00000 n 57 | 0000029170 00000 n 58 | 0000029492 00000 n 59 | 0000029867 00000 n 60 | 0000030072 00000 n 61 | 0000030115 00000 n 62 | 0000030173 00000 n 63 | 0000030495 00000 n 64 | 0000030870 00000 n 65 | 0000031075 00000 n 66 | 0000031118 00000 n 67 | 0000031176 00000 n 68 | 0000031498 00000 n 69 | 0000031873 00000 n 70 | 0000032136 00000 n 71 | 0000041460 00000 n 72 | 0000041506 00000 n 73 | 0000041922 00000 n 74 | 0000042344 00000 n 75 | 0000046101 00000 n 76 | 0000046622 00000 n 77 | 0000047217 00000 n 78 | 0000051163 00000 n 79 | 0000051721 00000 n 80 | 0000052453 00000 n 81 | 0000057379 00000 n 82 | 0000057406 00000 n 83 | 0000057611 00000 n 84 | 0000057654 00000 n 85 | 0000057712 00000 n 86 | 0000058034 00000 n 87 | 0000058409 00000 n 88 | 0000058672 00000 n 89 | 0000063653 00000 n 90 | 0000063680 00000 n 91 | 0000063723 00000 n 92 | 0000063781 00000 n 93 | 0000064103 00000 n 94 | 0000064478 00000 n 95 | 0000064769 00000 n 96 | 0000065036 00000 n 97 | 0000065251 00000 n 98 | 0000065541 00000 n 99 | 0000065826 00000 n 100 | 0000066095 00000 n 101 | 0000066310 00000 n 102 | 0000066600 00000 n 103 | 0000066948 00000 n 104 | 0000067216 00000 n 105 | 0000067432 00000 n 106 | 0000067723 00000 n 107 | 0000068015 00000 n 108 | 0000068280 00000 n 109 | 0000068495 00000 n 110 | 0000068783 00000 n 111 | 0000069172 00000 n 112 | 0000069437 00000 n 113 | 0000069652 00000 n 114 | 0000069940 00000 n 115 | 0000070310 00000 n 116 | 0000070577 00000 n 117 | 0000070792 00000 n 118 | 0000071082 00000 n 119 | 0000071314 00000 n 120 | 0000071518 00000 n 121 | 0000071776 00000 n 122 | 0000071875 00000 n 123 | 0000071977 00000 n 124 | 0000072157 00000 n 125 | 0000072335 00000 n 126 | 0000072530 00000 n 127 | 0000072789 00000 n 128 | 0000073069 00000 n 129 | 0000073318 00000 n 130 | 0000073417 00000 n 131 | 0000073620 00000 n 132 | 0000073824 00000 n 133 | 0000074028 00000 n 134 | 0000074217 00000 n 135 | 0000074410 00000 n 136 | 0000074601 00000 n 137 | 0000074798 00000 n 138 | 0000074993 00000 n 139 | 0000075181 00000 n 140 | 0000075367 00000 n 141 | 0000075556 00000 n 142 | 0000075886 00000 n 143 | 0000075948 00000 n 144 | 0000075969 00000 n 145 | 0000076316 00000 n 146 | 0000076534 00000 n 147 | 0000076740 00000 n 148 | 0000076955 00000 n 149 | 0000077302 00000 n 150 | 0000077359 00000 n 151 | 0000077380 00000 n 152 | 0000077726 00000 n 153 | 0000077938 00000 n 154 | 0000078186 00000 n 155 | 0000078430 00000 n 156 | 0000078507 00000 n 157 | 0000078587 00000 n 158 | 0000078823 00000 n 159 | 0000078900 00000 n 160 | 0000078980 00000 n 161 | 0000079191 00000 n 162 | 0000079565 00000 n 163 | 0000079615 00000 n 164 | 0000079923 00000 n 165 | 0000080350 00000 n 166 | 0000080775 00000 n 167 | 0000081137 00000 n 168 | 0000081174 00000 n 169 | 0000081623 00000 n 170 | 0000082078 00000 n 171 | 0000083307 00000 n 172 | 0000083443 00000 n 173 | 0000083964 00000 n 174 | 0000084487 00000 n 175 | 0000084760 00000 n 176 | 0000084874 00000 n 177 | 0000085149 00000 n 178 | 0000085244 00000 n 179 | 0000085509 00000 n 180 | 0000085782 00000 n 181 | 0000085881 00000 n 182 | 0000086093 00000 n 183 | 0000086366 00000 n 184 | 0000086465 00000 n 185 | 0000086677 00000 n 186 | 0000086950 00000 n 187 | 0000087049 00000 n 188 | 0000087261 00000 n 189 | 0000087535 00000 n 190 | 0000087634 00000 n 191 | 0000087846 00000 n 192 | 0000088121 00000 n 193 | 0000088220 00000 n 194 | 0000088432 00000 n 195 | 0000088707 00000 n 196 | 0000088806 00000 n 197 | 0000089018 00000 n 198 | 0000089293 00000 n 199 | 0000089392 00000 n 200 | 0000089604 00000 n 201 | 0000089703 00000 n 202 | 0000089915 00000 n 203 | 0000090205 00000 n 204 | 0000090426 00000 n 205 | 0000090644 00000 n 206 | 0000090962 00000 n 207 | 0000091019 00000 n 208 | 0000091387 00000 n 209 | 0000091493 00000 n 210 | 0000091735 00000 n 211 | 0000091834 00000 n 212 | 0000091936 00000 n 213 | 0000092175 00000 n 214 | 0000092274 00000 n 215 | 0000092376 00000 n 216 | 0000092685 00000 n 217 | 0000092801 00000 n 218 | 0000092920 00000 n 219 | 0000093134 00000 n 220 | 0000093437 00000 n 221 | 0000093553 00000 n 222 | 0000093672 00000 n 223 | 0000093886 00000 n 224 | 0000094182 00000 n 225 | 0000094298 00000 n 226 | 0000094417 00000 n 227 | 0000094631 00000 n 228 | 0000094926 00000 n 229 | 0000095042 00000 n 230 | 0000095161 00000 n 231 | 0000095375 00000 n 232 | 0000095670 00000 n 233 | 0000095786 00000 n 234 | 0000095905 00000 n 235 | 0000096119 00000 n 236 | 0000096420 00000 n 237 | 0000096536 00000 n 238 | 0000096655 00000 n 239 | 0000096869 00000 n 240 | 0000097196 00000 n 241 | 0000097312 00000 n 242 | 0000097575 00000 n 243 | 0000097694 00000 n 244 | 0000097804 00000 n 245 | 0000098018 00000 n 246 | 0000098354 00000 n 247 | 0000098619 00000 n 248 | 0000098834 00000 n 249 | 0000099123 00000 n 250 | 0000099433 00000 n 251 | 0000099532 00000 n 252 | 0000099634 00000 n 253 | 0000099876 00000 n 254 | 0000100216 00000 n 255 | 0000100484 00000 n 256 | 0000100700 00000 n 257 | 0000100990 00000 n 258 | 0000101286 00000 n 259 | 0000101554 00000 n 260 | 0000101770 00000 n 261 | 0000102060 00000 n 262 | 0000102369 00000 n 263 | 0000102634 00000 n 264 | 0000102848 00000 n 265 | 0000103135 00000 n 266 | 0000103447 00000 n 267 | 0000103713 00000 n 268 | 0000103929 00000 n 269 | 0000104217 00000 n 270 | 0000104525 00000 n 271 | 0000104789 00000 n 272 | 0000105005 00000 n 273 | 0000105292 00000 n 274 | 0000105630 00000 n 275 | 0000105978 00000 n 276 | 0000106204 00000 n 277 | 0000106431 00000 n 278 | 0000106654 00000 n 279 | 0000107023 00000 n 280 | 0000107044 00000 n 281 | 0000107067 00000 n 282 | 0000107407 00000 n 283 | 0000107623 00000 n 284 | 0000107886 00000 n 285 | 0000107963 00000 n 286 | 0000108043 00000 n 287 | 0000108294 00000 n 288 | 0000108371 00000 n 289 | 0000108451 00000 n 290 | 0000108679 00000 n 291 | 0000108928 00000 n 292 | 0000108996 00000 n 293 | 0000109212 00000 n 294 | 0000109425 00000 n 295 | 0000109736 00000 n 296 | 0000110145 00000 n 297 | 0000110166 00000 n 298 | 0000110514 00000 n 299 | 0000110825 00000 n 300 | 0000111176 00000 n 301 | 0000111511 00000 n 302 | 0000111859 00000 n 303 | 0000112062 00000 n 304 | 0000112373 00000 n 305 | 0000112456 00000 n 306 | 0000112882 00000 n 307 | 0000113180 00000 n 308 | 0000113263 00000 n 309 | 0000113535 00000 n 310 | 0000113635 00000 n 311 | 0000113899 00000 n 312 | 0000114197 00000 n 313 | 0000114297 00000 n 314 | 0000114561 00000 n 315 | 0000114859 00000 n 316 | 0000114959 00000 n 317 | 0000115222 00000 n 318 | 0000115521 00000 n 319 | 0000115633 00000 n 320 | 0000115897 00000 n 321 | 0000115997 00000 n 322 | 0000116261 00000 n 323 | 0000116486 00000 n 324 | 0000116549 00000 n 325 | 0000116814 00000 n 326 | 0000117080 00000 n 327 | 0000117336 00000 n 328 | 0000117614 00000 n 329 | 0000117895 00000 n 330 | -------------------------------------------------------------------------------- /tests/fixture_data/testIsObjectFree/Seige_of_Vicksburg_Sample_OCR.pdf: -------------------------------------------------------------------------------- 1 | 38 16 2 | 0000000023 00000 n 3 | 0000000593 00000 n 4 | 0000000962 00000 n 5 | 0000001150 00000 n 6 | 0000002182 00000 n 7 | 0000002299 00000 n 8 | 0000005902 00000 n 9 | 0000006033 00000 n 10 | 0000006086 00000 n 11 | 0000006133 00000 n 12 | 0000022142 00000 n 13 | 0000023745 00000 n 14 | 0000036171 00000 n 15 | 0000036218 00000 n 16 | 0000036362 00000 n 17 | 0000000761 00000 n 18 | 0 38 19 | 0000000000 65535 f 20 | 0000036415 00000 n 21 | 0000036608 00000 n 22 | 0000040043 00000 n 23 | 0000062526 00000 n 24 | 0000064168 00000 n 25 | 0000079123 00000 n 26 | 0000079265 00000 n 27 | 0000079460 00000 n 28 | 0000080497 00000 n 29 | 0000084022 00000 n 30 | 0000103504 00000 n 31 | 0000105136 00000 n 32 | 0000119406 00000 n 33 | 0000119550 00000 n 34 | 0000119748 00000 n 35 | 0000123678 00000 n 36 | 0000124435 00000 n 37 | 0000126063 00000 n 38 | 0000141743 00000 n 39 | 0000141887 00000 n 40 | 0000142095 00000 n 41 | 0000145954 00000 n 42 | 0000146720 00000 n 43 | 0000148350 00000 n 44 | 0000164181 00000 n 45 | 0000164325 00000 n 46 | 0000164543 00000 n 47 | 0000165585 00000 n 48 | 0000169113 00000 n 49 | 0000184252 00000 n 50 | 0000185896 00000 n 51 | 0000201138 00000 n 52 | 0000201282 00000 n 53 | 0000202316 00000 n 54 | 0000203347 00000 n 55 | 0000203433 00000 n 56 | 0000205114 00000 n 57 | -------------------------------------------------------------------------------- /tests/fixture_data/testIsObjectFree/jpeg.pdf: -------------------------------------------------------------------------------- 1 | 0 17 2 | 0000000000 65535 f 3 | 0000099839 00000 n 4 | 0000000019 00000 n 5 | 0000000276 00000 n 6 | 0000000296 00000 n 7 | 0000041872 00000 n 8 | 0000091180 00000 n 9 | 0000099982 00000 n 10 | 0000091202 00000 n 11 | 0000098889 00000 n 12 | 0000098910 00000 n 13 | 0000099106 00000 n 14 | 0000099476 00000 n 15 | 0000099707 00000 n 16 | 0000099740 00000 n 17 | 0000100081 00000 n 18 | 0000100178 00000 n 19 | -------------------------------------------------------------------------------- /tests/fixture_data/testReadXRefStreamCompressedObjects/crazyones.pdf: -------------------------------------------------------------------------------- 1 | 8 0 <>/ProcSet[/PDF/Text/ImageC/ImageB/ImageI]>> 2 | 3 81 <> 3 | 10 142 <> 4 | 2 200 <> 5 | 1 315 <> 6 | 11 346 [684.7 0 0 0 0 0 0 0 0 0 0 0 733.6 0 0 0 0 684.7 0 0 0 0 0 0 0 0 0 0 0 0 489.1 0 0 0 440.2 0 0 489.1 0 0 0 0 0 538 0 0 0 403.5 391.3 0 0 0 0 0 464.6 391.3] 7 | 13 502 [299.9 0 0 0 0 499.9 0 0 499.9 0 0 0 499.9 499.9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 749.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 449.9 449.9 0 449.9 0 0 0 0 0 0 0 0 0 499.9 0 0 412.4 0 324.9] 8 | 15 698 [599.4 570.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285.4 0 285.4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 485.2 0 770.5 727.8 0 0 0 0 0 770.5 0 0 0 0 941.7 0 799.2 0 0 0 0 742.1 0 0 1055.9 0 770.5 0 0 0 0 0 0 0 513.8 570.8 456.7 570.8 457.1 314 513.8 570.8 285.4 0 542.3 285.4 856.3 570.8 513.8 570.8 542.3 401.9 405.3 399.6 570.8 542.3 742.1 542.3 542.3 456.7] 9 | 4 1043 <> 10 | 19 1192 <> 11 | 5 1362 <> 12 | 20 1511 <> 13 | 6 1681 <> 14 | 21 1830 <> 15 | 17 1995 <> -------------------------------------------------------------------------------- /tests/fixture_data/testXRefStreamObjects/crazyones.pdf: -------------------------------------------------------------------------------- 1 | % The size of the XRef-Stream dictionary is 115 bytes, accounting for 23.000000 2 | % items 3 | 0 0 65535 4 | 1 15 0 5 | 1 10245 0 6 | 1 939 0 7 | 1 2999 0 8 | 1 4982 0 9 | 1 9949 0 10 | 1 11160 0 11 | 2 9 4 12 | 2 9 3 13 | 2 9 1 14 | 2 9 8 15 | 2 9 10 16 | 2 9 12 17 | 2 9 0 18 | 2 9 2 19 | 2 9 5 20 | 2 9 6 21 | 2 9 7 22 | 2 9 14 23 | 2 9 9 24 | 2 9 11 25 | 2 9 13 26 | 27 | -------------------------------------------------------------------------------- /tests/fixture_data/testXRefTableObjects/SF424_page2.pdf: -------------------------------------------------------------------------------- 1 | % This is the XRef Table excerpt of the SF424_page2.pdf file 2 | 0 328 3 | 0000000000 65535 f 4 | 0000000009 00000 n 5 | 0000000068 00000 n 6 | 0000000169 00000 n 7 | 0000000218 00000 n 8 | 0000000472 00000 n 9 | 0000002469 00000 n 10 | 0000002556 00000 n 11 | 0000003162 00000 n 12 | 0000003404 00000 n 13 | 0000004048 00000 n 14 | 0000004297 00000 n 15 | 0000005487 00000 n 16 | 0000005729 00000 n 17 | 0000005906 00000 n 18 | 0000006091 00000 n 19 | 0000006200 00000 n 20 | 0000006386 00000 n 21 | 0000006601 00000 n 22 | 0000006859 00000 n 23 | 0000011230 00000 n 24 | 0000011340 00000 n 25 | 0000011416 00000 n 26 | 0000011477 00000 n 27 | 0000011734 00000 n 28 | 0000015083 00000 n 29 | 0000015514 00000 n 30 | 0000015769 00000 n 31 | 0000016099 00000 n 32 | 0000016199 00000 n 33 | 0000017526 00000 n 34 | 0000017702 00000 n 35 | 0000017817 00000 n 36 | 0000018002 00000 n 37 | 0000018215 00000 n 38 | 0000018401 00000 n 39 | 0000018587 00000 n 40 | 0000018844 00000 n 41 | 0000019844 00000 n 42 | 0000021034 00000 n 43 | 0000021082 00000 n 44 | 0000021268 00000 n 45 | 0000021549 00000 n 46 | 0000021752 00000 n 47 | 0000021945 00000 n 48 | 0000022150 00000 n 49 | 0000022446 00000 n 50 | 0000028034 00000 n 51 | 0000028067 00000 n 52 | 0000028110 00000 n 53 | 0000028168 00000 n 54 | 0000028490 00000 n 55 | 0000028865 00000 n 56 | 0000029069 00000 n 57 | 0000029112 00000 n 58 | 0000029170 00000 n 59 | 0000029492 00000 n 60 | 0000029867 00000 n 61 | 0000030072 00000 n 62 | 0000030115 00000 n 63 | 0000030173 00000 n 64 | 0000030495 00000 n 65 | 0000030870 00000 n 66 | 0000031075 00000 n 67 | 0000031118 00000 n 68 | 0000031176 00000 n 69 | 0000031498 00000 n 70 | 0000031873 00000 n 71 | 0000032136 00000 n 72 | 0000041460 00000 n 73 | 0000041506 00000 n 74 | 0000041922 00000 n 75 | 0000042344 00000 n 76 | 0000046101 00000 n 77 | 0000046622 00000 n 78 | 0000047217 00000 n 79 | 0000051163 00000 n 80 | 0000051721 00000 n 81 | 0000052453 00000 n 82 | 0000057379 00000 n 83 | 0000057406 00000 n 84 | 0000057611 00000 n 85 | 0000057654 00000 n 86 | 0000057712 00000 n 87 | 0000058034 00000 n 88 | 0000058409 00000 n 89 | 0000058672 00000 n 90 | 0000063653 00000 n 91 | 0000063680 00000 n 92 | 0000063723 00000 n 93 | 0000063781 00000 n 94 | 0000064103 00000 n 95 | 0000064478 00000 n 96 | 0000064769 00000 n 97 | 0000065036 00000 n 98 | 0000065251 00000 n 99 | 0000065541 00000 n 100 | 0000065826 00000 n 101 | 0000066095 00000 n 102 | 0000066310 00000 n 103 | 0000066600 00000 n 104 | 0000066948 00000 n 105 | 0000067216 00000 n 106 | 0000067432 00000 n 107 | 0000067723 00000 n 108 | 0000068015 00000 n 109 | 0000068280 00000 n 110 | 0000068495 00000 n 111 | 0000068783 00000 n 112 | 0000069172 00000 n 113 | 0000069437 00000 n 114 | 0000069652 00000 n 115 | 0000069940 00000 n 116 | 0000070310 00000 n 117 | 0000070577 00000 n 118 | 0000070792 00000 n 119 | 0000071082 00000 n 120 | 0000071314 00000 n 121 | 0000071518 00000 n 122 | 0000071776 00000 n 123 | 0000071875 00000 n 124 | 0000071977 00000 n 125 | 0000072157 00000 n 126 | 0000072335 00000 n 127 | 0000072530 00000 n 128 | 0000072789 00000 n 129 | 0000073069 00000 n 130 | 0000073318 00000 n 131 | 0000073417 00000 n 132 | 0000073620 00000 n 133 | 0000073824 00000 n 134 | 0000074028 00000 n 135 | 0000074217 00000 n 136 | 0000074410 00000 n 137 | 0000074601 00000 n 138 | 0000074798 00000 n 139 | 0000074993 00000 n 140 | 0000075181 00000 n 141 | 0000075367 00000 n 142 | 0000075556 00000 n 143 | 0000075886 00000 n 144 | 0000075948 00000 n 145 | 0000075969 00000 n 146 | 0000076316 00000 n 147 | 0000076534 00000 n 148 | 0000076740 00000 n 149 | 0000076955 00000 n 150 | 0000077302 00000 n 151 | 0000077359 00000 n 152 | 0000077380 00000 n 153 | 0000077726 00000 n 154 | 0000077938 00000 n 155 | 0000078186 00000 n 156 | 0000078430 00000 n 157 | 0000078507 00000 n 158 | 0000078587 00000 n 159 | 0000078823 00000 n 160 | 0000078900 00000 n 161 | 0000078980 00000 n 162 | 0000079191 00000 n 163 | 0000079565 00000 n 164 | 0000079615 00000 n 165 | 0000079923 00000 n 166 | 0000080350 00000 n 167 | 0000080775 00000 n 168 | 0000081137 00000 n 169 | 0000081174 00000 n 170 | 0000081623 00000 n 171 | 0000082078 00000 n 172 | 0000083307 00000 n 173 | 0000083443 00000 n 174 | 0000083964 00000 n 175 | 0000084487 00000 n 176 | 0000084760 00000 n 177 | 0000084874 00000 n 178 | 0000085149 00000 n 179 | 0000085244 00000 n 180 | 0000085509 00000 n 181 | 0000085782 00000 n 182 | 0000085881 00000 n 183 | 0000086093 00000 n 184 | 0000086366 00000 n 185 | 0000086465 00000 n 186 | 0000086677 00000 n 187 | 0000086950 00000 n 188 | 0000087049 00000 n 189 | 0000087261 00000 n 190 | 0000087535 00000 n 191 | 0000087634 00000 n 192 | 0000087846 00000 n 193 | 0000088121 00000 n 194 | 0000088220 00000 n 195 | 0000088432 00000 n 196 | 0000088707 00000 n 197 | 0000088806 00000 n 198 | 0000089018 00000 n 199 | 0000089293 00000 n 200 | 0000089392 00000 n 201 | 0000089604 00000 n 202 | 0000089703 00000 n 203 | 0000089915 00000 n 204 | 0000090205 00000 n 205 | 0000090426 00000 n 206 | 0000090644 00000 n 207 | 0000090962 00000 n 208 | 0000091019 00000 n 209 | 0000091387 00000 n 210 | 0000091493 00000 n 211 | 0000091735 00000 n 212 | 0000091834 00000 n 213 | 0000091936 00000 n 214 | 0000092175 00000 n 215 | 0000092274 00000 n 216 | 0000092376 00000 n 217 | 0000092685 00000 n 218 | 0000092801 00000 n 219 | 0000092920 00000 n 220 | 0000093134 00000 n 221 | 0000093437 00000 n 222 | 0000093553 00000 n 223 | 0000093672 00000 n 224 | 0000093886 00000 n 225 | 0000094182 00000 n 226 | 0000094298 00000 n 227 | 0000094417 00000 n 228 | 0000094631 00000 n 229 | 0000094926 00000 n 230 | 0000095042 00000 n 231 | 0000095161 00000 n 232 | 0000095375 00000 n 233 | 0000095670 00000 n 234 | 0000095786 00000 n 235 | 0000095905 00000 n 236 | 0000096119 00000 n 237 | 0000096420 00000 n 238 | 0000096536 00000 n 239 | 0000096655 00000 n 240 | 0000096869 00000 n 241 | 0000097196 00000 n 242 | 0000097312 00000 n 243 | 0000097575 00000 n 244 | 0000097694 00000 n 245 | 0000097804 00000 n 246 | 0000098018 00000 n 247 | 0000098354 00000 n 248 | 0000098619 00000 n 249 | 0000098834 00000 n 250 | 0000099123 00000 n 251 | 0000099433 00000 n 252 | 0000099532 00000 n 253 | 0000099634 00000 n 254 | 0000099876 00000 n 255 | 0000100216 00000 n 256 | 0000100484 00000 n 257 | 0000100700 00000 n 258 | 0000100990 00000 n 259 | 0000101286 00000 n 260 | 0000101554 00000 n 261 | 0000101770 00000 n 262 | 0000102060 00000 n 263 | 0000102369 00000 n 264 | 0000102634 00000 n 265 | 0000102848 00000 n 266 | 0000103135 00000 n 267 | 0000103447 00000 n 268 | 0000103713 00000 n 269 | 0000103929 00000 n 270 | 0000104217 00000 n 271 | 0000104525 00000 n 272 | 0000104789 00000 n 273 | 0000105005 00000 n 274 | 0000105292 00000 n 275 | 0000105630 00000 n 276 | 0000105978 00000 n 277 | 0000106204 00000 n 278 | 0000106431 00000 n 279 | 0000106654 00000 n 280 | 0000107023 00000 n 281 | 0000107044 00000 n 282 | 0000107067 00000 n 283 | 0000107407 00000 n 284 | 0000107623 00000 n 285 | 0000107886 00000 n 286 | 0000107963 00000 n 287 | 0000108043 00000 n 288 | 0000108294 00000 n 289 | 0000108371 00000 n 290 | 0000108451 00000 n 291 | 0000108679 00000 n 292 | 0000108928 00000 n 293 | 0000108996 00000 n 294 | 0000109212 00000 n 295 | 0000109425 00000 n 296 | 0000109736 00000 n 297 | 0000110145 00000 n 298 | 0000110166 00000 n 299 | 0000110514 00000 n 300 | 0000110825 00000 n 301 | 0000111176 00000 n 302 | 0000111511 00000 n 303 | 0000111859 00000 n 304 | 0000112062 00000 n 305 | 0000112373 00000 n 306 | 0000112456 00000 n 307 | 0000112882 00000 n 308 | 0000113180 00000 n 309 | 0000113263 00000 n 310 | 0000113535 00000 n 311 | 0000113635 00000 n 312 | 0000113899 00000 n 313 | 0000114197 00000 n 314 | 0000114297 00000 n 315 | 0000114561 00000 n 316 | 0000114859 00000 n 317 | 0000114959 00000 n 318 | 0000115222 00000 n 319 | 0000115521 00000 n 320 | 0000115633 00000 n 321 | 0000115897 00000 n 322 | 0000115997 00000 n 323 | 0000116261 00000 n 324 | 0000116486 00000 n 325 | 0000116549 00000 n 326 | 0000116814 00000 n 327 | 0000117080 00000 n 328 | 0000117336 00000 n 329 | 0000117614 00000 n 330 | 0000117895 00000 n 331 | -------------------------------------------------------------------------------- /tests/fixture_data/testXRefTableObjects/Seige_of_Vicksburg_Sample_OCR.pdf: -------------------------------------------------------------------------------- 1 | % This is the XRef Table excerpt of the Seige_of_Vicksburg_Sample_OCR.pdf file 2 | 38 16 3 | 0000000023 00000 n 4 | 0000000593 00000 n 5 | 0000000962 00000 n 6 | 0000001150 00000 n 7 | 0000002182 00000 n 8 | 0000002299 00000 n 9 | 0000005902 00000 n 10 | 0000006033 00000 n 11 | 0000006086 00000 n 12 | 0000006133 00000 n 13 | 0000022142 00000 n 14 | 0000023745 00000 n 15 | 0000036171 00000 n 16 | 0000036218 00000 n 17 | 0000036362 00000 n 18 | 0000000761 00000 n 19 | 0 38 20 | 0000000000 65535 f 21 | 0000036415 00000 n 22 | 0000036608 00000 n 23 | 0000040043 00000 n 24 | 0000062526 00000 n 25 | 0000064168 00000 n 26 | 0000079123 00000 n 27 | 0000079265 00000 n 28 | 0000079460 00000 n 29 | 0000080497 00000 n 30 | 0000084022 00000 n 31 | 0000103504 00000 n 32 | 0000105136 00000 n 33 | 0000119406 00000 n 34 | 0000119550 00000 n 35 | 0000119748 00000 n 36 | 0000123678 00000 n 37 | 0000124435 00000 n 38 | 0000126063 00000 n 39 | 0000141743 00000 n 40 | 0000141887 00000 n 41 | 0000142095 00000 n 42 | 0000145954 00000 n 43 | 0000146720 00000 n 44 | 0000148350 00000 n 45 | 0000164181 00000 n 46 | 0000164325 00000 n 47 | 0000164543 00000 n 48 | 0000165585 00000 n 49 | 0000169113 00000 n 50 | 0000184252 00000 n 51 | 0000185896 00000 n 52 | 0000201138 00000 n 53 | 0000201282 00000 n 54 | 0000202316 00000 n 55 | 0000203347 00000 n 56 | 0000203433 00000 n 57 | 0000205114 00000 n 58 | -------------------------------------------------------------------------------- /tests/fixture_data/testXRefTableObjects/jpeg.pdf: -------------------------------------------------------------------------------- 1 | % This is the XRef Table excerpt of the jpeg.pdf file 2 | 0 17 3 | 0000000000 65535 f 4 | 0000099839 00000 n 5 | 0000000019 00000 n 6 | 0000000276 00000 n 7 | 0000000296 00000 n 8 | 0000041872 00000 n 9 | 0000091180 00000 n 10 | 0000099982 00000 n 11 | 0000091202 00000 n 12 | 0000098889 00000 n 13 | 0000098910 00000 n 14 | 0000099106 00000 n 15 | 0000099476 00000 n 16 | 0000099707 00000 n 17 | 0000099740 00000 n 18 | 0000100081 00000 n 19 | 0000100178 00000 n 20 | -------------------------------------------------------------------------------- /tests/fixture_data/testXTableAgainstXStream/GeoBase_NHNC1_Data_Model_UML_EN.pdf: -------------------------------------------------------------------------------- 1 | % This is the XRef Table excerpt of the GeoBase_NHNC1_Data_Model_UML_EN.pdf 2 | % file, encoding objects that are "ignored" by a PDF reader supporting version 3 | % 1.5 or lower, but stored into the XRef Stream by a reader capable of version 4 | % 1.5 or higher. 5 | 0 678 6 | 0000000119 65535 f 7 | 0000000017 00000 n 8 | 0000000126 00000 n 9 | 0000000309 00000 n 10 | 0000000642 00000 n 11 | 0000002376 00000 n 12 | 0000002536 00000 n 13 | 0000002760 00000 n 14 | 0000005960 00000 n 15 | 0000006125 00000 n 16 | 0000006354 00000 n 17 | 0000006530 00000 n 18 | 0000006777 00000 n 19 | 0000006915 00000 n 20 | 0000006945 00000 n 21 | 0000007111 00000 n 22 | 0000007185 00000 n 23 | 0000007432 00000 n 24 | 0000007603 00000 n 25 | 0000007845 00000 n 26 | 0000008005 00000 n 27 | 0000008165 00000 n 28 | 0000008925 00000 n 29 | 0000009176 00000 n 30 | 0000010704 00000 n 31 | 0000010879 00000 n 32 | 0000011118 00000 n 33 | 0000011379 00000 n 34 | 0000015477 00000 n 35 | 0000015601 00000 n 36 | 0000015631 00000 n 37 | 0000015783 00000 n 38 | 0000015857 00000 n 39 | 0000016100 00000 n 40 | 0000016225 00000 n 41 | 0000016255 00000 n 42 | 0000016408 00000 n 43 | 0000016482 00000 n 44 | 0000016713 00000 n 45 | 0000017071 00000 n 46 | 0000019176 00000 n 47 | 0000019302 00000 n 48 | 0000019592 00000 n 49 | 0000019898 00000 n 50 | 0000020024 00000 n 51 | 0000020150 00000 n 52 | 0000020461 00000 n 53 | 0000020590 00000 n 54 | 0000020868 00000 n 55 | 0000020999 00000 n 56 | 0000021304 00000 n 57 | 0000021434 00000 n 58 | 0000021603 00000 n 59 | 0000021837 00000 n 60 | 0000021967 00000 n 61 | 0000022283 00000 n 62 | 0000022413 00000 n 63 | 0000022720 00000 n 64 | 0000022850 00000 n 65 | 0000023170 00000 n 66 | 0000023490 00000 n 67 | 0000023800 00000 n 68 | 0000023930 00000 n 69 | 0000024250 00000 n 70 | 0000024570 00000 n 71 | 0000024890 00000 n 72 | 0000025188 00000 n 73 | 0000025318 00000 n 74 | 0000025663 00000 n 75 | 0000026673 00000 n 76 | 0000026727 00000 n 77 | 0000026781 00000 n 78 | 0000030750 00000 n 79 | 0000030882 00000 n 80 | 0000031011 00000 n 81 | 0000031041 00000 n 82 | 0000031198 00000 n 83 | 0000031272 00000 n 84 | 0000031520 00000 n 85 | 0000035757 00000 n 86 | 0000036682 00000 n 87 | 0000037439 00000 n 88 | 0000057203 00000 n 89 | 0000058791 00000 n 90 | 0000059493 00000 n 91 | 0000064855 00000 n 92 | 0000065023 00000 n 93 | 0000065253 00000 n 94 | 0000065377 00000 n 95 | 0000065407 00000 n 96 | 0000065559 00000 n 97 | 0000065633 00000 n 98 | 0000065876 00000 n 99 | 0000066039 00000 n 100 | 0000066264 00000 n 101 | 0000066438 00000 n 102 | 0000066676 00000 n 103 | 0000066846 00000 n 104 | 0000067080 00000 n 105 | 0000067769 00000 n 106 | 0000070097 00000 n 107 | 0000070782 00000 n 108 | 0000074703 00000 n 109 | 0000075372 00000 n 110 | 0000079610 00000 n 111 | 0000080530 00000 n 112 | 0000086751 00000 n 113 | 0000087445 00000 n 114 | 0000094019 00000 n 115 | 0000094195 00000 n 116 | 0000094434 00000 n 117 | 0000095146 00000 n 118 | 0000097610 00000 n 119 | 0000098290 00000 n 120 | 0000101036 00000 n 121 | 0000101753 00000 n 122 | 0000106802 00000 n 123 | 0000107713 00000 n 124 | 0000109253 00000 n 125 | 0000000120 65535 f 126 | 0000000121 65535 f 127 | 0000000122 65535 f 128 | 0000000123 65535 f 129 | 0000000124 65535 f 130 | 0000000125 65535 f 131 | 0000000126 65535 f 132 | 0000000127 65535 f 133 | 0000000128 65535 f 134 | 0000000129 65535 f 135 | 0000000130 65535 f 136 | 0000000131 65535 f 137 | 0000000132 65535 f 138 | 0000000133 65535 f 139 | 0000000134 65535 f 140 | 0000000135 65535 f 141 | 0000000136 65535 f 142 | 0000000137 65535 f 143 | 0000000138 65535 f 144 | 0000000139 65535 f 145 | 0000000140 65535 f 146 | 0000000141 65535 f 147 | 0000000142 65535 f 148 | 0000000143 65535 f 149 | 0000000144 65535 f 150 | 0000000145 65535 f 151 | 0000000146 65535 f 152 | 0000000147 65535 f 153 | 0000000148 65535 f 154 | 0000000149 65535 f 155 | 0000000150 65535 f 156 | 0000000151 65535 f 157 | 0000000152 65535 f 158 | 0000000153 65535 f 159 | 0000000154 65535 f 160 | 0000000155 65535 f 161 | 0000000156 65535 f 162 | 0000000157 65535 f 163 | 0000000158 65535 f 164 | 0000000159 65535 f 165 | 0000000160 65535 f 166 | 0000000161 65535 f 167 | 0000000162 65535 f 168 | 0000000163 65535 f 169 | 0000000164 65535 f 170 | 0000000165 65535 f 171 | 0000000166 65535 f 172 | 0000000167 65535 f 173 | 0000000168 65535 f 174 | 0000000169 65535 f 175 | 0000000170 65535 f 176 | 0000000171 65535 f 177 | 0000000172 65535 f 178 | 0000000173 65535 f 179 | 0000000174 65535 f 180 | 0000000175 65535 f 181 | 0000000176 65535 f 182 | 0000000177 65535 f 183 | 0000000178 65535 f 184 | 0000000179 65535 f 185 | 0000000180 65535 f 186 | 0000000181 65535 f 187 | 0000000182 65535 f 188 | 0000000183 65535 f 189 | 0000000184 65535 f 190 | 0000000185 65535 f 191 | 0000000186 65535 f 192 | 0000000187 65535 f 193 | 0000000188 65535 f 194 | 0000000189 65535 f 195 | 0000000190 65535 f 196 | 0000000191 65535 f 197 | 0000000192 65535 f 198 | 0000000193 65535 f 199 | 0000000194 65535 f 200 | 0000000195 65535 f 201 | 0000000196 65535 f 202 | 0000000197 65535 f 203 | 0000000198 65535 f 204 | 0000000199 65535 f 205 | 0000000200 65535 f 206 | 0000000201 65535 f 207 | 0000000202 65535 f 208 | 0000000203 65535 f 209 | 0000000204 65535 f 210 | 0000000205 65535 f 211 | 0000000206 65535 f 212 | 0000000207 65535 f 213 | 0000000208 65535 f 214 | 0000000209 65535 f 215 | 0000000210 65535 f 216 | 0000000211 65535 f 217 | 0000000212 65535 f 218 | 0000000213 65535 f 219 | 0000000214 65535 f 220 | 0000000215 65535 f 221 | 0000000216 65535 f 222 | 0000000217 65535 f 223 | 0000000218 65535 f 224 | 0000000219 65535 f 225 | 0000000220 65535 f 226 | 0000000221 65535 f 227 | 0000000222 65535 f 228 | 0000000223 65535 f 229 | 0000000224 65535 f 230 | 0000000225 65535 f 231 | 0000000226 65535 f 232 | 0000000227 65535 f 233 | 0000000228 65535 f 234 | 0000000229 65535 f 235 | 0000000230 65535 f 236 | 0000000231 65535 f 237 | 0000000232 65535 f 238 | 0000000233 65535 f 239 | 0000000234 65535 f 240 | 0000000235 65535 f 241 | 0000000236 65535 f 242 | 0000000237 65535 f 243 | 0000000238 65535 f 244 | 0000000239 65535 f 245 | 0000000240 65535 f 246 | 0000000241 65535 f 247 | 0000000242 65535 f 248 | 0000000243 65535 f 249 | 0000000244 65535 f 250 | 0000000245 65535 f 251 | 0000000246 65535 f 252 | 0000000247 65535 f 253 | 0000000248 65535 f 254 | 0000000249 65535 f 255 | 0000000250 65535 f 256 | 0000000251 65535 f 257 | 0000000252 65535 f 258 | 0000000253 65535 f 259 | 0000000254 65535 f 260 | 0000000255 65535 f 261 | 0000000256 65535 f 262 | 0000000257 65535 f 263 | 0000000258 65535 f 264 | 0000000259 65535 f 265 | 0000000260 65535 f 266 | 0000000261 65535 f 267 | 0000000262 65535 f 268 | 0000000263 65535 f 269 | 0000000264 65535 f 270 | 0000000265 65535 f 271 | 0000000266 65535 f 272 | 0000000267 65535 f 273 | 0000000268 65535 f 274 | 0000000269 65535 f 275 | 0000000270 65535 f 276 | 0000000271 65535 f 277 | 0000000272 65535 f 278 | 0000000273 65535 f 279 | 0000000274 65535 f 280 | 0000000275 65535 f 281 | 0000000276 65535 f 282 | 0000000277 65535 f 283 | 0000000278 65535 f 284 | 0000000279 65535 f 285 | 0000000280 65535 f 286 | 0000000281 65535 f 287 | 0000000282 65535 f 288 | 0000000283 65535 f 289 | 0000000284 65535 f 290 | 0000000285 65535 f 291 | 0000000286 65535 f 292 | 0000000287 65535 f 293 | 0000000288 65535 f 294 | 0000000289 65535 f 295 | 0000000290 65535 f 296 | 0000000291 65535 f 297 | 0000000292 65535 f 298 | 0000000293 65535 f 299 | 0000000294 65535 f 300 | 0000000295 65535 f 301 | 0000000296 65535 f 302 | 0000000297 65535 f 303 | 0000000298 65535 f 304 | 0000000299 65535 f 305 | 0000000300 65535 f 306 | 0000000301 65535 f 307 | 0000000302 65535 f 308 | 0000000303 65535 f 309 | 0000000304 65535 f 310 | 0000000305 65535 f 311 | 0000000306 65535 f 312 | 0000000307 65535 f 313 | 0000000308 65535 f 314 | 0000000309 65535 f 315 | 0000000310 65535 f 316 | 0000000311 65535 f 317 | 0000000312 65535 f 318 | 0000000313 65535 f 319 | 0000000314 65535 f 320 | 0000000315 65535 f 321 | 0000000316 65535 f 322 | 0000000317 65535 f 323 | 0000000318 65535 f 324 | 0000000319 65535 f 325 | 0000000320 65535 f 326 | 0000000321 65535 f 327 | 0000000322 65535 f 328 | 0000000323 65535 f 329 | 0000000324 65535 f 330 | 0000000325 65535 f 331 | 0000000326 65535 f 332 | 0000000327 65535 f 333 | 0000000328 65535 f 334 | 0000000329 65535 f 335 | 0000000330 65535 f 336 | 0000000331 65535 f 337 | 0000000332 65535 f 338 | 0000000333 65535 f 339 | 0000000334 65535 f 340 | 0000000335 65535 f 341 | 0000000336 65535 f 342 | 0000000337 65535 f 343 | 0000000338 65535 f 344 | 0000000339 65535 f 345 | 0000000340 65535 f 346 | 0000000341 65535 f 347 | 0000000342 65535 f 348 | 0000000343 65535 f 349 | 0000000344 65535 f 350 | 0000000345 65535 f 351 | 0000000346 65535 f 352 | 0000000347 65535 f 353 | 0000000348 65535 f 354 | 0000000349 65535 f 355 | 0000000350 65535 f 356 | 0000000351 65535 f 357 | 0000000352 65535 f 358 | 0000000353 65535 f 359 | 0000000354 65535 f 360 | 0000000355 65535 f 361 | 0000000356 65535 f 362 | 0000000357 65535 f 363 | 0000000358 65535 f 364 | 0000000359 65535 f 365 | 0000000360 65535 f 366 | 0000000361 65535 f 367 | 0000000362 65535 f 368 | 0000000363 65535 f 369 | 0000000364 65535 f 370 | 0000000365 65535 f 371 | 0000000366 65535 f 372 | 0000000367 65535 f 373 | 0000000368 65535 f 374 | 0000000369 65535 f 375 | 0000000370 65535 f 376 | 0000000371 65535 f 377 | 0000000372 65535 f 378 | 0000000373 65535 f 379 | 0000000374 65535 f 380 | 0000000375 65535 f 381 | 0000000376 65535 f 382 | 0000000377 65535 f 383 | 0000000378 65535 f 384 | 0000000379 65535 f 385 | 0000000380 65535 f 386 | 0000000381 65535 f 387 | 0000000382 65535 f 388 | 0000000383 65535 f 389 | 0000000384 65535 f 390 | 0000000385 65535 f 391 | 0000000386 65535 f 392 | 0000000387 65535 f 393 | 0000000388 65535 f 394 | 0000000389 65535 f 395 | 0000000390 65535 f 396 | 0000000391 65535 f 397 | 0000000392 65535 f 398 | 0000000393 65535 f 399 | 0000000394 65535 f 400 | 0000000395 65535 f 401 | 0000000396 65535 f 402 | 0000000397 65535 f 403 | 0000000398 65535 f 404 | 0000000399 65535 f 405 | 0000000400 65535 f 406 | 0000000401 65535 f 407 | 0000000402 65535 f 408 | 0000000403 65535 f 409 | 0000000404 65535 f 410 | 0000000405 65535 f 411 | 0000000406 65535 f 412 | 0000000407 65535 f 413 | 0000000408 65535 f 414 | 0000000409 65535 f 415 | 0000000410 65535 f 416 | 0000000411 65535 f 417 | 0000000412 65535 f 418 | 0000000413 65535 f 419 | 0000000414 65535 f 420 | 0000000415 65535 f 421 | 0000000416 65535 f 422 | 0000000417 65535 f 423 | 0000000418 65535 f 424 | 0000000419 65535 f 425 | 0000000420 65535 f 426 | 0000000421 65535 f 427 | 0000000422 65535 f 428 | 0000000423 65535 f 429 | 0000000424 65535 f 430 | 0000000425 65535 f 431 | 0000000426 65535 f 432 | 0000000427 65535 f 433 | 0000000428 65535 f 434 | 0000000429 65535 f 435 | 0000000430 65535 f 436 | 0000000431 65535 f 437 | 0000000432 65535 f 438 | 0000000433 65535 f 439 | 0000000434 65535 f 440 | 0000000435 65535 f 441 | 0000000436 65535 f 442 | 0000000437 65535 f 443 | 0000000438 65535 f 444 | 0000000439 65535 f 445 | 0000000440 65535 f 446 | 0000000441 65535 f 447 | 0000000442 65535 f 448 | 0000000443 65535 f 449 | 0000000444 65535 f 450 | 0000000445 65535 f 451 | 0000000446 65535 f 452 | 0000000447 65535 f 453 | 0000000448 65535 f 454 | 0000000449 65535 f 455 | 0000000450 65535 f 456 | 0000000451 65535 f 457 | 0000000452 65535 f 458 | 0000000453 65535 f 459 | 0000000454 65535 f 460 | 0000000455 65535 f 461 | 0000000456 65535 f 462 | 0000000457 65535 f 463 | 0000000458 65535 f 464 | 0000000459 65535 f 465 | 0000000460 65535 f 466 | 0000000461 65535 f 467 | 0000000462 65535 f 468 | 0000000463 65535 f 469 | 0000000464 65535 f 470 | 0000000465 65535 f 471 | 0000000466 65535 f 472 | 0000000467 65535 f 473 | 0000000468 65535 f 474 | 0000000469 65535 f 475 | 0000000470 65535 f 476 | 0000000471 65535 f 477 | 0000000472 65535 f 478 | 0000000473 65535 f 479 | 0000000474 65535 f 480 | 0000000475 65535 f 481 | 0000000476 65535 f 482 | 0000000477 65535 f 483 | 0000000478 65535 f 484 | 0000000479 65535 f 485 | 0000000480 65535 f 486 | 0000000481 65535 f 487 | 0000000482 65535 f 488 | 0000000483 65535 f 489 | 0000000484 65535 f 490 | 0000000485 65535 f 491 | 0000000486 65535 f 492 | 0000000487 65535 f 493 | 0000000488 65535 f 494 | 0000000489 65535 f 495 | 0000000490 65535 f 496 | 0000000491 65535 f 497 | 0000000492 65535 f 498 | 0000000493 65535 f 499 | 0000000494 65535 f 500 | 0000000495 65535 f 501 | 0000000496 65535 f 502 | 0000000497 65535 f 503 | 0000000498 65535 f 504 | 0000000499 65535 f 505 | 0000000500 65535 f 506 | 0000000501 65535 f 507 | 0000000502 65535 f 508 | 0000000503 65535 f 509 | 0000000504 65535 f 510 | 0000000505 65535 f 511 | 0000000506 65535 f 512 | 0000000507 65535 f 513 | 0000000508 65535 f 514 | 0000000509 65535 f 515 | 0000000510 65535 f 516 | 0000000511 65535 f 517 | 0000000512 65535 f 518 | 0000000513 65535 f 519 | 0000000514 65535 f 520 | 0000000515 65535 f 521 | 0000000516 65535 f 522 | 0000000517 65535 f 523 | 0000000518 65535 f 524 | 0000000519 65535 f 525 | 0000000520 65535 f 526 | 0000000521 65535 f 527 | 0000000522 65535 f 528 | 0000000523 65535 f 529 | 0000000524 65535 f 530 | 0000000525 65535 f 531 | 0000000526 65535 f 532 | 0000000527 65535 f 533 | 0000000528 65535 f 534 | 0000000529 65535 f 535 | 0000000530 65535 f 536 | 0000000531 65535 f 537 | 0000000532 65535 f 538 | 0000000533 65535 f 539 | 0000000534 65535 f 540 | 0000000535 65535 f 541 | 0000000536 65535 f 542 | 0000000537 65535 f 543 | 0000000538 65535 f 544 | 0000000539 65535 f 545 | 0000000540 65535 f 546 | 0000000541 65535 f 547 | 0000000542 65535 f 548 | 0000000543 65535 f 549 | 0000000544 65535 f 550 | 0000000545 65535 f 551 | 0000000546 65535 f 552 | 0000000547 65535 f 553 | 0000000548 65535 f 554 | 0000000549 65535 f 555 | 0000000550 65535 f 556 | 0000000551 65535 f 557 | 0000000552 65535 f 558 | 0000000553 65535 f 559 | 0000000554 65535 f 560 | 0000000555 65535 f 561 | 0000000556 65535 f 562 | 0000000557 65535 f 563 | 0000000558 65535 f 564 | 0000000559 65535 f 565 | 0000000560 65535 f 566 | 0000000561 65535 f 567 | 0000000562 65535 f 568 | 0000000563 65535 f 569 | 0000000564 65535 f 570 | 0000000565 65535 f 571 | 0000000566 65535 f 572 | 0000000567 65535 f 573 | 0000000568 65535 f 574 | 0000000569 65535 f 575 | 0000000570 65535 f 576 | 0000000571 65535 f 577 | 0000000572 65535 f 578 | 0000000573 65535 f 579 | 0000000574 65535 f 580 | 0000000575 65535 f 581 | 0000000576 65535 f 582 | 0000000577 65535 f 583 | 0000000578 65535 f 584 | 0000000579 65535 f 585 | 0000000580 65535 f 586 | 0000000581 65535 f 587 | 0000000582 65535 f 588 | 0000000583 65535 f 589 | 0000000584 65535 f 590 | 0000000585 65535 f 591 | 0000000586 65535 f 592 | 0000000587 65535 f 593 | 0000000588 65535 f 594 | 0000000589 65535 f 595 | 0000000590 65535 f 596 | 0000000591 65535 f 597 | 0000000592 65535 f 598 | 0000000593 65535 f 599 | 0000000594 65535 f 600 | 0000000595 65535 f 601 | 0000000596 65535 f 602 | 0000000597 65535 f 603 | 0000000598 65535 f 604 | 0000000599 65535 f 605 | 0000000600 65535 f 606 | 0000000601 65535 f 607 | 0000000602 65535 f 608 | 0000000603 65535 f 609 | 0000000604 65535 f 610 | 0000000605 65535 f 611 | 0000000606 65535 f 612 | 0000000607 65535 f 613 | 0000000608 65535 f 614 | 0000000609 65535 f 615 | 0000000610 65535 f 616 | 0000000611 65535 f 617 | 0000000612 65535 f 618 | 0000000613 65535 f 619 | 0000000614 65535 f 620 | 0000000615 65535 f 621 | 0000000616 65535 f 622 | 0000000617 65535 f 623 | 0000000618 65535 f 624 | 0000000619 65535 f 625 | 0000000620 65535 f 626 | 0000000621 65535 f 627 | 0000000622 65535 f 628 | 0000000623 65535 f 629 | 0000000624 65535 f 630 | 0000000625 65535 f 631 | 0000000626 65535 f 632 | 0000000627 65535 f 633 | 0000000628 65535 f 634 | 0000000629 65535 f 635 | 0000000630 65535 f 636 | 0000000631 65535 f 637 | 0000000632 65535 f 638 | 0000000633 65535 f 639 | 0000000634 65535 f 640 | 0000000635 65535 f 641 | 0000000636 65535 f 642 | 0000000637 65535 f 643 | 0000000638 65535 f 644 | 0000000639 65535 f 645 | 0000000640 65535 f 646 | 0000000641 65535 f 647 | 0000000642 65535 f 648 | 0000000643 65535 f 649 | 0000000644 65535 f 650 | 0000000645 65535 f 651 | 0000000646 65535 f 652 | 0000000647 65535 f 653 | 0000000648 65535 f 654 | 0000000000 65535 f 655 | 0000117841 00000 n 656 | 0000118046 00000 n 657 | 0000118346 00000 n 658 | 0000180620 00000 n 659 | 0000181077 00000 n 660 | 0000181665 00000 n 661 | 0000181965 00000 n 662 | 0000238489 00000 n 663 | 0000238886 00000 n 664 | 0000239442 00000 n 665 | 0000239830 00000 n 666 | 0000253551 00000 n 667 | 0000253742 00000 n 668 | 0000253975 00000 n 669 | 0000254214 00000 n 670 | 0000269283 00000 n 671 | 0000269311 00000 n 672 | 0000269612 00000 n 673 | 0000281924 00000 n 674 | 0000281968 00000 n 675 | 0000282268 00000 n 676 | 0000282562 00000 n 677 | 0000282959 00000 n 678 | 0000343200 00000 n 679 | 0000343599 00000 n 680 | 0000343943 00000 n 681 | 0000344223 00000 n 682 | 0000344399 00000 n 683 | 0000344663 00000 n 684 | -------------------------------------------------------------------------------- /tests/test_filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Performs unit tests for filters.py. 4 | 5 | TO-DO Add license notice, if any. 6 | """ 7 | from itertools import product as cartesian_product 8 | from math import floor, log 9 | from os.path import abspath, dirname, join 10 | import string 11 | import sys 12 | import unittest 13 | 14 | import pytest 15 | 16 | from pypdf.filters import ( 17 | ASCII85Codec, 18 | ASCIIHexCodec, 19 | CCITTFaxCodec, 20 | DCTCodec, 21 | FlateCodec, 22 | LZWCodec, 23 | decodeStreamData, 24 | ) 25 | from pypdf.generic import EncodedStreamObject, IndirectObject 26 | from pypdf.pdf import PdfFileReader 27 | from pypdf.utils import PdfReadError, PdfStreamError, hexEncode 28 | from tests.utils import intToBitstring 29 | 30 | TESTS_ROOT = abspath(dirname(__file__)) 31 | TEST_DATA_ROOT = join(TESTS_ROOT, "fixture_data") 32 | 33 | 34 | # Establish bytes/str/unicode types. 35 | try: 36 | unicode 37 | except NameError: 38 | # Python 3 39 | BytesType = bytes 40 | StrType = str 41 | UnicodeType = str 42 | else: 43 | # Python 2 44 | BytesType = bytes 45 | StrType = str 46 | UnicodeType = unicode 47 | 48 | 49 | class FlateCodecTestCase(unittest.TestCase): 50 | """ 51 | Tests expected results and edge cases of FlateCodec. 52 | """ 53 | 54 | @classmethod 55 | def setUpClass(cls): 56 | cls.filterInputs = [ 57 | "", 58 | "", 59 | """""", 60 | string.ascii_lowercase, 61 | string.ascii_uppercase, 62 | string.ascii_letters, 63 | string.digits, 64 | string.hexdigits, 65 | string.punctuation, 66 | string.whitespace, # Add more... 67 | ] 68 | for f__ in ("TheHappyPrince.txt",): 69 | with open(join(TEST_DATA_ROOT, f__)) as infile: 70 | cls.filterInputs.append(infile.read()) 71 | 72 | cls.filterInputs = tuple(s.encode("latin1") for s in cls.filterInputs) 73 | 74 | def testExpectedResults(self): 75 | """ 76 | Tests FlateCodec decode() and encode() methods. 77 | 78 | TO-DO Test the result with the omitted predictor values. 79 | """ 80 | codec = FlateCodec() 81 | predictors = [1] # , 10, 11, 12, 13, 14, 15] 82 | 83 | for predictor, s__ in cartesian_product(predictors, self.filterInputs): 84 | self.assertEqual( 85 | s__, 86 | codec.decode(codec.encode(s__), {"/Predictor": predictor}), 87 | "(predictor, s__) = (%d, %s)" % (predictor, s__), 88 | ) 89 | 90 | def testInvalidPredictors(self): 91 | """ 92 | Inputs a series of invalid predictor values (outside the 93 | {1, 2} U [10, 15] range) checking that ``PdfReadError`` is raised. 94 | """ 95 | codec = FlateCodec() 96 | predictors = tuple(set(range(-20, 21)) - {1, 2, 10, 11, 12, 13, 14, 15}) 97 | 98 | for predictor, s__ in cartesian_product(predictors, self.filterInputs): 99 | with self.assertRaises( 100 | PdfReadError, # pylint: disable=bad-continuation 101 | msg="(predictor, input) = (%d, %s)" # pylint: disable=bad-continuation 102 | % (predictor, s__), # pylint: disable=bad-continuation 103 | ): 104 | codec.decode(codec.encode(s__), {"/Predictor": predictor}) 105 | 106 | 107 | class ASCIIHexCodecTestCase(unittest.TestCase): 108 | """ 109 | Tests primarily the decode() method of ASCIIHexCodec. 110 | """ 111 | 112 | @classmethod 113 | def setUpClass(cls): 114 | cls.filterInputs = ( 115 | "", 116 | "", 117 | """""", 118 | ">", 119 | ">>", 120 | ">>>", 121 | string.ascii_lowercase, 122 | string.ascii_uppercase, 123 | string.ascii_letters, 124 | string.digits, 125 | string.hexdigits, 126 | string.punctuation, 127 | string.whitespace, # Add more... 128 | ) 129 | 130 | def testExpectedResults(self): 131 | """ 132 | Feeds a bunch of values to ``ASCIIHexCodec.decode()`` and ensures that 133 | the correct output is returned. 134 | 135 | TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or 136 | any other not terminated by ">"? (For the latter case, an exception 137 | is currently raised.) 138 | """ 139 | inputs = ( 140 | ">", 141 | "6162636465666768696a6b6c6d6e6f707172737475767778797a>", 142 | "4142434445464748494a4b4c4d4e4f505152535455565758595a>", 143 | "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464" 144 | "748494a4b4c4d4e4f505152535455565758595a>", 145 | "30313233343536373839>", 146 | "3 031323334353637 3839>", # Same as previous, but whitespaced 147 | "30313233343536373839616263646566414243444546>", 148 | hexEncode(string.whitespace) + ">", 149 | ) 150 | expected_outputs = ( 151 | "", 152 | string.ascii_lowercase, 153 | string.ascii_uppercase, 154 | string.ascii_letters, 155 | string.digits, 156 | string.digits, 157 | string.hexdigits, 158 | string.whitespace, 159 | ) 160 | 161 | for o__, i in zip(expected_outputs, inputs): 162 | self.assertEqual( 163 | o__, 164 | ASCIIHexCodec.decode(i), 165 | "Expected = %s\tReceived = %s" 166 | % (repr(o__), repr(ASCIIHexCodec.decode(i))), 167 | ) 168 | 169 | def testNoEod(self): 170 | """ 171 | Tests when no EOD character is present, ensuring an exception is 172 | raised. 173 | """ 174 | inputs = ("", "", """""", """""") 175 | 176 | for i in inputs: 177 | with self.assertRaises(PdfStreamError): 178 | ASCIIHexCodec.decode(i) 179 | 180 | 181 | class ASCII85CodecTestCase(unittest.TestCase): 182 | """ 183 | Tests the ``decode()`` method of ``ASCII85Codec``. 184 | """ 185 | 186 | def testEncodeDecode(self): 187 | """ 188 | Verifies that decode(encode(data)) == data, with encode() and decode() 189 | from ASCII85Codec. 190 | """ 191 | e__, d__ = ASCII85Codec.encode, ASCII85Codec.decode 192 | inputs = [ 193 | string.ascii_lowercase.encode("ascii"), 194 | string.ascii_uppercase.encode("ascii"), 195 | string.ascii_letters.encode("ascii"), 196 | string.whitespace.encode("ascii"), 197 | b"\x00\x00\x00\x00", 198 | 2 * b"\x00\x00\x00\x00", 199 | ] 200 | 201 | for filename in ("TheHappyPrince.txt",): 202 | with open(join(TEST_DATA_ROOT, filename), "rb") as infile: 203 | inputs.append(infile.read()) 204 | 205 | for i in inputs: 206 | if sys.version_info > (3, 0) and isinstance(i, str): 207 | # The Python 3 version of decode() returns a bytes instance 208 | exp = i.encode("LATIN1") 209 | else: 210 | exp = i 211 | 212 | self.assertEqual(exp, d__(e__(i))) 213 | 214 | def testWithOverflow(self): 215 | """ [EXPLAIN THIS.] """ 216 | inputs = ( 217 | v__ + "~>" 218 | for v__ in "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" 219 | "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a" 220 | "\x1b\x1c\x1d\x1e\x1fvwxy{|}~\x7f\x80\x81\x82" 221 | "\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d" 222 | "\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98" 223 | "\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬" 224 | "\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇ" 225 | ) 226 | 227 | for i in inputs: 228 | with self.assertRaises(ValueError, msg="char = " + repr(i)): 229 | ASCII85Codec.decode(i) 230 | 231 | def testFiveZeroBytes(self): 232 | """ 233 | From ISO 32000 (2008) sect. 7.4.3: 234 | «As a special case, if all five bytes are 0, they shall be represented 235 | by the character with code 122 (z) instead of by five exclamation 236 | points (!!!!!).» 237 | """ 238 | inputs = (b"z~>", b"zz~>", b"zzz~>") 239 | exp_outputs = ( 240 | b"\x00\x00\x00\x00", 241 | b"\x00\x00\x00\x00" * 2, 242 | b"\x00\x00\x00\x00" * 3, 243 | ) 244 | 245 | self.assertEqual(ASCII85Codec.decode(b"!!!!!~>"), ASCII85Codec.decode(b"z~>")) 246 | 247 | for o__, i in zip(exp_outputs, inputs): 248 | self.assertEqual(o__, ASCII85Codec.decode(i)) 249 | 250 | 251 | class LZWCodecTestCase(unittest.TestCase): 252 | """ 253 | Tests the ``LZWCodec.decode()`` method by means of a LZW Encoder built 254 | specifically for testing it. 255 | """ 256 | 257 | def testWriteCode(self): 258 | """ 259 | Tests that the memorization of bit values performed by ``_writeCode()`` 260 | as a contiguous bit-stream works as intended. 261 | """ 262 | self.maxDiff = None 263 | e__ = LZWCodec.Encoder("") 264 | e__.output = list() 265 | 266 | inputs = range(2 ** 8, 2 ** 12 - 1) 267 | e__.bitspercode = int(floor(log(inputs[0], 2))) + 1 268 | exp_output = "".join(intToBitstring(n__, floor(log(n__, 2))) for n__ in inputs) 269 | 270 | for i in inputs: 271 | if floor(log(i, 2)) + 1 > e__.bitspercode: 272 | e__.bitspercode += 1 273 | 274 | e__._writeCode(i) 275 | 276 | self.assertEqual( 277 | exp_output, "".join(intToBitstring(n__) for n__ in e__.output)[: e__.bitpos] 278 | ) 279 | 280 | def testReadCode(self): 281 | """ 282 | Tests that the interpretation of bit values performed by 283 | ``_readCode()`` as a contiguous bit-stream works as intended. 284 | """ 285 | inputs = bytearray(range(256)) 286 | d__ = LZWCodec.Decoder(inputs) 287 | exp_output_stream = "".join(intToBitstring(b__) for b__ in inputs) 288 | curr = 0 289 | # TODO: this deserves to be an assignment expression. 290 | code = d__._readCode() 291 | 292 | while code != -1: 293 | if curr + d__.bitspercode >= len(exp_output_stream): 294 | exp_output = exp_output_stream[curr:] + "0" * ( 295 | (curr + d__.bitspercode) - len(exp_output_stream) 296 | ) 297 | else: 298 | exp_output = exp_output_stream[curr : curr + d__.bitspercode] 299 | 300 | self.assertEqual( 301 | exp_output, 302 | intToBitstring(code, d__.bitspercode), 303 | msg="(curr, code) = (%d, %d)" % (curr, code), 304 | ) 305 | 306 | curr += d__.bitspercode 307 | code = d__._readCode() 308 | 309 | def testEncodeDecode(self): 310 | """ 311 | Ensures that the ``decode(encode(data))`` concatenation equals data, 312 | where data can be an arbitrary byte stream. 313 | """ 314 | self.maxDiff = None 315 | inputs = [ 316 | string.ascii_lowercase, 317 | string.ascii_uppercase, 318 | string.whitespace, 319 | string.ascii_letters, 320 | 2000 * string.ascii_letters, 321 | ] 322 | 323 | if sys.version_info > (3, 0): 324 | for index, e in enumerate(inputs): 325 | inputs[index] = e.encode("LATIN1") 326 | 327 | for f__ in ( 328 | "Hamlet.txt", # pylint: disable=bad-continuation 329 | "TheHappyPrince.txt", # pylint: disable=bad-continuation 330 | ): 331 | with open(join(TEST_DATA_ROOT, f__), "rb") as infile: 332 | # TO-DO If we approach the number of read bytes to 10K the 333 | # codec stops working correctly. This is a bug to fix! 334 | inputs.append(infile.read()) 335 | 336 | for b__ in inputs: 337 | # IMPORTANT TODO: why does this round trip fail for full-length 338 | # data? Until we solve this, truncate the inputs to the first 339 | # 9500 bytes. 340 | b__ = b__[:9500] 341 | e__ = LZWCodec.Encoder(b__) 342 | d__ = LZWCodec.Decoder(e__.encode()) 343 | 344 | self.assertEqual(b__, d__.decode()) 345 | 346 | 347 | class DecodeStreamDataTestCase(unittest.TestCase): 348 | """ 349 | Test case intended to test the 350 | :meth:`decodeStreamData` method. If functions by 351 | querying known object references, asking ``decodeStreamData()`` to decode 352 | their stream content and check the decoded value against what would be 353 | produced by the filter that is known to be used. 354 | """ 355 | 356 | def testDecodeStreamData(self): 357 | """ Stores PDF files infos and the coordinates of stream objects. We 358 | don't care if we need to open a new file stream for each obj. 359 | reference -- unit tests don't have to be efficient 360 | """ 361 | this_dir = join(TEST_DATA_ROOT, self.testDecodeStreamData.__name__) 362 | filters = ( 363 | # (filter type, filename, id, gen. number) 364 | (FlateCodec, "FlateDecode.pdf", 4, 0), 365 | (FlateCodec, "FlateDecode.pdf", 8, 0), 366 | (FlateCodec, "FlateDecode.pdf", 9, 0), 367 | # TO-DO No PDF files found with this type of encoding, get them. 368 | # (ASCIIHexCodec, "ASCIIHexDecode.pdf", ?, ?) 369 | (LZWCodec, "LZWDecode.pdf", 209, 0), 370 | (LZWCodec, "LZWDecode.pdf", 210, 0), 371 | (LZWCodec, "LZWDecode.pdf", 211, 0), 372 | (ASCII85Codec, "ASCII85Decode.pdf", 5, 0), 373 | (ASCII85Codec, "ASCII85Decode.pdf", 6, 0), 374 | (DCTCodec, "DCTDecode.pdf", 4, 0), 375 | # TO-DO No PDF files found with this type of encoding, get them. 376 | # (JPXCodec, "JPXDecode.pdf", ?, ?) 377 | (CCITTFaxCodec, "CCITTFaxDecode.pdf", 46, 0), 378 | ) 379 | 380 | for f__ in filters: 381 | with open(join(this_dir, f__[1]), "rb") as infile: 382 | reader = PdfFileReader(infile) 383 | ref = IndirectObject(f__[2], f__[3], reader) 384 | stream = reader.getObject(ref) 385 | 386 | # Ensures that the PdfFileReader reads a stream object 387 | self.assertEqual(EncodedStreamObject, type(stream)) 388 | 389 | # print("Running with %s!" % f[0].__name__) 390 | if f__[0] is CCITTFaxCodec: 391 | self.assertEqual( 392 | f__[0].decode( 393 | stream._data, 394 | stream.get("/DecodeParms"), 395 | stream.get("/Height"), 396 | ), 397 | decodeStreamData(stream), 398 | ) 399 | else: 400 | self.assertEqual( 401 | f__[0].decode(stream._data, stream.get("/DecodeParms")), 402 | decodeStreamData(stream), 403 | ) 404 | 405 | 406 | @pytest.mark.parametrize( 407 | "data, expected_value, exception", 408 | ( 409 | (b"<~~>", b"", None), # Empty input 410 | (b"<~@:E^~>", b"abc", None), # Basic decoding 411 | (u"<~@:E^~>", b"abc", None), # Handle a str (or unicode) object 412 | (b"<~@: E^~>", b"abc", None), # Ignore whitespace 413 | (b"<~z~>", b"\x00\x00\x00\x00", None), # Handle 'z' 414 | (b"~>", b"", None), # No initial '<~' 415 | (b"@:E^~>", b"abc", None), # No initial '<~' 416 | (b"", None, ValueError), # Choke on missing '~>' 417 | (b">", None, ValueError), # Choke on missing '~>' 418 | (b"<~<~~>", None, ValueError), # Don't double-skip '<~' 419 | (b"<~~~>", None, ValueError), # Choke on bare '~' 420 | (b"<~aazaa~>", None, ValueError), # Choke on mid-group 'z' 421 | (u"<~\x80~>", None, ValueError), # Choke on non-ASCII characters 422 | ), 423 | ) 424 | def test_ascii85_decode(data, expected_value, exception): 425 | """ [EXPLAIN THIS.] """ 426 | if exception: 427 | with pytest.raises(exception): 428 | ASCII85Codec.decode(data) 429 | else: 430 | value = ASCII85Codec.decode(data) 431 | assert value == expected_value 432 | assert isinstance(value, BytesType) 433 | 434 | 435 | @pytest.mark.parametrize( 436 | "data, expected_value", 437 | ( 438 | (b"", b"<~~>"), 439 | (b"abc", b"<~@:E^~>"), 440 | (b"\x00", b"<~!!~>"), 441 | (b"\xff", b"<~rr~>"), 442 | (b"\x00\x00\x00\x00", b"<~z~>"), 443 | ), 444 | ) # pylint: disable=invalid-name 445 | def testASCII85Encode(data, expected_value): 446 | """ [EXPLAIN THIS.] """ 447 | value = ASCII85Codec.encode(data) 448 | assert value == expected_value 449 | assert isinstance(value, BytesType) 450 | -------------------------------------------------------------------------------- /tests/test_generic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the ``pypdf/generic.py`` module. 3 | """ 4 | import io 5 | from os.path import abspath, dirname, join, pardir 6 | import sys 7 | import unittest 8 | 9 | from pypdf.generic import IndirectObject, ObjectStream, TextStringObject 10 | from pypdf.pdf import PdfFileReader 11 | 12 | # Configure path environment 13 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir)) 14 | TESTS_DATA_ROOT = join(PROJECT_ROOT, "tests", "fixture_data") 15 | 16 | sys.path.append(PROJECT_ROOT) 17 | 18 | 19 | class ObjectStreamTestCase(unittest.TestCase): 20 | """ [EXPLAIN THIS.] """ 21 | 22 | def test_object_ids(self): 23 | """ 24 | Tests the ``ObjectStream.objectIds()`` method. 25 | """ 26 | exp_results = ( 27 | (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17), 28 | ( 29 | 644, 30 | 642, 31 | 646, 32 | 647, 33 | 648, 34 | 122, 35 | 119, 36 | 120, 37 | 121, 38 | 124, 39 | 179, 40 | 232, 41 | 327, 42 | 467, 43 | 478, 44 | 519, 45 | 568, 46 | 573, 47 | 580, 48 | 586, 49 | 592, 50 | 598, 51 | 603, 52 | 611, 53 | 616, 54 | 623, 55 | 629, 56 | 634, 57 | ), 58 | ) 59 | # Files we know to have Object Streams within 60 | input_data = ( 61 | # (filename, id, generation number) 62 | ("crazyones.pdf", 9, 0), 63 | ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0), 64 | ) 65 | 66 | for o__, d__ in zip(exp_results, input_data): 67 | filepath = join(TESTS_DATA_ROOT, d__[0]) 68 | r__ = PdfFileReader(filepath) 69 | ref = IndirectObject(d__[1], d__[2], r__) 70 | obj_stm = r__.getObject(ref) 71 | 72 | r__.close() 73 | 74 | self.assertIsInstance(obj_stm, ObjectStream) 75 | self.assertTupleEqual(tuple(o__), tuple(obj_stm.objectIds)) 76 | 77 | 78 | class TextStringObjectTestCase(unittest.TestCase): 79 | """ [EXPLAIN THIS.] """ 80 | 81 | @staticmethod 82 | def _get_output_bytes_for_string(input_string): 83 | stream = io.BytesIO() 84 | text_string_object = TextStringObject(input_string) 85 | text_string_object.writeToStream(stream, encryption_key=None) 86 | stream_output = stream.getvalue() 87 | return stream_output 88 | 89 | def test_write_to_stream(self): 90 | """ 91 | Tests the ``TextStringObject.writeToStream()`` method. 92 | """ 93 | 94 | output_for_lowercase_letter = self._get_output_bytes_for_string("k") 95 | self.assertEqual(output_for_lowercase_letter, b"(k)") 96 | 97 | output_for_uppercase_letter = self._get_output_bytes_for_string("K") 98 | self.assertEqual(output_for_uppercase_letter, b"(K)") 99 | 100 | output_for_digit = self._get_output_bytes_for_string("7") 101 | self.assertEqual(output_for_digit, b"(7)") 102 | 103 | output_for_space = self._get_output_bytes_for_string(" ") 104 | self.assertEqual(output_for_space, b"( )") 105 | 106 | output_for_opening_parentheses = self._get_output_bytes_for_string("(") 107 | self.assertEqual(output_for_opening_parentheses, b"(\\050)") 108 | 109 | output_for_backslash = self._get_output_bytes_for_string("\\") 110 | self.assertEqual(output_for_backslash, b"(\\134)") 111 | 112 | 113 | if __name__ == "__main__": 114 | unittest.main() 115 | -------------------------------------------------------------------------------- /tests/test_pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests PDF primitives from pypdf.pdf. 3 | 4 | Note for future developers: if defining some code in a ``testX()`` method 5 | that relies on a "fixture data" (e.g. a test file to read from) place it in the 6 | ``/tests/fixture_data/testX/`` path (see some of the examples below to have a 7 | hint on how to do this). 8 | """ 9 | # TODO: switch dependence to pathlib. 10 | import binascii 11 | from io import BytesIO 12 | import os 13 | from os.path import abspath, basename, dirname, join, pardir 14 | import sys 15 | import tempfile 16 | import unittest 17 | 18 | from pypdf.generic import IndirectObject, readObject 19 | from pypdf.pdf import PdfFileReader, PdfFileWriter 20 | 21 | # Configure path environment 22 | PROJECT_ROOT = abspath(join(dirname(__file__), pardir)) 23 | TEST_DATA_ROOT = join(PROJECT_ROOT, "tests", "fixture_data") 24 | 25 | sys.path.append(PROJECT_ROOT) 26 | 27 | 28 | class PdfReaderTestCases(unittest.TestCase): 29 | """ [EXPLAIN THIS CLASS.] """ 30 | 31 | def setUp(self): 32 | # Variable defining the path where the method to be run next can store 33 | # its own fixture (test) data. 34 | self.localDataRoot = join(TEST_DATA_ROOT, self.id().split(".")[-1]) 35 | 36 | def testDel(self): 37 | """ 38 | Tests the ``__del__()`` method of ``PdfFileReader`` and 39 | ``PdfFileWriter`` ensuring that no exceptions are raised. 40 | """ 41 | r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf")) 42 | w = PdfFileWriter(BytesIO(b"")) 43 | 44 | try: 45 | r.__del__() 46 | self.assertTrue(True) 47 | except Exception as e: # pylint: disable=broad-except 48 | self.assertTrue( 49 | False, 50 | "Exception '%s' was raised in %s.__del__()" 51 | % (e, PdfFileReader.__name__), 52 | ) 53 | 54 | try: 55 | w.__del__() 56 | self.assertTrue(True) 57 | except Exception as e: # pylint: disable=broad-except 58 | self.assertTrue( 59 | False, 60 | "Exception '%s' was raised in %s.__del__()" 61 | % (e, PdfFileWriter.__name__), 62 | ) 63 | 64 | def testFileLoad(self): 65 | """ 66 | Test loading and parsing of a file. Extract text of the file and 67 | compare to expected textual output. Expected outcome: file loads, text 68 | matches expected. 69 | """ 70 | with open(join(TEST_DATA_ROOT, "crazyones.pdf"), "rb") as inputfile: 71 | # Load PDF file from file 72 | r = PdfFileReader(inputfile) 73 | page1 = r.getPage(0) 74 | 75 | # Retrieve the text of the PDF 76 | with open(join(self.localDataRoot, "crazyones.txt"), "rb") as pdftextFile: 77 | pdftext = pdftextFile.read() 78 | 79 | page1Text = page1.extractText().replace("\n", "").encode("utf-8") 80 | 81 | # Compare the text of the PDF to a known source 82 | self.assertEqual( 83 | pdftext, 84 | page1Text, 85 | msg="PDF extracted text differs from expected value." 86 | "\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, page1Text), 87 | ) 88 | 89 | r.close() 90 | 91 | def testJpegImage(self): 92 | """ 93 | Test loading and parsing of a file. Extract the image of the file and 94 | compare to expected textual output. Expected outcome: file loads, image 95 | matches expected. 96 | """ 97 | with open(join(TEST_DATA_ROOT, "jpeg.pdf"), "rb") as inputfile: 98 | # Load PDF file from file 99 | r = PdfFileReader(inputfile) 100 | 101 | # Retrieve the text of the image 102 | with open(join(self.localDataRoot, "jpeg.txt"), "r") as pdftextFile: 103 | imagetext = pdftextFile.read() 104 | 105 | page1 = r.getPage(0) 106 | xObject = page1["/Resources"]["/XObject"].getObject() 107 | data = xObject["/Im4"].getData() 108 | 109 | # Compare the text of the PDF to a known source 110 | self.assertEqual( 111 | binascii.hexlify(data).decode(), 112 | imagetext, 113 | msg="PDF extracted image differs from expected value." 114 | "\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" 115 | % (imagetext, binascii.hexlify(data).decode()), 116 | ) 117 | 118 | r.close() 119 | 120 | def testXRefTableObjects(self): 121 | """ 122 | Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect 123 | references from the XRef-Table *only* have been loaded as expected. 124 | Objects from the free entries list are included as well in the test. 125 | 126 | This case tests the part of ``PdfFileReader.objects()`` responsible for 127 | generating the Cross-Reference Table entries too. 128 | """ 129 | self.maxDiff = None 130 | inputFiles = ( 131 | "jpeg.pdf", 132 | "Seige_of_Vicksburg_Sample_OCR.pdf", 133 | "SF424_page2.pdf", 134 | ) 135 | 136 | for filename in inputFiles: 137 | filepath = join(TEST_DATA_ROOT, filename) 138 | xtablepath = join(self.localDataRoot, filename) 139 | r = PdfFileReader(filepath) 140 | # The two below are (id, gen, byte offset)-valued lists 141 | actualItems = list() 142 | expItems = list() 143 | 144 | for ref in r.objects(PdfFileReader.R_XTABLE, True): 145 | actualItems.append( 146 | ( 147 | ref.idnum, 148 | ref.generation, 149 | r._xrefTable[ref.generation][ref.idnum][0], 150 | ) 151 | ) 152 | 153 | r.close() 154 | # We artificially read the XRef Table entries that we know belong 155 | # to filepath, and store them into expItems. 156 | expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2))) 157 | actualItems = sorted(actualItems) 158 | expItems = sorted(expItems) 159 | 160 | self.assertListEqual( 161 | expItems, actualItems, "Differences found in " + filename 162 | ) 163 | 164 | def testXRefStreamObjects(self): 165 | """ 166 | Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it 167 | tests objects referenced by the Cross-Reference Stream. 168 | ``PdfFileReader.objects()`` second part (dealing with XStream objects) 169 | is invoked and implicitly tested. 170 | """ 171 | inputFiles = ("crazyones.pdf",) 172 | 173 | for filename in inputFiles: 174 | filepath = join(self.localDataRoot, filename) 175 | r = PdfFileReader(join(TEST_DATA_ROOT, filename)) 176 | # Two lists of tuples as explained by Table 18 177 | actualItems = list() 178 | expItems = list() 179 | 180 | with open(filepath, "r") as instream: 181 | for line in instream: 182 | if not line or line.isspace() or line.startswith("%"): 183 | continue 184 | 185 | this_type, field2, field3 = (int(f) for f in line.split()) 186 | expItems.append((this_type, field2, field3)) 187 | 188 | for item in r.objects(PdfFileReader.R_XSTREAM, True): 189 | priv8Item = r._xrefStm[item.idnum] 190 | 191 | if priv8Item[0] in {0, 1}: 192 | self.assertEqual(priv8Item[2], item.generation) 193 | elif priv8Item[0] == 2: 194 | self.assertEqual(item.generation, 0) 195 | 196 | actualItems.append(priv8Item) 197 | 198 | r.close() 199 | actualItems = sorted(actualItems) 200 | expItems = sorted(expItems) 201 | 202 | self.assertListEqual( 203 | expItems, 204 | actualItems, 205 | "Didn't correctly read the Cross-Reference Stream", 206 | ) 207 | 208 | def testReadXRefStreamCompressedObjects(self): # pylint: disable=too-many-locals 209 | """ 210 | Targets the same objects as ``testXRefStreamObjects()``, but instead 211 | of ensuring an identity between the list of items read and the one 212 | expected, it verifies that their *contents* are identical. 213 | 214 | This method does **not** test ``PdfFileReader.objects()`` as two of the 215 | previous test cases did. 216 | """ 217 | self.maxDiff = None 218 | inputFiles = ("crazyones.pdf",) 219 | # expItems and actualItems will contain two-element tuples, where the 220 | # first element is the object ID, used to sort. 221 | sortKey = lambda e: e[0] 222 | compressedObj = lambda e: e[1][0] == 2 223 | 224 | for filename in inputFiles: 225 | filepath = join(self.localDataRoot, filename) 226 | r = PdfFileReader(join(TEST_DATA_ROOT, filename)) 227 | expItems = list() 228 | actualItems = list() 229 | 230 | with open(filepath, "rb") as instream: 231 | for line in instream: 232 | if not line or line.isspace() or line.startswith(b"%"): 233 | continue 234 | 235 | globalId, offset, obj = line.split(b" ", 2) 236 | globalId, offset = int(globalId), int(offset) 237 | 238 | with BytesIO(obj) as objStream: 239 | obj = readObject(objStream, r) 240 | 241 | expItems.append((globalId, obj)) 242 | 243 | for itemid, _item in filter(compressedObj, r._xrefStm.items()): 244 | # We deal exclusively with compressed objects (from Table 18 of 245 | # ISO 32000 reference, 2008) whose generation number is 0 246 | actualItems.append( 247 | # (ID, PdfObject) tuples 248 | (itemid, IndirectObject(itemid, 0, r).getObject()) 249 | ) 250 | 251 | r.close() 252 | expItems = sorted(expItems, key=sortKey) 253 | actualItems = sorted(actualItems, key=sortKey) 254 | 255 | self.assertListEqual(expItems, actualItems) 256 | 257 | def testXTableAgainstXStream(self): 258 | """ 259 | In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That 260 | Do Not Support Compressed Reference Streams", the standard describes a 261 | means of crafting PDF files designed for versions 1.5+ that can be 262 | opened nevertheless by readers that support older versions. 263 | 264 | This test case verifies that all the items hidden by the XRef Table in 265 | non-conforming readers are *all and exactly* loaded into the XRef 266 | Stream by readers that support PDF 1.5+. 267 | """ 268 | self.maxDiff = None 269 | # TO-DO Possibly add a few other files to this test case 270 | inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf",) 271 | 272 | for filename in inputFiles: 273 | filepath = join(self.localDataRoot, filename) 274 | expItems = {e[0]: e[1:] for e in self._parseXRefTable(filepath, (0, 2, 3))} 275 | actualItems = list() 276 | r = PdfFileReader(join(TEST_DATA_ROOT, filename)) 277 | 278 | for ref in r.objects(PdfFileReader.R_XSTREAM, True): 279 | actualItems.append(ref) 280 | 281 | r.close() 282 | actualItems = sorted(actualItems, key=lambda e: e.idnum) 283 | expKeys = sorted(expItems.keys()) 284 | actualKeys = list(map(lambda e: e.idnum, actualItems)) 285 | 286 | self.assertListEqual( 287 | expKeys, actualKeys, "Lists of item IDs are not identical" 288 | ) 289 | 290 | for e, a in zip(expKeys, actualItems): 291 | self.assertEqual(e, a.idnum, "Items ID does not correspond") 292 | 293 | # If an item is in use in the XRef Stream, ensure then that it 294 | # is marked free in the XRef Table. 295 | if r._xrefStm[a.idnum][0] in (2,): 296 | self.assertTrue( 297 | expItems[e][-1], 298 | "Item %d should be hid by the XRef Table, but it was " 299 | "not." % e, 300 | ) 301 | 302 | def testIsObjectFree(self): 303 | """ 304 | Tests the ``PdfFileReader.isObjectFree()` method. 305 | """ 306 | # TO-DO Find PDF files that feature free-entry lists. We are checking 307 | # isObjectFree() only against used items. 308 | inputFiles = ( 309 | "jpeg.pdf", 310 | "Seige_of_Vicksburg_Sample_OCR.pdf", 311 | "SF424_page2.pdf", 312 | ) 313 | 314 | for filename in inputFiles: 315 | filepath = join(self.localDataRoot, filename) 316 | r = PdfFileReader(join(TEST_DATA_ROOT, filename)) 317 | expItems = self._parseXRefTable(filepath, (0, 1, 3)) 318 | actualItems = list() 319 | 320 | for ref in r.objects(PdfFileReader.R_XTABLE, True): 321 | actualItems.append( 322 | # This is where isObjectFree() gets invoked 323 | (ref.idnum, ref.generation, r.isObjectFree(ref)) 324 | ) 325 | 326 | r.close() 327 | expItems = sorted(expItems) 328 | actualItems = sorted(actualItems) 329 | 330 | self.assertListEqual(expItems, actualItems) 331 | 332 | def testContextManager(self): 333 | """ 334 | Tests the context manager implementation (the ``with as 335 | identifier`` feature) of ``PdfFileReader``. 336 | """ 337 | inputFiles = ( 338 | "jpeg.pdf", 339 | "Seige_of_Vicksburg_Sample_OCR.pdf", 340 | "SF424_page2.pdf", 341 | ) 342 | 343 | for filename in inputFiles: 344 | r = None 345 | 346 | with PdfFileReader(join(TEST_DATA_ROOT, filename)) as r: 347 | # Test assertions not strictly related to the whole test case 348 | self.assertEqual(filename, basename(r.filepath)) 349 | self.assertFalse(r.isClosed) 350 | 351 | self.assertTrue(r.isClosed) 352 | 353 | @staticmethod 354 | def _parseXRefTable(filepath, mask=tuple()): 355 | """ 356 | Parses a Cross-Reference Table, such as the sampled ones used for 357 | testing. 358 | 359 | :param filepath: file path where the table is stored in. 360 | :param mask: a list of fields' indices indicating which fields are to 361 | be returned. For example, ``(0, 2, 3)`` indicates that only the 362 | ``id``, ``byteOffset`` and ``isFree`` fields have to be returned. 363 | :return: an iterable of items of the form 364 | ``(id, gen, byteOffset, isFree)`` if ``mask`` hasn't been set, 365 | otherwise an iterable of all the items ``mask`` has specified. 366 | """ 367 | startid = None 368 | expecteditems = None 369 | itemssofar = None 370 | 371 | if not mask: 372 | mask = tuple(range(4)) 373 | 374 | with open(filepath, "r") as instream: 375 | for line in instream: 376 | if not line or line.isspace() or line.startswith("%"): 377 | continue 378 | 379 | tokens = line.strip().split() 380 | 381 | # We are beginning a new sub reference section 382 | if len(tokens) == 2: 383 | if itemssofar != expecteditems: 384 | raise ValueError( 385 | 'Line "%d %d" specified %d items, %d read' # pylint: disable=bad-string-format-type 386 | % (startid, expecteditems, expecteditems, itemssofar) 387 | ) 388 | 389 | startid = int(tokens[0]) 390 | expecteditems = int(tokens[1]) 391 | itemssofar = 0 392 | elif len(tokens) == 3: # New object info to add 393 | # We yield an (id, gen, byte offset) tuple 394 | output = ( 395 | startid + itemssofar, 396 | int(tokens[1]), 397 | int(tokens[0]), 398 | tokens[2] == "f", 399 | ) 400 | yield tuple(output[s] for s in mask) 401 | 402 | itemssofar += 1 403 | else: 404 | raise ValueError("Unexpected token in %s" % filepath) 405 | 406 | def testProperties(self): 407 | """ 408 | The switch from PyPDF2 to PyPDF4 sees many stylistic changes, including 409 | the use of the ``@property`` decorator (where possible) and pruning out 410 | of unnecessary arguments to ``property()`` as a function. 411 | In some cases, functions that previously had a ``@property`` accessor 412 | have it no more (to remove duplicate accesses). 413 | 414 | This test ensures that the two styles, the older and the newer, are 415 | functionally equivalent. 416 | """ 417 | properties = ( 418 | "documentInfo", 419 | "xmpMetadata", 420 | "numPages", 421 | "pages", 422 | "pageLayout", 423 | "pageMode", 424 | "isEncrypted", 425 | ) 426 | methods = ("getNamedDestinations", "getOutlines") 427 | 428 | for p in properties: 429 | self.assertIsInstance(getattr(PdfFileReader, p), property) 430 | 431 | for m in methods: 432 | self.assertTrue( 433 | hasattr(PdfFileReader, m), 434 | "%s() is not part of %s" % (m, PdfFileReader.__name__), 435 | ) 436 | self.assertTrue( 437 | callable(getattr(PdfFileReader, m)), 438 | "%s.%s() is not callable" % (PdfFileReader.__name__, m), 439 | ) 440 | 441 | def testAddAttachment(self): 442 | """ 443 | Tests the addAttachment function for attaching a single file. 444 | 445 | Since the Names array in the EmbeddedFiles dictionary contains both the 446 | name (string) and indirect object (dictionary) for each file, we have 447 | to check for two entries per attached file. 448 | """ 449 | 450 | _, testfile = tempfile.mkstemp() 451 | 452 | try: 453 | # Make PDF with attachment 454 | with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader: 455 | with PdfFileWriter(testfile) as writer: 456 | writer.appendPagesFromReader(reader) 457 | with open( 458 | join( # pylint: disable=bad-continuation 459 | TEST_DATA_ROOT, "attachment_small.png" 460 | ), 461 | "rb", # pylint: disable=bad-continuation # pylint: disable=bad-continuation 462 | ) as attachment_stream: 463 | read_data = attachment_stream.read() 464 | writer.addAttachment("attachment_small.png", read_data) 465 | writer.write() 466 | 467 | # Check for attachment entries 468 | with PdfFileReader(testfile) as pdf: 469 | # For caching _cachedObjects data 470 | pdf.numPages # pylint: disable=pointless-statement 471 | for _k, v in pdf._cachedObjects.items(): 472 | if "/Type" in v: 473 | if v["/Type"] == "/Catalog": 474 | self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"]) 475 | real = len(v["/Names"]["/EmbeddedFiles"]["/Names"]) 476 | self.assertEqual(2, real) 477 | finally: 478 | os.remove(testfile) 479 | 480 | def testAttachFiles(self): 481 | """ 482 | Tests the addAttachment function for attaching multiple files. 483 | 484 | Since the Names array in the EmbeddedFiles dictionary contains both the 485 | name (string) and indirect object (dictionary) for each file, we have 486 | to check for two entries per attached file. 487 | """ 488 | 489 | numAttachments = 3 490 | _, testfile = tempfile.mkstemp() 491 | 492 | try: 493 | # Make PDF with attachment 494 | with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader: 495 | with PdfFileWriter(testfile) as writer: 496 | writer.appendPagesFromReader(reader) 497 | 498 | writer.attachFiles( 499 | [join(TEST_DATA_ROOT, "attachment_small.png")] * numAttachments 500 | ) 501 | writer.write() 502 | 503 | # Check for attachment entries 504 | with PdfFileReader(testfile) as pdf: 505 | # For caching _cachedObjects data 506 | pdf.numPages # pylint: disable=pointless-statement 507 | for _k, v in pdf._cachedObjects.items(): 508 | if "/Type" in v: 509 | if v["/Type"] == "/Catalog": 510 | self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"]) 511 | real = len(v["/Names"]["/EmbeddedFiles"]["/Names"]) 512 | self.assertEqual(numAttachments * 2, real) 513 | finally: 514 | os.remove(testfile) 515 | 516 | 517 | class AddJsTestCase(unittest.TestCase): 518 | """ [EXPLAIN THIS CLASS.] """ 519 | 520 | def setUp(self): 521 | """ [EXPLAIN THIS CONVENIENCE.] """ 522 | reader = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf")) 523 | self.writer = PdfFileWriter(BytesIO(b"")) 524 | self.writer.appendPagesFromReader(reader) 525 | 526 | def testAdd(self): 527 | """ [EXPLAIN THIS TEST.] """ 528 | self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 529 | 530 | self.assertIn( 531 | "/Names", 532 | self.writer._rootObject, 533 | "addJS should add a name catalog in the root object.", 534 | ) 535 | self.assertIn( 536 | "/JavaScript", 537 | self.writer._rootObject["/Names"], 538 | "addJS should add a JavaScript name tree under the name catalog.", 539 | ) 540 | self.assertIn( 541 | "/JavaScript", 542 | self.writer._rootObject, 543 | "addJS should add a JavaScript action to the catalog.", 544 | ) 545 | 546 | def testOverwrite(self): 547 | """ [EXPLAIN THIS TEST.] """ 548 | self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 549 | first_js = self._getJavascriptName() 550 | 551 | self.writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 552 | second_js = self._getJavascriptName() 553 | 554 | self.assertNotEqual( 555 | first_js, 556 | second_js, 557 | "addJS should overwrite the previous script in the catalog.", 558 | ) 559 | 560 | def _getJavascriptName(self): 561 | self.assertIn("/Names", self.writer._rootObject) 562 | self.assertIn("/JavaScript", self.writer._rootObject["/Names"]) 563 | self.assertIn("/Names", self.writer._rootObject["/Names"]["/JavaScript"]) 564 | return self.writer._rootObject["/Names"]["/JavaScript"]["/Names"][0] 565 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Acsor 2 | # Copyright 2019 Kurt McKee 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are 7 | # met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # * Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # * The name of the author may not be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | # POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import io 30 | import re 31 | import string 32 | import unittest 33 | 34 | import pytest 35 | 36 | import pypdf.utils 37 | from tests.utils import bitstringToInt, intToBitstring 38 | 39 | # Establish the bytes/str/unicode types. 40 | try: 41 | unicode 42 | except NameError: 43 | # Python 3 44 | bytes_type = bytes 45 | str_type = str 46 | unicode_type = str 47 | else: 48 | # Python 2 49 | bytes_type = str 50 | str_type = str 51 | unicode_type = unicode 52 | 53 | 54 | class UtilsTestCase(unittest.TestCase): 55 | """ 56 | UtilsTestCase is intended to test the code utilities in utils.py. 57 | """ 58 | 59 | def testHexEncode(self): 60 | inputs = ( 61 | string.ascii_lowercase, 62 | string.ascii_uppercase, 63 | string.ascii_letters, 64 | " \t\n\r\x0b\x0c", 65 | # All the characters from \x00 to \xff in ascending order 66 | "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10" 67 | '\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#' 68 | "$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ab" 69 | "cdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87" 70 | "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97" 71 | "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" 72 | "\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" 73 | "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" 74 | "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" 75 | "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" 76 | "\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" 77 | "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", 78 | ) 79 | expOutputs = ( 80 | "6162636465666768696a6b6c6d6e6f707172737475767778797a", 81 | "4142434445464748494a4b4c4d4e4f505152535455565758595a", 82 | "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464" 83 | "748494a4b4c4d4e4f505152535455565758595a", 84 | "20090a0d0b0c", 85 | "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2" 86 | "02122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f40" 87 | "4142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606" 88 | "162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f8081" 89 | "82838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9fa0a1a" 90 | "2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2" 91 | "c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e" 92 | "3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff", 93 | ) 94 | 95 | for o, i in zip(expOutputs, inputs): 96 | self.assertEqual(o, pypdf.utils.hexEncode(i)) 97 | 98 | def testPairs(self): 99 | """ 100 | Tests ``utils.pairs()``. 101 | """ 102 | inputs = (range(0), range(6), range(10)) 103 | expOutputs = ( 104 | tuple(), 105 | ((0, 1), (2, 3), (4, 5)), 106 | ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)), 107 | ) 108 | 109 | for o, i in zip(expOutputs, inputs): 110 | self.assertTupleEqual(o, tuple(pypdf.utils.pairs(i))) 111 | 112 | def testPairsException(self): 113 | """ 114 | Tests ``utils.pairs()`` when it is fed unaccepted values. 115 | """ 116 | inputs = (range(1), range(5), range(11), range(111)) 117 | 118 | for i in inputs: 119 | with self.assertRaises(ValueError): 120 | list(pypdf.utils.pairs(i)) 121 | 122 | 123 | class TestUtilsTestCase(unittest.TestCase): 124 | """ 125 | TestUtilsTestCase is intended to test test-related utils functions, not 126 | project-wide ones. 127 | """ 128 | 129 | def testIntToBitstringToInt(self): 130 | """ 131 | Ensures that bitstringToInt(intToBitsring(input)) == input. 132 | """ 133 | inputs = range(2 ** 12 + 1) 134 | 135 | for i in inputs: 136 | self.assertEqual(i, bitstringToInt(intToBitstring(i))) 137 | 138 | def testBitstringToInt(self): 139 | """ 140 | Ensures that bitstringToInt() produces the expected result from some 141 | of its possible inputs. 142 | """ 143 | inputs = ( 144 | "00000000", 145 | "", 146 | "00000001", 147 | "1", 148 | "01010101", 149 | "1010101", 150 | "10101010", 151 | "11111111", 152 | "100000000", 153 | "0100000000", 154 | "00100000000", 155 | "000100000000", 156 | "100000001", 157 | "0100000001", 158 | "00100000001", 159 | "000100000001", 160 | ) 161 | expOutputs = ( 162 | 0, 163 | 0, 164 | 1, 165 | 1, 166 | 85, 167 | 85, 168 | 170, 169 | 255, 170 | 256, 171 | 256, 172 | 256, 173 | 256, 174 | 257, 175 | 257, 176 | 257, 177 | 257, 178 | ) 179 | 180 | for o, b in zip(expOutputs, inputs): 181 | self.assertEqual(o, bitstringToInt(b)) 182 | 183 | 184 | @pytest.mark.parametrize( 185 | "arg, expected", 186 | ( 187 | (u"str", True), 188 | ("str", True), 189 | (123, False), 190 | # I *think* that the function behaves incorrectly here. 191 | # 192 | # Python 2 support ends in 2020, but for now, I think that is_string() 193 | # should return False for Python 2 `bytes` and `str` objects, and only 194 | # `unicode` objects should return True. 195 | # 196 | # If this gets fixed, this additional test parameter will need 197 | # to be uncommented: 198 | # 199 | # (b'bytes', False) 200 | ), 201 | ) 202 | def testIsString(arg, expected): 203 | assert pypdf.utils.isString(arg) == expected 204 | 205 | 206 | @pytest.mark.parametrize( 207 | "arg, expected", ((123, True), (1 << 100, True), (123.123, False), ("str", False),) 208 | ) 209 | def testIsInt(arg, expected): 210 | assert pypdf.utils.isInt(arg) == expected 211 | 212 | 213 | @pytest.mark.parametrize( 214 | "arg, expected", 215 | ( 216 | (b"bytes", True), 217 | (u"bytes".encode("utf8"), True), 218 | (u"str", False), 219 | (b"str".decode("utf8"), False), 220 | (10, False), 221 | ), 222 | ) 223 | def testIsBytes(arg, expected): 224 | assert pypdf.utils.isBytes(arg) == expected 225 | 226 | 227 | @pytest.mark.parametrize( 228 | "data, maxchars, expected_value, expected_tell", 229 | ( 230 | (b"", None, b"", 0), 231 | (b"abcdef", None, b"abcdef", 6), 232 | (b"abcdef", 3, b"abc", 3), 233 | (b"abc def", None, b"abc", 4), 234 | ), 235 | ) 236 | def testReadUntilWhitespace(data, maxchars, expected_value, expected_tell): 237 | stream = io.BytesIO(data) 238 | assert pypdf.utils.readUntilWhitespace(stream, maxchars) == expected_value 239 | assert stream.tell() == expected_tell 240 | 241 | 242 | @pytest.mark.parametrize( 243 | "data, expected_value, expected_tell", 244 | ((b"", b"", 0), (b" ", b"", 6), (b" a ", b"a", 4),), 245 | ) 246 | def testReadNonWhitespace(data, expected_value, expected_tell): 247 | stream = io.BytesIO(data) 248 | assert pypdf.utils.readNonWhitespace(stream) == expected_value 249 | assert stream.tell() == expected_tell 250 | 251 | 252 | @pytest.mark.parametrize( 253 | "data, expected_result, expected_tell", 254 | ( 255 | (b"", False, 0), 256 | (b" ", True, 6), 257 | (b"a ", False, 1), 258 | (b" a ", True, 2), 259 | (b" a ", True, 3), 260 | ), 261 | ) 262 | def testSkipOverWhitespace(data, expected_result, expected_tell): 263 | stream = io.BytesIO(data) 264 | assert pypdf.utils.skipOverWhitespace(stream) == expected_result 265 | assert stream.tell() == expected_tell 266 | 267 | 268 | @pytest.mark.parametrize( 269 | "data, expected_tell", 270 | ((b"", 0), (b" ", 0), (b"a", 0), (b"%a\n\r", 3), (b"%a\r\n", 3), (b"%aa\r", 4),), 271 | ) 272 | def testSkipOverComments(data, expected_tell): 273 | stream = io.BytesIO(data) 274 | pypdf.utils.skipOverComment(stream) 275 | assert stream.tell() == expected_tell 276 | 277 | 278 | @pytest.mark.parametrize( 279 | "data, pattern, expected_value, expected_tell", 280 | ( 281 | (b"", b"123", b"", 0), 282 | (b"abc123def", b"123", b"abc", 3), 283 | (b"abcdef", b"123", b"abcdef", 6), 284 | ), 285 | ) 286 | def testReadUntilRegex(data, pattern, expected_value, expected_tell): 287 | stream = io.BytesIO(data) 288 | regex = re.compile(pattern) 289 | assert pypdf.utils.readUntilRegex(stream, regex, ignore_eof=True) == expected_value 290 | assert stream.tell() == expected_tell 291 | 292 | 293 | def testReadUntilRegexException(): 294 | stream = io.BytesIO(b"abcdef") 295 | regex = re.compile(b"123") 296 | with pytest.raises(pypdf.utils.PdfStreamError): 297 | pypdf.utils.readUntilRegex(stream, regex, ignore_eof=False) 298 | 299 | 300 | def testMatrixMultiply(): 301 | matrix1 = [ 302 | [1, 2], 303 | [3, 4], 304 | ] 305 | matrix2 = [ 306 | [2, 3], 307 | [5, 7], 308 | ] 309 | expected_result = [ 310 | [12, 17], 311 | [26, 37], 312 | ] 313 | assert pypdf.utils.matrixMultiply(matrix1, matrix2) == expected_result 314 | 315 | 316 | @pytest.mark.parametrize( 317 | "arg, expected_value", 318 | ((b"a", b"a"), (b"a"[0], b"a"), ("a", b"a"), (u"a", b"a"), (97, b"a"),), 319 | ) 320 | def testPypdfBytes(arg, expected_value): 321 | value = pypdf.utils.pypdfBytes(arg) 322 | assert value == expected_value 323 | assert isinstance(value, bytes_type) 324 | 325 | 326 | @pytest.mark.parametrize( 327 | "arg, expected_value", ((b"abc", "abc"), ("abc", "abc"), (u"abc", "abc"),) 328 | ) 329 | def testPypdfStr(arg, expected_value): 330 | value = pypdf.utils.pypdfStr(arg) 331 | assert value == expected_value 332 | assert isinstance(value, str_type) 333 | 334 | 335 | @pytest.mark.parametrize( 336 | "arg, expected_value", 337 | ( 338 | (b"abc", u"abc"), 339 | ("abc", u"abc"), 340 | (u"abc", u"abc"), 341 | (b"\\u0061bc", u"abc"), 342 | (u"\\u0061bc", u"\\u0061bc"), 343 | ), 344 | ) 345 | def testPypdfUnicode(arg, expected_value): 346 | value = pypdf.utils.pypdfUnicode(arg) 347 | assert value == expected_value 348 | assert isinstance(value, unicode_type) 349 | 350 | 351 | @pytest.mark.parametrize( 352 | "arg, expected_value", 353 | ( 354 | (b"a", 97), 355 | (b"a"[0], 97), 356 | ("a", 97), 357 | ("a"[0], 97), 358 | (u"a", 97), 359 | (u"a"[0], 97), 360 | (97, 97), 361 | ), 362 | ) 363 | def testPypdfOrd(arg, expected_value): 364 | value = pypdf.utils.pypdfOrd(arg) 365 | assert value == expected_value 366 | assert isinstance(value, int) 367 | 368 | 369 | @pytest.mark.parametrize( 370 | "arg, expected_value", ((97, "a"), (b"a", "a"), ("a", "a"), (u"a", "a"),) 371 | ) 372 | def testPypdfChr(arg, expected_value): 373 | value = pypdf.utils.pypdfChr(arg) 374 | assert value == expected_value 375 | assert isinstance(value, str_type) 376 | 377 | 378 | @pytest.mark.parametrize( 379 | "arg, expected_value", ((0x1, "0x1"), (1 << 100, "0x10000000000000000000000000"),) 380 | ) 381 | def testHexStr(arg, expected_value): 382 | value = pypdf.utils.hexStr(arg) 383 | assert value == expected_value 384 | assert isinstance(value, str_type) 385 | 386 | 387 | def testRC4Encode(): 388 | crypto_text = pypdf.utils.RC4Encrypt("def", "abc") 389 | assert crypto_text == b"\x9e\xa6\xef" 390 | assert isinstance(crypto_text, bytes) 391 | 392 | 393 | @pytest.mark.parametrize( 394 | "filename", (r"path/to/filename", r"path\to\filename", r"filename",) 395 | ) 396 | def testFormatWarning(filename): 397 | args = ("message", Warning, filename, "lineno", "line") 398 | warning = pypdf.utils.formatWarning(*args) 399 | assert warning == "Warning: message [filename:lineno]\n" 400 | 401 | 402 | def testWhitespaces(): 403 | whitespaces = {b" ", b"\n", b"\r", b"\t", b"\x00"} 404 | assert whitespaces == set(pypdf.utils.WHITESPACES) 405 | for character in pypdf.utils.WHITESPACES: 406 | assert isinstance(character, bytes_type) 407 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | File containing utils intended to be used in unit testing rather than the 3 | internal project codebase. 4 | """ 5 | 6 | 7 | def intToBitstring(n__, fill=8): 8 | """ 9 | Turns an integer ``n`` into its corresponding textual bit representation. 10 | 11 | :param fill: number of zeros to pad the bit representation with. 12 | :raises TypeError: if n is not an integer. 13 | """ 14 | if not isinstance(n__, int): 15 | raise TypeError("n must be an integer") 16 | 17 | return ("{bits:0>%db}" % fill).format(bits=n__) 18 | 19 | 20 | def bitstringToInt(b__): 21 | """Performs the reverse of ``intToBitstring()``.""" 22 | if not isinstance(b__, str): 23 | raise TypeError("Expected str, got %s" % b__.__class__) 24 | if not set(b__).issubset({"0", "1"}): 25 | raise ValueError("b must be a string containing only 0's and 1's") 26 | 27 | result, bitlen = 0, len(b__) 28 | 29 | for index, i in enumerate(b__): 30 | if i == "1": 31 | result += 2 ** (bitlen - index - 1) 32 | 33 | return result 34 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | # Summer 2020: is there value to retention of py27? py35? 3 | envlist = 4 | clean, 5 | py27, py35, py36, py37, py38 6 | report 7 | 8 | [testenv] 9 | commands = pytest --cov --cov-append tests/ 10 | deps = 11 | pytest 12 | pytest-cov 13 | 14 | [testenv:clean] 15 | commands = python -m coverage erase 16 | 17 | [testenv:py27] 18 | basepython = python2.7 19 | 20 | [testenv:py35] 21 | basepython = python3.5 22 | 23 | [testenv:py36] 24 | basepython = python3.6 25 | 26 | [testenv:py37] 27 | basepython = python3.7 28 | 29 | [testenv:report] 30 | commands = python -m coverage html 31 | --------------------------------------------------------------------------------