├── .gitignore
├── .travis.yml
├── CHANGES.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── appveyor.yml
├── dev_requirements.txt
├── dist
    ├── pdfquery-0.1.0.tar.gz
    ├── pdfquery-0.1.1.tar.gz
    ├── pdfquery-0.1.2.tar.gz
    ├── pdfquery-0.1.3.tar.gz
    ├── pdfquery-0.2.1.tar.gz
    ├── pdfquery-0.2.2.tar.gz
    ├── pdfquery-0.2.3.tar.gz
    ├── pdfquery-0.2.4.tar.gz
    ├── pdfquery-0.2.5.tar.gz
    ├── pdfquery-0.2.6.tar.gz
    ├── pdfquery-0.2.7.tar.gz
    ├── pdfquery-0.2.tar.gz
    ├── pdfquery-0.3.0.tar.gz
    ├── pdfquery-0.3.1.tar.gz
    ├── pdfquery-0.4.0.tar.gz
    ├── pdfquery-0.4.1.tar.gz
    ├── pdfquery-0.4.2.tar.gz
    └── pdfquery-0.4.3.tar.gz
├── pdfquery.egg-info
    └── pbr.json
├── pdfquery
    ├── __init__.py
    ├── cache.py
    ├── pdfquery.py
    └── pdftranslator.py
├── requirements_py2.txt
├── requirements_py3.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── samples
        ├── IRS_1040A.pdf
        ├── bug11.pdf
        ├── bug15.pdf
        ├── bug17.pdf
        ├── bug18.pdf
        ├── bug28.pdf
        ├── bug37.pdf
        ├── bug39.pdf
        └── bug42.pdf
    ├── saved_output
        ├── IRS_1040A_output.xml
        └── bug28_output.xml
    ├── test_main.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | .DS_Store*
3 | build/*
4 | *.pyc
5 | tests/*failed_output.xml
6 | pdfquery.egg-info
7 | .eggs


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 |   - "3.6"
 5 |   - "3.7"
 6 |   - "3.8"
 7 | env: CFLAGS="-O0"
 8 | 
 9 | cache:
10 |   directories:
11 |     - $HOME/.cache/pip
12 | 
13 | install:
14 |   - if [[ $TRAVIS_PYTHON_VERSION < 3 ]]; then pip install -r requirements_py2.txt; fi
15 |   - if [[ $TRAVIS_PYTHON_VERSION > 3 ]]; then pip install -r requirements_py3.txt; fi
16 |   - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi
17 | script:
18 |   python setup.py test
19 | after_success:
20 |   - coveralls
21 | 
22 | # See: http://docs.travis-ci.com/user/migrating-from-legacy/
23 | sudo: false


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | v0.4.3, 2016-03-27 -- Add laparams parameter to __init__.
 2 | v0.4.2, 2016-02-07 -- Annotations bugfix.
 3 | v0.4.1, 2015-12-21 -- Annotations bugfix.
 4 | v0.4.0, 2015-12-21 --
 5 |     - Python 3 support.
 6 |     - Some changes to attribute formatting.
 7 |     - Fix handling of invalid XML characters.
 8 | v0.3.1, 2015-07-22 -- Annotations bugfix.
 9 | v0.3.0, 2015-06-26 -- Include PDF annotations in xml structure.
10 | v0.2.7, 2014-09-22 -- Better Unicode text handling.
11 | v0.2.6, 2014-07-05 -- Yet more doc info handling.
12 | v0.2.5, 2014-06-29 -- Add Python 2.6 support; improve doc info handling.
13 | v0.2.4, 2014-05-29 -- Fix unicode and page number bugs.
14 | v0.2.3, 2014-05-15 -- Fix compatibility with new pdfminer.
15 | v0.2.2, 2013-12-04 -- Disable cache by default.
16 | v0.2.1, 2013-11-30 -- Fix compatibility with new pdfminer.
17 | v0.2.0, 2013-11-03 -- Bug fixes, compatibility with lxml 3.0, caching.
18 | v0.1.0, 2012-04-13 -- Initial release.
19 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013 Jack Cushman
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt *.rst
2 | recursive-exclude tests *


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ========
  2 | PDFQuery
  3 | ========
  4 | ------------------------------------------------------------
  5 | Concise, friendly PDF scraping using JQuery or XPath syntax.
  6 | ------------------------------------------------------------
  7 | 
  8 | .. image:: https://travis-ci.org/jcushman/pdfquery.png
  9 |    :alt: Travis Build Status
 10 |    :target: https://travis-ci.org/jcushman/pdfquery
 11 | .. image:: https://ci.appveyor.com/api/projects/status/d9or9795d9b66ai7?svg=true
 12 |    :alt: Appveyor Build Status
 13 |    :target: https://ci.appveyor.com/project/jcushman/pdfquery
 14 | 
 15 | 
 16 | PDFQuery is a light wrapper around pdfminer, lxml and pyquery. It's designed to reliably extract data from sets of
 17 | PDFs with as little code as possible.
 18 | 
 19 | .. contents:: **Table of Contents**
 20 | 
 21 | Installation
 22 | ============
 23 | 
 24 | ``easy_install pdfquery`` or ``pip install pdfquery``.
 25 | 
 26 | Quick Start
 27 | ===========
 28 | 
 29 | The basic idea is to transform a PDF document into an element tree so we can find items with JQuery-like selectors
 30 | using pyquery. Suppose we're trying to extract a name from a set of PDFs, but all we know is that it appears
 31 | underneath the words "Your first name and initial" in each PDF::
 32 | 
 33 |     >>> pdf = pdfquery.PDFQuery("tests/samples/IRS_1040A.pdf")
 34 |     >>> pdf.load()
 35 |     >>> label = pdf.pq('LTTextLineHorizontal:contains("Your first name and initial")')
 36 |     >>> left_corner = float(label.attr('x0'))
 37 |     >>> bottom_corner = float(label.attr('y0'))
 38 |     >>> name = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner-30, left_corner+150, bottom_corner)).text()
 39 |     >>> name
 40 |     'John E.'
 41 | 
 42 | Note that we don't have to know where the name is on the page, or what page it's on,
 43 | or how the PDF has it stored internally.
 44 | 
 45 | *Performance Note:* The initial call to pdf.load() runs very slowly, because the underlying
 46 | pdfminer library has to compare every element on the page to every other element.
 47 | See the Caching section to avoid this on subsequent runs.
 48 | 
 49 | Now let's extract and format a bunch of data all at once::
 50 | 
 51 |     >>> pdf = pdfquery.PDFQuery("tests/samples/IRS_1040A.pdf")
 52 |     >>> pdf.extract( [
 53 |          ('with_parent', 'LTPage[pageid="1"]'),
 54 |          ('with_formatter', 'text'),
 55 | 
 56 |          ('last_name', 'LTTextLineHorizontal:in_bbox("315,680,395,700")'),
 57 |          ('spouse', 'LTTextLineHorizontal:in_bbox("170,650,220,680")'),
 58 | 
 59 |          ('with_parent', 'LTPage[pageid="2"]'),
 60 | 
 61 |          ('oath', 'LTTextLineHorizontal:contains("perjury")', lambda match: match.text()[:30]+"..."),
 62 |          ('year', 'LTTextLineHorizontal:contains("Form 1040A (")', lambda match: int(match.text()[-5:-1]))
 63 |      ])
 64 | 
 65 | Result::
 66 | 
 67 |     {'last_name': 'Michaels',
 68 |      'spouse': 'Susan R.',
 69 |      'year': 2007,
 70 |      'oath': 'Under penalties of perjury, I ...',}
 71 | 
 72 | ------
 73 | Usage
 74 | ------
 75 | 
 76 | Data Models
 77 | ===========
 78 | 
 79 | PDFQuery works by loading a PDF as a pdfminer layout, converting the layout to an etree with lxml.etree,
 80 | and then applying a pyquery wrapper. All three underlying libraries are exposed, so you can use any of their
 81 | interfaces to get at the data you want.
 82 | 
 83 | First pdfminer opens the document and reads its layout.
 84 | You can access the pdfminer document at ``pdf.doc``::
 85 | 
 86 |     >>> pdf = pdfquery.PDFQuery("tests/samples/IRS_1040A.pdf")
 87 |     >>> pdf.doc
 88 |     <pdfminer.pdfparser.PDFDocument object at 0xd95c90>
 89 |     >>> pdf.doc.catalog # fetch attribute of underlying pdfminer document
 90 |     {'JT': <PDFObjRef:14>, 'PageLabels': <PDFObjRef:10>, 'Type': /Catalog, 'Pages': <PDFObjRef:12>, 'Metadata': <PDFObjRef:13>}
 91 | 
 92 | Next the layout is turned into an lxml.etree with a pyquery wrapper. After you call ``pdf.load()`` (by far the most
 93 | expensive operation in the process), you can access the etree at ``pdf.tree``, and the pyquery wrapper at ``pdf.pq``::
 94 | 
 95 |     >>> pdf.load()
 96 |     >>> pdf.tree
 97 |     <lxml.etree._ElementTree object at 0x106a285f0>
 98 |     >>> pdf.tree.write("test2.xml", pretty_print=True, encoding="utf-8")
 99 |     >>> pdf.tree.xpath('//*/LTPage')
100 |     [<Element LTPage at 0x994cb0>, <Element LTPage at 0x994a58>]
101 |     >>> pdf.pq('LTPage[pageid=1] :contains("Your first name")')
102 |     [<LTTextLineHorizontal>]
103 | 
104 | You'll save some time and memory if you call ``load()`` with only the page numbers you need. For example::
105 | 
106 |     >>> pdf.load(0, 2, 3, range(4,8))
107 | 
108 | *Performance Note:* The initial call to pdf.load() runs very slowly, because the underlying
109 | pdfminer library has to compare every element on the page to every other element.
110 | See the Caching section to avoid this on subsequent runs.
111 | 
112 | Under the hood, pdf.tree is basically an XML representation of the layout tree generated by pdfminer.pdfinterp. By
113 | default the tree is processed to combine individual character nodes, remove extra spaces,
114 | and sort the tree spatially. You can always get back to the original pdfminer Layout object from an element fetched
115 | by xpath or pyquery::
116 | 
117 |     >>> pdf.pq(':contains("Your first name and initial")')[0].layout
118 |     <LTTextLineHorizontal 143.651,714.694,213.083,721.661 u'Your  first  name  and  initial\n'>
119 | 
120 | Finding what you want
121 | =========================
122 | 
123 | PDFs are internally messy, so it's usually not helpful to find things based on document structure or element classes
124 | the way you would with HTML. Instead the most reliable selectors are the static labels on the page,
125 | which you can find by searching for their text contents, and physical location on the page. PDF coordinates are given
126 | in points (72 to the inch) starting from the bottom left corner. PDFMiner (and so PDFQuery) describes page locations
127 | in terms of bounding boxes, or bboxes. A bbox consists of four coordinates: the X and Y of the lower left
128 | corner, and the X and Y of the upper right corner.
129 | 
130 | If you're scraping text that's always in the same place on the page, the easiest way is to use Acrobat Pro's
131 | Measurement Tool, Photoshop, or a similar tool to measure distances (in points) from the lower left corner of the
132 | page, and use those distances to craft a selector like ``:in_bbox("x0,y0,x1,y1")`` (see below for more on ``in_bbox``).
133 | 
134 | If you're scraping text that might be in different parts of the page, the same basic technique applies,
135 | but you'll first have to find an element with consistent text that appears a consistent distance from the text you
136 | want, and then calculate the bbox relative to that element. See the Quick Start for an example of that approach.
137 | 
138 | If both of those fail, your best bet is to dump the xml using ``pdf.tree.write(filename, pretty_print=True)``,
139 | and see if you can find any other structure, tags or elements that reliably identify the part you're looking for.
140 | This is also helpful when you're trying to figure out why your selectors don't match ...
141 | 
142 | Custom Selectors
143 | ====================
144 | 
145 | The version of pyquery returned by pdf.pq supports some PDF-specific selectors to find elements by location on the
146 | page.
147 | 
148 | * \:in_bbox("x0,y0,x1,y1"): Matches only elements that fit entirely within the given bbox.
149 | 
150 | * \:overlaps_bbox("x0,y0,x1,y1"): Matches any elements that overlap the given bbox.
151 | 
152 | If you need a selector that isn't supported, you can write a filtering function returning a boolean::
153 | 
154 |     >>> def big_elements():
155 |         return float(this.get('width',0)) * float(this.get('height',0)) > 40000
156 |     >>> pdf.pq('LTPage[page_index="1"] *').filter(big_elements)
157 |     [<LTTextBoxHorizontal>, <LTRect>, <LTRect>]
158 | 
159 | (If you come up with any particularly useful filters, patch them into pdfquery.py as selectors and submit a pull
160 | request ...)
161 | 
162 | Caching
163 | ====================
164 | 
165 | PDFQuery accepts an optional caching argument that will store the results of PDF parsing,
166 | so subsequent runs on the same file will be much quicker. For example::
167 | 
168 |     from pdfquery.cache import FileCache
169 |     pdfquery.PDFQuery("tests/samples/IRS_1040A.pdf", parse_tree_cacher=FileCache("/tmp/"))
170 | 
171 | Bulk Data Scraping
172 | ====================
173 | 
174 | Often you're going to want to grab a bunch of different data from a PDF, using the same repetitive process:
175 | (1) find an element of the document using a pyquery selector or Xpath; (2) parse the resulting text; and (3) store it
176 | in a dict to be used later.
177 | 
178 | The ``extract`` method simplifies that process. Given a list of keywords and selectors::
179 | 
180 |     >>> pdf.extract([
181 |           ('last_name', ':in_bbox("315,680,395,700")'),
182 |           ('year', ':contains("Form 1040A (")', lambda match: int(match.text()[-5:-1]))
183 |      ])
184 | 
185 | the ``extract`` method returns a dictionary (by default) with a pyquery result set for each keyword,
186 | optionally processed through the supplied formatting function. In this example the result is::
187 | 
188 |     {'last_name': [<LTTextLineHorizontal>], 'year': 2007}
189 | 
190 | (It's often helpful to start with ``('with_formatter', 'text')`` so you get results like "Michaels" instead of
191 | ``[<LTTextLineHorizontal>]``. See Special Keywords below for more.)
192 | 
193 | Search Target
194 | ~~~~~~~~~~~~~
195 | 
196 | By default, ``extract`` searches the entire tree (or the part of the document loaded earlier by ``load()``,
197 | if it was limited to particular pages). If you want to limit the search to a part of the tree that you fetched with
198 | ``pdf.pq()`` earlier, pass that in as the second parameter after the list of searches.
199 | 
200 | Formatting Functions
201 | ~~~~~~~~~~~~~~~~~~~~
202 | 
203 | Notice that the 'year' example above contains an optional third paramater -- a formatting function. The formatting
204 | function will be passed a pyquery match result, so ``lambda match: match.text()`` will return the text contents of the
205 | matched elements.
206 | 
207 | Filtering Functions
208 | ~~~~~~~~~~~~~~~~~~~
209 | 
210 | Instead of a string, the selector can be a filtering function returning a boolean::
211 | 
212 |     >>> pdf.extract([('big', big_elements)])
213 |     {'big': [<LTPage>, <LTTextBoxHorizontal>, <LTRect>, <LTRect>, <LTPage>, <LTTextBoxHorizontal>, <LTRect>]}
214 | 
215 | (See Custom Selectors above for how to define functions like ``big_elements``.)
216 | 
217 | Special Keywords
218 | ~~~~~~~~~~~~~~~~
219 | 
220 | ``extract`` also looks for two special keywords in the list of searches that set defaults for the searches listed
221 | afterward. Note that you can include the same special keyword more than once to change the setting, as demonstrated
222 | in the Quick Start section. The keywords are\:
223 | 
224 | with_parent
225 | +++++++++++
226 | 
227 |  The ``with_parent`` keyword limits the following searches to children of the parent search. For example::
228 | 
229 |     >>> pdf.extract([
230 |          ('with_parent','LTPage[page_index="1"]'),
231 |          ('last_name', ':in_bbox("315,680,395,700")') # only matches elements on page 1
232 |      ])
233 | 
234 | with_formatter
235 | ++++++++++++++
236 | 
237 | The ``with_formatter`` keyword sets a default formatting function that will be called unless a specific one is supplied.
238 | For example::
239 | 
240 |     ('with_formatter', lambda match: int(match.text()))
241 | 
242 | will attempt to convert all of the following search results to integers. If you supply a string instead of a function,
243 | it will be interpreted as a method name to call on the pyquery search results. For example, the following two lines
244 | are equivalent::
245 | 
246 |     ('with_formatter', lambda match: match.text())
247 |     ('with_formatter', 'text')
248 | 
249 | If you want to stop filtering results, you can use::
250 | 
251 |     ('with_formatter', None)
252 | 
253 | ----------------
254 | Object Reference
255 | ----------------
256 | 
257 | Public Methods
258 | ================
259 | 
260 | ::
261 | 
262 |     PDFQuery(   file,
263 |                 merge_tags=('LTChar', 'LTAnon'),
264 |                 round_floats=True,
265 |                 round_digits=3,
266 |                 input_text_formatter=None,
267 |                 normalize_spaces=True,
268 |                 resort=True,
269 |                 parse_tree_cacher=None,
270 |                 laparams={'all_texts':True, 'detect_vertical':True})
271 | 
272 | Initialization function. Usually you'll only need to pass in the file (file object or path). The rest of the arguments
273 | control preprocessing of the element tree:
274 | 
275 | *   merge_tags: consecutive runs of these elements will be merged together, with the text of following elements
276 |     appended to the first element. This is useful for keeping the size of the tree down,
277 |     but it might help to turn it off if you want to select individual characters regardless of their containers.
278 | 
279 | *   round_floats and round_digits: if round_floats is True, numbers will be rounded to round_digits places. This is
280 |     almost always good.
281 | 
282 | *   input_text_formatter: a function that takes a string and returns a modified string,
283 |     to be applied to the text content of elements.
284 | 
285 | *   normalize_spaces: if True (and input_text_formatter isn't otherwise set), sets input_text_formatter to replace \s+
286 |     with a single space.
287 | 
288 | *   resort: if True, elements will be sorted such that any element fully within the bounding box of another element
289 |     becomes a child of that element, and elements on the same level are sorted top to bottom, left to right.
290 | 
291 | *   parse_tree_cacher: an object that knows how to save and load results of parsing a given page range from a given PDF.
292 |     Pass in FileCache('/tmp/') to save caches to the filesystem.
293 | 
294 | *   laparams: parameters for the ``pdfminer.layout.LAParams`` object used to initialize
295 |     ``pdfminer.converter.PDFPageAggregator``. Can be `dict`, `LAParams()`, or `None`.
296 | 
297 | ::
298 | 
299 |     extract(    searches,
300 |                 tree=None,
301 |                 as_dict=True)
302 | 
303 | See "Bulk Data Scraping."
304 | 
305 | * searches: list of searches to run, each consisting of a keyword, selector, and optional formatting function.
306 | * tree: pyquery tree to run searches against. By default, targets entire tree loaded by pdf.load()
307 | * as_dict: if changed to False, will return a list instead of a dict to preserve the order of the results.
308 | 
309 | ::
310 | 
311 |     load(*page_numbers)
312 | 
313 | Initialize the pdf.tree and pdf.pq objects. This will be called implicitly by pdf.extract(),
314 | but it's more efficient to call it explicitly with just the page numbers you need. Page numbers can be any
315 | combination of integers and lists, e.g. ``pdf.load(0,2,3,[4,5,6],range(10,15))``.
316 | 
317 | You can call ``pdf.load(None)`` if for some reason you want to initialize without loading *any* pages
318 | (like you are only interested in the document info).
319 | 
320 | Public But Less Useful Methods
321 | ================================
322 | 
323 | These are mostly used internally, but might be helpful sometimes ...
324 | 
325 | ::
326 | 
327 |     get_layout(page)
328 | 
329 | Given a page number (zero-indexed) or pdfminer PDFPage object, return the LTPage layout object for that page.
330 | 
331 | ::
332 | 
333 |     get_layouts()
334 | 
335 | Return list of all layouts (equivalent to calling get_layout() for each page).
336 | 
337 | ::
338 | 
339 |     get_page(page_number)
340 | 
341 | Given a page number, return the appropriate pdfminer PDFPage object.
342 | 
343 | ::
344 | 
345 |     get_pyquery(tree=None, page_numbers=[])
346 | 
347 | Wrap a given lxml element tree in pyquery.
348 | If no tree is supplied, will generate one from given page numbers, or all page numbers.
349 | 
350 | ::
351 | 
352 |     get_tree(*page_numbers)
353 | 
354 | Generate an etree for the given page numbers. ``*page_numbers`` can be the same form as in ``load()``.
355 | 
356 | 
357 | ----------------------------------------
358 | Documentation for Underlying Libraries
359 | ----------------------------------------
360 | 
361 | * PDFMiner (pdf.doc): pdfminer_homepage_, pdfminer_documentation_.
362 | 
363 | .. _pdfminer_homepage: http://www.unixuser.org/~euske/python/pdfminer/
364 | .. _pdfminer_documentation: http://www.unixuser.org/~euske/python/pdfminer/programming.html
365 | 
366 | * LXML.etree (pdf.tree): lxml_homepage_, tutorial_.
367 | 
368 | .. _lxml_homepage: http://lxml.de/index.html
369 | .. _tutorial: http://lxml.de/tutorial.html
370 | 
371 | * PyQuery (pdf.pq): pyquery_documentation_.
372 | 
373 | .. _pyquery_documentation: http://packages.python.org/pyquery/
374 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     # https://www.appveyor.com/docs/windows-images-software/#python
 4 |     # currently lxml does not successfully install in 3.5 and 3.8
 5 | #    - PYTHON: "C:\\Python35"
 6 |     - PYTHON: "C:\\Python36"
 7 |     - PYTHON: "C:\\Python37"
 8 | #    - PYTHON: "C:\\Python38"
 9 | 
10 | build: off
11 | 
12 | test_script:
13 |   - "%PYTHON%\\python.exe setup.py test"


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/dist/pdfquery-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.1.0.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.1.1.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.1.2.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.1.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.1.3.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.1.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.2.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.3.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.4.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.5.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.6.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.7.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.2.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.3.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.3.0.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.3.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.3.1.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.4.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.4.0.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.4.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.4.1.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.4.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.4.2.tar.gz


--------------------------------------------------------------------------------
/dist/pdfquery-0.4.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/dist/pdfquery-0.4.3.tar.gz


--------------------------------------------------------------------------------
/pdfquery.egg-info/pbr.json:
--------------------------------------------------------------------------------
1 | {"is_release": false, "git_version": "6e7907c"}


--------------------------------------------------------------------------------
/pdfquery/__init__.py:
--------------------------------------------------------------------------------
1 | from .pdfquery import PDFQuery


--------------------------------------------------------------------------------
/pdfquery/cache.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import zipfile
 3 | from lxml import etree
 4 | 
 5 | class BaseCache(object):
 6 | 
 7 |     def __init__(self):
 8 |         self.hash_key = None
 9 | 
10 |     def set_hash_key(self, file):
11 |         """Calculate and store hash key for file."""
12 |         filehasher = hashlib.md5()
13 |         while True:
14 |             data = file.read(8192)
15 |             if not data:
16 |                 break
17 |             filehasher.update(data)
18 |         file.seek(0)
19 |         self.hash_key = filehasher.hexdigest()
20 | 
21 |     def set(self, page_range_key, tree):
22 |         """write tree to key"""
23 |         pass
24 | 
25 |     def get(self, page_range_key):
26 |         """load tree from key, or None if cache miss"""
27 |         return None
28 | 
29 | 
30 | class DummyCache(BaseCache):
31 |     pass
32 | 
33 | 
34 | class FileCache(BaseCache):
35 | 
36 |     def __init__(self, directory='/tmp/'):
37 |         self.directory = directory
38 |         super(FileCache, self).__init__()
39 | 
40 |     def get_cache_filename(self, page_range_key):
41 |         return "pdfquery_{hash_key}{page_range_key}.xml".format(
42 |             hash_key=self.hash_key,
43 |             page_range_key=page_range_key
44 |         )
45 | 
46 |     def get_cache_file(self, page_range_key, mode):
47 |         try:
48 |             return zipfile.ZipFile(self.directory+self.get_cache_filename(page_range_key)+".zip", mode)
49 |         except IOError:
50 |             return None
51 | 
52 |     def set(self, page_range_key, tree):
53 |         xml = etree.tostring(tree, encoding='utf-8', pretty_print=False, xml_declaration=True)
54 |         cache_file = self.get_cache_file(page_range_key, 'w')
55 |         cache_file.writestr(self.get_cache_filename(page_range_key), xml)
56 |         cache_file.close()
57 | 
58 |     def get(self, page_range_key):
59 |         cache_file = self.get_cache_file(page_range_key, 'r')
60 |         if cache_file:
61 |             return etree.fromstring(cache_file.read(self.get_cache_filename(page_range_key)))


--------------------------------------------------------------------------------
/pdfquery/pdfquery.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # builtins
  5 | import codecs
  6 | import json
  7 | import numbers
  8 | import re
  9 | import chardet
 10 | try:
 11 |     from collections import OrderedDict
 12 | except ImportError:
 13 |     OrderedDict = dict  # sorry py2.6! Ordering isn't that important for our purposes anyway.
 14 | 
 15 | # pdfminer
 16 | from pdfminer.psparser import PSLiteral
 17 | from pdfminer.pdfparser import PDFParser
 18 | try:
 19 |     # pdfminer < 20131022
 20 |     from pdfminer.pdfparser import PDFDocument, PDFPage
 21 | except ImportError:
 22 |     # pdfminer >= 20131022
 23 |     from pdfminer.pdfdocument import PDFDocument
 24 |     from pdfminer.pdfpage import PDFPage
 25 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 26 | from pdfminer.layout import LAParams, LTChar, LTImage, LTPage
 27 | from pdfminer.converter import PDFPageAggregator
 28 | from pdfminer.pdftypes import resolve1
 29 | 
 30 | # other dependencies
 31 | from pyquery import PyQuery
 32 | from lxml import etree
 33 | import cssselect
 34 | import six
 35 | from six.moves import map
 36 | from six.moves import zip
 37 | 
 38 | # local imports
 39 | from .pdftranslator import PDFQueryTranslator
 40 | from .cache import DummyCache
 41 | 
 42 | 
 43 | # Re-sort the PDFMiner Layout tree so elements that fit inside other elements
 44 | # will be children of them
 45 | def _append_sorted(root, el, comparator):
 46 |     """ Add el as a child of root, or as a child of one of root's children.
 47 |     Comparator is a function(a, b) returning > 0 if a is a child of b, < 0 if
 48 |     b is a child of a, 0 if neither.
 49 |     """
 50 |     for child in root:
 51 |         rel = comparator(el, child)
 52 |         if rel > 0:
 53 |             # el fits inside child, add to child and return
 54 |             _append_sorted(child, el, comparator)
 55 |             return
 56 |         if rel < 0:
 57 |             # child fits inside el, move child into el (may move more than one)
 58 |             _append_sorted(el, child, comparator)
 59 |     # we weren't added to a child, so add to root
 60 |     root.append(el)
 61 | 
 62 | 
 63 | def _box_in_box(el, child):
 64 |     """ Return True if child is contained within el. """
 65 |     return all([
 66 |         float(el.get('x0')) <= float(child.get('x0')),
 67 |         float(el.get('x1')) >= float(child.get('x1')),
 68 |         float(el.get('y0')) <= float(child.get('y0')),
 69 |         float(el.get('y1')) >= float(child.get('y1')),
 70 |     ])
 71 | 
 72 | 
 73 | _comp_bbox_keys_required = set(['x0', 'x1', 'y0', 'y1'])
 74 | def _comp_bbox(el, el2):
 75 |     """ Return 1 if el in el2, -1 if el2 in el, else 0"""
 76 |     # only compare if both elements have x/y coordinates
 77 |     if _comp_bbox_keys_required <= set(el.keys()) and \
 78 |             _comp_bbox_keys_required <= set(el2.keys()):
 79 |         if _box_in_box(el2, el):
 80 |             return 1
 81 |         if _box_in_box(el, el2):
 82 |             return -1
 83 |     return 0
 84 | 
 85 | 
 86 | # assorted helpers
 87 | def _flatten(l, ltypes=(list, tuple)):
 88 |     # via http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
 89 |     ltype = type(l)
 90 |     l = list(l)
 91 |     i = 0
 92 |     while i < len(l):
 93 |         while isinstance(l[i], ltypes):
 94 |             if not l[i]:
 95 |                 l.pop(i)
 96 |                 i -= 1
 97 |                 break
 98 |             else:
 99 |                 l[i:i + 1] = l[i]
100 |         i += 1
101 |     return ltype(l)
102 | 
103 | # these might have to be removed from the start of a decoded string after
104 | # conversion
105 | bom_headers = set([
106 |     six.text_type(codecs.BOM_UTF8, 'utf8'),
107 |     six.text_type(codecs.BOM_UTF16_LE, 'utf-16LE'),
108 |     six.text_type(codecs.BOM_UTF16_BE, 'utf-16BE'),
109 |     six.text_type(codecs.BOM_UTF32_LE, 'utf-32LE'),
110 |     six.text_type(codecs.BOM_UTF32_BE, 'utf-32BE'),
111 | ])
112 | 
113 | 
114 | def smart_unicode_decode(encoded_string):
115 |     """
116 |         Given an encoded string of unknown format, detect the format with
117 |         chardet and return the unicode version.
118 |         Example input from bug #11:
119 |          ('\xfe\xff\x00I\x00n\x00s\x00p\x00e\x00c\x00t\x00i\x00o\x00n\x00'
120 |           '\x00R\x00e\x00p\x00o\x00r\x00t\x00 \x00v\x002\x00.\x002')
121 |     """
122 |     if not encoded_string:
123 |         return u''
124 | 
125 |     # optimization -- first try ascii
126 |     try:
127 |         return encoded_string.decode('ascii')
128 |     except UnicodeDecodeError:
129 |         pass
130 | 
131 |     # detect encoding
132 |     detected_encoding = chardet.detect(encoded_string)
133 |     # bug 54 -- depending on chardet version, if encoding is not guessed,
134 |     # either detected_encoding will be None or detected_encoding['encoding'] will be None
135 |     detected_encoding = detected_encoding['encoding'] if detected_encoding and detected_encoding.get('encoding') else 'utf8'
136 |     decoded_string = six.text_type(
137 |         encoded_string,
138 |         encoding=detected_encoding,
139 |         errors='replace'
140 |     )
141 | 
142 |     # unicode string may still have useless BOM character at the beginning
143 |     if decoded_string and decoded_string[0] in bom_headers:
144 |         decoded_string = decoded_string[1:]
145 | 
146 |     return decoded_string
147 | 
148 | def prepare_for_json_encoding(obj):
149 |     """
150 |     Convert an arbitrary object into just JSON data types (list, dict, unicode str, int, bool, null).
151 |     """
152 |     obj_type = type(obj)
153 |     if obj_type == list or obj_type == tuple:
154 |         return [prepare_for_json_encoding(item) for item in obj]
155 |     if obj_type == dict:
156 |         # alphabetizing keys lets us compare attributes for equality across runs
157 |         return OrderedDict(
158 |             (prepare_for_json_encoding(k),
159 |              prepare_for_json_encoding(obj[k])) for k in sorted(obj.keys())
160 |         )
161 |     if obj_type == six.binary_type:
162 |         return smart_unicode_decode(obj)
163 |     if obj_type == bool or obj is None or obj_type == six.text_type or isinstance(obj, numbers.Number):
164 |         return obj
165 |     if obj_type == PSLiteral:
166 |         # special case because pdfminer.six currently adds extra quotes to PSLiteral.__repr__
167 |         return u"/%s" % obj.name
168 |     return six.text_type(obj)
169 | 
170 | def obj_to_string(obj, top=True):
171 |     """
172 |     Turn an arbitrary object into a unicode string. If complex (dict/list/tuple), will be json-encoded.
173 |     """
174 |     obj = prepare_for_json_encoding(obj)
175 |     if type(obj) == six.text_type:
176 |         return obj
177 |     return json.dumps(obj)
178 | 
179 | 
180 | # via http://stackoverflow.com/a/25920392/307769
181 | invalid_xml_chars_re = re.compile(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+')
182 | def strip_invalid_xml_chars(s):
183 |     return invalid_xml_chars_re.sub(r'', s)
184 | 
185 | 
186 | # custom PDFDocument class
187 | class QPDFDocument(PDFDocument):
188 |     def get_page_number(self, index):
189 |         """
190 |         Given an index, return page label as specified by
191 |         catalog['PageLabels']['Nums']
192 | 
193 |         In a PDF, page labels are stored as a list of pairs, like
194 |         [starting_index, label_format, starting_index, label_format ...]
195 | 
196 |         For example:
197 |         [0, {'S': 'D', 'St': 151}, 4, {'S':'R', 'P':'Foo'}]
198 | 
199 |         So we have to first find the correct label_format based on the closest
200 |         starting_index lower than the requested index, then use the
201 |         label_format to convert the index to a page label.
202 | 
203 |         Label format meaning:
204 |             /S = [
205 |                     D Decimal arabic numerals
206 |                     R Uppercase roman numerals
207 |                     r Lowercase roman numerals
208 |                     A Uppercase letters (A to Z for the first 26 pages, AA to ZZ
209 |                       for the next 26, and so on)
210 |                     a Lowercase letters (a to z for the first 26 pages, aa to zz
211 |                       for the next 26, and so on)
212 |                 ] (if no /S, just use prefix ...)
213 |             /P = text string label
214 |             /St = integer start value
215 |         """
216 | 
217 |         # get and cache page ranges
218 |         if not hasattr(self, 'page_range_pairs'):
219 |             try:
220 |                 page_ranges = resolve1(self.catalog['PageLabels'])['Nums']
221 |                 assert len(page_ranges) > 1 and len(page_ranges) % 2 == 0
222 |                 self.page_range_pairs = list(
223 |                     reversed(list(zip(page_ranges[::2], page_ranges[1::2]))))
224 |             except:
225 |                 self.page_range_pairs = []
226 | 
227 |         if not self.page_range_pairs:
228 |             return ""
229 | 
230 |         # find page range containing index
231 |         for starting_index, label_format in self.page_range_pairs:
232 |             if starting_index <= index:
233 |                 break  # we found correct label_format
234 |         label_format = resolve1(label_format)
235 | 
236 |         page_label = ""
237 | 
238 |         # handle numeric part of label
239 |         if 'S' in label_format:
240 | 
241 |             # first find number for this page ...
242 |             page_label = index - starting_index
243 |             if 'St' in label_format:  # alternate start value
244 |                 page_label += label_format['St']
245 |             else:
246 |                 page_label += 1
247 | 
248 |             # ... then convert to correct format
249 |             num_type = label_format['S'].name
250 | 
251 |             # roman (upper or lower)
252 |             if num_type.lower() == 'r':
253 |                 import roman
254 |                 page_label = roman.toRoman(page_label)
255 |                 if num_type == 'r':
256 |                     page_label = page_label.lower()
257 | 
258 |             # letters
259 |             elif num_type.lower() == 'a':
260 |                 # a to z for the first 26 pages, aa to zz for the next 26, and
261 |                 # so on
262 |                 letter = chr(page_label % 26 + 65)
263 |                 letter *= page_label / 26 + 1
264 |                 if num_type == 'a':
265 |                     letter = letter.lower()
266 |                 page_label = letter
267 | 
268 |             # decimal arabic
269 |             else:  # if num_type == 'D':
270 |                 page_label = obj_to_string(page_label)
271 | 
272 |         # handle string prefix
273 |         if 'P' in label_format:
274 |             page_label = smart_unicode_decode(label_format['P']) + page_label
275 | 
276 |         return page_label
277 | 
278 | 
279 | # create etree parser using custom Element class
280 | 
281 | class LayoutElement(etree.ElementBase):
282 |     @property
283 |     def layout(self):
284 |         if not hasattr(self, '_layout'):
285 |             self._layout = None
286 |         return self._layout
287 | 
288 |     @layout.setter
289 |     def layout(self, value):
290 |         self._layout = value
291 | parser_lookup = etree.ElementDefaultClassLookup(element=LayoutElement)
292 | parser = etree.XMLParser()
293 | parser.set_element_class_lookup(parser_lookup)
294 | 
295 | 
296 | # main class
297 | class PDFQuery(object):
298 |     def __init__(
299 |             self,
300 |             file,
301 |             merge_tags=('LTChar', 'LTAnno'),
302 |             round_floats=True,
303 |             round_digits=3,
304 |             input_text_formatter=None,
305 |             normalize_spaces=True,
306 |             resort=True,
307 |             parse_tree_cacher=None,
308 |             laparams={'all_texts':True, 'detect_vertical':True},
309 |             password=''
310 |     ):
311 |         # store input
312 |         self.merge_tags = merge_tags
313 |         self.round_floats = round_floats
314 |         self.round_digits = round_digits
315 |         self.resort = resort
316 | 
317 |         # set up input text formatting function, if any
318 |         if input_text_formatter:
319 |             self.input_text_formatter = input_text_formatter
320 |         elif normalize_spaces:
321 |             r = re.compile(r'\s+')
322 |             self.input_text_formatter = lambda s: re.sub(r, ' ', s)
323 |         else:
324 |             self.input_text_formatter = None
325 | 
326 |         # open doc
327 |         if not hasattr(file, 'read'):
328 |             try:
329 |                 file = open(file, 'rb')
330 |             except TypeError:
331 |                 raise TypeError("File must be file object or filepath string.")
332 | 
333 |         parser = PDFParser(file)
334 |         if hasattr(QPDFDocument, 'set_parser'):
335 |             # pdfminer < 20131022
336 |             doc = QPDFDocument()
337 |             parser.set_document(doc)
338 |             doc.set_parser(parser)
339 |         else:
340 |             # pdfminer >= 20131022
341 |             doc = QPDFDocument(parser, password)
342 |             parser.set_document(doc)
343 |         if hasattr(doc, 'initialize'):
344 |             # as of pdfminer==20140328, "PDFDocument.initialize() method is
345 |             # removed and no longer needed."
346 |             doc.initialize()
347 |         self.doc = doc
348 |         self.parser = parser
349 |         self.tree = None
350 |         self.pq = None
351 |         self.file = file
352 | 
353 |         if parse_tree_cacher:
354 |             self._parse_tree_cacher = parse_tree_cacher
355 |             self._parse_tree_cacher.set_hash_key(self.file)
356 |         else:
357 |             self._parse_tree_cacher = DummyCache()
358 | 
359 |         # set up layout parsing
360 |         rsrcmgr = PDFResourceManager()
361 |         if type(laparams) == dict:
362 |             laparams = LAParams(**laparams)
363 |         self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
364 |         self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
365 | 
366 |         # caches
367 |         self._pages = []
368 |         self._pages_iter = None
369 |         self._elements = []
370 | 
371 |     def load(self, *page_numbers):
372 |         """
373 |         Load etree and pyquery object for entire document, or given page
374 |         numbers (ints or lists). After this is called, objects are
375 |         available at pdf.tree and pdf.pq.
376 | 
377 |         >>> pdf.load()
378 |         >>> pdf.tree
379 |         <lxml.etree._ElementTree object at ...>
380 |         >>> pdf.pq('LTPage')
381 |         [<LTPage>, <LTPage>]
382 |         >>> pdf.load(1)
383 |         >>> pdf.pq('LTPage')
384 |         [<LTPage>]
385 |         >>> pdf.load(0, 1)
386 |         >>> pdf.pq('LTPage')
387 |         [<LTPage>, <LTPage>]
388 |         """
389 |         self.tree = self.get_tree(*_flatten(page_numbers))
390 |         self.pq = self.get_pyquery(self.tree)
391 | 
392 |     def extract(self, searches, tree=None, as_dict=True):
393 |         """
394 |             >>> foo = pdf.extract([['pages', 'LTPage']])
395 |             >>> foo
396 |             {'pages': [<LTPage>, <LTPage>]}
397 |             >>> pdf.extract([['bar', ':in_bbox("100,100,400,400")']], foo['pages'][0])
398 |             {'bar': [<LTTextLineHorizontal>, <LTTextBoxHorizontal>,...
399 |         """
400 |         if self.tree is None or self.pq is None:
401 |             self.load()
402 |         if tree is None:
403 |             pq = self.pq
404 |         else:
405 |             pq = PyQuery(tree, css_translator=PDFQueryTranslator())
406 |         results = []
407 |         formatter = None
408 |         parent = pq
409 |         for search in searches:
410 |             if len(search) < 3:
411 |                 search = list(search) + [formatter]
412 |             key, search, tmp_formatter = search
413 |             if key == 'with_formatter':
414 |                 if isinstance(search, six.string_types):
415 |                     # is a pyquery method name, e.g. 'text'
416 |                     formatter = lambda o, search=search: getattr(o, search)()
417 |                 elif hasattr(search, '__call__') or not search:
418 |                     # is a method, or None to end formatting
419 |                     formatter = search
420 |                 else:
421 |                     raise TypeError("Formatter should be either a pyquery "
422 |                                     "method name or a callable function.")
423 |             elif key == 'with_parent':
424 |                 parent = pq(search) if search else pq
425 |             else:
426 |                 try:
427 |                     result = parent("*").filter(search) if \
428 |                         hasattr(search, '__call__') else parent(search)
429 |                 except cssselect.SelectorSyntaxError as e:
430 |                     raise cssselect.SelectorSyntaxError(
431 |                         "Error applying selector '%s': %s" % (search, e))
432 |                 if tmp_formatter:
433 |                     result = tmp_formatter(result)
434 |                 results += result if type(result) == tuple else [[key, result]]
435 |         if as_dict:
436 |             results = dict(results)
437 |         return results
438 | 
439 |     # tree building stuff
440 |     def get_pyquery(self, tree=None, page_numbers=None):
441 |         """
442 |             Wrap given tree in pyquery and return.
443 |             If no tree supplied, will generate one from given page_numbers, or
444 |             all page numbers.
445 |         """
446 |         if not page_numbers:
447 |             page_numbers = []
448 |         if tree is None:
449 |             if not page_numbers and self.tree is not None:
450 |                 tree = self.tree
451 |             else:
452 |                 tree = self.get_tree(page_numbers)
453 |         if hasattr(tree, 'getroot'):
454 |             tree = tree.getroot()
455 |         return PyQuery(tree, css_translator=PDFQueryTranslator())
456 | 
457 |     def get_tree(self, *page_numbers):
458 |         """
459 |             Return lxml.etree.ElementTree for entire document, or page numbers
460 |             given if any.
461 |         """
462 |         cache_key = "_".join(map(str, _flatten(page_numbers)))
463 |         tree = self._parse_tree_cacher.get(cache_key)
464 |         if tree is None:
465 |             # set up root
466 |             root = parser.makeelement("pdfxml")
467 |             if self.doc.info:
468 |                 for k, v in list(self.doc.info[0].items()):
469 |                     k = obj_to_string(k)
470 |                     v = obj_to_string(resolve1(v))
471 |                     try:
472 |                         root.set(k, v)
473 |                     except ValueError as e:
474 |                         # Sometimes keys have a character in them, like ':',
475 |                         # that isn't allowed in XML attribute names.
476 |                         # If that happens we just replace non-word characters
477 |                         # with '_'.
478 |                         if "Invalid attribute name" in e.args[0]:
479 |                             k = re.sub(r'\W', '_', k)
480 |                             root.set(k, v)
481 | 
482 |             # Parse pages and append to root.
483 |             # If nothing was passed in for page_numbers, we do this for all
484 |             # pages, but if None was explicitly passed in, we skip it.
485 |             if not(len(page_numbers) == 1 and page_numbers[0] is None):
486 |                 if page_numbers:
487 |                     pages = [[n, self.get_layout(self.get_page(n))] for n in
488 |                              _flatten(page_numbers)]
489 |                 else:
490 |                     pages = enumerate(self.get_layouts())
491 |                 for n, page in pages:
492 |                     page = self._xmlize(page)
493 |                     if self.resort:
494 |                         self._sort(page)
495 |                     page.set('page_index', obj_to_string(n))
496 |                     page.set('page_label', self.doc.get_page_number(n))
497 |                     root.append(page)
498 |                 self._clean_text(root)
499 | 
500 |             # wrap root in ElementTree
501 |             tree = etree.ElementTree(root)
502 |             self._parse_tree_cacher.set(cache_key, tree)
503 | 
504 |         return tree
505 | 
506 |     def _clean_text(self, branch):
507 |         """
508 |             Remove text from node if same text exists in its children.
509 |             Apply string formatter if set.
510 |         """
511 |         if branch.text and self.input_text_formatter:
512 |             branch.text = self.input_text_formatter(branch.text)
513 |         try:
514 |             for child in branch:
515 |                 self._clean_text(child)
516 |                 if branch.text and branch.text.find(child.text) >= 0:
517 |                     branch.text = branch.text.replace(child.text, '', 1)
518 |         except TypeError:  # not an iterable node
519 |             pass
520 | 
521 |     def _xmlize(self, node, root=None):
522 |         if isinstance(node, LayoutElement):
523 |             # Already an XML element we can use
524 |             branch = node
525 |         else:
526 |             # collect attributes of current node
527 |             tags = self._getattrs(
528 |                 node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox',
529 |                 'linewidth', 'pts', 'index', 'name', 'matrix', 'word_margin'
530 |             )
531 |             if type(node) == LTImage:
532 |                 tags.update(self._getattrs(
533 |                     node, 'colorspace', 'bits', 'imagemask', 'srcsize',
534 |                     'stream', 'name', 'pts', 'linewidth')
535 |                 )
536 |             elif type(node) == LTChar:
537 |                 tags.update(self._getattrs(
538 |                     node, 'fontname', 'adv', 'upright', 'size')
539 |                 )
540 |             elif type(node) == LTPage:
541 |                 tags.update(self._getattrs(node, 'pageid', 'rotate'))
542 | 
543 |             # create node
544 |             branch = parser.makeelement(node.__class__.__name__, tags)
545 | 
546 |         branch.layout = node
547 |         self._elements += [branch]  # make sure layout keeps state
548 |         if root is None:
549 |             root = branch
550 | 
551 |         # add text
552 |         if hasattr(node, 'get_text'):
553 |             branch.text = strip_invalid_xml_chars(node.get_text())
554 | 
555 |         # add children if node is an iterable
556 |         if hasattr(node, '__iter__'):
557 |             last = None
558 |             for child in node:
559 |                 child = self._xmlize(child, root)
560 |                 if self.merge_tags and child.tag in self.merge_tags:
561 |                     if branch.text and child.text in branch.text:
562 |                         continue
563 |                     elif last is not None and last.tag in self.merge_tags:
564 |                         last.text += child.text
565 |                         last.set(
566 |                             '_obj_id',
567 |                             last.get('_obj_id','') + "," + child.get('_obj_id','')
568 |                         )
569 |                         continue
570 |                 # sort children by bounding boxes
571 |                 if self.resort:
572 |                     _append_sorted(root, child, _comp_bbox)
573 |                 else:
574 |                     branch.append(child)
575 |                 last = child
576 |         return branch
577 | 
578 |     def _sort(self, tree):
579 |         """ Sort same-level elements top to bottom and left to right. """
580 |         children = list(tree)
581 |         if children:
582 |             tree[:] = sorted(children, key=lambda child: (-float(child.get('y1')), float(child.get('x0'))))
583 |             for child in children:
584 |                 self._sort(child)
585 | 
586 |     def _getattrs(self, obj, *attrs):
587 |         """ Return dictionary of given attrs on given object, if they exist,
588 |         processing through _filter_value().
589 |         """
590 |         filtered_attrs = {}
591 |         for attr in attrs:
592 |             if hasattr(obj, attr):
593 |                 filtered_attrs[attr] = obj_to_string(
594 |                     self._filter_value(getattr(obj, attr))
595 |                 )
596 |         return filtered_attrs
597 | 
598 |     def _filter_value(self, val):
599 |         if self.round_floats:
600 |             if type(val) == float:
601 |                 val = round(val, self.round_digits)
602 |             elif hasattr(val, '__iter__') and not isinstance(val, six.string_types):
603 |                 val = [self._filter_value(item) for item in val]
604 |         return val
605 | 
606 |     # page access stuff
607 |     def get_page(self, page_number):
608 |         """ Get PDFPage object -- 0-indexed."""
609 |         return self._cached_pages(target_page=page_number)
610 | 
611 |     def get_layout(self, page):
612 |         """ Get PDFMiner Layout object for given page object or page number. """
613 |         if type(page) == int:
614 |             page = self.get_page(page)
615 |         self.interpreter.process_page(page)
616 |         layout = self.device.get_result()
617 |         layout = self._add_annots(layout, page.annots)
618 |         return layout
619 | 
620 |     def get_layouts(self):
621 |         """ Get list of PDFMiner Layout objects for each page. """
622 |         return (self.get_layout(page) for page in self._cached_pages())
623 | 
624 |     def _cached_pages(self, target_page=-1):
625 |         """
626 |         Get a page or all pages from page generator, caching results.
627 |         This is necessary because PDFMiner searches recursively for pages,
628 |         so we won't know how many there are until we parse the whole document,
629 |         which we don't want to do until we need to.
630 |         """
631 |         try:
632 |             # pdfminer < 20131022
633 |             self._pages_iter = self._pages_iter or self.doc.get_pages()
634 |         except AttributeError:
635 |             # pdfminer >= 20131022
636 |             self._pages_iter = self._pages_iter or \
637 |                 PDFPage.create_pages(self.doc)
638 | 
639 |         if target_page >= 0:
640 |             while len(self._pages) <= target_page:
641 |                 next_page = next(self._pages_iter)
642 |                 if not next_page:
643 |                     return None
644 |                 next_page.page_number = 0
645 |                 self._pages += [next_page]
646 |             try:
647 |                 return self._pages[target_page]
648 |             except IndexError:
649 |                 return None
650 |         self._pages += list(self._pages_iter)
651 |         return self._pages
652 | 
653 |     def _add_annots(self, layout, annots):
654 |         """Adds annotations to the layout object
655 |         """
656 |         if annots:
657 |             for annot in resolve1(annots):
658 |                 annot = resolve1(annot)
659 |                 if annot.get('Rect') is not None:
660 |                     annot['bbox'] = annot.pop('Rect')  # Rename key
661 |                     annot = self._set_hwxy_attrs(annot)
662 |                 try:
663 |                     annot['URI'] = resolve1(annot['A'])['URI']
664 |                 except KeyError:
665 |                     pass
666 |                 for k, v in six.iteritems(annot):
667 |                     if not isinstance(v, six.string_types):
668 |                         annot[k] = obj_to_string(v)
669 |                 elem = parser.makeelement('Annot', annot)
670 |                 layout.add(elem)
671 |         return layout
672 | 
673 |     @staticmethod
674 |     def _set_hwxy_attrs(attr):
675 |         """Using the bbox attribute, set the h, w, x0, x1, y0, and y1
676 |         attributes.
677 |         """
678 |         bbox = attr['bbox']
679 |         attr['x0'] = bbox[0]
680 |         attr['x1'] = bbox[2]
681 |         attr['y0'] = bbox[1]
682 |         attr['y1'] = bbox[3]
683 |         attr['height'] = attr['y1'] - attr['y0']
684 |         attr['width'] = attr['x1'] - attr['x0']
685 |         return attr
686 | 
687 | 
688 | if __name__ == "__main__":
689 |     import doctest
690 |     pdf = PDFQuery("../examples/sample.pdf")
691 |     doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
692 | 


--------------------------------------------------------------------------------
/pdfquery/pdftranslator.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | #
 3 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
 4 | #
 5 | # Distributed under the BSD license, see LICENSE.txt
 6 | from cssselect import xpath as cssselect_xpath
 7 | 
 8 | 
 9 | class PDFQueryTranslator(cssselect_xpath.GenericTranslator):
10 | 
11 |     def xpath_in_bbox_function(self, xpath, fn):
12 |         if len(fn.arguments) > 1:
13 |             x0,y0,x1,y1 = [float(t.value) for t in fn.arguments]
14 |         else:
15 |             x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(","))
16 |         # TODO: seems to be doing < rather than <= ???
17 |         xpath.add_condition("@x0 >= %s" % x0)
18 |         xpath.add_condition("@y0 >= %s" % y0)
19 |         xpath.add_condition("@x1 <= %s" % x1)
20 |         xpath.add_condition("@y1 <= %s" % y1)
21 |         return xpath
22 | 
23 |     def xpath_overlaps_bbox_function(self, xpath, fn):
24 |         if len(fn.arguments) > 1:
25 |             x0,y0,x1,y1 = [float(t.value) for t in fn.arguments]
26 |         else:
27 |             x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(","))
28 |         # TODO: seems to be doing < rather than <= ???
29 |         xpath.add_condition("@x0 <= %s" % x1)
30 |         xpath.add_condition("@y0 <= %s" % y1)
31 |         xpath.add_condition("@x1 >= %s" % x0)
32 |         xpath.add_condition("@y1 >= %s" % y0)
33 |         return xpath


--------------------------------------------------------------------------------
/requirements_py2.txt:
--------------------------------------------------------------------------------
1 | cssselect>=0.7.1
2 | chardet
3 | lxml>=3.0
4 | pdfminer>=20110515
5 | six
6 | pyquery>=1.2.2
7 | roman>=1.4.0
8 | 


--------------------------------------------------------------------------------
/requirements_py3.txt:
--------------------------------------------------------------------------------
1 | cssselect>=0.7.1
2 | chardet
3 | lxml>=3.0
4 | pdfminer.six
5 | pyquery>=1.2.2
6 | roman>=1.4.0
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | # set up tests
 6 | if sys.version_info[:2] < (2, 7):
 7 |     test_suite = 'unittest2.collector'
 8 | else:
 9 |     test_suite = 'tests'
10 | 
11 | # Work around a traceback on Python < 2.7.4 and < 3.3.1
12 | # http://bugs.python.org/issue15881#msg170215
13 | try:
14 |     import multiprocessing  # noqa: unused
15 | except ImportError:
16 |     pass
17 | 
18 | setup(
19 |     name='pdfquery',
20 |     version='0.4.3',
21 |     author=u'Jack Cushman',
22 |     author_email='jcushman@gmail.com',
23 |     packages=find_packages(),
24 |     url='https://github.com/jcushman/pdfquery',
25 |     license='MIT',
26 |     description='Concise and friendly PDF scraper using JQuery or XPath selectors.',
27 |     keywords='',
28 |     long_description=open('README.rst').read(),
29 |     install_requires = open('requirements_py3.txt').read() if sys.version_info >= (3, 0) else open('requirements_py2.txt').read(),
30 |     classifiers=[
31 |         "Development Status :: 4 - Beta",
32 |         "Topic :: Text Processing",
33 |         "Topic :: Utilities",
34 |         "License :: OSI Approved :: MIT License",
35 |         "Intended Audience :: Developers",
36 |         "Operating System :: OS Independent",
37 |         "Programming Language :: Python",
38 |         "Programming Language :: Python :: 2",
39 |         "Programming Language :: Python :: 2.6",
40 |         "Programming Language :: Python :: 2.7",
41 |         "Programming Language :: Python :: 3",
42 |         "Programming Language :: Python :: 3.3",
43 |         "Programming Language :: Python :: 3.4",
44 |         "Programming Language :: Python :: 3.5",
45 |         ],
46 | 
47 |     test_suite=test_suite,
48 | )
49 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/__init__.py


--------------------------------------------------------------------------------
/tests/samples/IRS_1040A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/IRS_1040A.pdf


--------------------------------------------------------------------------------
/tests/samples/bug11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug11.pdf


--------------------------------------------------------------------------------
/tests/samples/bug15.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug15.pdf


--------------------------------------------------------------------------------
/tests/samples/bug17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug17.pdf


--------------------------------------------------------------------------------
/tests/samples/bug18.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug18.pdf


--------------------------------------------------------------------------------
/tests/samples/bug28.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug28.pdf


--------------------------------------------------------------------------------
/tests/samples/bug37.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug37.pdf


--------------------------------------------------------------------------------
/tests/samples/bug39.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug39.pdf


--------------------------------------------------------------------------------
/tests/samples/bug42.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcushman/pdfquery/7f83848e6ead157ecfb8563d592550b1fdda6020/tests/samples/bug42.pdf


--------------------------------------------------------------------------------
/tests/saved_output/bug28_output.xml:
--------------------------------------------------------------------------------
 1 | <pdfxml Producer="Microsoft® Word 2013" ModDate="D:20150609122158-04'00'" Creator="Microsoft® Word 2013" Author="Kathy_Stevens" CreationDate="D:20150609122158-04'00'">
 2 |   <LTPage bbox="[0, 0, 612, 792]" height="792" pageid="1" rotate="0" width="612" x0="0" x1="612" y0="0" y1="792" page_index="0" page_label="">
 3 |     <LTTextLineHorizontal bbox="[240.17, 706.368, 374.95, 717.084]" height="10.716" width="134.78" word_margin="0.1" x0="240.17" x1="374.95" y0="706.368" y1="717.084"><LTTextBoxHorizontal bbox="[240.17, 706.368, 374.95, 717.084]" height="10.716" index="0" width="134.78" x0="240.17" x1="374.95" y0="706.368" y1="717.084">PUBLISHED OPINIONS </LTTextBoxHorizontal></LTTextLineHorizontal>
 4 |     <LTTextLineHorizontal bbox="[217.37, 692.548, 397.75, 703.264]" height="10.716" width="180.38" word_margin="0.1" x0="217.37" x1="397.75" y0="692.548" y1="703.264"><LTTextBoxHorizontal bbox="[217.37, 692.548, 397.75, 703.264]" height="10.716" index="1" width="180.38" x0="217.37" x1="397.75" y0="692.548" y1="703.264">KENTUCKY SUPREME COURT </LTTextBoxHorizontal></LTTextLineHorizontal>
 5 |     <LTTextLineHorizontal bbox="[278.21, 678.748, 336.91, 689.464]" height="10.716" width="58.7" word_margin="0.1" x0="278.21" x1="336.91" y0="678.748" y1="689.464"><LTTextBoxHorizontal bbox="[278.21, 678.748, 336.91, 689.464]" height="10.716" index="2" width="58.7" x0="278.21" x1="336.91" y0="678.748" y1="689.464">MAY 2015 </LTTextBoxHorizontal></LTTextLineHorizontal>
 6 |     <LTTextLineHorizontal bbox="[72.024, 664.948, 75.024, 675.664]" height="10.716" width="3.0" word_margin="0.1" x0="72.024" x1="75.024" y0="664.948" y1="675.664"><LTTextBoxHorizontal bbox="[72.024, 664.948, 75.024, 675.664]" height="10.716" index="22" width="3.0" x0="72.024" x1="75.024" y0="664.948" y1="675.664"> </LTTextBoxHorizontal></LTTextLineHorizontal>
 7 |     <LTTextLineHorizontal bbox="[90.024, 651.148, 210.05, 662.476]" height="11.328" width="120.026" word_margin="0.1" x0="90.024" x1="210.05" y0="651.148" y1="662.476"><LTTextBoxHorizontal bbox="[90.024, 651.148, 210.05, 662.476]" height="11.328" index="3" width="120.026" x0="90.024" x1="210.05" y0="651.148" y1="662.476">I. CRIMINAL LAW: <LTRect bbox="[108.02, 651.22, 207.044, 652.42]" height="1.2" linewidth="0" pts="[[108.02, 651.22], [207.044, 651.22], [207.044, 652.42], [108.02, 652.42]]" width="99.024" x0="108.02" x1="207.044" y0="651.22" y1="652.42"/></LTTextBoxHorizontal></LTTextLineHorizontal>
 8 |     <LTTextLineHorizontal bbox="[126.02, 637.348, 129.02, 648.064]" height="10.716" width="3.0" word_margin="0.1" x0="126.02" x1="129.02" y0="637.348" y1="648.064"><LTTextBoxHorizontal bbox="[126.02, 637.348, 129.02, 648.064]" height="10.716" index="4" width="3.0" x0="126.02" x1="129.02" y0="637.348" y1="648.064"> </LTTextBoxHorizontal></LTTextLineHorizontal>
 9 |     <LTTextLineHorizontal bbox="[108.02, 623.548, 382.15, 634.876]" height="11.328" width="274.13" word_margin="0.1" x0="108.02" x1="382.15" y0="623.548" y1="634.876"><LTTextBoxHorizontal bbox="[108.02, 623.548, 382.15, 634.876]" height="11.328" index="5" width="274.13" x0="108.02" x1="382.15" y0="623.548" y1="634.876">A. Jeremy Caraway v. Commonwealth of Kentucky </LTTextBoxHorizontal></LTTextLineHorizontal>
10 |     <Annot A="{&quot;S&quot;: &quot;/URI&quot;, &quot;Type&quot;: &quot;/Action&quot;, &quot;URI&quot;: &quot;http://opinions.kycourts.net/sc/2013-SC-000610-MR.pdf&quot;}" BS="{&quot;W&quot;: 0}" F="4" StructParent="1" Subtype="/Link" URI="http://opinions.kycourts.net/sc/2013-SC-000610-MR.pdf" bbox="[123.75, 609.61, 235.57, 623.41]" height="13.799999999999955" width="111.82" x0="123.75" x1="235.57" y0="609.61" y1="623.41"/>
11 |     <LTTextLineHorizontal bbox="[126.02, 609.748, 236.33, 620.464]" height="10.716" width="110.31" word_margin="0.1" x0="126.02" x1="236.33" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[126.02, 609.748, 236.33, 620.464]" height="10.716" index="6" width="110.31" x0="126.02" x1="236.33" y0="609.748" y1="620.464">2013-SC-000610-MR <LTRect bbox="[126.02, 609.82, 233.32, 611.02]" height="1.2" linewidth="0" pts="[[126.02, 609.82], [233.32, 609.82], [233.32, 611.02], [126.02, 611.02]]" width="107.3" x0="126.02" x1="233.32" y0="609.82" y1="611.02"/></LTTextBoxHorizontal></LTTextLineHorizontal>
12 |     <LTTextLineHorizontal bbox="[252.05, 609.748, 255.05, 620.464]" height="10.716" width="3.0" word_margin="0.1" x0="252.05" x1="255.05" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[252.05, 609.748, 255.05, 620.464]" height="10.716" index="7" width="3.0" x0="252.05" x1="255.05" y0="609.748" y1="620.464"> </LTTextBoxHorizontal></LTTextLineHorizontal>
13 |     <LTTextLineHorizontal bbox="[288.05, 609.748, 291.05, 620.464]" height="10.716" width="3.0" word_margin="0.1" x0="288.05" x1="291.05" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[288.05, 609.748, 291.05, 620.464]" height="10.716" index="8" width="3.0" x0="288.05" x1="291.05" y0="609.748" y1="620.464"> </LTTextBoxHorizontal></LTTextLineHorizontal>
14 |     <LTTextLineHorizontal bbox="[324.07, 609.748, 327.07, 620.464]" height="10.716" width="3.0" word_margin="0.1" x0="324.07" x1="327.07" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[324.07, 609.748, 327.07, 620.464]" height="10.716" index="9" width="3.0" x0="324.07" x1="327.07" y0="609.748" y1="620.464"> </LTTextBoxHorizontal></LTTextLineHorizontal>
15 |     <LTTextLineHorizontal bbox="[360.07, 609.748, 363.07, 620.464]" height="10.716" width="3.0" word_margin="0.1" x0="360.07" x1="363.07" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[360.07, 609.748, 363.07, 620.464]" height="10.716" index="10" width="3.0" x0="360.07" x1="363.07" y0="609.748" y1="620.464"> </LTTextBoxHorizontal></LTTextLineHorizontal>
16 |     <LTTextLineHorizontal bbox="[396.07, 609.748, 467.38, 620.464]" height="10.716" width="71.31" word_margin="0.1" x0="396.07" x1="467.38" y0="609.748" y1="620.464"><LTTextBoxHorizontal bbox="[396.07, 609.748, 467.38, 620.464]" height="10.716" index="11" width="71.31" x0="396.07" x1="467.38" y0="609.748" y1="620.464">May 14, 2015 </LTTextBoxHorizontal></LTTextLineHorizontal>
17 |     <LTTextLineHorizontal bbox="[72.024, 595.948, 75.024, 606.664]" height="10.716" width="3.0" word_margin="0.1" x0="72.024" x1="75.024" y0="595.948" y1="606.664"><LTTextBoxHorizontal bbox="[72.024, 595.948, 75.024, 606.664]" height="10.716" index="23" width="3.0" x0="72.024" x1="75.024" y0="595.948" y1="606.664"> </LTTextBoxHorizontal></LTTextLineHorizontal>
18 |     <LTTextBoxHorizontal bbox="[126.02, 444.358, 542.474, 593.296]" height="148.938" index="12" width="416.454" x0="126.02" x1="542.474" y0="444.358" y1="593.296"><LTTextLineHorizontal bbox="[126.02, 582.388, 542.474, 593.296]" height="10.908" width="416.454" word_margin="0.1" x0="126.02" x1="542.474" y0="582.388" y1="593.296">Opinion of the Court by Justice Noble Affirming. All sitting; all concur. Caraway was </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 568.588, 526.34, 579.496]" height="10.908" width="400.32" word_margin="0.1" x0="126.02" x1="526.34" y0="568.588" y1="579.496">convicted of various sex offenses and was sentenced to 20 years’ imprisonment. In </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 554.788, 528.968, 565.696]" height="10.908" width="402.948" word_margin="0.1" x0="126.02" x1="528.968" y0="554.788" y1="565.696">affirming his convictions and sentence, the Court held that Caraway had accepted a </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 540.988, 526.774, 551.896]" height="10.908" width="400.754" word_margin="0.1" x0="126.02" x1="526.774" y0="540.988" y1="551.896">juror’s qualifications during voir dire, thereby waiving any objection to the alleged </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 527.158, 522.104, 538.066]" height="10.908" width="396.084" word_margin="0.1" x0="126.02" x1="522.104" y0="527.158" y1="538.066">partiality of the juror, and was thus barred from seeking appellate review on those </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 513.358, 537.56, 524.266]" height="10.908" width="411.54" word_margin="0.1" x0="126.02" x1="537.56" y0="513.358" y1="524.266">grounds; that his direct appeal ineffective assistance of counsel claim was premature; </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 499.558, 518.834, 510.466]" height="10.908" width="392.814" word_margin="0.1" x0="126.02" x1="518.834" y0="499.558" y1="510.466">that the trial court’s refusal to hear additional testimony at the sentencing hearing </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 485.758, 538.424, 496.666]" height="10.908" width="412.404" word_margin="0.1" x0="126.02" x1="538.424" y0="485.758" y1="496.666">after the penalty phase of trial was not error and did not deny Caraway of meaningful </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 471.958, 540.764, 482.866]" height="10.908" width="414.744" word_margin="0.1" x0="126.02" x1="540.764" y0="471.958" y1="482.866">judicial sentencing; and that, in light of the 2011 amendments to KRS 532.120(3), the </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 458.158, 524.846, 469.066]" height="10.908" width="398.826" word_margin="0.1" x0="126.02" x1="524.846" y0="458.158" y1="469.066">trial court was not required or authorized to order credit for time served in custody </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 444.358, 223.01, 455.266]" height="10.908" width="96.99" word_margin="0.1" x0="126.02" x1="223.01" y0="444.358" y1="455.266">before sentencing. </LTTextLineHorizontal></LTTextBoxHorizontal>
19 |     <LTTextLineHorizontal bbox="[72.024, 430.558, 75.024, 441.466]" height="10.908" width="3.0" word_margin="0.1" x0="72.024" x1="75.024" y0="430.558" y1="441.466"><LTTextBoxHorizontal bbox="[72.024, 430.558, 75.024, 441.466]" height="10.908" index="24" width="3.0" x0="72.024" x1="75.024" y0="430.558" y1="441.466"> </LTTextBoxHorizontal></LTTextLineHorizontal>
20 |     <LTTextLineHorizontal bbox="[108.02, 416.518, 350.71, 427.846]" height="11.328" width="242.69" word_margin="0.1" x0="108.02" x1="350.71" y0="416.518" y1="427.846"><LTTextBoxHorizontal bbox="[108.02, 416.518, 350.71, 427.846]" height="11.328" index="13" width="242.69" x0="108.02" x1="350.71" y0="416.518" y1="427.846">B. Jose Lopez v. Commonwealth of Kentucky </LTTextBoxHorizontal></LTTextLineHorizontal>
21 |     <Annot A="{&quot;S&quot;: &quot;/URI&quot;, &quot;Type&quot;: &quot;/Action&quot;, &quot;URI&quot;: &quot;http://opinions.kycourts.net/sc/2013-SC-000795-MR.pdf&quot;}" BS="{&quot;W&quot;: 0}" F="4" StructParent="2" Subtype="/Link" URI="http://opinions.kycourts.net/sc/2013-SC-000795-MR.pdf" bbox="[123.75, 402.63, 235.57, 416.43]" height="13.800000000000011" width="111.82" x0="123.75" x1="235.57" y0="402.63" y1="416.43"/>
22 |     <LTTextLineHorizontal bbox="[126.02, 402.718, 236.33, 413.434]" height="10.716" width="110.31" word_margin="0.1" x0="126.02" x1="236.33" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[126.02, 402.718, 236.33, 413.434]" height="10.716" index="14" width="110.31" x0="126.02" x1="236.33" y0="402.718" y1="413.434">2013-SC-000795-MR <LTRect bbox="[126.02, 402.79, 233.32, 403.99]" height="1.2" linewidth="0" pts="[[126.02, 402.79], [233.32, 402.79], [233.32, 403.99], [126.02, 403.99]]" width="107.3" x0="126.02" x1="233.32" y0="402.79" y1="403.99"/></LTTextBoxHorizontal></LTTextLineHorizontal>
23 |     <LTTextLineHorizontal bbox="[252.05, 402.718, 255.05, 413.434]" height="10.716" width="3.0" word_margin="0.1" x0="252.05" x1="255.05" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[252.05, 402.718, 255.05, 413.434]" height="10.716" index="15" width="3.0" x0="252.05" x1="255.05" y0="402.718" y1="413.434"> </LTTextBoxHorizontal></LTTextLineHorizontal>
24 |     <LTTextLineHorizontal bbox="[288.05, 402.718, 291.05, 413.434]" height="10.716" width="3.0" word_margin="0.1" x0="288.05" x1="291.05" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[288.05, 402.718, 291.05, 413.434]" height="10.716" index="16" width="3.0" x0="288.05" x1="291.05" y0="402.718" y1="413.434"> </LTTextBoxHorizontal></LTTextLineHorizontal>
25 |     <LTTextLineHorizontal bbox="[324.07, 402.718, 327.07, 413.434]" height="10.716" width="3.0" word_margin="0.1" x0="324.07" x1="327.07" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[324.07, 402.718, 327.07, 413.434]" height="10.716" index="17" width="3.0" x0="324.07" x1="327.07" y0="402.718" y1="413.434"> </LTTextBoxHorizontal></LTTextLineHorizontal>
26 |     <LTTextLineHorizontal bbox="[360.07, 402.718, 363.07, 413.434]" height="10.716" width="3.0" word_margin="0.1" x0="360.07" x1="363.07" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[360.07, 402.718, 363.07, 413.434]" height="10.716" index="18" width="3.0" x0="360.07" x1="363.07" y0="402.718" y1="413.434"> </LTTextBoxHorizontal></LTTextLineHorizontal>
27 |     <LTTextLineHorizontal bbox="[396.07, 402.718, 467.38, 413.434]" height="10.716" width="71.31" word_margin="0.1" x0="396.07" x1="467.38" y0="402.718" y1="413.434"><LTTextBoxHorizontal bbox="[396.07, 402.718, 467.38, 413.434]" height="10.716" index="19" width="71.31" x0="396.07" x1="467.38" y0="402.718" y1="413.434">May 14, 2015 </LTTextBoxHorizontal></LTTextLineHorizontal>
28 |     <LTTextLineHorizontal bbox="[72.024, 388.918, 75.024, 399.634]" height="10.716" width="3.0" word_margin="0.1" x0="72.024" x1="75.024" y0="388.918" y1="399.634"><LTTextBoxHorizontal bbox="[72.024, 388.918, 75.024, 399.634]" height="10.716" index="25" width="3.0" x0="72.024" x1="75.024" y0="388.918" y1="399.634"> </LTTextBoxHorizontal></LTTextLineHorizontal>
29 |     <LTTextBoxHorizontal bbox="[126.02, 71.712, 540.512, 386.266]" height="314.554" index="20" width="414.492" x0="126.02" x1="540.512" y0="71.712" y1="386.266"><LTTextLineHorizontal bbox="[126.02, 375.358, 537.514, 386.266]" height="10.908" width="411.494" word_margin="0.1" x0="126.02" x1="537.514" y0="375.358" y1="386.266">Opinion of the Court by Justice Keller. All sitting; all concur. . Lopez was convicted </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 361.538, 536.552, 372.446]" height="10.908" width="410.532" word_margin="0.1" x0="126.02" x1="536.552" y0="361.538" y1="372.446">of rape, incest, sexual abuse, and unlawful transaction with a minor. His convictions </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 347.738, 520.156, 358.646]" height="10.908" width="394.136" word_margin="0.1" x0="126.02" x1="520.156" y0="347.738" y1="358.646">arose from a sexual relationship he admitted to having with his under 16-year-old </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 333.938, 539.284, 344.846]" height="10.908" width="413.264" word_margin="0.1" x0="126.02" x1="539.284" y0="333.938" y1="344.846">stepdaughter. On appeal, Lopez primarily argued that he did not receive pre-trial due </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 320.138, 529.472, 331.046]" height="10.908" width="403.452" word_margin="0.1" x0="126.02" x1="529.472" y0="320.138" y1="331.046">process or a fair trial because he was not provided a qualified translator. The Court </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 306.338, 517.448, 317.246]" height="10.908" width="391.428" word_margin="0.1" x0="126.02" x1="517.448" y0="306.338" y1="317.246">noted that Lopez raised a number of issues regarding what constitutes a qualified </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 292.538, 536.746, 303.446]" height="10.908" width="410.726" word_margin="0.1" x0="126.02" x1="536.746" y0="292.538" y1="303.446">translator. However, because Lopez had not properly preserved those issues and had </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 278.738, 532.196, 289.646]" height="10.908" width="406.176" word_margin="0.1" x0="126.02" x1="532.196" y0="278.738" y1="289.646">not shown how he was harmed by any error related to the translations, the Court did </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 264.938, 517.316, 275.846]" height="10.908" width="391.296" word_margin="0.1" x0="126.02" x1="517.316" y0="264.938" y1="275.846">not substantively address them. Lopez also argued that his statement, which was </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 251.138, 534.142, 262.046]" height="10.908" width="408.122" word_margin="0.1" x0="126.02" x1="534.142" y0="251.138" y1="262.046">taken in the presence of a translator and contained the translator's translation, should </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 237.338, 538.784, 248.246]" height="10.908" width="412.764" word_margin="0.1" x0="126.02" x1="538.784" y0="237.338" y1="248.246">have been excluded as hearsay. The Court held that Lopez's translated statement was </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 223.538, 540.512, 234.446]" height="10.908" width="414.492" word_margin="0.1" x0="126.02" x1="540.512" y0="223.538" y1="234.446">an admissible statement against interest and the fact that a translator was involved did </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 209.738, 526.814, 220.646]" height="10.908" width="400.794" word_margin="0.1" x0="126.02" x1="526.814" y0="209.738" y1="220.646">not alter the nature of the statement. During the penalty phase, the jurors indicated </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 195.908, 539.348, 206.816]" height="10.908" width="413.328" word_margin="0.1" x0="126.02" x1="539.348" y0="195.908" y1="206.816">that they could not agree regarding the length of certain sentences; however, they had </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 182.108, 498.368, 193.016]" height="10.908" width="372.348" word_margin="0.1" x0="126.02" x1="498.368" y0="182.108" y1="193.016">agreed that any sentences should run concurrently. The trial court declared a </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 168.308, 517.774, 179.216]" height="10.908" width="391.754" word_margin="0.1" x0="126.02" x1="517.774" y0="168.308" y1="179.216">deadlock and imposed sentences that ran consecutively rather than concurrently. </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 154.508, 511.4, 165.416]" height="10.908" width="385.38" word_margin="0.1" x0="126.02" x1="511.4" y0="154.508" y1="165.416">Because Lopez had not preserved the issue, the Court looked for palpable error, </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 140.708, 520.328, 151.616]" height="10.908" width="394.308" word_margin="0.1" x0="126.02" x1="520.328" y0="140.708" y1="151.616">which it could not find. Finally, Lopez argued that testimony by his stepdaughter </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 126.908, 528.646, 137.816]" height="10.908" width="402.626" word_margin="0.1" x0="126.02" x1="528.646" y0="126.908" y1="137.816">about uncharged sexual activity amounted to impermissible KRE 404(b) evidence. </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 113.108, 531.022, 124.028]" height="10.92" width="405.002" word_margin="0.1" x0="126.02" x1="531.022" y0="113.108" y1="124.028">The Court held, as it did in Noel v. Commonwealth, 76 S.W.3d 923 (Ky. 2002), that </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 99.308, 540.212, 110.216]" height="10.908" width="414.192" word_margin="0.1" x0="126.02" x1="540.212" y0="99.308" y1="110.216">evidence regarding similar acts perpetrated against the same victim are almost always </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 85.512, 539.242, 96.42]" height="10.908" width="413.222" word_margin="0.1" x0="126.02" x1="539.242" y0="85.512" y1="96.42">admissible to prove intent, plan, or absence of mistake. Therefore, the complained of </LTTextLineHorizontal><LTTextLineHorizontal bbox="[126.02, 71.712, 289.73, 82.62]" height="10.908" width="163.71" word_margin="0.1" x0="126.02" x1="289.73" y0="71.712" y1="82.62">testimony was properly admitted. </LTTextLineHorizontal></LTTextBoxHorizontal>
30 |     <LTTextLineHorizontal bbox="[302.33, 50.124, 313.61, 61.416]" height="11.292" width="11.28" word_margin="0.1" x0="302.33" x1="313.61" y0="50.124" y1="61.416"><LTTextBoxHorizontal bbox="[302.33, 50.124, 313.61, 61.416]" height="11.292" index="21" width="11.28" x0="302.33" x1="313.61" y0="50.124" y1="61.416">1 </LTTextBoxHorizontal></LTTextLineHorizontal>
31 |     <LTTextLineHorizontal bbox="[72.024, 35.964, 75.864, 47.256]" height="11.292" width="3.84" word_margin="0.1" x0="72.024" x1="75.864" y0="35.964" y1="47.256"><LTTextBoxHorizontal bbox="[72.024, 35.964, 75.864, 47.256]" height="11.292" index="26" width="3.84" x0="72.024" x1="75.864" y0="35.964" y1="47.256"> </LTTextBoxHorizontal></LTTextLineHorizontal>
32 |   </LTPage>
33 | </pdfxml>
34 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
  1 | # to run:
  2 | # python setup.py test
  3 | #
  4 | # to debug:
  5 | # pip install nose
  6 | # nosetests --pdb
  7 | 
  8 | import sys
  9 | import pdfquery
 10 | from pdfquery.cache import FileCache
 11 | 
 12 | from .utils import BaseTestCase
 13 | 
 14 | ### helpers ###
 15 | 
 16 | 
 17 | 
 18 | 
 19 | class TestPDFQuery(BaseTestCase):
 20 |     """
 21 |         Various tests based on the IRS_1040A sample doc.
 22 |     """
 23 | 
 24 |     @classmethod
 25 |     def setUpClass(cls):
 26 |         cls.pdf = pdfquery.PDFQuery("tests/samples/IRS_1040A.pdf")
 27 |         cls.pdf.load()
 28 | 
 29 |     def test_xml_conversion(self):
 30 |         """
 31 |             Test that converted XML hasn't changed from saved version.
 32 |         """
 33 |         self.assertValidOutput(self.pdf, "IRS_1040A_output")
 34 | 
 35 |     def test_selectors(self):
 36 |         """
 37 |             Test the :contains and :in_bbox selectors.
 38 |         """
 39 |         label = self.pdf.pq('LTTextLineHorizontal:contains("Your first name '
 40 |                             'and initial")')
 41 |         self.assertEqual(len(label), 1)
 42 | 
 43 |         left_corner = float(label.attr('x0'))
 44 |         self.assertEqual(left_corner, 143.651)
 45 | 
 46 |         bottom_corner = float(label.attr('y0'))
 47 |         self.assertEqual(bottom_corner, 714.694)
 48 | 
 49 |         name = self.pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' %
 50 |                            (left_corner,
 51 |                             bottom_corner - 30,
 52 |                             left_corner + 150,
 53 |                             bottom_corner)
 54 |                            ).text()
 55 |         self.assertEqual(name, "John E.")
 56 | 
 57 |     def test_extract(self):
 58 |         """
 59 |             Test the extract() function.
 60 |         """
 61 |         values = self.pdf.extract([
 62 |             ('with_parent', 'LTPage[pageid="1"]'),
 63 |             ('with_formatter', 'text'),
 64 | 
 65 |             ('last_name', 'LTTextLineHorizontal:in_bbox("315,680,395,700")'),
 66 |             ('spouse', 'LTTextLineHorizontal:in_bbox("170,650,220,680")'),
 67 | 
 68 |             ('with_parent', 'LTPage[pageid="2"]'),
 69 | 
 70 |             ('oath', 'LTTextLineHorizontal:contains("perjury")',
 71 |              lambda match: match.text()[:30] + "..."),
 72 |             ('year', 'LTTextLineHorizontal:contains("Form 1040A (")',
 73 |              lambda match: int(match.text()[-5:-1]))
 74 |         ])
 75 | 
 76 |         self.assertDictEqual(values, {
 77 |             'last_name': 'Michaels',
 78 |             'spouse': 'Susan R.',
 79 |             'oath': u'Under penalties of perjury, I ...',
 80 |             'year': 2007
 81 |         })
 82 | 
 83 |     def test_page_numbers(self):
 84 |         self.assertEqual(self.pdf.tree.getroot()[0].get('page_label'), '1')
 85 | 
 86 | 
 87 | class TestDocInfo(BaseTestCase):
 88 | 
 89 |     def test_docinfo(self):
 90 | 
 91 |         doc_info_results = [
 92 |             ["tests/samples/bug11.pdf",
 93 |              {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext',
 94 |               'Title': u'\u262d\U0001f61c\U0001f4a9Unicode is fun!',
 95 |               'Author': 'Russkel', 'Creator': 'Firefox',
 96 |               'ModDate': "D:20140528141914+08'00'",
 97 |               'CreationDate': 'D:20140528061106Z', 'Subject': ''}],
 98 |             ["tests/samples/bug15.pdf",
 99 |              {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext',
100 |               'Author': 'Brepols Publishers',
101 |               'Creator': 'PDFsharp 1.2.1269-g (www.pdfsharp.com)',
102 |               'AAPL_Keywords': '["Brepols", "Publishers", "CTLO"]',
103 |               'Title': 'Exporter',
104 |               'ModDate': "D:20140614192741Z00'00'",
105 |               'Keywords': 'Brepols, Publishers, CTLO',
106 |               'CreationDate': "D:20140614192741Z00'00'",
107 |               'Subject': 'Extrait de la Library of Latin Texts - Series A'}],
108 |             ["tests/samples/bug17.pdf",
109 |              {'CreationDate': 'D:20140328164512Z',
110 |               'Creator': 'Adobe InDesign CC (Macintosh)',
111 |               'ModDate': 'D:20140328164513Z',
112 |               'Producer': 'Adobe PDF Library 10.0.1', 'Trapped': '/False'}]
113 |         ]
114 | 
115 |         for file_path, expected_results in doc_info_results:
116 |             pdf = pdfquery.PDFQuery(file_path)
117 |             pdf.load(None)
118 |             docinfo = dict(pdf.tree.getroot().attrib)
119 |             self.assertDictEqual(docinfo,expected_results)
120 | 
121 | 
122 | class TestUnicode(BaseTestCase):
123 | 
124 |     def test_unicode_text(self):
125 |         pdf = pdfquery.PDFQuery("tests/samples/bug18.pdf")
126 |         pdf.load()
127 |         self.assertEqual(
128 |             pdf.pq('LTTextLineHorizontal:contains("Hop Hing Oils")').text(),
129 |             (u'5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c '
130 |              u'\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9')
131 |         )
132 | 
133 |     def test_invalid_xml_characters(self):
134 |         pdf = pdfquery.PDFQuery("tests/samples/bug39.pdf")
135 |         pdf.load(2)  # throws error if we fail to strip ascii control characters -- see issue #39
136 | 
137 | 
138 | class TestAnnotations(BaseTestCase):
139 |     """
140 |         Ensure that annotations such as links are getting added to the PDFs
141 |         properly, as discussed in issue #28.
142 |     """
143 | 
144 |     def test_xml_conversion(self):
145 |         """
146 |             Test that converted XML hasn't changed from saved version.
147 |         """
148 |         pdf = pdfquery.PDFQuery("tests/samples/bug28.pdf")
149 |         pdf.load()
150 |         self.assertValidOutput(pdf, "bug28_output")
151 | 
152 |     def test_annot_dereferencing(self):
153 |         """
154 |             See issues #37, #42.
155 |         """
156 |         pdf = pdfquery.PDFQuery("tests/samples/bug37.pdf")
157 |         pdf.load()
158 |         pdf = pdfquery.PDFQuery("tests/samples/bug42.pdf")
159 |         pdf.load()
160 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | 
 3 | from six import BytesIO
 4 | import sys
 5 | 
 6 | if sys.version_info[:2] < (2, 7):
 7 |     import unittest2 as unittest
 8 | else:
 9 |     import unittest
10 | 
11 | # ignore index= attribute in xml comparison, as it is not stable between python versions
12 | IGNORE_ATTRIBS = {'index'}
13 | 
14 | class BaseTestCase(unittest.TestCase):
15 | 
16 |     def assertValidOutput(self, pdf, output_name):
17 |         """
18 |             Test that converted XML hasn't changed from saved version.
19 |         """
20 |         # Just skip this test if we're on python 2.6 -- float handling makes element sort ordering unpredictable,
21 |         # causing intermittent test failures.
22 |         if sys.version_info[:2] < (2, 7):
23 |             return
24 | 
25 |         # get current XML for sample file
26 |         tree_string = BytesIO()
27 |         pdf.tree.write(tree_string, pretty_print=True, encoding="utf-8")
28 |         tree_string = tree_string.getvalue()
29 | 
30 |         # get previous XML
31 |         # this varies by Python version, because the float handling isn't quite
32 |         # the same
33 |         comparison_file = "tests/saved_output/%s.xml" % (output_name,)
34 |         with open(comparison_file, 'rb') as f:
35 |             saved_string = f.read()
36 | 
37 |         # compare current to previous
38 |         try:
39 |             self.xml_strings_equal(saved_string, tree_string)
40 |         except self.failureException as e:
41 |             output_path = "tests/%s_failed_output.xml" % output_name
42 |             with open(output_path, "wb") as out:
43 |                 out.write(tree_string)
44 |             # for debugging: run `pytest --lf --pdb` and then use etree.dump(e1), etree.dump(e2)
45 |             e1, e2 = e.args[1:3]
46 |             raise self.failureException("XML conversion of sample pdf has changed! Compare %s to %s" % (comparison_file, output_path)) from e
47 | 
48 |     def xml_strings_equal(self, s1, s2, ignore_attribs=IGNORE_ATTRIBS):
49 |         """
50 |             Return true if two xml strings are semantically equivalent (ignoring attribute ordering and whitespace).
51 |         """
52 |         # via http://stackoverflow.com/a/24349916/307769
53 |         def elements_equal(e1, e2):
54 |             if e1.tag != e2.tag: raise self.failureException("Mismatched tags", e1, e2)
55 |             if e1.text != e2.text: raise self.failureException("Mismatched text", e1, e2)
56 |             if e1.tail != e2.tail: raise self.failureException("Mismatched tail", e1, e2)
57 |             if set(e1.attrib) - ignore_attribs != set(e2.attrib) - ignore_attribs: raise self.failureException("Mismatched attributes %s and %s" % (e1.attrib, e2.attrib), e1, e2)
58 |             if len(e1) != len(e2): raise self.failureException("Mismatched children", e1, e2)
59 |             for c1, c2 in zip(e1, e2):
60 |                 elements_equal(c1, c2)
61 | 
62 |         e1 = etree.XML(s1, parser=etree.XMLParser(remove_blank_text=True))
63 |         e2 = etree.XML(s2, parser=etree.XMLParser(remove_blank_text=True))
64 | 
65 |         return elements_equal(e1, e2)
66 | 


--------------------------------------------------------------------------------