├── .editorconfig
├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── setup.py
├── src
    └── hocr
    │   ├── __init__.py
    │   ├── commands.py
    │   ├── meta.py
    │   ├── overlay.py
    │   ├── page.py
    │   └── parser.py
└── tests
    ├── conftest.py
    └── hocr
        ├── example.html
        └── test_parse.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = space
 6 | indent_size = 4
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | insert_final_newline = true
11 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dot files
 2 | .*
 3 | !.editorconfig
 4 | !.gitattributes
 5 | !.gitignore
 6 | !.travis*
 7 | 
 8 | # Python
 9 | *.pyc
10 | *.pyo
11 | *.egg-info
12 | 
13 | # Git files
14 | *.orig
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - '3.3'
 4 | 
 5 | install:
 6 |   - 'travis_retry pip install -e ".[test]" --use-mirrors'
 7 |   - 'travis_retry pip install coveralls --use-mirrors'
 8 | 
 9 | script: 'py.test --pep8 --cov hocr'
10 | 
11 | after_success: 'coveralls'
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright © 2012-2013 by Concordus Applications, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to
 8 | do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-hocr
 2 | [![Build Status](https://travis-ci.org/concordusapps/python-hocr.png?branch=master)](https://travis-ci.org/concordusapps/python-hocr)
 3 | [![Coverage Status](https://coveralls.io/repos/concordusapps/python-hocr/badge.png?branch=master)](https://coveralls.io/r/concordusapps/python-hocr?branch=master)
 4 | [![PyPi Version](https://pypip.in/v/hocr/badge.png)](https://pypi.python.org/pypi/hocr)
 5 | ![PyPi Downloads](https://pypip.in/d/hocr/badge.png)
 6 | > HOCR manipulation and utility library; provides hocr2pdf binary.
 7 | 
 8 | ## License
 9 | 
10 | Unless otherwise noted, all files contained within this project are liensed under the MIT opensource license. See the included file LICENSE or visit [opensource.org][] for more information.
11 | 
12 | [opensource.org]: http://opensource.org/licenses/MIT
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from setuptools import setup, find_packages
 4 | from pkgutil import get_importer
 5 | 
 6 | 
 7 | # Navigate, import, and retrieve the metadata of the project.
 8 | meta = get_importer('src/hocr').find_module('meta').load_module('meta')
 9 | 
10 | 
11 | setup(
12 |     name='hocr',
13 |     version=meta.version,
14 |     description=meta.description,
15 |     author='Concordus Applications',
16 |     author_email='support@concordusapps.com',
17 |     url='https://github.com/concordusapps/python-hocr',
18 |     classifiers=[
19 |         'Development Status :: 3 - Alpha',
20 |         'Intended Audience :: Developers',
21 |         'Intended Audience :: System Administrators',
22 |         'License :: OSI Approved :: MIT License',
23 |         'Operating System :: OS Independent',
24 |         'Programming Language :: Python :: 3.3',
25 |     ],
26 |     package_dir={'hocr': 'src/hocr'},
27 |     packages=find_packages('src'),
28 |     entry_points={
29 |         'console_scripts': ['hocr2pdf = hocr.commands:hocr2pdf']
30 |     },
31 |     dependency_links=[
32 |         'bzr+lp:beautifulsoup#egg=beautifulsoup-4.0',
33 |         'git+git://github.com/bsidhom/python3-chardet.git@master#egg=chardet-dev',
34 |     ],
35 |     install_requires=[
36 |         'six',
37 |         'lxml >= 3.2.3, < 4.0.0',
38 |         'chardet == dev',
39 |         'beautifulsoup == 4.0',
40 |         'hummus >= 0.2.0',
41 |         'filemagic',
42 |         'pillow'
43 |     ],
44 |     extras_require={
45 |         'test': [
46 |             'pytest',
47 |             'pytest-pep8',
48 |             'pytest-cov'
49 |         ],
50 |     },
51 | )
52 | 


--------------------------------------------------------------------------------
/src/hocr/__init__.py:
--------------------------------------------------------------------------------
 1 | from .page import Page
 2 | from .parser import parse
 3 | from .meta import version as __version__, description as __doc__  # NOQA
 4 | from .overlay import overlay
 5 | 
 6 | __all__ = [
 7 |     'parse',
 8 |     'Page',
 9 |     'overlay',
10 | ]
11 | 


--------------------------------------------------------------------------------
/src/hocr/commands.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | import re
 4 | from hocr import overlay
 5 | import sys
 6 | import os
 7 | 
 8 | 
 9 | def hocr2pdf():
10 |     # Build command arguments.
11 |     parser = ArgumentParser()
12 |     parser.add_argument('source')
13 |     parser.add_argument('hocr')
14 |     parser.add_argument('-o', '--output')
15 | 
16 |     # Parse command arguments.
17 |     arguments = parser.parse_args()
18 | 
19 |     # Parse source.
20 |     source_filename = arguments.source
21 |     source_index = 0
22 |     match = re.match(r'^(.*)\[(.*)\]$', arguments.source)
23 |     if match is not None:
24 |         source_filename, source_index = match.groups()
25 | 
26 |     # Decide on output.
27 |     if arguments.output:
28 |         output_stream = open(arguments.output, 'wb')
29 | 
30 |     else:
31 |         output_stream = os.fdopen(sys.stdout.fileno(), 'wb')
32 | 
33 |     # Invoke the overlay method.
34 |     overlay(output=output_stream,
35 |             source=source_filename,
36 |             text=arguments.hocr,
37 |             index=int(source_index))
38 | 
39 |     # Close the stream.
40 |     output_stream.close()
41 | 


--------------------------------------------------------------------------------
/src/hocr/meta.py:
--------------------------------------------------------------------------------
1 | version = '0.2.11'
2 | description = ('HOCR manipulation and utility library; '
3 |                'provides hocr2pdf binary.')
4 | 


--------------------------------------------------------------------------------
/src/hocr/overlay.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import itertools
  3 | import six
  4 | from hummus import Document, Font, Text, Image
  5 | from .parser import parse
  6 | from PIL import ImageFont
  7 | import magic
  8 | 
  9 | 
 10 | class Line:
 11 | 
 12 |     def __init__(self):
 13 |         self.words = []
 14 | 
 15 | 
 16 | def _is_within(value, compare, threshold=0.005):
 17 |     return (compare * (1 - threshold)) < value < (compare * (1 + threshold))
 18 | 
 19 | 
 20 | def _collect_lines(words):
 21 |     words.sort(key=lambda w: w.box.top)
 22 |     lines = []
 23 |     current_line = []
 24 |     current_top = None
 25 |     for word in words:
 26 |         if current_top is None:
 27 |             current_line.append(word)
 28 |             current_top = word.box.top
 29 | 
 30 |         elif _is_within(word.box.top, current_top):
 31 |             current_line.append(word)
 32 |             current_top = (current_top + word.box.top) / 2.0
 33 | 
 34 |         else:
 35 |             lines.append(current_line)
 36 |             current_line = []
 37 |             current_line.append(word)
 38 |             current_top = word.box.top
 39 |     if current_line:
 40 |         lines.append(current_line)
 41 |     return lines
 42 | 
 43 | 
 44 | def _join_words(groups):
 45 |     words = []
 46 |     for group in groups:
 47 |         group.sort(key=lambda w: w.box.left)
 48 |         text = ""
 49 |         top = group[0].box.top
 50 |         left = group[0].box.left
 51 |         for word in group:
 52 |             text = text + " " + word.text
 53 |             top = (top + word.box.top) / 2.0
 54 |         top = int(round(top))
 55 |         word = group[0]
 56 |         word.box.top = top
 57 |         word.box.right = group[-1].box.right
 58 |         word.box.left = left
 59 |         word.text = text
 60 |         words.append(word)
 61 |     return words
 62 | 
 63 | 
 64 | def _align_words(groups):
 65 |     words = []
 66 |     for group in groups:
 67 |         group.sort(key=lambda w: w.box.left)
 68 |         top = group[0].box.top
 69 |         for word in group:
 70 |             word.box.top = top
 71 |             words.append(word)
 72 | 
 73 |     return words
 74 | 
 75 | 
 76 | def _split_lines(lines):
 77 |     chunks = []
 78 |     for line in lines:
 79 |         blocks = []
 80 |         cur_block = []
 81 |         cur_right = None
 82 |         line.sort(key=lambda w: w.box.left)
 83 |         for word in line:
 84 |             if cur_right is None:
 85 |                 cur_block.append(word)
 86 |                 cur_right = word.box.right
 87 | 
 88 |             elif _is_within(word.box.left, cur_right, threshold=0.25):
 89 |                 cur_right = word.box.right
 90 |                 cur_block.append(word)
 91 | 
 92 |             else:
 93 |                 blocks.append(cur_block)
 94 |                 cur_right = word.box.right
 95 |                 cur_block = [word]
 96 |         if cur_block:
 97 |             blocks.append(cur_block)
 98 |         chunks.append(blocks)
 99 |     return list(itertools.chain(*chunks))
100 | 
101 | 
102 | def _is_document(source):
103 |     """Check if the source refers to a PDF document or not.
104 |     """
105 |     test = 'id_filename' if isinstance(source, str) else 'id_buffer'
106 |     with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
107 |         return getattr(m, test)(source) == 'application/pdf'
108 | 
109 | 
110 | def overlay(output, source, text, index=0, font='TimesNewRoman', dpi=72.0):
111 |     """Overlay a PDF document or JPEG image with the text from a HOCR file.
112 | 
113 |     Writes the overlaid source as a PDF file to the output filename or
114 |     file-like object.
115 | 
116 |     @param[in] source
117 |         Either a file-like object or a filename of the image or PDF document
118 |         to embed as the background.
119 | 
120 |     @param[in] text
121 |         Either a file-like object or a filename of the HOCR text.
122 |     """
123 | 
124 |     # Parse the HOCR text.
125 |     page = parse(text)[0]
126 | 
127 |     # Initialize PDF document.
128 |     with Document(output) as document:
129 | 
130 |         # Initialize a new page and begin its context.
131 |         with document.Page() as ctx:
132 | 
133 |             # Prepare to embed the target as the background of the
134 |             # new PDF.
135 |             if _is_document(source):
136 |                 with Document(source, 'r') as target:
137 | 
138 |                     # Set the box to be equivalent as the source.
139 |                     target_page = target[index]
140 |                     ctx.box = target_page.box
141 | 
142 |                     # Embed the target.
143 |                     ctx.embed(target_page)
144 | 
145 |             else:
146 |                 # Assume we have an image to embed. This will do
147 |                 # hilarious things if we "dont" have an image as
148 |                 # image magick.. is magick.
149 |                 with Image(source, index=index) as target:
150 | 
151 |                     # Set the box to be equivalent as the source.
152 |                     ctx.box = target.box
153 | 
154 |                     # Embed the target.
155 |                     ctx.embed(target)
156 | 
157 |             # Figure out scale.
158 |             scale = ctx.box.right / page.box.right
159 | 
160 |             # Filter out any words that are "empty"
161 |             words = list(filter(lambda w: bool(w.text.strip()), page.words))
162 | 
163 |             # Collect the words into the lines of the page.
164 |             lines = _collect_lines(words)
165 | 
166 |             # Split the lines if it does go across the whole page.
167 |             lines = _split_lines(lines)
168 | 
169 |             # Join the list of list of boxes
170 |             words = _join_words(lines)
171 | 
172 |             # Iterate through words in the HOCR page.
173 |             for word in words:
174 | 
175 |                 # Skip if we don't have text.
176 |                 text = word.text.strip()
177 |                 if not text:
178 |                     continue
179 | 
180 |                 # Get x,y position where text should begin.
181 |                 x, y = word.box.left, word.box.top
182 | 
183 |                 # Apply the scale factor.
184 |                 x *= scale
185 |                 y *= scale
186 | 
187 |                 # Mirror the Y axis as HOCR and PDF are in differnet
188 |                 # quadrants because.
189 |                 y = ctx.box.bottom - y
190 | 
191 |                 # Build a font object.
192 |                 fobj = Font(font, bold=word.bold, italic=word.italic)
193 | 
194 |                 # Approximate the font size by measuring the width of
195 |                 # the text using pillow.
196 |                 pil_font = ImageFont.truetype(fobj.file, 10)
197 |                 base_width, _ = pil_font.getsize(text)
198 |                 base_width /= dpi
199 |                 expected_width = (word.box.width * scale) / dpi
200 |                 scale_width = expected_width / base_width
201 |                 fsize = 10 * scale_width
202 | 
203 |                 # Measure the font again and shift it down.
204 |                 pil_font = ImageFont.truetype(fobj.file, int(fsize))
205 |                 _, actual_height = pil_font.getsize(text)
206 |                 y -= actual_height
207 | 
208 |                 # Write text.
209 |                 # print(text, x, y)
210 |                 ctx.add(Text(text, fobj, size=fsize, x=x, y=y, mode=7))
211 | 


--------------------------------------------------------------------------------
/src/hocr/page.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | class Box:
  5 | 
  6 |     def __init__(self, text=None, *, left=0, right=0, top=0, bottom=0):
  7 | 
  8 |         # Parse the text string representation if given.
  9 |         if text is not None:
 10 |             left, top, right, bottom = map(int, text.split())
 11 | 
 12 |         self.left = left
 13 |         self.right = right
 14 |         self.top = top
 15 |         self.bottom = bottom
 16 | 
 17 |     @property
 18 |     def width(self):
 19 |         return self.right - self.left
 20 | 
 21 |     @property
 22 |     def height(self):
 23 |         return self.bottom - self.top
 24 | 
 25 |     def __repr__(self):
 26 |         return '<Box(%r, %r, %r, %r)>' % (
 27 |             self.left, self.top, self.right, self.bottom)
 28 | 
 29 | 
 30 | class Base:
 31 | 
 32 |     _allowed_ocr_classes = {}
 33 | 
 34 |     def __init__(self, element):
 35 |         """
 36 |         @param[in] element
 37 |             XML node for the OCR element.
 38 |         """
 39 | 
 40 |         # Store the element for later reference.
 41 |         self._element = element
 42 | 
 43 |         # Create an element cache.
 44 |         self._cache = {}
 45 | 
 46 |         # Parse the properties of the HOCR element.
 47 |         properties = element.get('title', '').split(';')
 48 |         for prop in properties:
 49 |             name, value = prop.split(maxsplit=1)
 50 |             if name == 'bbox':
 51 |                 self.box = Box(value)
 52 | 
 53 |             elif name == 'image':
 54 |                 self.image = value.strip('" ')
 55 | 
 56 |     def __dir__(self):
 57 |         return super().__dir__() + list(self._allowed_ocr_classes)
 58 | 
 59 |     def __getattr__(self, name):
 60 |         # Return the cached version if present.
 61 |         if name in self._cache:
 62 |             return self._cache[name]
 63 | 
 64 |         # Parse the named OCR elements.
 65 |         if name in self._allowed_ocr_classes:
 66 |             ref = OCR_CLASSES[name]
 67 |             nodes = self._element.find_all(class_=re.compile(ref['name']))
 68 |             self._cache[name] = elements = list(map(ref['class'], nodes))
 69 |             return elements
 70 | 
 71 |         # Attribute is not present.
 72 |         raise AttributeError(name)
 73 | 
 74 | 
 75 | class Word(Base):
 76 | 
 77 |     _allowed_ocr_classes = {}
 78 | 
 79 |     def __init__(self, element):
 80 |         # Initialize the base.
 81 |         super().__init__(element)
 82 | 
 83 |         # Discover if we are "bold".
 84 |         # A word element is bold if its text node is wrapped in a <strong/>.
 85 |         self.bold = bool(element.find('strong'))
 86 | 
 87 |         # Discover if we are "italic".
 88 |         # A word element is italic if its text node is wrapped in a <em/>.
 89 |         self.italic = bool(element.find('em'))
 90 | 
 91 |         # Find the text node.
 92 |         self.text = element.text
 93 | 
 94 |     def __str__(self):
 95 |         return '<Word(%r, %r)>' % (self.text, self.box)
 96 | 
 97 | 
 98 | class Line(Base):
 99 |     _allowed_ocr_classes = {'words'}
100 | 
101 | 
102 | class Paragraph(Base):
103 |     _allowed_ocr_classes = {'lines', 'words'}
104 | 
105 | 
106 | class Block(Base):
107 |     _allowed_ocr_classes = {'paragraphs', 'lines', 'words'}
108 | 
109 | 
110 | class Page(Base):
111 |     _allowed_ocr_classes = {'blocks', 'paragraphs', 'lines', 'words'}
112 | 
113 | 
114 | OCR_CLASSES = {
115 |     'words': {'name': 'ocr.?_word', 'class': Word},
116 |     'lines': {'name': 'ocr_line', 'class': Line},
117 |     'paragraphs': {'name': 'ocr_par', 'class': Paragraph},
118 |     'blocks': {'name': 'ocr_carea', 'class': Block}
119 | }
120 | 


--------------------------------------------------------------------------------
/src/hocr/parser.py:
--------------------------------------------------------------------------------
 1 | from .page import Page
 2 | import six
 3 | from bs4 import UnicodeDammit, BeautifulSoup
 4 | # from lxml.etree import fromstring
 5 | 
 6 | 
 7 | def parse(source):
 8 |     """Parse a HOCR stream into page elements.
 9 | 
10 |     @param[in] source
11 |         Either a file-like object or a filename of the HOCR text.
12 |     """
13 | 
14 |     # Corece the source into content.
15 |     if isinstance(source, six.string_types):
16 |         with open(source, 'rb') as stream:
17 |             content = stream.read()
18 | 
19 |     else:
20 |         content = source.read()
21 | 
22 |     # Parse the HOCR xml stream.
23 |     ud = UnicodeDammit(content, is_html=True)
24 |     soup = BeautifulSoup(ud.unicode_markup, 'lxml')
25 | 
26 |     # Get all the pages and parse them into page elements.
27 |     return [Page(x) for x in soup.find_all(class_='ocr_page')]
28 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys
3 | from os import path
4 | 
5 | # Append the source directory to PATH.
6 | sys.path.append(path.join(path.dirname(__file__), '..', 'src'))
7 | 


--------------------------------------------------------------------------------
/tests/hocr/test_parse.py:
--------------------------------------------------------------------------------
  1 | import hocr
  2 | from os import path
  3 | from pytest import raises
  4 | 
  5 | BASE_DIR = path.dirname(__file__)
  6 | 
  7 | 
  8 | def parse(filename='example.html'):
  9 |     return hocr.parse(path.join(BASE_DIR, filename))
 10 | 
 11 | 
 12 | def test_parse_from_stream():
 13 |     with open(path.join(BASE_DIR, 'example.html'), 'rb') as stream:
 14 |         pages = hocr.parse(stream)
 15 | 
 16 |         assert len(pages) == len(parse('example.html'))
 17 | 
 18 | 
 19 | def test_get_number_of_pages():
 20 |     assert len(parse()) == 1
 21 | 
 22 | 
 23 | def test_parse_return_datastructure_is_pages():
 24 |     for item in parse():
 25 |         assert isinstance(item, hocr.Page)
 26 | 
 27 | 
 28 | # def test_page_has_page_number():
 29 | #     pages = parse()
 30 | 
 31 | #     for item in pages:
 32 | #         assert hasattr(item, 'index')
 33 | #         assert item.index >= 0
 34 | 
 35 | 
 36 | # def test_page_has_unique_page_number():
 37 | #     pages = parse()
 38 | #     numbers = {x.index for x in pages}
 39 | 
 40 | #     assert len(pages) == len(numbers)
 41 | 
 42 | 
 43 | def test_page_elements_in_dir():
 44 |     page = parse()[0]
 45 | 
 46 |     assert 'words' in dir(page)
 47 |     assert 'blocks' in dir(page)
 48 | 
 49 | 
 50 | def test_page_has_proper_attribute_error():
 51 |     page = parse()[0]
 52 | 
 53 |     with raises(AttributeError):
 54 |         page.shjgioda
 55 | 
 56 | 
 57 | def test_page_has_bounding_box():
 58 |     for page in parse():
 59 |         assert page.box.left >= 0
 60 | 
 61 | 
 62 | def test_page_bounding_box_has_correct_value():
 63 |     page = parse()[0]
 64 | 
 65 |     assert page.box.left == 0
 66 |     assert page.box.top == 0
 67 |     assert page.box.right == 5100
 68 |     assert page.box.bottom == 6600
 69 | 
 70 | 
 71 | def test_page_has_image_name():
 72 |     page = parse()[0]
 73 | 
 74 |     assert page.image == '/tmp/tmpepham8.tiff'
 75 | 
 76 | 
 77 | def test_page_has_blocks():
 78 |     page = parse()[0]
 79 | 
 80 |     assert len(page.blocks) == 3
 81 | 
 82 | 
 83 | def test_page_blocks_have_paragraphs():
 84 |     page = parse()[0]
 85 | 
 86 |     assert len(page.blocks[0].paragraphs) == 1
 87 |     assert len(page.blocks[1].paragraphs) == 50
 88 |     assert len(page.blocks[2].paragraphs) == 1
 89 | 
 90 | 
 91 | def test_page_block_paragraphs_have_lines():
 92 |     page = parse()[0]
 93 | 
 94 |     assert len(page.blocks[1].paragraphs[0].lines) == 2
 95 |     assert len(page.blocks[1].paragraphs[10].lines) == 1
 96 |     assert len(page.blocks[1].paragraphs[20].lines) == 1
 97 |     assert len(page.blocks[2].paragraphs[0].lines) == 1
 98 | 
 99 | 
100 | def test_page_block_paragraph_lines_have_words():
101 |     page = parse()[0]
102 | 
103 |     assert len(page.blocks[0].paragraphs[0].lines[0].words) == 3
104 |     assert len(page.blocks[1].paragraphs[0].lines[0].words) == 3
105 |     assert len(page.blocks[1].paragraphs[10].lines[0].words) == 54
106 | 
107 | 
108 | def test_page_has_words():
109 |     page = parse()[0]
110 | 
111 |     assert len(page.words) == 2665
112 | 
113 | 
114 | def test_words_have_text():
115 |     page = parse()[0]
116 | 
117 |     assert page.words[0].text == 'TABLE'
118 |     assert page.words[2].text == 'CONTENTS'
119 |     assert page.words[102].text == '.'
120 | 
121 | 
122 | def test_words_have_boldness():
123 |     page = parse()[0]
124 | 
125 |     assert page.words[0].bold
126 |     assert not page.words[73].bold
127 | 
128 | 
129 | def test_words_have_italicness():
130 |     page = parse()[0]
131 | 
132 |     assert not page.words[0].italic
133 |     assert page.words[2].italic
134 |     assert not page.words[73].italic
135 | 
136 | 
137 | def test_words_have_bounding_box():
138 |     page = parse()[0]
139 | 
140 |     assert page.words[0].box.left == 2216
141 |     assert page.words[0].box.top == 1049
142 |     assert page.words[0].box.right == 2449
143 |     assert page.words[0].box.bottom == 1098
144 | 


--------------------------------------------------------------------------------