├── .editorconfig ├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── setup.py ├── src └── hocr │ ├── __init__.py │ ├── commands.py │ ├── meta.py │ ├── overlay.py │ ├── page.py │ └── parser.py └── tests ├── conftest.py └── hocr ├── example.html └── test_parse.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | indent_size = 4 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dot files 2 | .* 3 | !.editorconfig 4 | !.gitattributes 5 | !.gitignore 6 | !.travis* 7 | 8 | # Python 9 | *.pyc 10 | *.pyo 11 | *.egg-info 12 | 13 | # Git files 14 | *.orig 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.3' 4 | 5 | install: 6 | - 'travis_retry pip install -e ".[test]" --use-mirrors' 7 | - 'travis_retry pip install coveralls --use-mirrors' 8 | 9 | script: 'py.test --pep8 --cov hocr' 10 | 11 | after_success: 'coveralls' 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2012-2013 by Concordus Applications, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to 8 | do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-hocr 2 | [![Build Status](https://travis-ci.org/concordusapps/python-hocr.png?branch=master)](https://travis-ci.org/concordusapps/python-hocr) 3 | [![Coverage Status](https://coveralls.io/repos/concordusapps/python-hocr/badge.png?branch=master)](https://coveralls.io/r/concordusapps/python-hocr?branch=master) 4 | [![PyPi Version](https://pypip.in/v/hocr/badge.png)](https://pypi.python.org/pypi/hocr) 5 | ![PyPi Downloads](https://pypip.in/d/hocr/badge.png) 6 | > HOCR manipulation and utility library; provides hocr2pdf binary. 7 | 8 | ## License 9 | 10 | Unless otherwise noted, all files contained within this project are liensed under the MIT opensource license. See the included file LICENSE or visit [opensource.org][] for more information. 11 | 12 | [opensource.org]: http://opensource.org/licenses/MIT 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from setuptools import setup, find_packages 4 | from pkgutil import get_importer 5 | 6 | 7 | # Navigate, import, and retrieve the metadata of the project. 8 | meta = get_importer('src/hocr').find_module('meta').load_module('meta') 9 | 10 | 11 | setup( 12 | name='hocr', 13 | version=meta.version, 14 | description=meta.description, 15 | author='Concordus Applications', 16 | author_email='support@concordusapps.com', 17 | url='https://github.com/concordusapps/python-hocr', 18 | classifiers=[ 19 | 'Development Status :: 3 - Alpha', 20 | 'Intended Audience :: Developers', 21 | 'Intended Audience :: System Administrators', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Operating System :: OS Independent', 24 | 'Programming Language :: Python :: 3.3', 25 | ], 26 | package_dir={'hocr': 'src/hocr'}, 27 | packages=find_packages('src'), 28 | entry_points={ 29 | 'console_scripts': ['hocr2pdf = hocr.commands:hocr2pdf'] 30 | }, 31 | dependency_links=[ 32 | 'bzr+lp:beautifulsoup#egg=beautifulsoup-4.0', 33 | 'git+git://github.com/bsidhom/python3-chardet.git@master#egg=chardet-dev', 34 | ], 35 | install_requires=[ 36 | 'six', 37 | 'lxml >= 3.2.3, < 4.0.0', 38 | 'chardet == dev', 39 | 'beautifulsoup == 4.0', 40 | 'hummus >= 0.2.0', 41 | 'filemagic', 42 | 'pillow' 43 | ], 44 | extras_require={ 45 | 'test': [ 46 | 'pytest', 47 | 'pytest-pep8', 48 | 'pytest-cov' 49 | ], 50 | }, 51 | ) 52 | -------------------------------------------------------------------------------- /src/hocr/__init__.py: -------------------------------------------------------------------------------- 1 | from .page import Page 2 | from .parser import parse 3 | from .meta import version as __version__, description as __doc__ # NOQA 4 | from .overlay import overlay 5 | 6 | __all__ = [ 7 | 'parse', 8 | 'Page', 9 | 'overlay', 10 | ] 11 | -------------------------------------------------------------------------------- /src/hocr/commands.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from argparse import ArgumentParser 3 | import re 4 | from hocr import overlay 5 | import sys 6 | import os 7 | 8 | 9 | def hocr2pdf(): 10 | # Build command arguments. 11 | parser = ArgumentParser() 12 | parser.add_argument('source') 13 | parser.add_argument('hocr') 14 | parser.add_argument('-o', '--output') 15 | 16 | # Parse command arguments. 17 | arguments = parser.parse_args() 18 | 19 | # Parse source. 20 | source_filename = arguments.source 21 | source_index = 0 22 | match = re.match(r'^(.*)\[(.*)\]$', arguments.source) 23 | if match is not None: 24 | source_filename, source_index = match.groups() 25 | 26 | # Decide on output. 27 | if arguments.output: 28 | output_stream = open(arguments.output, 'wb') 29 | 30 | else: 31 | output_stream = os.fdopen(sys.stdout.fileno(), 'wb') 32 | 33 | # Invoke the overlay method. 34 | overlay(output=output_stream, 35 | source=source_filename, 36 | text=arguments.hocr, 37 | index=int(source_index)) 38 | 39 | # Close the stream. 40 | output_stream.close() 41 | -------------------------------------------------------------------------------- /src/hocr/meta.py: -------------------------------------------------------------------------------- 1 | version = '0.2.11' 2 | description = ('HOCR manipulation and utility library; ' 3 | 'provides hocr2pdf binary.') 4 | -------------------------------------------------------------------------------- /src/hocr/overlay.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import itertools 3 | import six 4 | from hummus import Document, Font, Text, Image 5 | from .parser import parse 6 | from PIL import ImageFont 7 | import magic 8 | 9 | 10 | class Line: 11 | 12 | def __init__(self): 13 | self.words = [] 14 | 15 | 16 | def _is_within(value, compare, threshold=0.005): 17 | return (compare * (1 - threshold)) < value < (compare * (1 + threshold)) 18 | 19 | 20 | def _collect_lines(words): 21 | words.sort(key=lambda w: w.box.top) 22 | lines = [] 23 | current_line = [] 24 | current_top = None 25 | for word in words: 26 | if current_top is None: 27 | current_line.append(word) 28 | current_top = word.box.top 29 | 30 | elif _is_within(word.box.top, current_top): 31 | current_line.append(word) 32 | current_top = (current_top + word.box.top) / 2.0 33 | 34 | else: 35 | lines.append(current_line) 36 | current_line = [] 37 | current_line.append(word) 38 | current_top = word.box.top 39 | if current_line: 40 | lines.append(current_line) 41 | return lines 42 | 43 | 44 | def _join_words(groups): 45 | words = [] 46 | for group in groups: 47 | group.sort(key=lambda w: w.box.left) 48 | text = "" 49 | top = group[0].box.top 50 | left = group[0].box.left 51 | for word in group: 52 | text = text + " " + word.text 53 | top = (top + word.box.top) / 2.0 54 | top = int(round(top)) 55 | word = group[0] 56 | word.box.top = top 57 | word.box.right = group[-1].box.right 58 | word.box.left = left 59 | word.text = text 60 | words.append(word) 61 | return words 62 | 63 | 64 | def _align_words(groups): 65 | words = [] 66 | for group in groups: 67 | group.sort(key=lambda w: w.box.left) 68 | top = group[0].box.top 69 | for word in group: 70 | word.box.top = top 71 | words.append(word) 72 | 73 | return words 74 | 75 | 76 | def _split_lines(lines): 77 | chunks = [] 78 | for line in lines: 79 | blocks = [] 80 | cur_block = [] 81 | cur_right = None 82 | line.sort(key=lambda w: w.box.left) 83 | for word in line: 84 | if cur_right is None: 85 | cur_block.append(word) 86 | cur_right = word.box.right 87 | 88 | elif _is_within(word.box.left, cur_right, threshold=0.25): 89 | cur_right = word.box.right 90 | cur_block.append(word) 91 | 92 | else: 93 | blocks.append(cur_block) 94 | cur_right = word.box.right 95 | cur_block = [word] 96 | if cur_block: 97 | blocks.append(cur_block) 98 | chunks.append(blocks) 99 | return list(itertools.chain(*chunks)) 100 | 101 | 102 | def _is_document(source): 103 | """Check if the source refers to a PDF document or not. 104 | """ 105 | test = 'id_filename' if isinstance(source, str) else 'id_buffer' 106 | with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: 107 | return getattr(m, test)(source) == 'application/pdf' 108 | 109 | 110 | def overlay(output, source, text, index=0, font='TimesNewRoman', dpi=72.0): 111 | """Overlay a PDF document or JPEG image with the text from a HOCR file. 112 | 113 | Writes the overlaid source as a PDF file to the output filename or 114 | file-like object. 115 | 116 | @param[in] source 117 | Either a file-like object or a filename of the image or PDF document 118 | to embed as the background. 119 | 120 | @param[in] text 121 | Either a file-like object or a filename of the HOCR text. 122 | """ 123 | 124 | # Parse the HOCR text. 125 | page = parse(text)[0] 126 | 127 | # Initialize PDF document. 128 | with Document(output) as document: 129 | 130 | # Initialize a new page and begin its context. 131 | with document.Page() as ctx: 132 | 133 | # Prepare to embed the target as the background of the 134 | # new PDF. 135 | if _is_document(source): 136 | with Document(source, 'r') as target: 137 | 138 | # Set the box to be equivalent as the source. 139 | target_page = target[index] 140 | ctx.box = target_page.box 141 | 142 | # Embed the target. 143 | ctx.embed(target_page) 144 | 145 | else: 146 | # Assume we have an image to embed. This will do 147 | # hilarious things if we "dont" have an image as 148 | # image magick.. is magick. 149 | with Image(source, index=index) as target: 150 | 151 | # Set the box to be equivalent as the source. 152 | ctx.box = target.box 153 | 154 | # Embed the target. 155 | ctx.embed(target) 156 | 157 | # Figure out scale. 158 | scale = ctx.box.right / page.box.right 159 | 160 | # Filter out any words that are "empty" 161 | words = list(filter(lambda w: bool(w.text.strip()), page.words)) 162 | 163 | # Collect the words into the lines of the page. 164 | lines = _collect_lines(words) 165 | 166 | # Split the lines if it does go across the whole page. 167 | lines = _split_lines(lines) 168 | 169 | # Join the list of list of boxes 170 | words = _join_words(lines) 171 | 172 | # Iterate through words in the HOCR page. 173 | for word in words: 174 | 175 | # Skip if we don't have text. 176 | text = word.text.strip() 177 | if not text: 178 | continue 179 | 180 | # Get x,y position where text should begin. 181 | x, y = word.box.left, word.box.top 182 | 183 | # Apply the scale factor. 184 | x *= scale 185 | y *= scale 186 | 187 | # Mirror the Y axis as HOCR and PDF are in differnet 188 | # quadrants because. 189 | y = ctx.box.bottom - y 190 | 191 | # Build a font object. 192 | fobj = Font(font, bold=word.bold, italic=word.italic) 193 | 194 | # Approximate the font size by measuring the width of 195 | # the text using pillow. 196 | pil_font = ImageFont.truetype(fobj.file, 10) 197 | base_width, _ = pil_font.getsize(text) 198 | base_width /= dpi 199 | expected_width = (word.box.width * scale) / dpi 200 | scale_width = expected_width / base_width 201 | fsize = 10 * scale_width 202 | 203 | # Measure the font again and shift it down. 204 | pil_font = ImageFont.truetype(fobj.file, int(fsize)) 205 | _, actual_height = pil_font.getsize(text) 206 | y -= actual_height 207 | 208 | # Write text. 209 | # print(text, x, y) 210 | ctx.add(Text(text, fobj, size=fsize, x=x, y=y, mode=7)) 211 | -------------------------------------------------------------------------------- /src/hocr/page.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class Box: 5 | 6 | def __init__(self, text=None, *, left=0, right=0, top=0, bottom=0): 7 | 8 | # Parse the text string representation if given. 9 | if text is not None: 10 | left, top, right, bottom = map(int, text.split()) 11 | 12 | self.left = left 13 | self.right = right 14 | self.top = top 15 | self.bottom = bottom 16 | 17 | @property 18 | def width(self): 19 | return self.right - self.left 20 | 21 | @property 22 | def height(self): 23 | return self.bottom - self.top 24 | 25 | def __repr__(self): 26 | return '' % ( 27 | self.left, self.top, self.right, self.bottom) 28 | 29 | 30 | class Base: 31 | 32 | _allowed_ocr_classes = {} 33 | 34 | def __init__(self, element): 35 | """ 36 | @param[in] element 37 | XML node for the OCR element. 38 | """ 39 | 40 | # Store the element for later reference. 41 | self._element = element 42 | 43 | # Create an element cache. 44 | self._cache = {} 45 | 46 | # Parse the properties of the HOCR element. 47 | properties = element.get('title', '').split(';') 48 | for prop in properties: 49 | name, value = prop.split(maxsplit=1) 50 | if name == 'bbox': 51 | self.box = Box(value) 52 | 53 | elif name == 'image': 54 | self.image = value.strip('" ') 55 | 56 | def __dir__(self): 57 | return super().__dir__() + list(self._allowed_ocr_classes) 58 | 59 | def __getattr__(self, name): 60 | # Return the cached version if present. 61 | if name in self._cache: 62 | return self._cache[name] 63 | 64 | # Parse the named OCR elements. 65 | if name in self._allowed_ocr_classes: 66 | ref = OCR_CLASSES[name] 67 | nodes = self._element.find_all(class_=re.compile(ref['name'])) 68 | self._cache[name] = elements = list(map(ref['class'], nodes)) 69 | return elements 70 | 71 | # Attribute is not present. 72 | raise AttributeError(name) 73 | 74 | 75 | class Word(Base): 76 | 77 | _allowed_ocr_classes = {} 78 | 79 | def __init__(self, element): 80 | # Initialize the base. 81 | super().__init__(element) 82 | 83 | # Discover if we are "bold". 84 | # A word element is bold if its text node is wrapped in a . 85 | self.bold = bool(element.find('strong')) 86 | 87 | # Discover if we are "italic". 88 | # A word element is italic if its text node is wrapped in a . 89 | self.italic = bool(element.find('em')) 90 | 91 | # Find the text node. 92 | self.text = element.text 93 | 94 | def __str__(self): 95 | return '' % (self.text, self.box) 96 | 97 | 98 | class Line(Base): 99 | _allowed_ocr_classes = {'words'} 100 | 101 | 102 | class Paragraph(Base): 103 | _allowed_ocr_classes = {'lines', 'words'} 104 | 105 | 106 | class Block(Base): 107 | _allowed_ocr_classes = {'paragraphs', 'lines', 'words'} 108 | 109 | 110 | class Page(Base): 111 | _allowed_ocr_classes = {'blocks', 'paragraphs', 'lines', 'words'} 112 | 113 | 114 | OCR_CLASSES = { 115 | 'words': {'name': 'ocr.?_word', 'class': Word}, 116 | 'lines': {'name': 'ocr_line', 'class': Line}, 117 | 'paragraphs': {'name': 'ocr_par', 'class': Paragraph}, 118 | 'blocks': {'name': 'ocr_carea', 'class': Block} 119 | } 120 | -------------------------------------------------------------------------------- /src/hocr/parser.py: -------------------------------------------------------------------------------- 1 | from .page import Page 2 | import six 3 | from bs4 import UnicodeDammit, BeautifulSoup 4 | # from lxml.etree import fromstring 5 | 6 | 7 | def parse(source): 8 | """Parse a HOCR stream into page elements. 9 | 10 | @param[in] source 11 | Either a file-like object or a filename of the HOCR text. 12 | """ 13 | 14 | # Corece the source into content. 15 | if isinstance(source, six.string_types): 16 | with open(source, 'rb') as stream: 17 | content = stream.read() 18 | 19 | else: 20 | content = source.read() 21 | 22 | # Parse the HOCR xml stream. 23 | ud = UnicodeDammit(content, is_html=True) 24 | soup = BeautifulSoup(ud.unicode_markup, 'lxml') 25 | 26 | # Get all the pages and parse them into page elements. 27 | return [Page(x) for x in soup.find_all(class_='ocr_page')] 28 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from os import path 4 | 5 | # Append the source directory to PATH. 6 | sys.path.append(path.join(path.dirname(__file__), '..', 'src')) 7 | -------------------------------------------------------------------------------- /tests/hocr/test_parse.py: -------------------------------------------------------------------------------- 1 | import hocr 2 | from os import path 3 | from pytest import raises 4 | 5 | BASE_DIR = path.dirname(__file__) 6 | 7 | 8 | def parse(filename='example.html'): 9 | return hocr.parse(path.join(BASE_DIR, filename)) 10 | 11 | 12 | def test_parse_from_stream(): 13 | with open(path.join(BASE_DIR, 'example.html'), 'rb') as stream: 14 | pages = hocr.parse(stream) 15 | 16 | assert len(pages) == len(parse('example.html')) 17 | 18 | 19 | def test_get_number_of_pages(): 20 | assert len(parse()) == 1 21 | 22 | 23 | def test_parse_return_datastructure_is_pages(): 24 | for item in parse(): 25 | assert isinstance(item, hocr.Page) 26 | 27 | 28 | # def test_page_has_page_number(): 29 | # pages = parse() 30 | 31 | # for item in pages: 32 | # assert hasattr(item, 'index') 33 | # assert item.index >= 0 34 | 35 | 36 | # def test_page_has_unique_page_number(): 37 | # pages = parse() 38 | # numbers = {x.index for x in pages} 39 | 40 | # assert len(pages) == len(numbers) 41 | 42 | 43 | def test_page_elements_in_dir(): 44 | page = parse()[0] 45 | 46 | assert 'words' in dir(page) 47 | assert 'blocks' in dir(page) 48 | 49 | 50 | def test_page_has_proper_attribute_error(): 51 | page = parse()[0] 52 | 53 | with raises(AttributeError): 54 | page.shjgioda 55 | 56 | 57 | def test_page_has_bounding_box(): 58 | for page in parse(): 59 | assert page.box.left >= 0 60 | 61 | 62 | def test_page_bounding_box_has_correct_value(): 63 | page = parse()[0] 64 | 65 | assert page.box.left == 0 66 | assert page.box.top == 0 67 | assert page.box.right == 5100 68 | assert page.box.bottom == 6600 69 | 70 | 71 | def test_page_has_image_name(): 72 | page = parse()[0] 73 | 74 | assert page.image == '/tmp/tmpepham8.tiff' 75 | 76 | 77 | def test_page_has_blocks(): 78 | page = parse()[0] 79 | 80 | assert len(page.blocks) == 3 81 | 82 | 83 | def test_page_blocks_have_paragraphs(): 84 | page = parse()[0] 85 | 86 | assert len(page.blocks[0].paragraphs) == 1 87 | assert len(page.blocks[1].paragraphs) == 50 88 | assert len(page.blocks[2].paragraphs) == 1 89 | 90 | 91 | def test_page_block_paragraphs_have_lines(): 92 | page = parse()[0] 93 | 94 | assert len(page.blocks[1].paragraphs[0].lines) == 2 95 | assert len(page.blocks[1].paragraphs[10].lines) == 1 96 | assert len(page.blocks[1].paragraphs[20].lines) == 1 97 | assert len(page.blocks[2].paragraphs[0].lines) == 1 98 | 99 | 100 | def test_page_block_paragraph_lines_have_words(): 101 | page = parse()[0] 102 | 103 | assert len(page.blocks[0].paragraphs[0].lines[0].words) == 3 104 | assert len(page.blocks[1].paragraphs[0].lines[0].words) == 3 105 | assert len(page.blocks[1].paragraphs[10].lines[0].words) == 54 106 | 107 | 108 | def test_page_has_words(): 109 | page = parse()[0] 110 | 111 | assert len(page.words) == 2665 112 | 113 | 114 | def test_words_have_text(): 115 | page = parse()[0] 116 | 117 | assert page.words[0].text == 'TABLE' 118 | assert page.words[2].text == 'CONTENTS' 119 | assert page.words[102].text == '.' 120 | 121 | 122 | def test_words_have_boldness(): 123 | page = parse()[0] 124 | 125 | assert page.words[0].bold 126 | assert not page.words[73].bold 127 | 128 | 129 | def test_words_have_italicness(): 130 | page = parse()[0] 131 | 132 | assert not page.words[0].italic 133 | assert page.words[2].italic 134 | assert not page.words[73].italic 135 | 136 | 137 | def test_words_have_bounding_box(): 138 | page = parse()[0] 139 | 140 | assert page.words[0].box.left == 2216 141 | assert page.words[0].box.top == 1049 142 | assert page.words[0].box.right == 2449 143 | assert page.words[0].box.bottom == 1098 144 | --------------------------------------------------------------------------------