├── .gitignore ├── tests.py ├── README.md └── ocr.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | 2 | from ocr import compute_dpi, get_paper_type 3 | import unittest 4 | 5 | 6 | class ResolutionTests(unittest.TestCase): 7 | 8 | def test_compute_dpi(self): 9 | self.assertEqual(compute_dpi(611, 792, 2544, 3300), (300, 300)) 10 | 11 | def test_paper_type(self): 12 | self.assertEqual(get_paper_type(611, 792), 'US Letter') 13 | self.assertEqual(get_paper_type(792, 611), 'US Letter, landscape') 14 | self.assertEqual(get_paper_type(598, 842), 'A4') 15 | 16 | 17 | if __name__ == '__main__': 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pdf-ocr-overlay 2 | =============== 3 | 4 | Simple way to make scanned PDFs searchable based on Tesseract. 5 | 6 | $ ./ocr.py scanned.pdf output.pdf 7 | 8 | Adds an overlay to scanned PDF so that your document archive can be indexed and 9 | text can be found easier in large documents. 10 | 11 | Dependencies 12 | ------------ 13 | 14 | * python 15 | * tesseract-ocr (tesseract) 16 | * exactimage (hocr2pdf) 17 | * poppler-utils (pdfimages) 18 | * ghostscript (gs) 19 | 20 | Alternatives 21 | ------------ 22 | 23 | [OCRFeeder](https://live.gnome.org/OCRFeeder) lets you rebuild documents from 24 | scanned images and documents. It has layout analysis, frontend-editing, 25 | spell-checking and much more. If you want to modify the extracted texts then 26 | this is the way to go. Exported PDFs with an overlay tend to be very large and 27 | it takes some time to process a bunch of documents. 28 | 29 | [Google Docs](https://docs.google.com) does OCR for uploaded documents. I've 30 | not tested this extensively but it's probably good and will probably get better 31 | over time. If you want to upload your documents to Google anyway, go for that. 32 | 33 | [pdfocr](https://github.com/gkovacs/pdfocr) by Geza Kovacs is quite similar to 34 | this project also I haven't used it yet. It's based on cuneiform and 35 | implemented in Ruby. 36 | -------------------------------------------------------------------------------- /ocr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright (c) 2012 Ludwig Haehne 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from argparse import ArgumentParser 25 | from glob import glob 26 | import logging 27 | import os 28 | from Queue import Queue 29 | from shutil import rmtree 30 | from subprocess import check_call, check_output, Popen, PIPE 31 | from tempfile import mkdtemp 32 | from threading import Thread 33 | import time 34 | 35 | 36 | logging.basicConfig(level=logging.DEBUG, 37 | format='%(asctime)s [%(levelname)s] %(message)s', 38 | datefmt='%H:%M:%S', 39 | ) 40 | 41 | 42 | class Timer: 43 | def __enter__(self): 44 | self.start = time.time() 45 | return self 46 | 47 | def __exit__(self, *args): 48 | self.end = time.time() 49 | self.interval = self.end - self.start 50 | 51 | 52 | def system_info(): 53 | deps = [] 54 | 55 | pdfimages = Popen(['pdfimages', '-v'], stderr=PIPE).communicate()[1] 56 | deps.append(('pdfimages', pdfimages.split('\n')[0])) 57 | 58 | tesseract = Popen(['tesseract', '-v'], stderr=PIPE).communicate()[1] 59 | deps.append(('tesseract', tesseract.split('\n')[0])) 60 | 61 | # hocr2pdf doesn't support a --version flag but prints the version to 62 | # stderr if called without arguments 63 | hocr2pdf = Popen(['hocr2pdf', '--help'], stdout=PIPE, stderr=PIPE) 64 | hocr2pdf = hocr2pdf.communicate()[1] 65 | hocr2pdf = [line for line in hocr2pdf if 'version' in line] 66 | if hocr2pdf: 67 | deps.append(('hocr2pdf', hocr2pdf)) 68 | 69 | gs = check_output(['gs', '--version']).split('\n')[0] 70 | deps.append(('gs', gs)) 71 | 72 | identify = check_output(['identify', '--version']).split('\n')[0] 73 | deps.append(('identify', identify)) 74 | 75 | convert = check_output(['convert', '--version']).split('\n')[0] 76 | deps.append(('convert', convert)) 77 | 78 | return deps 79 | 80 | 81 | def extract_images(pdf, output): 82 | check_call(["pdfimages", pdf, output]) 83 | images = [] 84 | for filetype in ('*.ppm', '*.pbm', '*.jpg'): 85 | images.extend(glob(os.path.join(output, filetype))) 86 | images.sort() 87 | return images 88 | 89 | 90 | def compute_dpi(pdf_w, pdf_h, image_w, image_h): 91 | """ 92 | Deduce scan resolution from PDF and image size. 93 | http://stackoverflow.com/a/576816/63392 94 | """ 95 | dpi_w = int(round(image_w*72./pdf_w)) 96 | dpi_h = int(round(image_h*72./pdf_h)) 97 | return dpi_w, dpi_h 98 | 99 | 100 | def get_resolution(filename): 101 | """ 102 | Return resolution per page. 103 | """ 104 | pages = check_output(["identify", "-format", "%w,%h;", filename]) 105 | pages = [page.split(',') for page in pages.split(';') if page.strip()] 106 | pages = [(int(x), int(y)) for x, y in pages] 107 | return pages 108 | 109 | 110 | PAPER_SIZES = { 111 | (841, 1189): 'A0', 112 | (594, 841): 'A1', 113 | (420, 594): 'A2', 114 | (297, 420): 'A3', 115 | (210, 297): 'A4', 116 | (148, 210): 'A5', 117 | (105, 148): 'A6', 118 | ( 74, 105): 'A7', 119 | ( 52, 74): 'A8', 120 | (250, 353): 'B4', 121 | (176, 250): 'B5', 122 | (125, 176): 'B6', 123 | (215, 279): 'US Letter', 124 | } 125 | 126 | 127 | def get_paper_type(wdots, hdots): 128 | w_mm = int(wdots / 7.2 * 2.54) 129 | h_mm = int(hdots / 7.2 * 2.54) 130 | if (w_mm, h_mm) in PAPER_SIZES: 131 | return PAPER_SIZES[(w_mm, h_mm)] 132 | elif (h_mm, w_mm) in PAPER_SIZES: 133 | return PAPER_SIZES[(h_mm, w_mm)] + ', landscape' 134 | else: 135 | return '' 136 | 137 | 138 | def ocr_page(image, lang='eng', width=-1, height=-1): 139 | base = os.path.splitext(image)[0] 140 | png = base + '.png' 141 | hocr = base + '.html' 142 | pdf = base + '.pdf' 143 | w, h = get_resolution(image)[0] 144 | dpi_w, dpi_h = compute_dpi(width, height, w, h) 145 | logging.debug("Page={}x{} Image={}x{} DPI={}x{}".format( 146 | width, height, w, h, dpi_w, dpi_h)) 147 | check_call(["convert", image, png]) 148 | devnull = open('/dev/null', 'w') 149 | check_call(["tesseract", png, base, '-l', lang, 'hocr'], 150 | stdout=devnull) 151 | # Reduce resolution of images to 1/2 that have 600dpi or more. 152 | if dpi_w >= 600: 153 | check_call(["mogrify", "-resize", "50%", png]) 154 | dpi_w = dpi_w / 2 155 | html = os.open(hocr, os.O_RDONLY) 156 | check_call(['hocr2pdf', '-r', str(dpi_w), '-i', png, '-o', pdf], 157 | stdin=html) 158 | os.close(html) 159 | return pdf 160 | 161 | 162 | def process_page(index, queue, lang, resolution): 163 | """ 164 | Pull page from the queue and perform OCR. Make sure to acknowledge the 165 | message in the queue even if there is an exception so that the main thread 166 | does not block on a queue that will never be empty. 167 | """ 168 | while True: 169 | page, image = queue.get() 170 | try: 171 | logging.info("Page {:>2}: Run OCR ...".format(page)) 172 | width, height = resolution[page-1] 173 | with Timer() as t: 174 | ocr_page(image, lang=lang, width=width, height=height) 175 | logging.info("Page {:>2}: OCR took {:.2f}s".format(page, t.interval)) 176 | finally: 177 | queue.task_done() 178 | 179 | 180 | def merge_pdf(pages, output_filename): 181 | check_call(['gs', 182 | '-q', 183 | '-dNOPAUSE', 184 | '-dBATCH', 185 | '-sDEVICE=pdfwrite', 186 | '-dCompatibilityLevel=1.4', 187 | '-sOutputFile={}'.format(output_filename)] + 188 | pages, 189 | ) 190 | 191 | 192 | def start_workers(num_workers, queue, lang, resolution): 193 | for tid in range(num_workers): 194 | args = (tid, queue, lang, resolution) 195 | worker = Thread(target=process_page, args=args) 196 | worker.daemon = True 197 | worker.start() 198 | 199 | 200 | def process(input_file, output_file, lang='eng', jobs=4): 201 | tmp = os.path.join(mkdtemp(), '') 202 | try: 203 | resolution = get_resolution(input_file) 204 | w, h = resolution[0] 205 | logging.info("{} pages, {}mm*{}mm {}".format(len(resolution), 206 | int(w / 7.2 * 2.54), 207 | int(h / 7.2 * 2.54), 208 | get_paper_type(w, h))) 209 | logging.info("Extract pages from {}".format(input_file)) 210 | images = extract_images(input_file, tmp) 211 | num_workers = min(len(images), jobs) 212 | queue = Queue() 213 | start_workers(num_workers, queue, lang, resolution) 214 | logging.info("Process {} pages with {} threads".format(len(images), 215 | num_workers)) 216 | for idx, image in enumerate(images, start=1): 217 | queue.put((idx, image)) 218 | queue.join() 219 | pages = sorted(glob(os.path.join(tmp, '*.pdf'))) 220 | logging.info("OCR complete. Merge pages into '{}'".format(output_file)) 221 | merge_pdf(pages, output_file) 222 | check_call(['ls', '-lh', input_file, output_file]) 223 | finally: 224 | rmtree(tmp) 225 | 226 | 227 | if __name__ == '__main__': 228 | parser = ArgumentParser(description="Add OCR to overlay to scanned PDF") 229 | parser.add_argument('input', nargs=1, help='Scanned PDF') 230 | parser.add_argument('output', nargs=1, help='Output PDF') 231 | parser.add_argument('-l', '--lang', default='eng', 232 | help='3-digit tesseract language code (default "eng")') 233 | parser.add_argument('-j', '--jobs', default=4, type=int, 234 | help='Specifies the number of pages to process simultaneously') 235 | args = parser.parse_args() 236 | for name, version in system_info(): 237 | logging.info('{:<12}: {}'.format(name, version)) 238 | process(args.input[0], args.output[0], lang=args.lang, jobs=args.jobs) 239 | --------------------------------------------------------------------------------