├── .gitignore
├── tests.py
├── README.md
└── ocr.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ocr import compute_dpi, get_paper_type
 3 | import unittest
 4 | 
 5 | 
 6 | class ResolutionTests(unittest.TestCase):
 7 | 
 8 |     def test_compute_dpi(self):
 9 |         self.assertEqual(compute_dpi(611, 792, 2544, 3300), (300, 300))
10 | 
11 |     def test_paper_type(self):
12 |         self.assertEqual(get_paper_type(611, 792), 'US Letter')
13 |         self.assertEqual(get_paper_type(792, 611), 'US Letter, landscape')
14 |         self.assertEqual(get_paper_type(598, 842), 'A4')
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     unittest.main()
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pdf-ocr-overlay
 2 | ===============
 3 | 
 4 | Simple way to make scanned PDFs searchable based on Tesseract.
 5 | 
 6 | $ ./ocr.py scanned.pdf output.pdf
 7 | 
 8 | Adds an overlay to scanned PDF so that your document archive can be indexed and
 9 | text can be found easier in large documents.
10 | 
11 | Dependencies
12 | ------------
13 | 
14 |  * python
15 |  * tesseract-ocr (tesseract)
16 |  * exactimage (hocr2pdf)
17 |  * poppler-utils (pdfimages)
18 |  * ghostscript (gs)
19 | 
20 | Alternatives
21 | ------------
22 | 
23 | [OCRFeeder](https://live.gnome.org/OCRFeeder) lets you rebuild documents from
24 | scanned images and documents. It has layout analysis, frontend-editing,
25 | spell-checking and much more. If you want to modify the extracted texts then
26 | this is the way to go. Exported PDFs with an overlay tend to be very large and
27 | it takes some time to process a bunch of documents.
28 | 
29 | [Google Docs](https://docs.google.com) does OCR for uploaded documents. I've
30 | not tested this extensively but it's probably good and will probably get better
31 | over time. If you want to upload your documents to Google anyway, go for that.
32 | 
33 | [pdfocr](https://github.com/gkovacs/pdfocr) by Geza Kovacs is quite similar to
34 | this project also I haven't used it yet. It's based on cuneiform and
35 | implemented in Ruby.
36 | 


--------------------------------------------------------------------------------
/ocr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright (c) 2012 Ludwig Haehne
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from argparse import ArgumentParser
 25 | from glob import glob
 26 | import logging
 27 | import os
 28 | from Queue import Queue
 29 | from shutil import rmtree
 30 | from subprocess import check_call, check_output, Popen, PIPE
 31 | from tempfile import mkdtemp
 32 | from threading import Thread
 33 | import time
 34 | 
 35 | 
 36 | logging.basicConfig(level=logging.DEBUG,
 37 |                     format='%(asctime)s [%(levelname)s] %(message)s',
 38 |                     datefmt='%H:%M:%S',
 39 |                     )
 40 | 
 41 | 
 42 | class Timer:
 43 |     def __enter__(self):
 44 |         self.start = time.time()
 45 |         return self
 46 | 
 47 |     def __exit__(self, *args):
 48 |         self.end = time.time()
 49 |         self.interval = self.end - self.start
 50 | 
 51 | 
 52 | def system_info():
 53 |     deps = []
 54 | 
 55 |     pdfimages = Popen(['pdfimages', '-v'], stderr=PIPE).communicate()[1]
 56 |     deps.append(('pdfimages', pdfimages.split('\n')[0]))
 57 | 
 58 |     tesseract = Popen(['tesseract', '-v'], stderr=PIPE).communicate()[1]
 59 |     deps.append(('tesseract', tesseract.split('\n')[0]))
 60 | 
 61 |     # hocr2pdf doesn't support a --version flag but prints the version to
 62 |     # stderr if called without arguments
 63 |     hocr2pdf = Popen(['hocr2pdf', '--help'], stdout=PIPE, stderr=PIPE)
 64 |     hocr2pdf = hocr2pdf.communicate()[1]
 65 |     hocr2pdf = [line for line in hocr2pdf if 'version' in line]
 66 |     if hocr2pdf:
 67 |         deps.append(('hocr2pdf', hocr2pdf))
 68 | 
 69 |     gs = check_output(['gs', '--version']).split('\n')[0]
 70 |     deps.append(('gs', gs))
 71 | 
 72 |     identify = check_output(['identify', '--version']).split('\n')[0]
 73 |     deps.append(('identify', identify))
 74 | 
 75 |     convert = check_output(['convert', '--version']).split('\n')[0]
 76 |     deps.append(('convert', convert))
 77 | 
 78 |     return deps
 79 | 
 80 | 
 81 | def extract_images(pdf, output):
 82 |     check_call(["pdfimages", pdf, output])
 83 |     images = []
 84 |     for filetype in ('*.ppm', '*.pbm', '*.jpg'):
 85 |         images.extend(glob(os.path.join(output, filetype)))
 86 |     images.sort()
 87 |     return images
 88 | 
 89 | 
 90 | def compute_dpi(pdf_w, pdf_h, image_w, image_h):
 91 |     """
 92 |     Deduce scan resolution from PDF and image size.
 93 |     http://stackoverflow.com/a/576816/63392
 94 |     """
 95 |     dpi_w = int(round(image_w*72./pdf_w))
 96 |     dpi_h = int(round(image_h*72./pdf_h))
 97 |     return dpi_w, dpi_h
 98 | 
 99 | 
100 | def get_resolution(filename):
101 |     """
102 |     Return resolution per page.
103 |     """
104 |     pages = check_output(["identify", "-format", "%w,%h;", filename])
105 |     pages = [page.split(',') for page in pages.split(';') if page.strip()]
106 |     pages = [(int(x), int(y)) for x, y in pages]
107 |     return pages
108 | 
109 | 
110 | PAPER_SIZES = {
111 |     (841, 1189): 'A0',
112 |     (594, 841): 'A1',
113 |     (420, 594): 'A2',
114 |     (297, 420): 'A3',
115 |     (210, 297): 'A4',
116 |     (148, 210): 'A5',
117 |     (105, 148): 'A6',
118 |     ( 74, 105): 'A7',
119 |     ( 52,  74): 'A8',
120 |     (250, 353): 'B4',
121 |     (176, 250): 'B5',
122 |     (125, 176): 'B6',
123 |     (215, 279): 'US Letter',
124 |     }
125 | 
126 | 
127 | def get_paper_type(wdots, hdots):
128 |     w_mm = int(wdots / 7.2 * 2.54)
129 |     h_mm = int(hdots / 7.2 * 2.54)
130 |     if (w_mm, h_mm) in PAPER_SIZES:
131 |         return PAPER_SIZES[(w_mm, h_mm)]
132 |     elif (h_mm, w_mm) in PAPER_SIZES:
133 |         return PAPER_SIZES[(h_mm, w_mm)] + ', landscape'
134 |     else:
135 |         return ''
136 | 
137 | 
138 | def ocr_page(image, lang='eng', width=-1, height=-1):
139 |     base = os.path.splitext(image)[0]
140 |     png = base + '.png'
141 |     hocr = base + '.html'
142 |     pdf = base + '.pdf'
143 |     w, h = get_resolution(image)[0]
144 |     dpi_w, dpi_h = compute_dpi(width, height, w, h)
145 |     logging.debug("Page={}x{} Image={}x{} DPI={}x{}".format(
146 |                   width, height, w, h, dpi_w, dpi_h))
147 |     check_call(["convert", image, png])
148 |     devnull = open('/dev/null', 'w')
149 |     check_call(["tesseract", png, base, '-l', lang, 'hocr'],
150 |                stdout=devnull)
151 |     # Reduce resolution of images to 1/2 that have 600dpi or more.
152 |     if dpi_w >= 600:
153 |         check_call(["mogrify", "-resize", "50%", png])
154 |         dpi_w = dpi_w / 2
155 |     html = os.open(hocr, os.O_RDONLY)
156 |     check_call(['hocr2pdf', '-r', str(dpi_w), '-i', png, '-o', pdf],
157 |                stdin=html)
158 |     os.close(html)
159 |     return pdf
160 | 
161 | 
162 | def process_page(index, queue, lang, resolution):
163 |     """
164 |     Pull page from the queue and perform OCR. Make sure to acknowledge the
165 |     message in the queue even if there is an exception so that the main thread
166 |     does not block on a queue that will never be empty.
167 |     """
168 |     while True:
169 |         page, image = queue.get()
170 |         try:
171 |             logging.info("Page {:>2}: Run OCR ...".format(page))
172 |             width, height = resolution[page-1]
173 |             with Timer() as t:
174 |                 ocr_page(image, lang=lang, width=width, height=height)
175 |             logging.info("Page {:>2}: OCR took {:.2f}s".format(page, t.interval))
176 |         finally:
177 |             queue.task_done()
178 | 
179 | 
180 | def merge_pdf(pages, output_filename):
181 |     check_call(['gs',
182 |                 '-q',
183 |                 '-dNOPAUSE',
184 |                 '-dBATCH',
185 |                 '-sDEVICE=pdfwrite',
186 |                 '-dCompatibilityLevel=1.4',
187 |                 '-sOutputFile={}'.format(output_filename)] +
188 |                pages,
189 |                )
190 | 
191 | 
192 | def start_workers(num_workers, queue, lang, resolution):
193 |     for tid in range(num_workers):
194 |         args = (tid, queue, lang, resolution)
195 |         worker = Thread(target=process_page, args=args)
196 |         worker.daemon = True
197 |         worker.start()
198 | 
199 | 
200 | def process(input_file, output_file, lang='eng', jobs=4):
201 |     tmp = os.path.join(mkdtemp(), '')
202 |     try:
203 |         resolution = get_resolution(input_file)
204 |         w, h = resolution[0]
205 |         logging.info("{} pages, {}mm*{}mm {}".format(len(resolution),
206 |                                                      int(w / 7.2 * 2.54),
207 |                                                      int(h / 7.2 * 2.54),
208 |                                                      get_paper_type(w, h)))
209 |         logging.info("Extract pages from {}".format(input_file))
210 |         images = extract_images(input_file, tmp)
211 |         num_workers = min(len(images), jobs)
212 |         queue = Queue()
213 |         start_workers(num_workers, queue, lang, resolution)
214 |         logging.info("Process {} pages with {} threads".format(len(images),
215 |                                                                    num_workers))
216 |         for idx, image in enumerate(images, start=1):
217 |             queue.put((idx, image))
218 |         queue.join()
219 |         pages = sorted(glob(os.path.join(tmp, '*.pdf')))
220 |         logging.info("OCR complete. Merge pages into '{}'".format(output_file))
221 |         merge_pdf(pages, output_file)
222 |         check_call(['ls', '-lh', input_file, output_file])
223 |     finally:
224 |         rmtree(tmp)
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     parser = ArgumentParser(description="Add OCR to overlay to scanned PDF")
229 |     parser.add_argument('input', nargs=1, help='Scanned PDF')
230 |     parser.add_argument('output', nargs=1, help='Output PDF')
231 |     parser.add_argument('-l', '--lang', default='eng',
232 |                         help='3-digit tesseract language code (default "eng")')
233 |     parser.add_argument('-j', '--jobs', default=4, type=int,
234 |                         help='Specifies the number of pages to process simultaneously')
235 |     args = parser.parse_args()
236 |     for name, version in system_info():
237 |         logging.info('{:<12}: {}'.format(name, version))
238 |     process(args.input[0], args.output[0], lang=args.lang, jobs=args.jobs)
239 | 


--------------------------------------------------------------------------------