├── .gitignore
├── LICENSE
├── README.md
└── extract_text


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012, Jason Funk <jasonlfunk@gmail.com>
 2 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software
 3 | and associated documentation files (the "Software"), to deal in the Software without restriction,
 4 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 5 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
 6 | subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial
 9 | portions of the Software.
10 |  
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
12 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
13 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
14 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
15 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ocr-text-extraction
 2 | ===================
 3 | 
 4 | I am not actively supporting this script. It was just an experiment.
 5 | 
 6 | Processes an image to extract the text portions. Primarily
 7 | used for pre-processing for performing OCR.
 8 | 
 9 | Implemented in Python using OpenCV.
10 | 
11 | Based on the paper "Font and Background Color Independent Text Binarization" by
12 | T Kasar, J Kumar and A G Ramakrishnan
13 | http://www.m.cs.osakafu-u.ac.jp/cbdar2007/proceedings/papers/O1-1.pdf
14 | 
15 | Copyright (c) 2012, Jason Funk <jasonlfunk@gmail.com>
16 | 


--------------------------------------------------------------------------------
/extract_text:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Processes an image to extract the text portions. Primarily
  4 | # used for pre-processing for performing OCR.
  5 | 
  6 | # Based on the paper "Font and Background Color Independent Text Binarization" by
  7 | # T Kasar, J Kumar and A G Ramakrishnan
  8 | # http://www.m.cs.osakafu-u.ac.jp/cbdar2007/proceedings/papers/O1-1.pdf
  9 | 
 10 | # Copyright (c) 2012, Jason Funk <jasonlfunk@gmail.com>
 11 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software
 12 | # and associated documentation files (the "Software"), to deal in the Software without restriction,
 13 | # including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 14 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
 15 | # subject to the following conditions:
 16 | #
 17 | # The above copyright notice and this permission notice shall be included in all copies or substantial
 18 | # portions of the Software.
 19 | #
 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
 21 | # LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 22 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 24 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 25 | 
 26 | import cv2
 27 | import numpy as np
 28 | import sys
 29 | import os.path
 30 | 
 31 | if len(sys.argv) != 3:
 32 |     print "%s input_file output_file" % (sys.argv[0])
 33 |     sys.exit()
 34 | else:
 35 |     input_file = sys.argv[1]
 36 |     output_file = sys.argv[2]
 37 | 
 38 | if not os.path.isfile(input_file):
 39 |     print "No such file '%s'" % input_file
 40 |     sys.exit()
 41 | 
 42 | DEBUG = 0
 43 | 
 44 | 
 45 | # Determine pixel intensity
 46 | # Apparently human eyes register colors differently.
 47 | # TVs use this formula to determine
 48 | # pixel intensity = 0.30R + 0.59G + 0.11B
 49 | def ii(xx, yy):
 50 |     global img, img_y, img_x
 51 |     if yy >= img_y or xx >= img_x:
 52 |         #print "pixel out of bounds ("+str(y)+","+str(x)+")"
 53 |         return 0
 54 |     pixel = img[yy][xx]
 55 |     return 0.30 * pixel[2] + 0.59 * pixel[1] + 0.11 * pixel[0]
 56 | 
 57 | 
 58 | # A quick test to check whether the contour is
 59 | # a connected shape
 60 | def connected(contour):
 61 |     first = contour[0][0]
 62 |     last = contour[len(contour) - 1][0]
 63 |     return abs(first[0] - last[0]) <= 1 and abs(first[1] - last[1]) <= 1
 64 | 
 65 | 
 66 | # Helper function to return a given contour
 67 | def c(index):
 68 |     global contours
 69 |     return contours[index]
 70 | 
 71 | 
 72 | # Count the number of real children
 73 | def count_children(index, h_, contour):
 74 |     # No children
 75 |     if h_[index][2] < 0:
 76 |         return 0
 77 |     else:
 78 |         #If the first child is a contour we care about
 79 |         # then count it, otherwise don't
 80 |         if keep(c(h_[index][2])):
 81 |             count = 1
 82 |         else:
 83 |             count = 0
 84 | 
 85 |             # Also count all of the child's siblings and their children
 86 |         count += count_siblings(h_[index][2], h_, contour, True)
 87 |         return count
 88 | 
 89 | 
 90 | # Quick check to test if the contour is a child
 91 | def is_child(index, h_):
 92 |     return get_parent(index, h_) > 0
 93 | 
 94 | 
 95 | # Get the first parent of the contour that we care about
 96 | def get_parent(index, h_):
 97 |     parent = h_[index][3]
 98 |     while not keep(c(parent)) and parent > 0:
 99 |         parent = h_[parent][3]
100 | 
101 |     return parent
102 | 
103 | 
104 | # Count the number of relevant siblings of a contour
105 | def count_siblings(index, h_, contour, inc_children=False):
106 |     # Include the children if necessary
107 |     if inc_children:
108 |         count = count_children(index, h_, contour)
109 |     else:
110 |         count = 0
111 | 
112 |     # Look ahead
113 |     p_ = h_[index][0]
114 |     while p_ > 0:
115 |         if keep(c(p_)):
116 |             count += 1
117 |         if inc_children:
118 |             count += count_children(p_, h_, contour)
119 |         p_ = h_[p_][0]
120 | 
121 |     # Look behind
122 |     n = h_[index][1]
123 |     while n > 0:
124 |         if keep(c(n)):
125 |             count += 1
126 |         if inc_children:
127 |             count += count_children(n, h_, contour)
128 |         n = h_[n][1]
129 |     return count
130 | 
131 | 
132 | # Whether we care about this contour
133 | def keep(contour):
134 |     return keep_box(contour) and connected(contour)
135 | 
136 | 
137 | # Whether we should keep the containing box of this
138 | # contour based on it's shape
139 | def keep_box(contour):
140 |     xx, yy, w_, h_ = cv2.boundingRect(contour)
141 | 
142 |     # width and height need to be floats
143 |     w_ *= 1.0
144 |     h_ *= 1.0
145 | 
146 |     # Test it's shape - if it's too oblong or tall it's
147 |     # probably not a real character
148 |     if w_ / h_ < 0.1 or w_ / h_ > 10:
149 |         if DEBUG:
150 |             print "\t Rejected because of shape: (" + str(xx) + "," + str(yy) + "," + str(w_) + "," + str(h_) + ")" + \
151 |                   str(w_ / h_)
152 |         return False
153 |     
154 |     # check size of the box
155 |     if ((w_ * h_) > ((img_x * img_y) / 5)) or ((w_ * h_) < 15):
156 |         if DEBUG:
157 |             print "\t Rejected because of size"
158 |         return False
159 | 
160 |     return True
161 | 
162 | 
163 | def include_box(index, h_, contour):
164 |     if DEBUG:
165 |         print str(index) + ":"
166 |         if is_child(index, h_):
167 |             print "\tIs a child"
168 |             print "\tparent " + str(get_parent(index, h_)) + " has " + str(
169 |                 count_children(get_parent(index, h_), h_, contour)) + " children"
170 |             print "\thas " + str(count_children(index, h_, contour)) + " children"
171 | 
172 |     if is_child(index, h_) and count_children(get_parent(index, h_), h_, contour) <= 2:
173 |         if DEBUG:
174 |             print "\t skipping: is an interior to a letter"
175 |         return False
176 | 
177 |     if count_children(index, h_, contour) > 2:
178 |         if DEBUG:
179 |             print "\t skipping, is a container of letters"
180 |         return False
181 | 
182 |     if DEBUG:
183 |         print "\t keeping"
184 |     return True
185 | 
186 | # Load the image
187 | orig_img = cv2.imread(input_file)
188 | 
189 | # Add a border to the image for processing sake
190 | img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT)
191 | 
192 | # Calculate the width and height of the image
193 | img_y = len(img)
194 | img_x = len(img[0])
195 | 
196 | if DEBUG:
197 |     print "Image is " + str(len(img)) + "x" + str(len(img[0]))
198 | 
199 | #Split out each channel
200 | blue, green, red = cv2.split(img)
201 | 
202 | # Run canny edge detection on each channel
203 | blue_edges = cv2.Canny(blue, 200, 250)
204 | green_edges = cv2.Canny(green, 200, 250)
205 | red_edges = cv2.Canny(red, 200, 250)
206 | 
207 | # Join edges back into image
208 | edges = blue_edges | green_edges | red_edges
209 | 
210 | # Find the contours
211 | contours, hierarchy = cv2.findContours(edges.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
212 | 
213 | hierarchy = hierarchy[0]
214 | 
215 | if DEBUG:
216 |     processed = edges.copy()
217 |     rejected = edges.copy()
218 | 
219 | # These are the boxes that we are determining
220 | keepers = []
221 | 
222 | # For each contour, find the bounding rectangle and decide
223 | # if it's one we care about
224 | for index_, contour_ in enumerate(contours):
225 |     if DEBUG:
226 |         print "Processing #%d" % index_
227 | 
228 |     x, y, w, h = cv2.boundingRect(contour_)
229 | 
230 |     # Check the contour and it's bounding box
231 |     if keep(contour_) and include_box(index_, hierarchy, contour_):
232 |         # It's a winner!
233 |         keepers.append([contour_, [x, y, w, h]])
234 |         if DEBUG:
235 |             cv2.rectangle(processed, (x, y), (x + w, y + h), (100, 100, 100), 1)
236 |             cv2.putText(processed, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255))
237 |     else:
238 |         if DEBUG:
239 |             cv2.rectangle(rejected, (x, y), (x + w, y + h), (100, 100, 100), 1)
240 |             cv2.putText(rejected, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255))
241 | 
242 | # Make a white copy of our image
243 | new_image = edges.copy()
244 | new_image.fill(255)
245 | boxes = []
246 | 
247 | # For each box, find the foreground and background intensities
248 | for index_, (contour_, box) in enumerate(keepers):
249 | 
250 |     # Find the average intensity of the edge pixels to
251 |     # determine the foreground intensity
252 |     fg_int = 0.0
253 |     for p in contour_:
254 |         fg_int += ii(p[0][0], p[0][1])
255 | 
256 |     fg_int /= len(contour_)
257 |     if DEBUG:
258 |         print "FG Intensity for #%d = %d" % (index_, fg_int)
259 | 
260 |     # Find the intensity of three pixels going around the
261 |     # outside of each corner of the bounding box to determine
262 |     # the background intensity
263 |     x_, y_, width, height = box
264 |     bg_int = \
265 |         [
266 |             # bottom left corner 3 pixels
267 |             ii(x_ - 1, y_ - 1),
268 |             ii(x_ - 1, y_),
269 |             ii(x_, y_ - 1),
270 | 
271 |             # bottom right corner 3 pixels
272 |             ii(x_ + width + 1, y_ - 1),
273 |             ii(x_ + width, y_ - 1),
274 |             ii(x_ + width + 1, y_),
275 | 
276 |             # top left corner 3 pixels
277 |             ii(x_ - 1, y_ + height + 1),
278 |             ii(x_ - 1, y_ + height),
279 |             ii(x_, y_ + height + 1),
280 | 
281 |             # top right corner 3 pixels
282 |             ii(x_ + width + 1, y_ + height + 1),
283 |             ii(x_ + width, y_ + height + 1),
284 |             ii(x_ + width + 1, y_ + height)
285 |         ]
286 | 
287 |     # Find the median of the background
288 |     # pixels determined above
289 |     bg_int = np.median(bg_int)
290 | 
291 |     if DEBUG:
292 |         print "BG Intensity for #%d = %s" % (index_, repr(bg_int))
293 | 
294 |     # Determine if the box should be inverted
295 |     if fg_int >= bg_int:
296 |         fg = 255
297 |         bg = 0
298 |     else:
299 |         fg = 0
300 |         bg = 255
301 | 
302 |         # Loop through every pixel in the box and color the
303 |         # pixel accordingly
304 |     for x in range(x_, x_ + width):
305 |         for y in range(y_, y_ + height):
306 |             if y >= img_y or x >= img_x:
307 |                 if DEBUG:
308 |                     print "pixel out of bounds (%d,%d)" % (y, x)
309 |                 continue
310 |             if ii(x, y) > fg_int:
311 |                 new_image[y][x] = bg
312 |             else:
313 |                 new_image[y][x] = fg
314 | 
315 | # blur a bit to improve ocr accuracy
316 | new_image = cv2.blur(new_image, (2, 2))
317 | cv2.imwrite(output_file, new_image)
318 | if DEBUG:
319 |     cv2.imwrite('edges.png', edges)
320 |     cv2.imwrite('processed.png', processed)
321 |     cv2.imwrite('rejected.png', rejected)
322 | 


--------------------------------------------------------------------------------