├── .gitignore ├── LICENSE ├── README.md └── extract_text /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Jason Funk 2 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software 3 | and associated documentation files (the "Software"), to deal in the Software without restriction, 4 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 5 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 6 | subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial 9 | portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 12 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 13 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 14 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 15 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ocr-text-extraction 2 | =================== 3 | 4 | I am not actively supporting this script. It was just an experiment. 5 | 6 | Processes an image to extract the text portions. Primarily 7 | used for pre-processing for performing OCR. 8 | 9 | Implemented in Python using OpenCV. 10 | 11 | Based on the paper "Font and Background Color Independent Text Binarization" by 12 | T Kasar, J Kumar and A G Ramakrishnan 13 | http://www.m.cs.osakafu-u.ac.jp/cbdar2007/proceedings/papers/O1-1.pdf 14 | 15 | Copyright (c) 2012, Jason Funk 16 | -------------------------------------------------------------------------------- /extract_text: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Processes an image to extract the text portions. Primarily 4 | # used for pre-processing for performing OCR. 5 | 6 | # Based on the paper "Font and Background Color Independent Text Binarization" by 7 | # T Kasar, J Kumar and A G Ramakrishnan 8 | # http://www.m.cs.osakafu-u.ac.jp/cbdar2007/proceedings/papers/O1-1.pdf 9 | 10 | # Copyright (c) 2012, Jason Funk 11 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software 12 | # and associated documentation files (the "Software"), to deal in the Software without restriction, 13 | # including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 14 | # and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | # 17 | # The above copyright notice and this permission notice shall be included in all copies or substantial 18 | # portions of the Software. 19 | # 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 21 | # LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 22 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | import cv2 27 | import numpy as np 28 | import sys 29 | import os.path 30 | 31 | if len(sys.argv) != 3: 32 | print "%s input_file output_file" % (sys.argv[0]) 33 | sys.exit() 34 | else: 35 | input_file = sys.argv[1] 36 | output_file = sys.argv[2] 37 | 38 | if not os.path.isfile(input_file): 39 | print "No such file '%s'" % input_file 40 | sys.exit() 41 | 42 | DEBUG = 0 43 | 44 | 45 | # Determine pixel intensity 46 | # Apparently human eyes register colors differently. 47 | # TVs use this formula to determine 48 | # pixel intensity = 0.30R + 0.59G + 0.11B 49 | def ii(xx, yy): 50 | global img, img_y, img_x 51 | if yy >= img_y or xx >= img_x: 52 | #print "pixel out of bounds ("+str(y)+","+str(x)+")" 53 | return 0 54 | pixel = img[yy][xx] 55 | return 0.30 * pixel[2] + 0.59 * pixel[1] + 0.11 * pixel[0] 56 | 57 | 58 | # A quick test to check whether the contour is 59 | # a connected shape 60 | def connected(contour): 61 | first = contour[0][0] 62 | last = contour[len(contour) - 1][0] 63 | return abs(first[0] - last[0]) <= 1 and abs(first[1] - last[1]) <= 1 64 | 65 | 66 | # Helper function to return a given contour 67 | def c(index): 68 | global contours 69 | return contours[index] 70 | 71 | 72 | # Count the number of real children 73 | def count_children(index, h_, contour): 74 | # No children 75 | if h_[index][2] < 0: 76 | return 0 77 | else: 78 | #If the first child is a contour we care about 79 | # then count it, otherwise don't 80 | if keep(c(h_[index][2])): 81 | count = 1 82 | else: 83 | count = 0 84 | 85 | # Also count all of the child's siblings and their children 86 | count += count_siblings(h_[index][2], h_, contour, True) 87 | return count 88 | 89 | 90 | # Quick check to test if the contour is a child 91 | def is_child(index, h_): 92 | return get_parent(index, h_) > 0 93 | 94 | 95 | # Get the first parent of the contour that we care about 96 | def get_parent(index, h_): 97 | parent = h_[index][3] 98 | while not keep(c(parent)) and parent > 0: 99 | parent = h_[parent][3] 100 | 101 | return parent 102 | 103 | 104 | # Count the number of relevant siblings of a contour 105 | def count_siblings(index, h_, contour, inc_children=False): 106 | # Include the children if necessary 107 | if inc_children: 108 | count = count_children(index, h_, contour) 109 | else: 110 | count = 0 111 | 112 | # Look ahead 113 | p_ = h_[index][0] 114 | while p_ > 0: 115 | if keep(c(p_)): 116 | count += 1 117 | if inc_children: 118 | count += count_children(p_, h_, contour) 119 | p_ = h_[p_][0] 120 | 121 | # Look behind 122 | n = h_[index][1] 123 | while n > 0: 124 | if keep(c(n)): 125 | count += 1 126 | if inc_children: 127 | count += count_children(n, h_, contour) 128 | n = h_[n][1] 129 | return count 130 | 131 | 132 | # Whether we care about this contour 133 | def keep(contour): 134 | return keep_box(contour) and connected(contour) 135 | 136 | 137 | # Whether we should keep the containing box of this 138 | # contour based on it's shape 139 | def keep_box(contour): 140 | xx, yy, w_, h_ = cv2.boundingRect(contour) 141 | 142 | # width and height need to be floats 143 | w_ *= 1.0 144 | h_ *= 1.0 145 | 146 | # Test it's shape - if it's too oblong or tall it's 147 | # probably not a real character 148 | if w_ / h_ < 0.1 or w_ / h_ > 10: 149 | if DEBUG: 150 | print "\t Rejected because of shape: (" + str(xx) + "," + str(yy) + "," + str(w_) + "," + str(h_) + ")" + \ 151 | str(w_ / h_) 152 | return False 153 | 154 | # check size of the box 155 | if ((w_ * h_) > ((img_x * img_y) / 5)) or ((w_ * h_) < 15): 156 | if DEBUG: 157 | print "\t Rejected because of size" 158 | return False 159 | 160 | return True 161 | 162 | 163 | def include_box(index, h_, contour): 164 | if DEBUG: 165 | print str(index) + ":" 166 | if is_child(index, h_): 167 | print "\tIs a child" 168 | print "\tparent " + str(get_parent(index, h_)) + " has " + str( 169 | count_children(get_parent(index, h_), h_, contour)) + " children" 170 | print "\thas " + str(count_children(index, h_, contour)) + " children" 171 | 172 | if is_child(index, h_) and count_children(get_parent(index, h_), h_, contour) <= 2: 173 | if DEBUG: 174 | print "\t skipping: is an interior to a letter" 175 | return False 176 | 177 | if count_children(index, h_, contour) > 2: 178 | if DEBUG: 179 | print "\t skipping, is a container of letters" 180 | return False 181 | 182 | if DEBUG: 183 | print "\t keeping" 184 | return True 185 | 186 | # Load the image 187 | orig_img = cv2.imread(input_file) 188 | 189 | # Add a border to the image for processing sake 190 | img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) 191 | 192 | # Calculate the width and height of the image 193 | img_y = len(img) 194 | img_x = len(img[0]) 195 | 196 | if DEBUG: 197 | print "Image is " + str(len(img)) + "x" + str(len(img[0])) 198 | 199 | #Split out each channel 200 | blue, green, red = cv2.split(img) 201 | 202 | # Run canny edge detection on each channel 203 | blue_edges = cv2.Canny(blue, 200, 250) 204 | green_edges = cv2.Canny(green, 200, 250) 205 | red_edges = cv2.Canny(red, 200, 250) 206 | 207 | # Join edges back into image 208 | edges = blue_edges | green_edges | red_edges 209 | 210 | # Find the contours 211 | contours, hierarchy = cv2.findContours(edges.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) 212 | 213 | hierarchy = hierarchy[0] 214 | 215 | if DEBUG: 216 | processed = edges.copy() 217 | rejected = edges.copy() 218 | 219 | # These are the boxes that we are determining 220 | keepers = [] 221 | 222 | # For each contour, find the bounding rectangle and decide 223 | # if it's one we care about 224 | for index_, contour_ in enumerate(contours): 225 | if DEBUG: 226 | print "Processing #%d" % index_ 227 | 228 | x, y, w, h = cv2.boundingRect(contour_) 229 | 230 | # Check the contour and it's bounding box 231 | if keep(contour_) and include_box(index_, hierarchy, contour_): 232 | # It's a winner! 233 | keepers.append([contour_, [x, y, w, h]]) 234 | if DEBUG: 235 | cv2.rectangle(processed, (x, y), (x + w, y + h), (100, 100, 100), 1) 236 | cv2.putText(processed, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255)) 237 | else: 238 | if DEBUG: 239 | cv2.rectangle(rejected, (x, y), (x + w, y + h), (100, 100, 100), 1) 240 | cv2.putText(rejected, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255)) 241 | 242 | # Make a white copy of our image 243 | new_image = edges.copy() 244 | new_image.fill(255) 245 | boxes = [] 246 | 247 | # For each box, find the foreground and background intensities 248 | for index_, (contour_, box) in enumerate(keepers): 249 | 250 | # Find the average intensity of the edge pixels to 251 | # determine the foreground intensity 252 | fg_int = 0.0 253 | for p in contour_: 254 | fg_int += ii(p[0][0], p[0][1]) 255 | 256 | fg_int /= len(contour_) 257 | if DEBUG: 258 | print "FG Intensity for #%d = %d" % (index_, fg_int) 259 | 260 | # Find the intensity of three pixels going around the 261 | # outside of each corner of the bounding box to determine 262 | # the background intensity 263 | x_, y_, width, height = box 264 | bg_int = \ 265 | [ 266 | # bottom left corner 3 pixels 267 | ii(x_ - 1, y_ - 1), 268 | ii(x_ - 1, y_), 269 | ii(x_, y_ - 1), 270 | 271 | # bottom right corner 3 pixels 272 | ii(x_ + width + 1, y_ - 1), 273 | ii(x_ + width, y_ - 1), 274 | ii(x_ + width + 1, y_), 275 | 276 | # top left corner 3 pixels 277 | ii(x_ - 1, y_ + height + 1), 278 | ii(x_ - 1, y_ + height), 279 | ii(x_, y_ + height + 1), 280 | 281 | # top right corner 3 pixels 282 | ii(x_ + width + 1, y_ + height + 1), 283 | ii(x_ + width, y_ + height + 1), 284 | ii(x_ + width + 1, y_ + height) 285 | ] 286 | 287 | # Find the median of the background 288 | # pixels determined above 289 | bg_int = np.median(bg_int) 290 | 291 | if DEBUG: 292 | print "BG Intensity for #%d = %s" % (index_, repr(bg_int)) 293 | 294 | # Determine if the box should be inverted 295 | if fg_int >= bg_int: 296 | fg = 255 297 | bg = 0 298 | else: 299 | fg = 0 300 | bg = 255 301 | 302 | # Loop through every pixel in the box and color the 303 | # pixel accordingly 304 | for x in range(x_, x_ + width): 305 | for y in range(y_, y_ + height): 306 | if y >= img_y or x >= img_x: 307 | if DEBUG: 308 | print "pixel out of bounds (%d,%d)" % (y, x) 309 | continue 310 | if ii(x, y) > fg_int: 311 | new_image[y][x] = bg 312 | else: 313 | new_image[y][x] = fg 314 | 315 | # blur a bit to improve ocr accuracy 316 | new_image = cv2.blur(new_image, (2, 2)) 317 | cv2.imwrite(output_file, new_image) 318 | if DEBUG: 319 | cv2.imwrite('edges.png', edges) 320 | cv2.imwrite('processed.png', processed) 321 | cv2.imwrite('rejected.png', rejected) 322 | --------------------------------------------------------------------------------