├── .DS_Store ├── IDtesseract.py ├── README.md ├── canny.py ├── passport.py ├── passportRecognizeNew.py └── single_test.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iChenwin/pytesseractID/5ded3b644c0b1ca34515134a6ea13286bfbe686d/.DS_Store -------------------------------------------------------------------------------- /IDtesseract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | 4 | """ Use pytesseract(Google OCR library) recognize ID number. 5 | 使用 pytesseract 识别 18 位身份证号。 6 | """ 7 | 8 | from PIL import Image 9 | import pytesseract 10 | import cv2 11 | import os 12 | import string 13 | import json 14 | import re 15 | import numpy as np 16 | import heapq 17 | import threading 18 | import Queue 19 | import psutil 20 | import multiprocessing 21 | import time 22 | import sys 23 | 24 | # 单个图片识别item 25 | class ImageRecognizerItem(object): 26 | # 有点类似其它高级语言的构造函数 27 | def __init__(self, recognizedText, rect): 28 | self.rect = rect 29 | self.recognizedText = recognizedText 30 | self.dealedText = "" 31 | 32 | # 身份证信息类 33 | class IDcardInfo(object): 34 | # 有点类似其它高级语言的构造函数 35 | def __init__(self): 36 | self.IDNumber = "" 37 | self.name = "" 38 | self.sex = "" 39 | self.birthDate = "" 40 | self.address = "" 41 | self.issueDate = "" 42 | self.expiryDate = "" 43 | self.authority = "" 44 | 45 | def toJSON(self): 46 | return json.dumps(self, default=lambda o: o.__dict__, 47 | sort_keys=True, indent=4) 48 | 49 | class ThreadRecognize(threading.Thread): 50 | def __init__(self, queue): 51 | threading.Thread.__init__(self) 52 | self.queue = queue 53 | 54 | def run(self): 55 | while True: 56 | # check available memory 57 | virtualMemoryInfo = psutil.virtual_memory() 58 | availableMemory = virtualMemoryInfo.available 59 | # print(str(availableMemory/1025/1024)+"M") 60 | if availableMemory > MEMORY_WARNING: 61 | args = self.queue.get() 62 | recognizeImage(*args) 63 | self.queue.task_done() 64 | # else: 65 | # print("memory warning!") 66 | 67 | #在所有的框中挑出三个最宽的矩形框 68 | def findIDcnt(countours): 69 | #保存所有框的宽度 70 | widths = [] 71 | for idx, cnt in enumerate(countours): 72 | x, y, width, height = cv2.boundingRect(cnt) 73 | widths.insert(idx, width) 74 | 75 | #挑出宽度前三的三个宽度 76 | IDList = heapq.nlargest(3, widths) 77 | #根据这三个宽度,找出对应的那三个矩形框 78 | IDcnts = [] 79 | for idx, item in enumerate(IDList): 80 | index = widths.index(item) 81 | IDcnts.insert(idx, countours[index]) 82 | # print IDcnts 83 | 84 | return IDcnts 85 | 86 | # 图片路径 87 | filePath = '2.jpg' 88 | MEMORY_WARNING = 400*1024*1024 # 200M 89 | CPU_COUNT = multiprocessing.cpu_count() # 线程数 90 | ENABLE_THREAD = True # 是否开启多线程模式 91 | 92 | IDrect = () 93 | recognizedItems = [] 94 | handledTexts = {} 95 | 96 | # 使用Tesseract进行文字识别 97 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None): 98 | 99 | global IDrect 100 | 101 | if IDrect == rect: 102 | return 103 | 104 | config = "-psm 7" # single line mode 105 | if charWhiteList is not None: 106 | config += " -c tessedit_char_whitelist=" + charWhiteList 107 | 108 | image = Image.fromarray(cvimage) 109 | 110 | result = pytesseract.image_to_string(image, lang=language, config=config) 111 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\'{}〔〕『』{}【】〖〗《》「」〈〉()()a-zA-Z]+|[+——!,。?、~@#¥%……&*()“”=:-`′-]+".decode("utf8"), "".decode("utf8"), result) 112 | 113 | if language == "eng" and len(result) == 18: 114 | handledTexts["IDnumber"] = result 115 | IDrect = rect 116 | elif string != "": 117 | item = ImageRecognizerItem(string, rect) 118 | results.append(item) 119 | 120 | # 省市列表 121 | provinces = [ 122 | "北京", 123 | "广东", 124 | "山东", 125 | "江苏", 126 | "河南", 127 | "上海", 128 | "河北", 129 | "浙江", 130 | "香港", 131 | "陕西", 132 | "湖南", 133 | "重庆", 134 | "福建", 135 | "天津", 136 | "云南", 137 | "四川", 138 | "广西", 139 | "安徽", 140 | "海南", 141 | "江西", 142 | "湖北", 143 | "山西", 144 | "辽宁", 145 | "台湾", 146 | "黑龙江", 147 | "内蒙古", 148 | "澳门", 149 | "贵州", 150 | "甘肃", 151 | "青海", 152 | "新疆", 153 | "西藏", 154 | "吉林", 155 | "宁夏" 156 | ] 157 | 158 | def handlePersonalInfo(): 159 | for idx, item in enumerate(reversed(recognizedItems)): 160 | 161 | if item.recognizedText.startswith(u"姓名"): 162 | handledTexts["name"] = item.recognizedText[2:] 163 | elif item.recognizedText.isdigit() and int(item.recognizedText) > 10000000: 164 | recognizedItems.remove(item) 165 | elif item.recognizedText.startswith("19") or item.recognizedText.startswith("20"): 166 | handledTexts["birthDate"] = item.recognizedText 167 | elif item.recognizedText.startswith(u"出生"): 168 | handledTexts["birthDate"] = item.recognizedText[2:] 169 | elif item.recognizedText.startswith(u"性别"): 170 | handledTexts["gender"] = item.recognizedText[2:] 171 | elif item.recognizedText.startswith(u"民族"): 172 | handledTexts["ethnic"] = item.recognizedText[2:] 173 | else: 174 | if item.recognizedText.startswith(u"公民身份号码"): 175 | if not handledTexts.has_key("IDnumber"): 176 | handledTexts["IDnumber"] = item.recognizedText[6:] 177 | continue 178 | 179 | if item.recognizedText.startswith(u"住址"): 180 | handledTexts["address"] = item.recognizedText[2:] 181 | else: 182 | handledTexts["address"] += item.recognizedText[2:] 183 | 184 | def main(): 185 | 186 | handledTexts["name"] = "" 187 | handledTexts["birthDate"] = "" 188 | handledTexts["gender"] = "" 189 | handledTexts["ethnic"] = "" 190 | handledTexts["IDnumber"] = "" 191 | handledTexts["address"] = "" 192 | 193 | # parse command line options 194 | if len(sys.argv) != 2: 195 | #print 'Usage: python input_name output_name' 196 | returnData = {'code':1001, 'data':'无效参数'} 197 | print json.dumps(returnData) 198 | exit(1) 199 | filePath = sys.argv[1] 200 | 201 | start = time.time() 202 | 203 | #print "<----- processing %s ----->" % filePath 204 | 205 | #身份证号码识别,先对图片进行黑白处理,裁剪出身份证号,然后识别 206 | img = cv2.imread(filePath, 0) 207 | img = cv2.resize(img, (1200, 900)) 208 | 209 | # 图片亮度调节 210 | # imgArr = np.array(img) 211 | # imgMean = np.mean(img) 212 | # imgcopy = imgArr - imgMean 213 | # imgcopy = imgcopy * 2 + imgMean * 3 214 | # imgcopy = imgcopy / 255 215 | 216 | #二值 217 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) 218 | retval, binaryed = cv2.threshold(img, 110, 255, cv2.THRESH_BINARY); 219 | 220 | #显示处理后图片,调试用 221 | # cv2.imshow("Binary", binaryed) 222 | # k = cv2.waitKey(0) 223 | 224 | #闭运算 225 | # closed = cv2.morphologyEx(binaryed, cv2.MORPH_CLOSE, kernel) 226 | # cv2.imshow("Close",closed) 227 | # k = cv2.waitKey(0) 228 | 229 | 230 | #开运算 231 | # opened = cv2.morphologyEx(binaryed, cv2.MORPH_OPEN, kernel) 232 | # cv2.imshow("Open", opened) 233 | # k = cv2.waitKey(0) 234 | 235 | #腐蚀图像 236 | # dilated = cv2.dilate(binaryed, kernel) 237 | # cv2.imshow("dilate", dilated) 238 | # k = cv2.waitKey(0) 239 | 240 | #膨胀图像,使身份证号连成一整块,方便裁剪 241 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (65, 20)) 242 | eroded = cv2.erode(binaryed, kernel) 243 | 244 | # cv2.imshow("cannyed", eroded) 245 | # k = cv2.waitKey(0) 246 | 247 | #黑白反色,将字转为白色,为下一步框选做准备 248 | inverted = cv2.bitwise_not(eroded) 249 | 250 | # cv2.imshow("inverted", inverted) 251 | # k = cv2.waitKey(0) 252 | 253 | #框选出前景中,识别出的文本块 254 | contours, hierarchy = cv2.findContours(inverted, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) 255 | 256 | #在所有文本框中挑出最长的三个框,身份证号应该在其中 257 | IDcnts = findIDcnt(contours) 258 | 259 | #画框 260 | # cv2.drawContours(img, IDcnts, -1, (255,0,0), 3) 261 | # cv2.imshow("img", img) 262 | # k = cv2.waitKey(0) 263 | 264 | queue = Queue.Queue() 265 | if ENABLE_THREAD: 266 | for i in range(CPU_COUNT): 267 | t = ThreadRecognize(queue) 268 | t.setDaemon(True) 269 | t.start() 270 | 271 | IDimgs = [] 272 | for idx, IDcnt in enumerate(IDcnts): 273 | x, y, w, h = cv2.boundingRect(IDcnt) 274 | rect = (x, y, w, h) 275 | #裁剪图片,并储存在IDimgs中 276 | IDimg = img[y: y + h, x: x + w] 277 | IDimgs.insert(idx, IDimg) 278 | 279 | if ENABLE_THREAD: 280 | args = (recognizedItems, IDimg, rect, "eng", "0123456789X",) 281 | queue.put(args) 282 | else: 283 | recognizeImage(recognizedItems, IDimg, rect, "eng", "0123456789X") 284 | # cv2.imshow("IDimg", IDimg) 285 | # k = cv2.waitKey(0) 286 | 287 | textImgs = [] 288 | for idx, IDcnt in enumerate(contours): 289 | x, y, w, h = cv2.boundingRect(IDcnt) 290 | rect = (x, y, w, h) 291 | if IDrect == rect: 292 | break 293 | 294 | #裁剪图片,并储存在textImg中 295 | textImg = binaryed[y: y + h, x: x + w] 296 | # textImgs.insert(idx, textImg) 297 | 298 | if ENABLE_THREAD: 299 | args = (recognizedItems, textImg, rect, "chi_sim",) 300 | queue.put(args) 301 | else: 302 | recognizeImage(recognizedItems, textImg, rect, "chi_sim") 303 | 304 | # cv2.imshow("textImg", textImg) 305 | # k = cv2.waitKey(0) 306 | 307 | queue.join() 308 | 309 | handlePersonalInfo() 310 | result = json.dumps(handledTexts, default=lambda o: o.__dict__, sort_keys=False, indent=4) 311 | print json.dumps({'code':1000, 'data':json.loads(result)}) 312 | #print result 313 | cv2.destroyAllWindows() 314 | #print "<----- %.1f seconds used ----->" % (time.time() - start) 315 | 316 | if __name__ == "__main__": 317 | main() 318 | 319 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 使用 pytesseract 识别 18 位身份证号 2 | 3 | 项目调用 OpenCV 对图片进行预处理,裁剪出包含身份证号码的部分,然后调用 pytesseract 识别出号码。 4 | 5 | ### 版本 6 | `Python 2.7.13` 7 | 8 | ### 依赖库 9 | ``` 10 | PIL 11 | pytesseract 12 | tesseract 3.05.01 13 | numpy 1.13.1 14 | ``` 15 | 16 | ### 运行 17 | 18 | 将身份证照片拷贝至项目文件夹下,执行: 19 | `python IDtesseract.py 1.jpg` 20 | -------------------------------------------------------------------------------- /canny.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | 4 | from PIL import Image 5 | import pytesseract 6 | import cv2 7 | import os 8 | import string 9 | import re 10 | import numpy as np 11 | import sys 12 | 13 | def main(): 14 | # parse command line options 15 | if len(sys.argv) != 2: 16 | print 'Usage: python input_name output_name' 17 | exit(1) 18 | filePath = sys.argv[1] 19 | 20 | print "<----- processing %s ----->" % filePath 21 | 22 | #身份证号码识别,先对图片进行黑白处理,裁剪出身份证号,然后识别 23 | img = cv2.imread(filePath, 0) 24 | img = cv2.resize(img, (1200, 900)) 25 | 26 | # 图片亮度调节 27 | # imgArr = np.array(img) 28 | # imgMean = np.mean(img) 29 | # imgcopy = imgArr - imgMean 30 | # imgcopy = imgcopy * 2 + imgMean * 3 31 | # imgcopy = imgcopy / 255 32 | 33 | canny = cv2.Canny(img, 60, 300) 34 | inverted = cv2.bitwise_not(canny) 35 | cv2.imshow('Canny', inverted) 36 | 37 | test1 = Image.fromarray(canny) 38 | test2 = Image.fromarray(inverted) 39 | 40 | result = pytesseract.image_to_string(test1, lang="eng", config="-c tessedit_char_whitelist=0123456789X") 41 | print result 42 | print "-------" 43 | result = pytesseract.image_to_string(test2, lang="eng") 44 | print result 45 | 46 | k = cv2.waitKey(0) 47 | 48 | if __name__ == "__main__": 49 | main() -------------------------------------------------------------------------------- /passport.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding:utf-8 3 | 4 | import sys 5 | import threading 6 | import multiprocessing 7 | import Queue 8 | import re 9 | import json 10 | import cv2 11 | import numpy as np 12 | # import os 13 | # import subprocess 14 | import pytesseract 15 | import psutil 16 | # from matplotlib import pyplot as plt 17 | from PIL import Image, ExifTags 18 | from pypinyin import pinyin, lazy_pinyin 19 | import pypinyin 20 | import time 21 | 22 | reload(sys) 23 | sys.setdefaultencoding("utf-8") 24 | 25 | # 护照中出现的省市列表 26 | passportProvinces = [ 27 | "北京", 28 | "广东", 29 | "山东", 30 | "江苏", 31 | "河南", 32 | "上海", 33 | "河北", 34 | "浙江", 35 | "香港", 36 | "陕西", 37 | "湖南", 38 | "重庆", 39 | "福建", 40 | "天津", 41 | "云南", 42 | "四川", 43 | "广西", 44 | "安徽", 45 | "海南", 46 | "江西", 47 | "湖北", 48 | "山西", 49 | "辽宁", 50 | "台湾", 51 | "黑龙江", 52 | "内蒙古", 53 | "澳门", 54 | "贵州", 55 | "甘肃", 56 | "青海", 57 | "新疆", 58 | "西藏", 59 | "吉林", 60 | "宁夏" 61 | ] 62 | 63 | # 护照中出现的月份英文缩写列表 64 | passportMonthAbbrs = [ 65 | "JAN", 66 | "FEB", 67 | "MAR", 68 | "APR", 69 | "MAY", 70 | "JUN", 71 | "JUL", 72 | "AUG", 73 | "SEP", 74 | "OCT", 75 | "NOV", 76 | "DEC" 77 | ] 78 | 79 | # 根据月份英文缩写获取数字月份 80 | def getMonthNumberStringWithAddr(addr): 81 | if addr == "JAN": 82 | return "01" 83 | elif addr == "FEB": 84 | return "02" 85 | elif addr == "MAR": 86 | return "03" 87 | elif addr == "APR": 88 | return "04" 89 | elif addr == "MAY": 90 | return "05" 91 | elif addr == "JUN": 92 | return "06" 93 | elif addr == "JUL": 94 | return "07" 95 | elif addr == "AUG": 96 | return "08" 97 | elif addr == "SEP": 98 | return "09" 99 | elif addr == "OCT": 100 | return "10" 101 | elif addr == "NOV": 102 | return "11" 103 | elif addr == "DEC": 104 | return "12" 105 | return "" 106 | 107 | def getMidX(rect): 108 | x0 = rect[0] 109 | x1 = rect[0] + rect[2] 110 | return (x0 + x1) * 0.5 111 | 112 | def getMidY(rect): 113 | y0 = rect[1] 114 | y1 = rect[1] + rect[3] 115 | return (y0 + y1) * 0.5 116 | 117 | # 修正图片旋转 118 | def fixRotation(filePath): 119 | try: 120 | image = Image.open(filePath) 121 | for orientation in ExifTags.TAGS.keys(): 122 | if ExifTags.TAGS[orientation] == 'Orientation': 123 | break 124 | exif = dict(image._getexif().items()) 125 | 126 | if exif[orientation] == 3: 127 | image = image.rotate(180, expand=True) 128 | elif exif[orientation] == 6: 129 | image = image.rotate(270, expand=True) 130 | elif exif[orientation] == 8: 131 | image = image.rotate(90, expand=True) 132 | image.save(filePath) 133 | image.close() 134 | 135 | except (AttributeError, KeyError, IndexError): 136 | # cases: image don't have getexif 137 | pass 138 | 139 | # 获取暗色在图片中所占百分比 140 | def getDarkColorPercent(image): 141 | height = np.size(image, 0) 142 | width = np.size(image, 1) 143 | imgSize = width * height 144 | result = cv2.threshold(image, 100, -1, cv2.THRESH_TOZERO)[1] 145 | nonzero = cv2.countNonZero(result) 146 | if nonzero > 0: 147 | return (imgSize - nonzero) / float(imgSize) 148 | else: 149 | return 0 150 | 151 | # 在图片中画出框 152 | def drawRects(image, rects): 153 | for rect in rects: 154 | cv2.rectangle(image, (int(rect[0]), int(rect[1])), (int( 155 | rect[0] + rect[2]), int(rect[1] + rect[3])), (0, 255, 0), 15, 8, 0) 156 | 157 | # 处理成黑白图片以便进行文字识别 158 | def dealImage(image, thresh): 159 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (2, 1)) 160 | dilate = cv2.dilate(image, kernel) 161 | 162 | gray = cv2.cvtColor(dilate, cv2.COLOR_RGB2GRAY) 163 | return cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)[1] 164 | 165 | # 等比缩放图片 166 | def scaleImage(image, scale): 167 | height = np.size(image, 0) 168 | width = np.size(image, 1) 169 | dstSize = (int(width * scale), int(height * scale)) 170 | 171 | return cv2.resize(image, dstSize, None, 0, 0, cv2.INTER_LINEAR) 172 | 173 | # 检测可能包含文字的区域 174 | def detectTextRects(image, imageScale): 175 | # letterBoxes 176 | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 177 | threshold = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY)[1] 178 | 179 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (130, 20)) 180 | result = cv2.dilate((255 - threshold), kernel) 181 | 182 | # // 检索轮廓并返回检测到的轮廓的个数 183 | contours = cv2.findContours(result, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0] 184 | 185 | maxValue = 200 * imageScale 186 | minValue = 40 * imageScale 187 | 188 | boundRect = [] 189 | for points in contours: 190 | appRect = cv2.boundingRect(points) # x y w h 191 | 192 | if (appRect[3] > maxValue and appRect[2] > maxValue): 193 | continue 194 | 195 | if (appRect[3] < minValue or appRect[2] < minValue): 196 | continue 197 | appRect = list(appRect) 198 | appRect[2] += 60 * imageScale 199 | appRect[3] += 15 * imageScale 200 | appRect[0] -= 30 * imageScale 201 | appRect[1] -= 7.5 * imageScale 202 | boundRect.append(tuple(appRect)) 203 | return boundRect 204 | 205 | 206 | # 执行文字识别shell并返回结果 207 | # def image_to_string(img, cleanup=True, plus=''): 208 | # # cleanup为True则识别完成后删除生成的文本文件 209 | # # plus参数为给tesseract的附加高级参数 210 | # try: 211 | # subprocess.check_output('tesseract ' + img + ' ' + img + ' ' + plus, shell=True) # 生成同名txt文件 212 | # except subprocess.CalledProcessError as e: 213 | # return "" 214 | # text = '' 215 | # with open(img + '.txt', 'r') as f: 216 | # text = f.read().strip() 217 | # if cleanup: 218 | # os.remove(img + '.txt') 219 | # return text 220 | 221 | # 护照信息类 222 | class PassportInfo(object): 223 | # 有点类似其它高级语言的构造函数 224 | def __init__(self): 225 | self.passportNumber = "" 226 | self.name = "" 227 | self.namePinyin = "" 228 | self.sex = "" 229 | self.nationality = "" 230 | self.birthDate = "" 231 | self.birthPlace = "" 232 | self.issueDate = "" 233 | self.issuePlace = "" 234 | self.expiryDate = "" 235 | self.authority = "" 236 | self.authorityEnglish = "" 237 | self.bearerSignature = "" 238 | self.firstBooklet = "" 239 | self.secondBooklet = "" 240 | 241 | def toJSON(self): 242 | return json.dumps(self, default=lambda o: o.__dict__, 243 | sort_keys=True, indent=4) 244 | 245 | # 单个图片识别item 246 | class ImageRecognizerItem(object): 247 | # 有点类似其它高级语言的构造函数 248 | def __init__(self, recognizedText, rect): 249 | self.rect = rect 250 | self.recognizedText = recognizedText 251 | self.dealedText = "" 252 | 253 | class ThreadRecognize(threading.Thread): 254 | def __init__(self, queue): 255 | threading.Thread.__init__(self) 256 | self.queue = queue 257 | 258 | def run(self): 259 | while True: 260 | # check available memory 261 | virtualMemoryInfo = psutil.virtual_memory() 262 | availableMemory = virtualMemoryInfo.available 263 | # print(str(availableMemory/1025/1024)+"M") 264 | if availableMemory > MEMORY_WARNING: 265 | args = self.queue.get() 266 | recognizeImage(*args) 267 | self.queue.task_done() 268 | # else: 269 | # print("memory warning!") 270 | 271 | # 判断是否只有中文 272 | def isChinese(string): 273 | tempString = string.replace(" ", "") 274 | # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8")) 275 | # return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", unicode(tempString, "utf8"))) == 0 276 | return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", tempString)) == 0 277 | 278 | # def getChineseString(string): 279 | # tempString = string.replace(" ", "") 280 | # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8")) 281 | # return tempString.encode("ascii") 282 | 283 | # 是否包含数字 284 | def hasNumbers(inputString): 285 | return bool(re.search(r'\d', inputString)) 286 | 287 | # 根据正则表达式进行替换,返回替换后的文本是否为空 288 | def replaceWithRegexIsEmpty(regex, string): 289 | return len(re.sub(regex, "", string)) == 0 290 | 291 | # 使用Tesseract进行文字识别 292 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None): 293 | config = "-psm 7" # single line mode 294 | if charWhiteList is not None: 295 | config += " -c tessedit_char_whitelist=" + charWhiteList 296 | 297 | image = Image.fromarray(cvimage) 298 | 299 | result = pytesseract.image_to_string(image, lang=language, config=config) 300 | 301 | item = ImageRecognizerItem(result, rect) 302 | results.append(item) 303 | 304 | # 处理ImageRecognizerItem 305 | def handleRecognizedItem(recognizedItem, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates): 306 | recognizedText = recognizedItem.recognizedText.replace("\n", "") 307 | if (len(recognizedText) > 0): 308 | dealText = recognizedText.replace("<", "") 309 | dealText = dealText.replace(" ", "") 310 | isValid = re.sub("\w", "", dealText) == "" 311 | string = recognizedText.replace(" ", "") 312 | 313 | # newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", unicode(recognizedText, "utf8")) 314 | newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", recognizedText) 315 | 316 | # 底部两行 317 | if (isValid and len(string) == 44 and len(newString) == 0): 318 | recognizedText = string 319 | checkDigit = recognizedText[9: 9 + 1] 320 | if len(re.sub("^[1-9]+$", "", checkDigit)) != 0: 321 | surname = recognizedText[5:5 + 39] 322 | arr = filter(None, surname.split("<")) 323 | if len(arr) == 2: 324 | handledTexts["familyName"] = arr[0] 325 | handledTexts["givenName"] = arr[1] 326 | 327 | passportInfo.firstBooklet = recognizedText 328 | else: 329 | passportNumber = recognizedText[0:0 + 9] 330 | nationality = recognizedText[10:10 + 3] 331 | birth = recognizedText[13:13 + 6] 332 | sex = recognizedText[20:20 + 1] 333 | expiration = recognizedText[21:21 + 6] 334 | # personalNumber = recognizedText[28:28+14] 335 | 336 | handledTexts["passportNumber"] = passportNumber 337 | handledTexts["nationality"] = nationality 338 | handledTexts["birth"] = birth 339 | handledTexts["sex"] = sex 340 | handledTexts["expiration"] = expiration 341 | 342 | passportInfo.passportNumber = passportNumber 343 | passportInfo.sex = sex 344 | passportInfo.nationality = nationality 345 | passportInfo.secondBooklet = recognizedText 346 | else: 347 | # detect province 348 | # 可能是省市:字符串中包含省市的中文或拼音 349 | for province in passportProvinces: 350 | provincePinyin = ''.join(lazy_pinyin(unicode(province, 'utf8'))) 351 | # provincePinyin = ''.join(lazy_pinyin(province)) 352 | provincePinyin = provincePinyin.upper() 353 | string = recognizedText.replace(" ", "") 354 | if (province in string or provincePinyin in string): 355 | recognizedItem.dealedText = province 356 | possibleProvinces.append(recognizedItem) 357 | 358 | # detect date 359 | # 可能是日期:字符串中包含月份缩写 360 | for monthAddr in passportMonthAbbrs: 361 | if (monthAddr in recognizedText and hasNumbers(recognizedText)): 362 | possibleDates.append(recognizedItem) 363 | 364 | # 可能是姓名:字符串全是中文 365 | if isChinese(recognizedText): 366 | # recognizedItem.dealedText = getChineseString(recognizedText).encode("utf8") 367 | possibleNames.append(recognizedItem) 368 | 369 | # 最后处理 370 | # 只针对现版因私普通护照,旧版或其他类型护照的信息位置可能会有所不同 371 | # 护照种类:外交护照、公务护照、普通护照(因公普通护照、因私普通护照) 372 | def handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates): 373 | # find name 374 | # 条件:字符串拼音是在booklet中检测出的中文姓名拼音 375 | if handledTexts.has_key("familyName") and handledTexts.has_key("givenName"): 376 | fullnamePinyin = handledTexts["familyName"] + handledTexts["givenName"] 377 | fullnamePinyin = fullnamePinyin.upper() 378 | for item in possibleNames: 379 | name = item.recognizedText.replace(" ", "") 380 | # namePinyin = ''.join(lazy_pinyin(unicode(name, 'utf-8'))) 381 | namePinyin = ''.join(lazy_pinyin(name)) 382 | namePinyin = namePinyin.upper() 383 | if namePinyin == fullnamePinyin: 384 | passportInfo.name = name 385 | passportInfo.namePinyin = namePinyin 386 | 387 | # handle province 388 | # 条件:因为只会出现两个省市,并且上面的是出生地点,下面是签发地点 389 | if len(possibleProvinces) == 2: 390 | item0 = possibleProvinces[0] 391 | item1 = possibleProvinces[1] 392 | if (getMidY(item0.rect) > getMidY(item1.rect)): 393 | passportInfo.issuePlace = item0.dealedText 394 | passportInfo.birthPlace = item1.dealedText 395 | 396 | else: 397 | passportInfo.issuePlace = item1.dealedText 398 | passportInfo.birthPlace = item0.dealedText 399 | 400 | # issue and expiry date 401 | # 条件:27MAY1993类型的日期只有一个(出生日期);122月FEB2016有两个,上面是签发日期,下面是有效期至,如果只检测到一个此类型日期,根据booklet中检测出的两位年的日期920527,与之匹配检测出是签发日期还是有效期至 402 | issueOrExpiry = [] 403 | births = [] 404 | for item in possibleDates: 405 | date = item.recognizedText 406 | date = date.replace(" ", "") 407 | date = date.replace("/", "") 408 | 409 | # 27MAY1993 410 | if replaceWithRegexIsEmpty("^\d{2}[A-Za-z]{3}\d{4}$", date): 411 | births.append(date) 412 | # 122月FEB2016 413 | elif replaceWithRegexIsEmpty(u"^\d{3,4}月{1}[A-Za-z]{3}\d{4}$", date): 414 | issueOrExpiry.append(date) 415 | 416 | if len(births) == 1 and handledTexts.has_key("birth"): 417 | date = births[0] 418 | bookletDate = handledTexts["birth"] 419 | 420 | birthYear = date[5:5 + 4] 421 | birthMonth = getMonthNumberStringWithAddr(date[2:2 + 3]) 422 | if birthMonth != "": 423 | birthDay = date[0:0 + 2] 424 | 425 | # 与booklet上的日期比对 426 | if birthDay == bookletDate[4:4 + 2] and birthMonth == bookletDate[2:2 + 2] and birthYear.endswith(bookletDate[0:0 + 2]): 427 | passportInfo.birthDate = birthYear + " " + birthMonth + " " + birthDay 428 | 429 | if len(issueOrExpiry) > 0 and handledTexts.has_key("expiration"): 430 | bookletDate = handledTexts["expiration"] 431 | 432 | date0 = issueOrExpiry[0] 433 | day0 = date0[0:0 + 2] 434 | year0 = date0[len(date0) - 4:len(date0) - 4 + 4] 435 | monthAddr0 = date0[len(date0) - 7:len(date0) - 7 + 3] 436 | month0 = getMonthNumberStringWithAddr(monthAddr0) 437 | 438 | existExpiryDate = False 439 | # 与booklet上的日期比对 440 | if day0 == bookletDate[4:4 + 2] and month0 == bookletDate[2:2 + 2] and year0.endswith(bookletDate[0:0 + 2]): 441 | passportInfo.expiryDate = year0 + " " + month0 + " " + day0 442 | existExpiryDate = True 443 | 444 | if len(issueOrExpiry) == 2: 445 | date1 = issueOrExpiry[1] 446 | day1 = date1[0:0 + 2] 447 | year1 = date1[len(date1) - 4:len(date1) - 4 + 4] 448 | monthAddr1 = date1[len(date1) - 7:len(date1) - 7 + 3] 449 | month1 = getMonthNumberStringWithAddr(monthAddr1) 450 | 451 | if not existExpiryDate: 452 | # 与booklet上的日期比对 453 | if day1[4:4 + 2] and month1[2:2 + 2] and year1.endswith(bookletDate[0:0 + 2]): 454 | passportInfo.expiryDate = year1 + " " + month1 + " " + day1 455 | passportInfo.issueDate = year0 + " " + month0 + " " + day0 456 | 457 | existExpiryDate = True 458 | else: 459 | passportInfo.issueDate = year1 + " " + month1 + " " + day1 460 | 461 | # 图片路径 462 | filePath = '666.jpg' 463 | IMAGE_SCALE = 0.7 464 | MEMORY_WARNING = 400*1024*1024 # 200M 465 | ENABLE_THREAD = True 466 | def main(): 467 | if len(sys.argv) != 2: 468 | print 'Usage: python aruba.py image_name' 469 | exit(1) 470 | filePath = sys.argv[1] 471 | 472 | passportInfo = PassportInfo() 473 | recognizedItems = [] 474 | handledTexts = {} 475 | possibleNames = [] 476 | possibleProvinces = [] 477 | possibleDates = [] 478 | 479 | threads = [] 480 | 481 | IMAGE_SCALE = 0.7 482 | MEMORY_WARNING = 400*1024*1024 # 200M 483 | CPU_COUNT = multiprocessing.cpu_count() # 线程数 484 | 485 | #print '------------'+str(CPU_COUNT)+'----------------' 486 | 487 | ENABLE_THREAD = True # 是否开启多线程模式 488 | 489 | # 修正图片旋转,有时候手机拍出的照片会出现旋转的情况 490 | fixRotation(filePath) 491 | 492 | # 读取图片 493 | img = cv2.imread(filePath, 1) 494 | height = np.size(img, 0) 495 | width = np.size(img, 1) 496 | scale = 4000.0 * IMAGE_SCALE / width * 1.0 497 | 498 | # 拉伸图片到宽度为4000*IMAGE_SCALE 499 | img = scaleImage(img, scale) 500 | 501 | # 处理图片,以便使用Tesseract进行识别 502 | dealedImg = dealImage(img, 95) 503 | 504 | # 获取可能包含文字的区域 505 | rects = detectTextRects(img, IMAGE_SCALE) 506 | 507 | # 测试,画出框 508 | drawRects(img, rects) 509 | cv2.imwrite('pas.jpg', img) 510 | 511 | start_time = time.time() 512 | 513 | 514 | queue = Queue.Queue() 515 | if ENABLE_THREAD: 516 | for i in range(CPU_COUNT): 517 | t = ThreadRecognize(queue) 518 | t.setDaemon(True) 519 | t.start() 520 | 521 | for rect in rects: 522 | x = int(rect[0]) 523 | y = int(rect[1]) 524 | w = int(rect[2]) 525 | h = int(rect[3]) 526 | 527 | # 根据长宽过滤不太可能包含文字的图片 528 | if (((w > 50 * IMAGE_SCALE and w < 2000 * IMAGE_SCALE) or w > 2500 * IMAGE_SCALE) and (w > h)): 529 | crop_img = dealedImg[y:y + h, x:x + w] 530 | 531 | darkColorPercent = getDarkColorPercent(crop_img) 532 | 533 | # 根据图片中包含的黑色百分比过滤不太可能包含文字的图片 534 | if (darkColorPercent > 0.04 and darkColorPercent < 0.35): 535 | 536 | # result = "" 537 | 538 | # 长度很长的很可能就是booklets 539 | if w > 2500 * IMAGE_SCALE: 540 | if ENABLE_THREAD: 541 | args = (recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",) 542 | # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",)) 543 | # threads.append(thread) 544 | queue.put(args) 545 | else: 546 | recognizeImage(recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<") 547 | else: 548 | if ENABLE_THREAD: 549 | args = (recognizedItems, crop_img, rect, "eng+chi_sim",) 550 | # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng+chi_sim",)) 551 | # threads.append(thread) 552 | queue.put(args) 553 | else: 554 | recognizeImage(recognizedItems, crop_img, rect, "eng+chi_sim") 555 | 556 | 557 | # if ENABLE_THREAD: 558 | # for t in threads: 559 | # t.setDaemon(True) 560 | # t.start() 561 | # # t.join() 562 | # # for t in threads: 563 | # # t.join() 564 | # queue.join() 565 | 566 | queue.join() 567 | 568 | for item in recognizedItems: 569 | # 对每个识别出的文字进行处理 570 | handleRecognizedItem(item, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates) 571 | 572 | # # 对收集到的信息进行最后处理 573 | handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates) 574 | 575 | print("--- %s seconds ---" % (time.time() - start_time)) 576 | 577 | result = passportInfo.toJSON() 578 | #print(json.dumps(result)) 579 | print(result) 580 | 581 | if __name__ == "__main__": 582 | main() -------------------------------------------------------------------------------- /passportRecognizeNew.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding:utf-8 3 | 4 | import sys 5 | import threading 6 | import multiprocessing 7 | import Queue 8 | import re 9 | import json 10 | import cv2 11 | import numpy as np 12 | # import os 13 | # import subprocess 14 | import pytesseract 15 | import psutil 16 | # from matplotlib import pyplot as plt 17 | from PIL import Image, ExifTags 18 | from pypinyin import pinyin, lazy_pinyin 19 | import pypinyin 20 | import time 21 | 22 | reload(sys) 23 | sys.setdefaultencoding("utf-8") 24 | 25 | # 护照中出现的省市列表 26 | passportProvinces = [ 27 | "北京", 28 | "广东", 29 | "山东", 30 | "江苏", 31 | "河南", 32 | "上海", 33 | "河北", 34 | "浙江", 35 | "香港", 36 | "陕西", 37 | "湖南", 38 | "重庆", 39 | "福建", 40 | "天津", 41 | "云南", 42 | "四川", 43 | "广西", 44 | "安徽", 45 | "海南", 46 | "江西", 47 | "湖北", 48 | "山西", 49 | "辽宁", 50 | "台湾", 51 | "黑龙江", 52 | "内蒙古", 53 | "澳门", 54 | "贵州", 55 | "甘肃", 56 | "青海", 57 | "新疆", 58 | "西藏", 59 | "吉林", 60 | "宁夏" 61 | ] 62 | 63 | # 护照中出现的月份英文缩写列表 64 | passportMonthAbbrs = [ 65 | "JAN", 66 | "FEB", 67 | "MAR", 68 | "APR", 69 | "MAY", 70 | "JUN", 71 | "JUL", 72 | "AUG", 73 | "SEP", 74 | "OCT", 75 | "NOV", 76 | "DEC" 77 | ] 78 | 79 | # 根据月份英文缩写获取数字月份 80 | def getMonthNumberStringWithAddr(addr): 81 | if addr == "JAN": 82 | return "01" 83 | elif addr == "FEB": 84 | return "02" 85 | elif addr == "MAR": 86 | return "03" 87 | elif addr == "APR": 88 | return "04" 89 | elif addr == "MAY": 90 | return "05" 91 | elif addr == "JUN": 92 | return "06" 93 | elif addr == "JUL": 94 | return "07" 95 | elif addr == "AUG": 96 | return "08" 97 | elif addr == "SEP": 98 | return "09" 99 | elif addr == "OCT": 100 | return "10" 101 | elif addr == "NOV": 102 | return "11" 103 | elif addr == "DEC": 104 | return "12" 105 | return "" 106 | 107 | def getMidX(rect): 108 | x0 = rect[0] 109 | x1 = rect[0] + rect[2] 110 | return (x0 + x1) * 0.5 111 | 112 | def getMidY(rect): 113 | y0 = rect[1] 114 | y1 = rect[1] + rect[3] 115 | return (y0 + y1) * 0.5 116 | 117 | # 修正图片旋转 118 | def fixRotation(filePath): 119 | try: 120 | image = Image.open(filePath) 121 | for orientation in ExifTags.TAGS.keys(): 122 | if ExifTags.TAGS[orientation] == 'Orientation': 123 | break 124 | exif = dict(image._getexif().items()) 125 | 126 | if exif[orientation] == 3: 127 | image = image.rotate(180, expand=True) 128 | elif exif[orientation] == 6: 129 | image = image.rotate(270, expand=True) 130 | elif exif[orientation] == 8: 131 | image = image.rotate(90, expand=True) 132 | image.save(filePath) 133 | image.close() 134 | 135 | except (AttributeError, KeyError, IndexError): 136 | # cases: image don't have getexif 137 | pass 138 | 139 | # 获取暗色在图片中所占百分比 140 | def getDarkColorPercent(image): 141 | height = np.size(image, 0) 142 | width = np.size(image, 1) 143 | imgSize = width * height 144 | result = cv2.threshold(image, 100, -1, cv2.THRESH_TOZERO)[1] 145 | nonzero = cv2.countNonZero(result) 146 | if nonzero > 0: 147 | return (imgSize - nonzero) / float(imgSize) 148 | else: 149 | return 0 150 | 151 | # 在图片中画出框 152 | def drawRects(image, rects): 153 | for rect in rects: 154 | cv2.rectangle(image, (int(rect[0]), int(rect[1])), (int( 155 | rect[0] + rect[2]), int(rect[1] + rect[3])), (0, 255, 0), 15, 8, 0) 156 | 157 | # 处理成黑白图片以便进行文字识别 158 | def dealImage(image, thresh): 159 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (2, 1)) 160 | dilate = cv2.dilate(image, kernel) 161 | 162 | gray = cv2.cvtColor(dilate, cv2.COLOR_RGB2GRAY) 163 | return cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)[1] 164 | 165 | # 等比缩放图片 166 | def scaleImage(image, scale): 167 | height = np.size(image, 0) 168 | width = np.size(image, 1) 169 | dstSize = (int(width * scale), int(height * scale)) 170 | 171 | return cv2.resize(image, dstSize, None, 0, 0, cv2.INTER_LINEAR) 172 | 173 | # 检测可能包含文字的区域 174 | def detectTextRects(image, imageScale): 175 | # letterBoxes 176 | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 177 | threshold = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY)[1] 178 | 179 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (130, 20)) 180 | result = cv2.dilate((255 - threshold), kernel) 181 | 182 | # // 检索轮廓并返回检测到的轮廓的个数 183 | contours = cv2.findContours(result, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0] 184 | 185 | maxValue = 200 * imageScale 186 | minValue = 40 * imageScale 187 | 188 | boundRect = [] 189 | for points in contours: 190 | appRect = cv2.boundingRect(points) # x y w h 191 | 192 | if (appRect[3] > maxValue and appRect[2] > maxValue): 193 | continue 194 | 195 | if (appRect[3] < minValue or appRect[2] < minValue): 196 | continue 197 | appRect = list(appRect) 198 | appRect[2] += 60 * imageScale 199 | appRect[3] += 15 * imageScale 200 | appRect[0] -= 30 * imageScale 201 | appRect[1] -= 7.5 * imageScale 202 | boundRect.append(tuple(appRect)) 203 | return boundRect 204 | 205 | 206 | # 执行文字识别shell并返回结果 207 | # def image_to_string(img, cleanup=True, plus=''): 208 | # # cleanup为True则识别完成后删除生成的文本文件 209 | # # plus参数为给tesseract的附加高级参数 210 | # try: 211 | # subprocess.check_output('tesseract ' + img + ' ' + img + ' ' + plus, shell=True) # 生成同名txt文件 212 | # except subprocess.CalledProcessError as e: 213 | # return "" 214 | # text = '' 215 | # with open(img + '.txt', 'r') as f: 216 | # text = f.read().strip() 217 | # if cleanup: 218 | # os.remove(img + '.txt') 219 | # return text 220 | 221 | # 护照信息类 222 | class PassportInfo(object): 223 | # 有点类似其它高级语言的构造函数 224 | def __init__(self): 225 | self.passportNumber = "" 226 | self.name = "" 227 | self.namePinyin = "" 228 | self.sex = "" 229 | self.nationality = "" 230 | self.birthDate = "" 231 | self.birthPlace = "" 232 | self.issueDate = "" 233 | self.issuePlace = "" 234 | self.expiryDate = "" 235 | self.authority = "" 236 | self.authorityEnglish = "" 237 | self.bearerSignature = "" 238 | self.firstBooklet = "" 239 | self.secondBooklet = "" 240 | 241 | def toJSON(self): 242 | return json.dumps(self, default=lambda o: o.__dict__, 243 | sort_keys=True, indent=4) 244 | 245 | # 单个图片识别item 246 | class ImageRecognizerItem(object): 247 | # 有点类似其它高级语言的构造函数 248 | def __init__(self, recognizedText, rect): 249 | self.rect = rect 250 | self.recognizedText = recognizedText 251 | self.dealedText = "" 252 | 253 | class ThreadRecognize(threading.Thread): 254 | def __init__(self, queue): 255 | threading.Thread.__init__(self) 256 | self.queue = queue 257 | 258 | def run(self): 259 | while True: 260 | # check available memory 261 | virtualMemoryInfo = psutil.virtual_memory() 262 | availableMemory = virtualMemoryInfo.available 263 | # print(str(availableMemory/1025/1024)+"M") 264 | if availableMemory > MEMORY_WARNING: 265 | args = self.queue.get() 266 | recognizeImage(*args) 267 | self.queue.task_done() 268 | # else: 269 | # print("memory warning!") 270 | 271 | # 判断是否只有中文 272 | def isChinese(string): 273 | tempString = string.replace(" ", "") 274 | # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8")) 275 | # return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", unicode(tempString, "utf8"))) == 0 276 | return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", tempString)) == 0 277 | 278 | # def getChineseString(string): 279 | # tempString = string.replace(" ", "") 280 | # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8")) 281 | # return tempString.encode("ascii") 282 | 283 | # 是否包含数字 284 | def hasNumbers(inputString): 285 | return bool(re.search(r'\d', inputString)) 286 | 287 | # 根据正则表达式进行替换,返回替换后的文本是否为空 288 | def replaceWithRegexIsEmpty(regex, string): 289 | return len(re.sub(regex, "", string)) == 0 290 | 291 | # 使用Tesseract进行文字识别 292 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None): 293 | config = "-psm 7" # single line mode 294 | if charWhiteList is not None: 295 | config += " -c tessedit_char_whitelist=" + charWhiteList 296 | 297 | image = Image.fromarray(cvimage) 298 | 299 | result = pytesseract.image_to_string(image, lang=language, config=config) 300 | 301 | item = ImageRecognizerItem(result, rect) 302 | results.append(item) 303 | 304 | # 处理ImageRecognizerItem 305 | def handleRecognizedItem(recognizedItem, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates): 306 | recognizedText = recognizedItem.recognizedText.replace("\n", "") 307 | if (len(recognizedText) > 0): 308 | dealText = recognizedText.replace("<", "") 309 | dealText = dealText.replace(" ", "") 310 | isValid = re.sub("\w", "", dealText) == "" 311 | string = recognizedText.replace(" ", "") 312 | 313 | # newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", unicode(recognizedText, "utf8")) 314 | newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", recognizedText) 315 | 316 | # 底部两行 317 | if (isValid and len(string) == 44 and len(newString) == 0): 318 | recognizedText = string 319 | checkDigit = recognizedText[9: 9 + 1] 320 | if len(re.sub("^[1-9]+$", "", checkDigit)) != 0: 321 | surname = recognizedText[5:5 + 39] 322 | arr = filter(None, surname.split("<")) 323 | if len(arr) == 2: 324 | handledTexts["familyName"] = arr[0] 325 | handledTexts["givenName"] = arr[1] 326 | 327 | passportInfo.firstBooklet = recognizedText 328 | else: 329 | passportNumber = recognizedText[0:0 + 9] 330 | nationality = recognizedText[10:10 + 3] 331 | birth = recognizedText[13:13 + 6] 332 | sex = recognizedText[20:20 + 1] 333 | expiration = recognizedText[21:21 + 6] 334 | # personalNumber = recognizedText[28:28+14] 335 | 336 | handledTexts["passportNumber"] = passportNumber 337 | handledTexts["nationality"] = nationality 338 | handledTexts["birth"] = birth 339 | handledTexts["sex"] = sex 340 | handledTexts["expiration"] = expiration 341 | 342 | passportInfo.passportNumber = passportNumber 343 | passportInfo.sex = sex 344 | passportInfo.nationality = nationality 345 | passportInfo.secondBooklet = recognizedText 346 | else: 347 | # detect province 348 | # 可能是省市:字符串中包含省市的中文或拼音 349 | for province in passportProvinces: 350 | provincePinyin = ''.join(lazy_pinyin(unicode(province, 'utf8'))) 351 | # provincePinyin = ''.join(lazy_pinyin(province)) 352 | provincePinyin = provincePinyin.upper() 353 | string = recognizedText.replace(" ", "") 354 | if (province in string or provincePinyin in string): 355 | recognizedItem.dealedText = province 356 | possibleProvinces.append(recognizedItem) 357 | 358 | # detect date 359 | # 可能是日期:字符串中包含月份缩写 360 | for monthAddr in passportMonthAbbrs: 361 | if (monthAddr in recognizedText and hasNumbers(recognizedText)): 362 | possibleDates.append(recognizedItem) 363 | 364 | # 可能是姓名:字符串全是中文 365 | if isChinese(recognizedText): 366 | # recognizedItem.dealedText = getChineseString(recognizedText).encode("utf8") 367 | possibleNames.append(recognizedItem) 368 | 369 | # 最后处理 370 | # 只针对现版因私普通护照,旧版或其他类型护照的信息位置可能会有所不同 371 | # 护照种类:外交护照、公务护照、普通护照(因公普通护照、因私普通护照) 372 | def handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates): 373 | # find name 374 | # 条件:字符串拼音是在booklet中检测出的中文姓名拼音 375 | if handledTexts.has_key("familyName") and handledTexts.has_key("givenName"): 376 | fullnamePinyin = handledTexts["familyName"] + handledTexts["givenName"] 377 | fullnamePinyin = fullnamePinyin.upper() 378 | for item in possibleNames: 379 | name = item.recognizedText.replace(" ", "") 380 | # namePinyin = ''.join(lazy_pinyin(unicode(name, 'utf-8'))) 381 | namePinyin = ''.join(lazy_pinyin(name)) 382 | namePinyin = namePinyin.upper() 383 | if namePinyin == fullnamePinyin: 384 | passportInfo.name = name 385 | passportInfo.namePinyin = namePinyin 386 | 387 | # handle province 388 | # 条件:因为只会出现两个省市,并且上面的是出生地点,下面是签发地点 389 | if len(possibleProvinces) == 2: 390 | item0 = possibleProvinces[0] 391 | item1 = possibleProvinces[1] 392 | if (getMidY(item0.rect) > getMidY(item1.rect)): 393 | passportInfo.issuePlace = item0.dealedText 394 | passportInfo.birthPlace = item1.dealedText 395 | 396 | else: 397 | passportInfo.issuePlace = item1.dealedText 398 | passportInfo.birthPlace = item0.dealedText 399 | 400 | # issue and expiry date 401 | # 条件:27MAY1993类型的日期只有一个(出生日期);122月FEB2016有两个,上面是签发日期,下面是有效期至,如果只检测到一个此类型日期,根据booklet中检测出的两位年的日期920527,与之匹配检测出是签发日期还是有效期至 402 | issueOrExpiry = [] 403 | births = [] 404 | for item in possibleDates: 405 | date = item.recognizedText 406 | date = date.replace(" ", "") 407 | date = date.replace("/", "") 408 | 409 | # 27MAY1993 410 | if replaceWithRegexIsEmpty("^\d{2}[A-Za-z]{3}\d{4}$", date): 411 | births.append(date) 412 | # 122月FEB2016 413 | elif replaceWithRegexIsEmpty(u"^\d{3,4}月{1}[A-Za-z]{3}\d{4}$", date): 414 | issueOrExpiry.append(date) 415 | 416 | if len(births) == 1 and handledTexts.has_key("birth"): 417 | date = births[0] 418 | bookletDate = handledTexts["birth"] 419 | 420 | birthYear = date[5:5 + 4] 421 | birthMonth = getMonthNumberStringWithAddr(date[2:2 + 3]) 422 | if birthMonth != "": 423 | birthDay = date[0:0 + 2] 424 | 425 | # 与booklet上的日期比对 426 | if birthDay == bookletDate[4:4 + 2] and birthMonth == bookletDate[2:2 + 2] and birthYear.endswith(bookletDate[0:0 + 2]): 427 | passportInfo.birthDate = birthYear + " " + birthMonth + " " + birthDay 428 | 429 | if len(issueOrExpiry) > 0 and handledTexts.has_key("expiration"): 430 | bookletDate = handledTexts["expiration"] 431 | 432 | date0 = issueOrExpiry[0] 433 | day0 = date0[0:0 + 2] 434 | year0 = date0[len(date0) - 4:len(date0) - 4 + 4] 435 | monthAddr0 = date0[len(date0) - 7:len(date0) - 7 + 3] 436 | month0 = getMonthNumberStringWithAddr(monthAddr0) 437 | 438 | existExpiryDate = False 439 | # 与booklet上的日期比对 440 | if day0 == bookletDate[4:4 + 2] and month0 == bookletDate[2:2 + 2] and year0.endswith(bookletDate[0:0 + 2]): 441 | passportInfo.expiryDate = year0 + " " + month0 + " " + day0 442 | existExpiryDate = True 443 | 444 | if len(issueOrExpiry) == 2: 445 | date1 = issueOrExpiry[1] 446 | day1 = date1[0:0 + 2] 447 | year1 = date1[len(date1) - 4:len(date1) - 4 + 4] 448 | monthAddr1 = date1[len(date1) - 7:len(date1) - 7 + 3] 449 | month1 = getMonthNumberStringWithAddr(monthAddr1) 450 | 451 | if not existExpiryDate: 452 | # 与booklet上的日期比对 453 | if day1[4:4 + 2] and month1[2:2 + 2] and year1.endswith(bookletDate[0:0 + 2]): 454 | passportInfo.expiryDate = year1 + " " + month1 + " " + day1 455 | passportInfo.issueDate = year0 + " " + month0 + " " + day0 456 | 457 | existExpiryDate = True 458 | else: 459 | passportInfo.issueDate = year1 + " " + month1 + " " + day1 460 | 461 | 462 | passportInfo = PassportInfo() 463 | recognizedItems = [] 464 | handledTexts = {} 465 | possibleNames = [] 466 | possibleProvinces = [] 467 | possibleDates = [] 468 | 469 | threads = [] 470 | 471 | # 图片路径 472 | filePath = 'pas.JPG' 473 | IMAGE_SCALE = 0.7 474 | MEMORY_WARNING = 400*1024*1024 # 200M 475 | CPU_COUNT = multiprocessing.cpu_count() # 线程数 476 | ENABLE_THREAD = True # 是否开启多线程模式 477 | 478 | # 修正图片旋转,有时候手机拍出的照片会出现旋转的情况 479 | fixRotation(filePath) 480 | 481 | # 读取图片 482 | img = cv2.imread(filePath, 1) 483 | height = np.size(img, 0) 484 | width = np.size(img, 1) 485 | scale = 4000.0 * IMAGE_SCALE / width * 1.0 486 | 487 | # 拉伸图片到宽度为4000*IMAGE_SCALE 488 | img = scaleImage(img, scale) 489 | 490 | # 处理图片,以便使用Tesseract进行识别 491 | dealedImg = dealImage(img, 95) 492 | 493 | # 获取可能包含文字的区域 494 | rects = detectTextRects(img, IMAGE_SCALE) 495 | 496 | # 测试,画出框 497 | drawRects(img, rects) 498 | 499 | start_time = time.time() 500 | 501 | 502 | queue = Queue.Queue() 503 | if ENABLE_THREAD: 504 | for i in range(CPU_COUNT): 505 | t = ThreadRecognize(queue) 506 | t.setDaemon(True) 507 | t.start() 508 | 509 | for rect in rects: 510 | x = int(rect[0]) 511 | y = int(rect[1]) 512 | w = int(rect[2]) 513 | h = int(rect[3]) 514 | 515 | # 根据长宽过滤不太可能包含文字的图片 516 | if (((w > 50 * IMAGE_SCALE and w < 2000 * IMAGE_SCALE) or w > 2500 * IMAGE_SCALE) and (w > h)): 517 | crop_img = dealedImg[y:y + h, x:x + w] 518 | 519 | darkColorPercent = getDarkColorPercent(crop_img) 520 | 521 | # 根据图片中包含的黑色百分比过滤不太可能包含文字的图片 522 | if (darkColorPercent > 0.04 and darkColorPercent < 0.35): 523 | 524 | # result = "" 525 | 526 | # 长度很长的很可能就是booklets 527 | if w > 2500 * IMAGE_SCALE: 528 | if ENABLE_THREAD: 529 | args = (recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",) 530 | # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",)) 531 | # threads.append(thread) 532 | queue.put(args) 533 | else: 534 | recognizeImage(recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<") 535 | else: 536 | if ENABLE_THREAD: 537 | args = (recognizedItems, crop_img, rect, "eng+chi_sim",) 538 | # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng+chi_sim",)) 539 | # threads.append(thread) 540 | queue.put(args) 541 | else: 542 | recognizeImage(recognizedItems, crop_img, rect, "eng+chi_sim") 543 | 544 | 545 | # if ENABLE_THREAD: 546 | # for t in threads: 547 | # t.setDaemon(True) 548 | # t.start() 549 | # # t.join() 550 | # # for t in threads: 551 | # # t.join() 552 | # queue.join() 553 | 554 | queue.join() 555 | 556 | for item in recognizedItems: 557 | # 对每个识别出的文字进行处理 558 | handleRecognizedItem(item, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates) 559 | 560 | # # 对收集到的信息进行最后处理 561 | handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates) 562 | 563 | print("--- %s seconds ---" % (time.time() - start_time)) 564 | 565 | result = passportInfo.toJSON() 566 | print(json.dumps(result)) 567 | 568 | -------------------------------------------------------------------------------- /single_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | 多线程测试 3 | """ 4 | #coding=utf-8 5 | from time import ctime, sleep 6 | import threading 7 | 8 | def music(name): 9 | for i in range(2): 10 | print "I'm listening %s at %s" % (name, ctime()) 11 | sleep(1) 12 | print "music done %s" % ctime() 13 | 14 | def movie(name): 15 | for i in range(2): 16 | print "I'm watching %s at %s" % (name, ctime()) 17 | sleep(5) 18 | print "movie done %s" % ctime() 19 | 20 | threads = [] 21 | t1 = threading.Thread(target = music, args = (u"成都",)) 22 | threads.append(t1) 23 | t2 = threading.Thread(target = movie, args = (u"阿凡达",)) 24 | threads.append(t2) 25 | 26 | if __name__ == "__main__": 27 | for t in threads: 28 | t.setDaemon(True) 29 | t.start() 30 | t.join() 31 | print "It's over! %s" % ctime() --------------------------------------------------------------------------------