├── .DS_Store
├── IDtesseract.py
├── README.md
├── canny.py
├── passport.py
├── passportRecognizeNew.py
└── single_test.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iChenwin/pytesseractID/5ded3b644c0b1ca34515134a6ea13286bfbe686d/.DS_Store


--------------------------------------------------------------------------------
/IDtesseract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | 
  4 | """ Use pytesseract(Google OCR library) recognize ID number.
  5 |     使用 pytesseract 识别 18 位身份证号。
  6 | """
  7 | 
  8 | from PIL import Image
  9 | import pytesseract
 10 | import cv2
 11 | import os
 12 | import string
 13 | import json
 14 | import re
 15 | import numpy as np
 16 | import heapq
 17 | import threading
 18 | import Queue
 19 | import psutil
 20 | import multiprocessing
 21 | import time
 22 | import sys
 23 | 
 24 | # 单个图片识别item
 25 | class ImageRecognizerItem(object):
 26 |     # 有点类似其它高级语言的构造函数
 27 |     def __init__(self, recognizedText, rect):
 28 |         self.rect = rect
 29 |         self.recognizedText = recognizedText
 30 |         self.dealedText = ""
 31 | 
 32 | # 身份证信息类
 33 | class IDcardInfo(object):
 34 |     # 有点类似其它高级语言的构造函数
 35 |     def __init__(self):
 36 |         self.IDNumber = ""
 37 |         self.name = ""
 38 |         self.sex = ""
 39 |         self.birthDate = ""
 40 |         self.address = ""
 41 |         self.issueDate = ""
 42 |         self.expiryDate = ""
 43 |         self.authority = ""
 44 | 
 45 |     def toJSON(self):
 46 |         return json.dumps(self, default=lambda o: o.__dict__,
 47 |                           sort_keys=True, indent=4)
 48 | 
 49 | class ThreadRecognize(threading.Thread):
 50 |     def __init__(self, queue):
 51 |         threading.Thread.__init__(self)
 52 |         self.queue = queue
 53 | 
 54 |     def run(self):
 55 |         while True:
 56 |             # check available memory
 57 |             virtualMemoryInfo = psutil.virtual_memory()
 58 |             availableMemory = virtualMemoryInfo.available
 59 |             # print(str(availableMemory/1025/1024)+"M")
 60 |             if availableMemory > MEMORY_WARNING:
 61 |                 args = self.queue.get()
 62 |                 recognizeImage(*args)
 63 |                 self.queue.task_done()
 64 |             # else:
 65 |             #     print("memory warning!")
 66 | 
 67 | #在所有的框中挑出三个最宽的矩形框
 68 | def findIDcnt(countours):
 69 |     #保存所有框的宽度
 70 |     widths = []
 71 |     for idx, cnt in enumerate(countours):
 72 |         x, y, width, height = cv2.boundingRect(cnt)
 73 |         widths.insert(idx, width)
 74 |     
 75 |     #挑出宽度前三的三个宽度
 76 |     IDList = heapq.nlargest(3, widths)
 77 |     #根据这三个宽度，找出对应的那三个矩形框
 78 |     IDcnts = []
 79 |     for idx, item in enumerate(IDList):
 80 |         index = widths.index(item)
 81 |         IDcnts.insert(idx, countours[index])
 82 |     # print IDcnts
 83 | 
 84 |     return IDcnts
 85 | 
 86 | # 图片路径
 87 | filePath = '2.jpg'
 88 | MEMORY_WARNING = 400*1024*1024  # 200M
 89 | CPU_COUNT = multiprocessing.cpu_count() # 线程数
 90 | ENABLE_THREAD = True   # 是否开启多线程模式
 91 | 
 92 | IDrect = ()
 93 | recognizedItems = []
 94 | handledTexts = {}
 95 | 
 96 | # 使用Tesseract进行文字识别
 97 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None):
 98 | 
 99 |     global IDrect
100 | 
101 |     if IDrect == rect:
102 |         return
103 | 
104 |     config = "-psm 7"   # single line mode
105 |     if charWhiteList is not None:
106 |         config += " -c tessedit_char_whitelist=" + charWhiteList
107 | 
108 |     image = Image.fromarray(cvimage)
109 | 
110 |     result = pytesseract.image_to_string(image, lang=language, config=config)
111 |     string = re.sub("[\s+\.\!\/_,$%^*(+\"\'{}〔〕『』｛｝【】〖〗《》「」〈〉（）()a-zA-Z]+|[+——！，。？、~@#￥%……&*（）“”=:-`′-]+".decode("utf8"), "".decode("utf8"), result)
112 | 
113 |     if language == "eng" and len(result) == 18:
114 |         handledTexts["IDnumber"] = result
115 |         IDrect = rect
116 |     elif string != "":
117 |         item = ImageRecognizerItem(string, rect)
118 |         results.append(item)
119 | 
120 | # 省市列表
121 | provinces = [
122 |     "北京",
123 |     "广东",
124 |     "山东",
125 |     "江苏",
126 |     "河南",
127 |     "上海",
128 |     "河北",
129 |     "浙江",
130 |     "香港",
131 |     "陕西",
132 |     "湖南",
133 |     "重庆",
134 |     "福建",
135 |     "天津",
136 |     "云南",
137 |     "四川",
138 |     "广西",
139 |     "安徽",
140 |     "海南",
141 |     "江西",
142 |     "湖北",
143 |     "山西",
144 |     "辽宁",
145 |     "台湾",
146 |     "黑龙江",
147 |     "内蒙古",
148 |     "澳门",
149 |     "贵州",
150 |     "甘肃",
151 |     "青海",
152 |     "新疆",
153 |     "西藏",
154 |     "吉林",
155 |     "宁夏"
156 | ]
157 | 
158 | def handlePersonalInfo():
159 |     for idx, item in enumerate(reversed(recognizedItems)):
160 | 
161 |         if item.recognizedText.startswith(u"姓名"):
162 |             handledTexts["name"] = item.recognizedText[2:]
163 |         elif item.recognizedText.isdigit() and int(item.recognizedText) > 10000000:
164 |             recognizedItems.remove(item)
165 |         elif item.recognizedText.startswith("19") or item.recognizedText.startswith("20"):
166 |             handledTexts["birthDate"] = item.recognizedText
167 |         elif item.recognizedText.startswith(u"出生"):
168 |             handledTexts["birthDate"] = item.recognizedText[2:]
169 |         elif item.recognizedText.startswith(u"性别"):
170 |             handledTexts["gender"] = item.recognizedText[2:]
171 |         elif item.recognizedText.startswith(u"民族"):
172 |             handledTexts["ethnic"] = item.recognizedText[2:]
173 |         else:
174 |             if item.recognizedText.startswith(u"公民身份号码"):
175 |                 if not handledTexts.has_key("IDnumber"):
176 |                     handledTexts["IDnumber"] = item.recognizedText[6:]
177 |                 continue
178 |                 
179 |             if item.recognizedText.startswith(u"住址"):
180 |                 handledTexts["address"] = item.recognizedText[2:]
181 |             else:
182 |                 handledTexts["address"] += item.recognizedText[2:]
183 | 
184 | def main():
185 | 
186 |     handledTexts["name"] = ""
187 |     handledTexts["birthDate"] = ""
188 |     handledTexts["gender"] = ""
189 |     handledTexts["ethnic"] = ""
190 |     handledTexts["IDnumber"] = ""
191 |     handledTexts["address"] = ""
192 | 
193 |     # parse command line options
194 |     if len(sys.argv) != 2:
195 |         #print 'Usage: python input_name output_name'
196 |         returnData = {'code':1001, 'data':'无效参数'}
197 |         print json.dumps(returnData)
198 |         exit(1)
199 |     filePath = sys.argv[1]
200 | 
201 |     start = time.time()
202 | 
203 |     #print "<----- processing %s ----->" % filePath
204 | 
205 |     #身份证号码识别，先对图片进行黑白处理，裁剪出身份证号，然后识别
206 |     img = cv2.imread(filePath, 0)
207 |     img = cv2.resize(img, (1200, 900)) 
208 | 
209 |     # 图片亮度调节
210 |     # imgArr = np.array(img)
211 |     # imgMean = np.mean(img)
212 |     # imgcopy = imgArr - imgMean
213 |     # imgcopy = imgcopy * 2 + imgMean * 3
214 |     # imgcopy = imgcopy / 255
215 | 
216 |     #二值
217 |     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
218 |     retval, binaryed = cv2.threshold(img, 110, 255, cv2.THRESH_BINARY);  
219 | 
220 |     #显示处理后图片，调试用
221 |     # cv2.imshow("Binary", binaryed)
222 |     # k = cv2.waitKey(0)
223 | 
224 |     #闭运算  
225 |     # closed = cv2.morphologyEx(binaryed, cv2.MORPH_CLOSE, kernel)  
226 |     # cv2.imshow("Close",closed)
227 |     # k = cv2.waitKey(0)
228 | 
229 | 
230 |     #开运算  
231 |     # opened = cv2.morphologyEx(binaryed, cv2.MORPH_OPEN, kernel)  
232 |     # cv2.imshow("Open", opened)
233 |     # k = cv2.waitKey(0)
234 | 
235 |     #腐蚀图像
236 |     # dilated = cv2.dilate(binaryed, kernel) 
237 |     # cv2.imshow("dilate", dilated)
238 |     # k = cv2.waitKey(0)
239 | 
240 |     #膨胀图像，使身份证号连成一整块，方便裁剪
241 |     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (65, 20)) 
242 |     eroded = cv2.erode(binaryed, kernel) 
243 | 
244 |     # cv2.imshow("cannyed", eroded)
245 |     # k = cv2.waitKey(0)
246 | 
247 |     #黑白反色，将字转为白色，为下一步框选做准备
248 |     inverted = cv2.bitwise_not(eroded)
249 | 
250 |     # cv2.imshow("inverted", inverted)
251 |     # k = cv2.waitKey(0)
252 | 
253 |     #框选出前景中，识别出的文本块
254 |     contours, hierarchy = cv2.findContours(inverted, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)  
255 | 
256 |     #在所有文本框中挑出最长的三个框，身份证号应该在其中
257 |     IDcnts = findIDcnt(contours)
258 | 
259 |     #画框
260 |     # cv2.drawContours(img, IDcnts, -1, (255,0,0), 3)
261 |     # cv2.imshow("img", img)
262 |     # k = cv2.waitKey(0)
263 | 
264 |     queue = Queue.Queue()
265 |     if ENABLE_THREAD:
266 |         for i in range(CPU_COUNT):
267 |             t = ThreadRecognize(queue)
268 |             t.setDaemon(True)
269 |             t.start()
270 | 
271 |     IDimgs = []
272 |     for idx, IDcnt in enumerate(IDcnts):
273 |         x, y, w, h = cv2.boundingRect(IDcnt)
274 |         rect = (x, y, w, h)
275 |         #裁剪图片，并储存在IDimgs中
276 |         IDimg = img[y: y + h, x: x + w]
277 |         IDimgs.insert(idx, IDimg)
278 | 
279 |         if ENABLE_THREAD:
280 |             args = (recognizedItems, IDimg, rect, "eng", "0123456789X",)
281 |             queue.put(args)
282 |         else:
283 |             recognizeImage(recognizedItems, IDimg, rect, "eng", "0123456789X")
284 |         # cv2.imshow("IDimg", IDimg)
285 |         # k = cv2.waitKey(0)
286 | 
287 |     textImgs = []
288 |     for idx, IDcnt in enumerate(contours):
289 |         x, y, w, h = cv2.boundingRect(IDcnt)
290 |         rect = (x, y, w, h)
291 |         if IDrect == rect:
292 |             break
293 | 
294 |         #裁剪图片，并储存在textImg中
295 |         textImg = binaryed[y: y + h, x: x + w]
296 |         # textImgs.insert(idx, textImg)
297 | 
298 |         if ENABLE_THREAD:
299 |             args = (recognizedItems, textImg, rect, "chi_sim",)
300 |             queue.put(args)
301 |         else:
302 |             recognizeImage(recognizedItems, textImg, rect, "chi_sim")
303 | 
304 |         # cv2.imshow("textImg", textImg)
305 |         # k = cv2.waitKey(0)
306 | 
307 |     queue.join()
308 | 
309 |     handlePersonalInfo()
310 |     result = json.dumps(handledTexts, default=lambda o: o.__dict__, sort_keys=False, indent=4)
311 |     print json.dumps({'code':1000, 'data':json.loads(result)})
312 |     #print result
313 |     cv2.destroyAllWindows()
314 |     #print "<----- %.1f seconds used ----->" % (time.time() - start)
315 | 
316 | if __name__ == "__main__":
317 |     main()
318 |     
319 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 使用 pytesseract 识别 18 位身份证号
 2 | 
 3 | 项目调用 OpenCV 对图片进行预处理，裁剪出包含身份证号码的部分，然后调用 pytesseract 识别出号码。
 4 | 
 5 | ### 版本
 6 | `Python 2.7.13`  
 7 | 
 8 | ### 依赖库
 9 | ```
10 | PIL
11 | pytesseract
12 | tesseract    3.05.01
13 | numpy        1.13.1
14 | ```
15 | 
16 | ### 运行
17 | 
18 | 将身份证照片拷贝至项目文件夹下，执行：
19 | `python IDtesseract.py 1.jpg`
20 | 


--------------------------------------------------------------------------------
/canny.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #coding=utf-8
 3 | 
 4 | from PIL import Image
 5 | import pytesseract
 6 | import cv2
 7 | import os
 8 | import string
 9 | import re
10 | import numpy as np
11 | import sys
12 | 
13 | def main():
14 |     # parse command line options
15 |     if len(sys.argv) != 2:
16 |         print 'Usage: python input_name output_name'
17 |         exit(1)
18 |     filePath = sys.argv[1]
19 | 
20 |     print "<----- processing %s ----->" % filePath
21 | 
22 |     #身份证号码识别，先对图片进行黑白处理，裁剪出身份证号，然后识别
23 |     img = cv2.imread(filePath, 0)
24 |     img = cv2.resize(img, (1200, 900)) 
25 | 
26 |     # 图片亮度调节
27 |     # imgArr = np.array(img)
28 |     # imgMean = np.mean(img)
29 |     # imgcopy = imgArr - imgMean
30 |     # imgcopy = imgcopy * 2 + imgMean * 3
31 |     # imgcopy = imgcopy / 255
32 | 
33 |     canny = cv2.Canny(img, 60, 300)  
34 |     inverted = cv2.bitwise_not(canny)
35 |     cv2.imshow('Canny', inverted)
36 | 
37 |     test1 = Image.fromarray(canny)
38 |     test2 = Image.fromarray(inverted)
39 | 
40 |     result = pytesseract.image_to_string(test1, lang="eng", config="-c tessedit_char_whitelist=0123456789X")
41 |     print result
42 |     print "-------"
43 |     result = pytesseract.image_to_string(test2, lang="eng")
44 |     print result
45 | 
46 |     k = cv2.waitKey(0)
47 | 
48 | if __name__ == "__main__":
49 |     main()


--------------------------------------------------------------------------------
/passport.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding:utf-8
  3 | 
  4 | import sys
  5 | import threading
  6 | import multiprocessing
  7 | import Queue
  8 | import re
  9 | import json
 10 | import cv2
 11 | import numpy as np
 12 | # import os
 13 | # import subprocess
 14 | import pytesseract
 15 | import psutil
 16 | # from matplotlib import pyplot as plt
 17 | from PIL import Image, ExifTags
 18 | from pypinyin import pinyin, lazy_pinyin
 19 | import pypinyin
 20 | import time
 21 | 
 22 | reload(sys)
 23 | sys.setdefaultencoding("utf-8")
 24 | 
 25 | # 护照中出现的省市列表
 26 | passportProvinces = [
 27 |     "北京",
 28 |     "广东",
 29 |     "山东",
 30 |     "江苏",
 31 |     "河南",
 32 |     "上海",
 33 |     "河北",
 34 |     "浙江",
 35 |     "香港",
 36 |     "陕西",
 37 |     "湖南",
 38 |     "重庆",
 39 |     "福建",
 40 |     "天津",
 41 |     "云南",
 42 |     "四川",
 43 |     "广西",
 44 |     "安徽",
 45 |     "海南",
 46 |     "江西",
 47 |     "湖北",
 48 |     "山西",
 49 |     "辽宁",
 50 |     "台湾",
 51 |     "黑龙江",
 52 |     "内蒙古",
 53 |     "澳门",
 54 |     "贵州",
 55 |     "甘肃",
 56 |     "青海",
 57 |     "新疆",
 58 |     "西藏",
 59 |     "吉林",
 60 |     "宁夏"
 61 | ]
 62 | 
 63 | # 护照中出现的月份英文缩写列表
 64 | passportMonthAbbrs = [
 65 |     "JAN",
 66 |     "FEB",
 67 |     "MAR",
 68 |     "APR",
 69 |     "MAY",
 70 |     "JUN",
 71 |     "JUL",
 72 |     "AUG",
 73 |     "SEP",
 74 |     "OCT",
 75 |     "NOV",
 76 |     "DEC"
 77 | ]
 78 | 
 79 | # 根据月份英文缩写获取数字月份
 80 | def getMonthNumberStringWithAddr(addr):
 81 |     if addr == "JAN":
 82 |         return "01"
 83 |     elif addr == "FEB":
 84 |         return "02"
 85 |     elif addr == "MAR":
 86 |         return "03"
 87 |     elif addr == "APR":
 88 |         return "04"
 89 |     elif addr == "MAY":
 90 |         return "05"
 91 |     elif addr == "JUN":
 92 |         return "06"
 93 |     elif addr == "JUL":
 94 |         return "07"
 95 |     elif addr == "AUG":
 96 |         return "08"
 97 |     elif addr == "SEP":
 98 |         return "09"
 99 |     elif addr == "OCT":
100 |         return "10"
101 |     elif addr == "NOV":
102 |         return "11"
103 |     elif addr == "DEC":
104 |         return "12"
105 |     return ""
106 | 
107 | def getMidX(rect):
108 |     x0 = rect[0]
109 |     x1 = rect[0] + rect[2]
110 |     return (x0 + x1) * 0.5
111 | 
112 | def getMidY(rect):
113 |     y0 = rect[1]
114 |     y1 = rect[1] + rect[3]
115 |     return (y0 + y1) * 0.5
116 | 
117 | # 修正图片旋转
118 | def fixRotation(filePath):
119 |     try:
120 |         image = Image.open(filePath)
121 |         for orientation in ExifTags.TAGS.keys():
122 |             if ExifTags.TAGS[orientation] == 'Orientation':
123 |                 break
124 |         exif = dict(image._getexif().items())
125 | 
126 |         if exif[orientation] == 3:
127 |             image = image.rotate(180, expand=True)
128 |         elif exif[orientation] == 6:
129 |             image = image.rotate(270, expand=True)
130 |         elif exif[orientation] == 8:
131 |             image = image.rotate(90, expand=True)
132 |         image.save(filePath)
133 |         image.close()
134 | 
135 |     except (AttributeError, KeyError, IndexError):
136 |         # cases: image don't have getexif
137 |         pass
138 | 
139 | # 获取暗色在图片中所占百分比
140 | def getDarkColorPercent(image):
141 |     height = np.size(image, 0)
142 |     width = np.size(image, 1)
143 |     imgSize = width * height
144 |     result = cv2.threshold(image, 100, -1, cv2.THRESH_TOZERO)[1]
145 |     nonzero = cv2.countNonZero(result)
146 |     if nonzero > 0:
147 |         return (imgSize - nonzero) / float(imgSize)
148 |     else:
149 |         return 0
150 | 
151 | # 在图片中画出框
152 | def drawRects(image, rects):
153 |     for rect in rects:
154 |         cv2.rectangle(image, (int(rect[0]), int(rect[1])), (int(
155 |             rect[0] + rect[2]), int(rect[1] + rect[3])), (0, 255, 0), 15, 8, 0)
156 | 
157 | # 处理成黑白图片以便进行文字识别
158 | def dealImage(image, thresh):
159 |     kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (2, 1))
160 |     dilate = cv2.dilate(image, kernel)
161 | 
162 |     gray = cv2.cvtColor(dilate, cv2.COLOR_RGB2GRAY)
163 |     return cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)[1]
164 | 
165 | # 等比缩放图片
166 | def scaleImage(image, scale):
167 |     height = np.size(image, 0)
168 |     width = np.size(image, 1)
169 |     dstSize = (int(width * scale), int(height * scale))
170 | 
171 |     return cv2.resize(image, dstSize, None, 0, 0, cv2.INTER_LINEAR)
172 | 
173 | # 检测可能包含文字的区域
174 | def detectTextRects(image, imageScale):
175 |     # letterBoxes
176 |     gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
177 |     threshold = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY)[1]
178 | 
179 |     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (130, 20))
180 |     result = cv2.dilate((255 - threshold), kernel)
181 | 
182 |     # // 检索轮廓并返回检测到的轮廓的个数
183 |     contours = cv2.findContours(result, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]
184 | 
185 |     maxValue = 200 * imageScale
186 |     minValue = 40 * imageScale
187 | 
188 |     boundRect = []
189 |     for points in contours:
190 |         appRect = cv2.boundingRect(points)  # x y w h
191 | 
192 |         if (appRect[3] > maxValue and appRect[2] > maxValue):
193 |             continue
194 | 
195 |         if (appRect[3] < minValue or appRect[2] < minValue):
196 |             continue
197 |         appRect = list(appRect)
198 |         appRect[2] += 60 * imageScale
199 |         appRect[3] += 15 * imageScale
200 |         appRect[0] -= 30 * imageScale
201 |         appRect[1] -= 7.5 * imageScale
202 |         boundRect.append(tuple(appRect))
203 |     return boundRect
204 | 
205 | 
206 | # 执行文字识别shell并返回结果
207 | # def image_to_string(img, cleanup=True, plus=''):
208 | #     # cleanup为True则识别完成后删除生成的文本文件
209 | #     # plus参数为给tesseract的附加高级参数
210 | #     try:
211 | #         subprocess.check_output('tesseract ' + img + ' ' + img + ' ' + plus, shell=True)  # 生成同名txt文件
212 | #     except subprocess.CalledProcessError as e:
213 | #         return ""
214 | #     text = ''
215 | #     with open(img + '.txt', 'r') as f:
216 | #         text = f.read().strip()
217 | #     if cleanup:
218 | #         os.remove(img + '.txt')
219 | #     return text
220 | 
221 | # 护照信息类
222 | class PassportInfo(object):
223 |     # 有点类似其它高级语言的构造函数
224 |     def __init__(self):
225 |         self.passportNumber = ""
226 |         self.name = ""
227 |         self.namePinyin = ""
228 |         self.sex = ""
229 |         self.nationality = ""
230 |         self.birthDate = ""
231 |         self.birthPlace = ""
232 |         self.issueDate = ""
233 |         self.issuePlace = ""
234 |         self.expiryDate = ""
235 |         self.authority = ""
236 |         self.authorityEnglish = ""
237 |         self.bearerSignature = ""
238 |         self.firstBooklet = ""
239 |         self.secondBooklet = ""
240 | 
241 |     def toJSON(self):
242 |         return json.dumps(self, default=lambda o: o.__dict__,
243 |                           sort_keys=True, indent=4)
244 | 
245 | # 单个图片识别item
246 | class ImageRecognizerItem(object):
247 |     # 有点类似其它高级语言的构造函数
248 |     def __init__(self, recognizedText, rect):
249 |         self.rect = rect
250 |         self.recognizedText = recognizedText
251 |         self.dealedText = ""
252 | 
253 | class ThreadRecognize(threading.Thread):
254 |     def __init__(self, queue):
255 |         threading.Thread.__init__(self)
256 |         self.queue = queue
257 | 
258 |     def run(self):
259 |         while True:
260 |             # check available memory
261 |             virtualMemoryInfo = psutil.virtual_memory()
262 |             availableMemory = virtualMemoryInfo.available
263 |             # print(str(availableMemory/1025/1024)+"M")
264 |             if availableMemory > MEMORY_WARNING:
265 |                 args = self.queue.get()
266 |                 recognizeImage(*args)
267 |                 self.queue.task_done()
268 |             # else:
269 |             #     print("memory warning!")
270 | 
271 | # 判断是否只有中文
272 | def isChinese(string):
273 |     tempString = string.replace(" ", "")
274 |     # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8"))
275 |     # return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", unicode(tempString, "utf8"))) == 0
276 |     return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", tempString)) == 0
277 | 
278 | # def getChineseString(string):
279 | #     tempString = string.replace(" ", "")
280 | #     tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8"))
281 | #     return tempString.encode("ascii")
282 | 
283 | # 是否包含数字
284 | def hasNumbers(inputString):
285 |     return bool(re.search(r'\d', inputString))
286 | 
287 | # 根据正则表达式进行替换，返回替换后的文本是否为空
288 | def replaceWithRegexIsEmpty(regex, string):
289 |     return len(re.sub(regex, "", string)) == 0
290 | 
291 | # 使用Tesseract进行文字识别
292 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None):
293 |     config = "-psm 7"   # single line mode
294 |     if charWhiteList is not None:
295 |         config += " -c tessedit_char_whitelist=" + charWhiteList
296 | 
297 |     image = Image.fromarray(cvimage)
298 | 
299 |     result = pytesseract.image_to_string(image, lang=language, config=config)
300 | 
301 |     item = ImageRecognizerItem(result, rect)
302 |     results.append(item)
303 | 
304 | # 处理ImageRecognizerItem
305 | def handleRecognizedItem(recognizedItem, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates):
306 |     recognizedText = recognizedItem.recognizedText.replace("\n", "")
307 |     if (len(recognizedText) > 0):
308 |         dealText = recognizedText.replace("<", "")
309 |         dealText = dealText.replace(" ", "")
310 |         isValid = re.sub("\w", "", dealText) == ""
311 |         string = recognizedText.replace(" ", "")
312 | 
313 |         # newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", unicode(recognizedText, "utf8"))
314 |         newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", recognizedText)
315 | 
316 |         # 底部两行
317 |         if (isValid and len(string) == 44 and len(newString) == 0):
318 |             recognizedText = string
319 |             checkDigit = recognizedText[9: 9 + 1]
320 |             if len(re.sub("^[1-9]+$", "", checkDigit)) != 0:
321 |                 surname = recognizedText[5:5 + 39]
322 |                 arr = filter(None, surname.split("<"))
323 |                 if len(arr) == 2:
324 |                     handledTexts["familyName"] = arr[0]
325 |                     handledTexts["givenName"] = arr[1]
326 | 
327 |                 passportInfo.firstBooklet = recognizedText
328 |             else:
329 |                 passportNumber = recognizedText[0:0 + 9]
330 |                 nationality = recognizedText[10:10 + 3]
331 |                 birth = recognizedText[13:13 + 6]
332 |                 sex = recognizedText[20:20 + 1]
333 |                 expiration = recognizedText[21:21 + 6]
334 |                 # personalNumber = recognizedText[28:28+14]
335 | 
336 |                 handledTexts["passportNumber"] = passportNumber
337 |                 handledTexts["nationality"] = nationality
338 |                 handledTexts["birth"] = birth
339 |                 handledTexts["sex"] = sex
340 |                 handledTexts["expiration"] = expiration
341 | 
342 |                 passportInfo.passportNumber = passportNumber
343 |                 passportInfo.sex = sex
344 |                 passportInfo.nationality = nationality
345 |                 passportInfo.secondBooklet = recognizedText
346 |         else:
347 |             # detect province
348 |             # 可能是省市：字符串中包含省市的中文或拼音
349 |             for province in passportProvinces:
350 |                 provincePinyin = ''.join(lazy_pinyin(unicode(province, 'utf8')))
351 |                 # provincePinyin = ''.join(lazy_pinyin(province))
352 |                 provincePinyin = provincePinyin.upper()
353 |                 string = recognizedText.replace(" ", "")
354 |                 if (province in string or provincePinyin in string):
355 |                     recognizedItem.dealedText = province
356 |                     possibleProvinces.append(recognizedItem)
357 | 
358 |             # detect date
359 |             # 可能是日期：字符串中包含月份缩写
360 |             for monthAddr in passportMonthAbbrs:
361 |                 if (monthAddr in recognizedText and hasNumbers(recognizedText)):
362 |                     possibleDates.append(recognizedItem)
363 | 
364 |             # 可能是姓名：字符串全是中文
365 |             if isChinese(recognizedText):
366 |                 # recognizedItem.dealedText = getChineseString(recognizedText).encode("utf8")
367 |                 possibleNames.append(recognizedItem)
368 | 
369 | # 最后处理
370 | # 只针对现版因私普通护照，旧版或其他类型护照的信息位置可能会有所不同
371 | # 护照种类：外交护照、公务护照、普通护照（因公普通护照、因私普通护照）
372 | def handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates):
373 |     # find name
374 |     # 条件：字符串拼音是在booklet中检测出的中文姓名拼音
375 |     if handledTexts.has_key("familyName") and handledTexts.has_key("givenName"):
376 |         fullnamePinyin = handledTexts["familyName"] + handledTexts["givenName"]
377 |         fullnamePinyin = fullnamePinyin.upper()
378 |         for item in possibleNames:
379 |             name = item.recognizedText.replace(" ", "")
380 |             # namePinyin = ''.join(lazy_pinyin(unicode(name, 'utf-8')))
381 |             namePinyin = ''.join(lazy_pinyin(name))
382 |             namePinyin = namePinyin.upper()
383 |             if namePinyin == fullnamePinyin:
384 |                 passportInfo.name = name
385 |                 passportInfo.namePinyin = namePinyin
386 | 
387 |     # handle province
388 |     # 条件：因为只会出现两个省市，并且上面的是出生地点，下面是签发地点
389 |     if len(possibleProvinces) == 2:
390 |         item0 = possibleProvinces[0]
391 |         item1 = possibleProvinces[1]
392 |         if (getMidY(item0.rect) > getMidY(item1.rect)):
393 |             passportInfo.issuePlace = item0.dealedText
394 |             passportInfo.birthPlace = item1.dealedText
395 | 
396 |         else:
397 |             passportInfo.issuePlace = item1.dealedText
398 |             passportInfo.birthPlace = item0.dealedText
399 | 
400 |     # issue and expiry date
401 |     # 条件：27MAY1993类型的日期只有一个（出生日期）；122月FEB2016有两个，上面是签发日期，下面是有效期至，如果只检测到一个此类型日期，根据booklet中检测出的两位年的日期920527，与之匹配检测出是签发日期还是有效期至
402 |     issueOrExpiry = []
403 |     births = []
404 |     for item in possibleDates:
405 |         date = item.recognizedText
406 |         date = date.replace(" ", "")
407 |         date = date.replace("/", "")
408 | 
409 |         # 27MAY1993
410 |         if replaceWithRegexIsEmpty("^\d{2}[A-Za-z]{3}\d{4}$", date):
411 |             births.append(date)
412 |         # 122月FEB2016
413 |         elif replaceWithRegexIsEmpty(u"^\d{3,4}月{1}[A-Za-z]{3}\d{4}$", date):
414 |             issueOrExpiry.append(date)
415 | 
416 |     if len(births) == 1 and handledTexts.has_key("birth"):
417 |         date = births[0]
418 |         bookletDate = handledTexts["birth"]
419 | 
420 |         birthYear = date[5:5 + 4]
421 |         birthMonth = getMonthNumberStringWithAddr(date[2:2 + 3])
422 |         if birthMonth != "":
423 |             birthDay = date[0:0 + 2]
424 | 
425 |             # 与booklet上的日期比对
426 |             if birthDay == bookletDate[4:4 + 2] and birthMonth == bookletDate[2:2 + 2] and birthYear.endswith(bookletDate[0:0 + 2]):
427 |                 passportInfo.birthDate = birthYear + " " + birthMonth + " " + birthDay
428 | 
429 |     if len(issueOrExpiry) > 0 and handledTexts.has_key("expiration"):
430 |         bookletDate = handledTexts["expiration"]
431 | 
432 |         date0 = issueOrExpiry[0]
433 |         day0 = date0[0:0 + 2]
434 |         year0 = date0[len(date0) - 4:len(date0) - 4 + 4]
435 |         monthAddr0 = date0[len(date0) - 7:len(date0) - 7 + 3]
436 |         month0 = getMonthNumberStringWithAddr(monthAddr0)
437 | 
438 |         existExpiryDate = False
439 |         # 与booklet上的日期比对
440 |         if day0 == bookletDate[4:4 + 2] and month0 == bookletDate[2:2 + 2] and year0.endswith(bookletDate[0:0 + 2]):
441 |             passportInfo.expiryDate = year0 + " " + month0 + " " + day0
442 |             existExpiryDate = True
443 | 
444 |         if len(issueOrExpiry) == 2:
445 |             date1 = issueOrExpiry[1]
446 |             day1 = date1[0:0 + 2]
447 |             year1 = date1[len(date1) - 4:len(date1) - 4 + 4]
448 |             monthAddr1 = date1[len(date1) - 7:len(date1) - 7 + 3]
449 |             month1 = getMonthNumberStringWithAddr(monthAddr1)
450 | 
451 |             if not existExpiryDate:
452 |                 # 与booklet上的日期比对
453 |                 if day1[4:4 + 2] and month1[2:2 + 2] and year1.endswith(bookletDate[0:0 + 2]):
454 |                     passportInfo.expiryDate = year1 + " " + month1 + " " + day1
455 |                     passportInfo.issueDate = year0 + " " + month0 + " " + day0
456 | 
457 |                     existExpiryDate = True
458 |             else:
459 |                 passportInfo.issueDate = year1 + " " + month1 + " " + day1
460 | 
461 | # 图片路径
462 | filePath = '666.jpg'
463 | IMAGE_SCALE = 0.7
464 | MEMORY_WARNING = 400*1024*1024  # 200M
465 | ENABLE_THREAD = True 
466 | def main():
467 |     if len(sys.argv) != 2:
468 |         print 'Usage: python aruba.py image_name'
469 |         exit(1)
470 |     filePath = sys.argv[1]
471 | 
472 |     passportInfo = PassportInfo()
473 |     recognizedItems = []
474 |     handledTexts = {}
475 |     possibleNames = []
476 |     possibleProvinces = []
477 |     possibleDates = []
478 | 
479 |     threads = []
480 | 
481 |     IMAGE_SCALE = 0.7
482 |     MEMORY_WARNING = 400*1024*1024  # 200M
483 |     CPU_COUNT = multiprocessing.cpu_count() # 线程数
484 | 
485 |     #print '------------'+str(CPU_COUNT)+'----------------'
486 | 
487 |     ENABLE_THREAD = True   # 是否开启多线程模式
488 | 
489 |     # 修正图片旋转，有时候手机拍出的照片会出现旋转的情况
490 |     fixRotation(filePath)
491 | 
492 |     # 读取图片
493 |     img = cv2.imread(filePath, 1)
494 |     height = np.size(img, 0)
495 |     width = np.size(img, 1)
496 |     scale = 4000.0 * IMAGE_SCALE / width * 1.0
497 | 
498 |     # 拉伸图片到宽度为4000*IMAGE_SCALE
499 |     img = scaleImage(img, scale)
500 | 
501 |     # 处理图片，以便使用Tesseract进行识别
502 |     dealedImg = dealImage(img, 95)
503 | 
504 |     # 获取可能包含文字的区域
505 |     rects = detectTextRects(img, IMAGE_SCALE)
506 | 
507 |     # 测试，画出框
508 |     drawRects(img, rects)
509 |     cv2.imwrite('pas.jpg', img)
510 | 
511 |     start_time = time.time()
512 | 
513 | 
514 |     queue = Queue.Queue()
515 |     if ENABLE_THREAD:
516 |         for i in range(CPU_COUNT):
517 |             t = ThreadRecognize(queue)
518 |             t.setDaemon(True)
519 |             t.start()
520 | 
521 |     for rect in rects:
522 |         x = int(rect[0])
523 |         y = int(rect[1])
524 |         w = int(rect[2])
525 |         h = int(rect[3])
526 | 
527 |         # 根据长宽过滤不太可能包含文字的图片
528 |         if (((w > 50 * IMAGE_SCALE and w < 2000 * IMAGE_SCALE) or w > 2500 * IMAGE_SCALE) and (w > h)):
529 |             crop_img = dealedImg[y:y + h, x:x + w]
530 | 
531 |             darkColorPercent = getDarkColorPercent(crop_img)
532 | 
533 |             # 根据图片中包含的黑色百分比过滤不太可能包含文字的图片
534 |             if (darkColorPercent > 0.04 and darkColorPercent < 0.35):
535 | 
536 |                 # result = ""
537 | 
538 |                 # 长度很长的很可能就是booklets
539 |                 if w > 2500 * IMAGE_SCALE:
540 |                     if ENABLE_THREAD:
541 |                         args = (recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",)
542 |                         # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",))
543 |                         # threads.append(thread)
544 |                         queue.put(args)
545 |                     else:
546 |                         recognizeImage(recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<")
547 |                 else:
548 |                     if ENABLE_THREAD:
549 |                         args = (recognizedItems, crop_img, rect, "eng+chi_sim",)
550 |                         # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng+chi_sim",))
551 |                         # threads.append(thread)
552 |                         queue.put(args)
553 |                     else:
554 |                         recognizeImage(recognizedItems, crop_img, rect, "eng+chi_sim")
555 |                 
556 | 
557 |     # if ENABLE_THREAD:
558 |     #     for t in threads:
559 |     #         t.setDaemon(True)
560 |     #         t.start()
561 |     #     # t.join()
562 |     #     # for t in threads:
563 |     #     #     t.join()
564 |     #     queue.join()
565 | 
566 |     queue.join()
567 | 
568 |     for item in recognizedItems:
569 |         # 对每个识别出的文字进行处理
570 |         handleRecognizedItem(item, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates)
571 | 
572 |     # # 对收集到的信息进行最后处理
573 |     handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates)
574 | 
575 |     print("--- %s seconds ---" % (time.time() - start_time))
576 | 
577 |     result = passportInfo.toJSON()
578 |     #print(json.dumps(result))
579 |     print(result)
580 | 
581 | if __name__ == "__main__":
582 |     main()


--------------------------------------------------------------------------------
/passportRecognizeNew.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding:utf-8
  3 | 
  4 | import sys
  5 | import threading
  6 | import multiprocessing
  7 | import Queue
  8 | import re
  9 | import json
 10 | import cv2
 11 | import numpy as np
 12 | # import os
 13 | # import subprocess
 14 | import pytesseract
 15 | import psutil
 16 | # from matplotlib import pyplot as plt
 17 | from PIL import Image, ExifTags
 18 | from pypinyin import pinyin, lazy_pinyin
 19 | import pypinyin
 20 | import time
 21 | 
 22 | reload(sys)
 23 | sys.setdefaultencoding("utf-8")
 24 | 
 25 | # 护照中出现的省市列表
 26 | passportProvinces = [
 27 |     "北京",
 28 |     "广东",
 29 |     "山东",
 30 |     "江苏",
 31 |     "河南",
 32 |     "上海",
 33 |     "河北",
 34 |     "浙江",
 35 |     "香港",
 36 |     "陕西",
 37 |     "湖南",
 38 |     "重庆",
 39 |     "福建",
 40 |     "天津",
 41 |     "云南",
 42 |     "四川",
 43 |     "广西",
 44 |     "安徽",
 45 |     "海南",
 46 |     "江西",
 47 |     "湖北",
 48 |     "山西",
 49 |     "辽宁",
 50 |     "台湾",
 51 |     "黑龙江",
 52 |     "内蒙古",
 53 |     "澳门",
 54 |     "贵州",
 55 |     "甘肃",
 56 |     "青海",
 57 |     "新疆",
 58 |     "西藏",
 59 |     "吉林",
 60 |     "宁夏"
 61 | ]
 62 | 
 63 | # 护照中出现的月份英文缩写列表
 64 | passportMonthAbbrs = [
 65 |     "JAN",
 66 |     "FEB",
 67 |     "MAR",
 68 |     "APR",
 69 |     "MAY",
 70 |     "JUN",
 71 |     "JUL",
 72 |     "AUG",
 73 |     "SEP",
 74 |     "OCT",
 75 |     "NOV",
 76 |     "DEC"
 77 | ]
 78 | 
 79 | # 根据月份英文缩写获取数字月份
 80 | def getMonthNumberStringWithAddr(addr):
 81 |     if addr == "JAN":
 82 |         return "01"
 83 |     elif addr == "FEB":
 84 |         return "02"
 85 |     elif addr == "MAR":
 86 |         return "03"
 87 |     elif addr == "APR":
 88 |         return "04"
 89 |     elif addr == "MAY":
 90 |         return "05"
 91 |     elif addr == "JUN":
 92 |         return "06"
 93 |     elif addr == "JUL":
 94 |         return "07"
 95 |     elif addr == "AUG":
 96 |         return "08"
 97 |     elif addr == "SEP":
 98 |         return "09"
 99 |     elif addr == "OCT":
100 |         return "10"
101 |     elif addr == "NOV":
102 |         return "11"
103 |     elif addr == "DEC":
104 |         return "12"
105 |     return ""
106 | 
107 | def getMidX(rect):
108 |     x0 = rect[0]
109 |     x1 = rect[0] + rect[2]
110 |     return (x0 + x1) * 0.5
111 | 
112 | def getMidY(rect):
113 |     y0 = rect[1]
114 |     y1 = rect[1] + rect[3]
115 |     return (y0 + y1) * 0.5
116 | 
117 | # 修正图片旋转
118 | def fixRotation(filePath):
119 |     try:
120 |         image = Image.open(filePath)
121 |         for orientation in ExifTags.TAGS.keys():
122 |             if ExifTags.TAGS[orientation] == 'Orientation':
123 |                 break
124 |         exif = dict(image._getexif().items())
125 | 
126 |         if exif[orientation] == 3:
127 |             image = image.rotate(180, expand=True)
128 |         elif exif[orientation] == 6:
129 |             image = image.rotate(270, expand=True)
130 |         elif exif[orientation] == 8:
131 |             image = image.rotate(90, expand=True)
132 |         image.save(filePath)
133 |         image.close()
134 | 
135 |     except (AttributeError, KeyError, IndexError):
136 |         # cases: image don't have getexif
137 |         pass
138 | 
139 | # 获取暗色在图片中所占百分比
140 | def getDarkColorPercent(image):
141 |     height = np.size(image, 0)
142 |     width = np.size(image, 1)
143 |     imgSize = width * height
144 |     result = cv2.threshold(image, 100, -1, cv2.THRESH_TOZERO)[1]
145 |     nonzero = cv2.countNonZero(result)
146 |     if nonzero > 0:
147 |         return (imgSize - nonzero) / float(imgSize)
148 |     else:
149 |         return 0
150 | 
151 | # 在图片中画出框
152 | def drawRects(image, rects):
153 |     for rect in rects:
154 |         cv2.rectangle(image, (int(rect[0]), int(rect[1])), (int(
155 |             rect[0] + rect[2]), int(rect[1] + rect[3])), (0, 255, 0), 15, 8, 0)
156 | 
157 | # 处理成黑白图片以便进行文字识别
158 | def dealImage(image, thresh):
159 |     kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (2, 1))
160 |     dilate = cv2.dilate(image, kernel)
161 | 
162 |     gray = cv2.cvtColor(dilate, cv2.COLOR_RGB2GRAY)
163 |     return cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)[1]
164 | 
165 | # 等比缩放图片
166 | def scaleImage(image, scale):
167 |     height = np.size(image, 0)
168 |     width = np.size(image, 1)
169 |     dstSize = (int(width * scale), int(height * scale))
170 | 
171 |     return cv2.resize(image, dstSize, None, 0, 0, cv2.INTER_LINEAR)
172 | 
173 | # 检测可能包含文字的区域
174 | def detectTextRects(image, imageScale):
175 |     # letterBoxes
176 |     gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
177 |     threshold = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY)[1]
178 | 
179 |     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (130, 20))
180 |     result = cv2.dilate((255 - threshold), kernel)
181 | 
182 |     # // 检索轮廓并返回检测到的轮廓的个数
183 |     contours = cv2.findContours(result, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]
184 | 
185 |     maxValue = 200 * imageScale
186 |     minValue = 40 * imageScale
187 | 
188 |     boundRect = []
189 |     for points in contours:
190 |         appRect = cv2.boundingRect(points)  # x y w h
191 | 
192 |         if (appRect[3] > maxValue and appRect[2] > maxValue):
193 |             continue
194 | 
195 |         if (appRect[3] < minValue or appRect[2] < minValue):
196 |             continue
197 |         appRect = list(appRect)
198 |         appRect[2] += 60 * imageScale
199 |         appRect[3] += 15 * imageScale
200 |         appRect[0] -= 30 * imageScale
201 |         appRect[1] -= 7.5 * imageScale
202 |         boundRect.append(tuple(appRect))
203 |     return boundRect
204 | 
205 | 
206 | # 执行文字识别shell并返回结果
207 | # def image_to_string(img, cleanup=True, plus=''):
208 | #     # cleanup为True则识别完成后删除生成的文本文件
209 | #     # plus参数为给tesseract的附加高级参数
210 | #     try:
211 | #         subprocess.check_output('tesseract ' + img + ' ' + img + ' ' + plus, shell=True)  # 生成同名txt文件
212 | #     except subprocess.CalledProcessError as e:
213 | #         return ""
214 | #     text = ''
215 | #     with open(img + '.txt', 'r') as f:
216 | #         text = f.read().strip()
217 | #     if cleanup:
218 | #         os.remove(img + '.txt')
219 | #     return text
220 | 
221 | # 护照信息类
222 | class PassportInfo(object):
223 |     # 有点类似其它高级语言的构造函数
224 |     def __init__(self):
225 |         self.passportNumber = ""
226 |         self.name = ""
227 |         self.namePinyin = ""
228 |         self.sex = ""
229 |         self.nationality = ""
230 |         self.birthDate = ""
231 |         self.birthPlace = ""
232 |         self.issueDate = ""
233 |         self.issuePlace = ""
234 |         self.expiryDate = ""
235 |         self.authority = ""
236 |         self.authorityEnglish = ""
237 |         self.bearerSignature = ""
238 |         self.firstBooklet = ""
239 |         self.secondBooklet = ""
240 | 
241 |     def toJSON(self):
242 |         return json.dumps(self, default=lambda o: o.__dict__,
243 |                           sort_keys=True, indent=4)
244 | 
245 | # 单个图片识别item
246 | class ImageRecognizerItem(object):
247 |     # 有点类似其它高级语言的构造函数
248 |     def __init__(self, recognizedText, rect):
249 |         self.rect = rect
250 |         self.recognizedText = recognizedText
251 |         self.dealedText = ""
252 | 
253 | class ThreadRecognize(threading.Thread):
254 |     def __init__(self, queue):
255 |         threading.Thread.__init__(self)
256 |         self.queue = queue
257 | 
258 |     def run(self):
259 |         while True:
260 |             # check available memory
261 |             virtualMemoryInfo = psutil.virtual_memory()
262 |             availableMemory = virtualMemoryInfo.available
263 |             # print(str(availableMemory/1025/1024)+"M")
264 |             if availableMemory > MEMORY_WARNING:
265 |                 args = self.queue.get()
266 |                 recognizeImage(*args)
267 |                 self.queue.task_done()
268 |             # else:
269 |             #     print("memory warning!")
270 | 
271 | # 判断是否只有中文
272 | def isChinese(string):
273 |     tempString = string.replace(" ", "")
274 |     # tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8"))
275 |     # return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", unicode(tempString, "utf8"))) == 0
276 |     return len(re.sub(ur"^[\u4e00-\u9fa5]+$", "", tempString)) == 0
277 | 
278 | # def getChineseString(string):
279 | #     tempString = string.replace(" ", "")
280 | #     tempString = re.sub(ur"[^\u4e00-\u9fa5]", "", unicode(tempString, "utf8"))
281 | #     return tempString.encode("ascii")
282 | 
283 | # 是否包含数字
284 | def hasNumbers(inputString):
285 |     return bool(re.search(r'\d', inputString))
286 | 
287 | # 根据正则表达式进行替换，返回替换后的文本是否为空
288 | def replaceWithRegexIsEmpty(regex, string):
289 |     return len(re.sub(regex, "", string)) == 0
290 | 
291 | # 使用Tesseract进行文字识别
292 | def recognizeImage(results, cvimage ,rect, language, charWhiteList=None):
293 |     config = "-psm 7"   # single line mode
294 |     if charWhiteList is not None:
295 |         config += " -c tessedit_char_whitelist=" + charWhiteList
296 | 
297 |     image = Image.fromarray(cvimage)
298 | 
299 |     result = pytesseract.image_to_string(image, lang=language, config=config)
300 | 
301 |     item = ImageRecognizerItem(result, rect)
302 |     results.append(item)
303 | 
304 | # 处理ImageRecognizerItem
305 | def handleRecognizedItem(recognizedItem, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates):
306 |     recognizedText = recognizedItem.recognizedText.replace("\n", "")
307 |     if (len(recognizedText) > 0):
308 |         dealText = recognizedText.replace("<", "")
309 |         dealText = dealText.replace(" ", "")
310 |         isValid = re.sub("\w", "", dealText) == ""
311 |         string = recognizedText.replace(" ", "")
312 | 
313 |         # newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", unicode(recognizedText, "utf8"))
314 |         newString = re.sub(ur"^[a-zA-Z0-9\u4e00-\u9fa5,/< ]+$", "", recognizedText)
315 | 
316 |         # 底部两行
317 |         if (isValid and len(string) == 44 and len(newString) == 0):
318 |             recognizedText = string
319 |             checkDigit = recognizedText[9: 9 + 1]
320 |             if len(re.sub("^[1-9]+$", "", checkDigit)) != 0:
321 |                 surname = recognizedText[5:5 + 39]
322 |                 arr = filter(None, surname.split("<"))
323 |                 if len(arr) == 2:
324 |                     handledTexts["familyName"] = arr[0]
325 |                     handledTexts["givenName"] = arr[1]
326 | 
327 |                 passportInfo.firstBooklet = recognizedText
328 |             else:
329 |                 passportNumber = recognizedText[0:0 + 9]
330 |                 nationality = recognizedText[10:10 + 3]
331 |                 birth = recognizedText[13:13 + 6]
332 |                 sex = recognizedText[20:20 + 1]
333 |                 expiration = recognizedText[21:21 + 6]
334 |                 # personalNumber = recognizedText[28:28+14]
335 | 
336 |                 handledTexts["passportNumber"] = passportNumber
337 |                 handledTexts["nationality"] = nationality
338 |                 handledTexts["birth"] = birth
339 |                 handledTexts["sex"] = sex
340 |                 handledTexts["expiration"] = expiration
341 | 
342 |                 passportInfo.passportNumber = passportNumber
343 |                 passportInfo.sex = sex
344 |                 passportInfo.nationality = nationality
345 |                 passportInfo.secondBooklet = recognizedText
346 |         else:
347 |             # detect province
348 |             # 可能是省市：字符串中包含省市的中文或拼音
349 |             for province in passportProvinces:
350 |                 provincePinyin = ''.join(lazy_pinyin(unicode(province, 'utf8')))
351 |                 # provincePinyin = ''.join(lazy_pinyin(province))
352 |                 provincePinyin = provincePinyin.upper()
353 |                 string = recognizedText.replace(" ", "")
354 |                 if (province in string or provincePinyin in string):
355 |                     recognizedItem.dealedText = province
356 |                     possibleProvinces.append(recognizedItem)
357 | 
358 |             # detect date
359 |             # 可能是日期：字符串中包含月份缩写
360 |             for monthAddr in passportMonthAbbrs:
361 |                 if (monthAddr in recognizedText and hasNumbers(recognizedText)):
362 |                     possibleDates.append(recognizedItem)
363 | 
364 |             # 可能是姓名：字符串全是中文
365 |             if isChinese(recognizedText):
366 |                 # recognizedItem.dealedText = getChineseString(recognizedText).encode("utf8")
367 |                 possibleNames.append(recognizedItem)
368 | 
369 | # 最后处理
370 | # 只针对现版因私普通护照，旧版或其他类型护照的信息位置可能会有所不同
371 | # 护照种类：外交护照、公务护照、普通护照（因公普通护照、因私普通护照）
372 | def handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates):
373 |     # find name
374 |     # 条件：字符串拼音是在booklet中检测出的中文姓名拼音
375 |     if handledTexts.has_key("familyName") and handledTexts.has_key("givenName"):
376 |         fullnamePinyin = handledTexts["familyName"] + handledTexts["givenName"]
377 |         fullnamePinyin = fullnamePinyin.upper()
378 |         for item in possibleNames:
379 |             name = item.recognizedText.replace(" ", "")
380 |             # namePinyin = ''.join(lazy_pinyin(unicode(name, 'utf-8')))
381 |             namePinyin = ''.join(lazy_pinyin(name))
382 |             namePinyin = namePinyin.upper()
383 |             if namePinyin == fullnamePinyin:
384 |                 passportInfo.name = name
385 |                 passportInfo.namePinyin = namePinyin
386 | 
387 |     # handle province
388 |     # 条件：因为只会出现两个省市，并且上面的是出生地点，下面是签发地点
389 |     if len(possibleProvinces) == 2:
390 |         item0 = possibleProvinces[0]
391 |         item1 = possibleProvinces[1]
392 |         if (getMidY(item0.rect) > getMidY(item1.rect)):
393 |             passportInfo.issuePlace = item0.dealedText
394 |             passportInfo.birthPlace = item1.dealedText
395 | 
396 |         else:
397 |             passportInfo.issuePlace = item1.dealedText
398 |             passportInfo.birthPlace = item0.dealedText
399 | 
400 |     # issue and expiry date
401 |     # 条件：27MAY1993类型的日期只有一个（出生日期）；122月FEB2016有两个，上面是签发日期，下面是有效期至，如果只检测到一个此类型日期，根据booklet中检测出的两位年的日期920527，与之匹配检测出是签发日期还是有效期至
402 |     issueOrExpiry = []
403 |     births = []
404 |     for item in possibleDates:
405 |         date = item.recognizedText
406 |         date = date.replace(" ", "")
407 |         date = date.replace("/", "")
408 | 
409 |         # 27MAY1993
410 |         if replaceWithRegexIsEmpty("^\d{2}[A-Za-z]{3}\d{4}$", date):
411 |             births.append(date)
412 |         # 122月FEB2016
413 |         elif replaceWithRegexIsEmpty(u"^\d{3,4}月{1}[A-Za-z]{3}\d{4}$", date):
414 |             issueOrExpiry.append(date)
415 | 
416 |     if len(births) == 1 and handledTexts.has_key("birth"):
417 |         date = births[0]
418 |         bookletDate = handledTexts["birth"]
419 | 
420 |         birthYear = date[5:5 + 4]
421 |         birthMonth = getMonthNumberStringWithAddr(date[2:2 + 3])
422 |         if birthMonth != "":
423 |             birthDay = date[0:0 + 2]
424 | 
425 |             # 与booklet上的日期比对
426 |             if birthDay == bookletDate[4:4 + 2] and birthMonth == bookletDate[2:2 + 2] and birthYear.endswith(bookletDate[0:0 + 2]):
427 |                 passportInfo.birthDate = birthYear + " " + birthMonth + " " + birthDay
428 | 
429 |     if len(issueOrExpiry) > 0 and handledTexts.has_key("expiration"):
430 |         bookletDate = handledTexts["expiration"]
431 | 
432 |         date0 = issueOrExpiry[0]
433 |         day0 = date0[0:0 + 2]
434 |         year0 = date0[len(date0) - 4:len(date0) - 4 + 4]
435 |         monthAddr0 = date0[len(date0) - 7:len(date0) - 7 + 3]
436 |         month0 = getMonthNumberStringWithAddr(monthAddr0)
437 | 
438 |         existExpiryDate = False
439 |         # 与booklet上的日期比对
440 |         if day0 == bookletDate[4:4 + 2] and month0 == bookletDate[2:2 + 2] and year0.endswith(bookletDate[0:0 + 2]):
441 |             passportInfo.expiryDate = year0 + " " + month0 + " " + day0
442 |             existExpiryDate = True
443 | 
444 |         if len(issueOrExpiry) == 2:
445 |             date1 = issueOrExpiry[1]
446 |             day1 = date1[0:0 + 2]
447 |             year1 = date1[len(date1) - 4:len(date1) - 4 + 4]
448 |             monthAddr1 = date1[len(date1) - 7:len(date1) - 7 + 3]
449 |             month1 = getMonthNumberStringWithAddr(monthAddr1)
450 | 
451 |             if not existExpiryDate:
452 |                 # 与booklet上的日期比对
453 |                 if day1[4:4 + 2] and month1[2:2 + 2] and year1.endswith(bookletDate[0:0 + 2]):
454 |                     passportInfo.expiryDate = year1 + " " + month1 + " " + day1
455 |                     passportInfo.issueDate = year0 + " " + month0 + " " + day0
456 | 
457 |                     existExpiryDate = True
458 |             else:
459 |                 passportInfo.issueDate = year1 + " " + month1 + " " + day1
460 | 
461 | 
462 | passportInfo = PassportInfo()
463 | recognizedItems = []
464 | handledTexts = {}
465 | possibleNames = []
466 | possibleProvinces = []
467 | possibleDates = []
468 | 
469 | threads = []
470 | 
471 | # 图片路径
472 | filePath = 'pas.JPG'
473 | IMAGE_SCALE = 0.7
474 | MEMORY_WARNING = 400*1024*1024  # 200M
475 | CPU_COUNT = multiprocessing.cpu_count() # 线程数
476 | ENABLE_THREAD = True   # 是否开启多线程模式
477 | 
478 | # 修正图片旋转，有时候手机拍出的照片会出现旋转的情况
479 | fixRotation(filePath)
480 | 
481 | # 读取图片
482 | img = cv2.imread(filePath, 1)
483 | height = np.size(img, 0)
484 | width = np.size(img, 1)
485 | scale = 4000.0 * IMAGE_SCALE / width * 1.0
486 | 
487 | # 拉伸图片到宽度为4000*IMAGE_SCALE
488 | img = scaleImage(img, scale)
489 | 
490 | # 处理图片，以便使用Tesseract进行识别
491 | dealedImg = dealImage(img, 95)
492 | 
493 | # 获取可能包含文字的区域
494 | rects = detectTextRects(img, IMAGE_SCALE)
495 | 
496 | # 测试，画出框
497 | drawRects(img, rects)
498 | 
499 | start_time = time.time()
500 | 
501 | 
502 | queue = Queue.Queue()
503 | if ENABLE_THREAD:
504 |     for i in range(CPU_COUNT):
505 |         t = ThreadRecognize(queue)
506 |         t.setDaemon(True)
507 |         t.start()
508 | 
509 | for rect in rects:
510 |     x = int(rect[0])
511 |     y = int(rect[1])
512 |     w = int(rect[2])
513 |     h = int(rect[3])
514 | 
515 |     # 根据长宽过滤不太可能包含文字的图片
516 |     if (((w > 50 * IMAGE_SCALE and w < 2000 * IMAGE_SCALE) or w > 2500 * IMAGE_SCALE) and (w > h)):
517 |         crop_img = dealedImg[y:y + h, x:x + w]
518 | 
519 |         darkColorPercent = getDarkColorPercent(crop_img)
520 | 
521 |         # 根据图片中包含的黑色百分比过滤不太可能包含文字的图片
522 |         if (darkColorPercent > 0.04 and darkColorPercent < 0.35):
523 | 
524 |             # result = ""
525 | 
526 |             # 长度很长的很可能就是booklets
527 |             if w > 2500 * IMAGE_SCALE:
528 |                 if ENABLE_THREAD:
529 |                     args = (recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",)
530 |                     # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<",))
531 |                     # threads.append(thread)
532 |                     queue.put(args)
533 |                 else:
534 |                     recognizeImage(recognizedItems, crop_img, rect, "eng", "0123456789ABCDEFGHIJKMLNOPQRSTUVWXYZ\<")
535 |             else:
536 |                 if ENABLE_THREAD:
537 |                     args = (recognizedItems, crop_img, rect, "eng+chi_sim",)
538 |                     # thread = threading.Thread(target=recognizeImage, args=(queue, recognizedItems, crop_img, rect, "eng+chi_sim",))
539 |                     # threads.append(thread)
540 |                     queue.put(args)
541 |                 else:
542 |                     recognizeImage(recognizedItems, crop_img, rect, "eng+chi_sim")
543 |             
544 | 
545 | # if ENABLE_THREAD:
546 | #     for t in threads:
547 | #         t.setDaemon(True)
548 | #         t.start()
549 | #     # t.join()
550 | #     # for t in threads:
551 | #     #     t.join()
552 | #     queue.join()
553 | 
554 | queue.join()
555 | 
556 | for item in recognizedItems:
557 |     # 对每个识别出的文字进行处理
558 |     handleRecognizedItem(item, passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates)
559 | 
560 | # # 对收集到的信息进行最后处理
561 | handledTextsForPassport(passportInfo, handledTexts, possibleNames, possibleProvinces, possibleDates)
562 | 
563 | print("--- %s seconds ---" % (time.time() - start_time))
564 | 
565 | result = passportInfo.toJSON()
566 | print(json.dumps(result))
567 | 
568 | 


--------------------------------------------------------------------------------
/single_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     多线程测试
 3 | """
 4 | #coding=utf-8
 5 | from time import ctime, sleep
 6 | import threading
 7 | 
 8 | def music(name):
 9 |     for i in range(2):
10 |         print "I'm listening %s at %s" % (name, ctime())
11 |         sleep(1)
12 |         print "music done %s" % ctime()
13 | 
14 | def movie(name):
15 |     for i in range(2):
16 |         print "I'm watching %s at %s" % (name, ctime())
17 |         sleep(5)
18 |         print "movie done %s" % ctime()
19 | 
20 | threads = []
21 | t1 = threading.Thread(target = music, args = (u"成都",))
22 | threads.append(t1)
23 | t2 = threading.Thread(target = movie, args = (u"阿凡达",))
24 | threads.append(t2)
25 | 
26 | if __name__ == "__main__":
27 |     for t in threads:
28 |         t.setDaemon(True)
29 |         t.start()
30 |     t.join()
31 |     print "It's over! %s" % ctime()


--------------------------------------------------------------------------------