├── .gitignore
├── fuck_sjtu_captcha.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | !*.py


--------------------------------------------------------------------------------
/fuck_sjtu_captcha.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | """
  4 | 验证码处理步骤:
  5 | 
  6 | 1. 二值化
  7 | 2. 去噪点(由于sjtu验证码没有噪点，不需要这步)
  8 | 3. 字符切割
  9 | 4. 单个字符图片旋转到合适角度:旋转卡壳算法（投影至x轴长度最小）(效果不好，sjtu的验证码都没什么旋转，暂时不用后续再加)
 10 | 5. 缩放到相同大小
 11 | 6. 持久化，string hickle
 12 | """
 13 | from PIL import Image
 14 | import numpy as np
 15 | import os
 16 | import pickle
 17 | from itertools import groupby
 18 | 
 19 | from utils import (
 20 | 	COLOR_RGB_BLACK, COLOR_RGB_WHITE, COLOR_RGBA_BLACK, COLOR_RGBA_WHITE,
 21 | 	BORDER_LEFT, BORDER_TOP, BORDER_RIGHT, BORDER_BOTTOM,
 22 | 	RAW_DATA_DIR, PROCESSED_DATA_DIR, LABELS_DIR,
 23 | 	NORM_SIZE,
 24 | )
 25 | 
 26 | # 存放处理后的图片数据
 27 | if not os.path.exists(PROCESSED_DATA_DIR):
 28 | 	os.mkdir(PROCESSED_DATA_DIR)
 29 | 
 30 | class SJTUCaptcha(object):
 31 | 	def __init__(self, image):
 32 | 		"""
 33 | 		初始化
 34 | 		:param image: 验证码图片文件 Image Object
 35 | 		:param manual: 是否人工验证, 默认为False, 采用机器验证
 36 | 		"""
 37 | 		if isinstance(image, str) or isinstance(image, unicode):
 38 | 			self.name = image.split('/')[-1].split('.')[0]
 39 | 		if isinstance(image, file) or isinstance(image, str) or isinstance(image, unicode):
 40 | 			self._image = Image.open(image)
 41 | 		elif isinstance(image, JpegImageFile):
 42 | 			self._image = image
 43 | 		else:
 44 | 			raise Exception('captcha image file is unavailable')
 45 | 
 46 | 	def preprocess(self):
 47 | 		# 获取验证码预处理结果: 返回二维list，一行表示一个child image
 48 | 		res = []
 49 | 
 50 | 		store_path = PROCESSED_DATA_DIR + self.name.split('.')[0]
 51 | 		if not os.path.exists(store_path):
 52 | 			os.mkdir(store_path)
 53 | 
 54 | 		self._binaryzation()
 55 | 		
 56 | 		child_images = self._cut_images()
 57 | 		for i in range(len(child_images)):
 58 | 
 59 | 			normalized_image = self._resize_to_norm(child_images[i])
 60 | 			# normalized_image.show()
 61 | 			# normalized_image.save(store_path + '/%d.jpg' % i)
 62 | 			# normalized_image.show()
 63 | 
 64 | 			# self._captcha_to_string(normalized_image, save_as = '%d'%i)
 65 | 			res.append(self._captcha_to_list(normalized_image))
 66 | 		# 如果当前处理的验证码只有四位，再加一个20*20的全零list
 67 | 		if len(res) == 4:
 68 | 			res.append([0]*400)
 69 | 
 70 | 		assert len(res) == 5
 71 | 		return res
 72 | 
 73 | 	def _binaryzation(self):
 74 | 		"""
 75 | 		将图片进行二值化
 76 | 		"""
 77 | 		#有很多种算法，这里选择rgb加权平均值算法
 78 | 		width, height = self._image.size
 79 | 		for y in xrange(height):
 80 | 			for x in xrange(width):
 81 | 				r, g, b = self._image.getpixel((x, y))
 82 | 				value = 0.299 * r + 0.587 * g + 0.114 * b
 83 | 				#value就是灰度值，这里使用127作为阀值
 84 | 				#小于127的就认为是黑色也就是0 大于等于127的就是白色，也就是255
 85 | 				if value < 170:
 86 | 					self._image.putpixel((x, y), COLOR_RGB_BLACK)
 87 | 				else:
 88 | 					self._image.putpixel((x, y), COLOR_RGB_WHITE)
 89 | 	
 90 | 	# 图片到x轴或y轴的投影，如果有数据（黑色像素点）值为1，否则为0
 91 | 	def _get_projection_x(self): # axis = 0: x轴, axis = 1: y轴
 92 | 		# 初始化投影标记list
 93 | 		p_x = [0 for _ in xrange(self._image.size[0])]
 94 | 		width, height = self._image.size
 95 | 		
 96 | 		for x in xrange(width):
 97 | 			for y in xrange(height):
 98 | 				if self._image.getpixel((x, y)) == COLOR_RGB_BLACK:
 99 | 					p_x[x] = 1
100 | 					break
101 | 		return p_x
102 | 
103 | 	# 获取切割后的x轴坐标点，返回值为[初始位置，长度]的列表
104 | 	def _get_split_seq(self, projection_x):
105 | 		split_seq = []
106 | 		start_x = 0
107 | 		length = 0
108 | 		for pos_x, val in enumerate(projection_x):
109 | 			if val == 0 and length == 0:
110 | 				continue
111 | 			elif val == 0 and length != 0:
112 | 				split_seq.append([start_x, length])
113 | 				length = 0
114 | 			elif val == 1:
115 | 				if length == 0:
116 | 					start_x = pos_x
117 | 				length += 1
118 | 			else:
119 | 				raise Exception('generating split sequence occurs error')
120 | 		# 循环结束时如果length不为0，说明还有一部分需要append
121 | 		if length != 0:
122 | 			split_seq.append([start_x, length])
123 | 		return split_seq
124 | 
125 | 	def _is_joint(self, split_len):
126 | 		"""
127 | 		以字符宽度统计值判断当前split_len是否是两个字符的长度
128 | 		返回True需要进一步进行滴水算法分割
129 | 		"""
130 | 		return True if split_len >= 18 else False
131 | 
132 | 	def _is_black(self, rgb):
133 | 		"""
134 | 		: param rgb: tuple (r, g, b) 
135 | 		"""
136 | 		return True if rgb == COLOR_RGB_BLACK else False
137 | 
138 | 	def _drop_fall(self, image):
139 | 		"""
140 | 		对粘连两个字符的图片进行drop fall算法分割
141 | 		"""
142 | 		# 1. 竖直投影统计
143 | 		width, height = image.size
144 | 		print "当前待切割图片的 width: %d, height: %d" % (width, height)
145 | 		hist_width = [0]*width
146 | 		for x in xrange(width):
147 | 			for y in xrange(height):
148 | 				if self._is_black(image.getpixel((x, y))):
149 | 					hist_width[x] += 1 
150 | 
151 | 		print "当前的hist_width: %s" % str(hist_width)
152 | 		
153 | 		# 2. 找到极小值点
154 | 		start_x = self._get_start_x(hist_width)
155 | 		print "当前的起始点是: %d" % start_x
156 | 
157 | 		# 3. 以这个极小值点作为起始滴落点,实施滴水算法
158 | 		start_route = []
159 | 		for y in range(height):
160 | 			start_route.append((0, y))
161 | 
162 | 		end_route = self._get_end_route(image, start_x, height)
163 | 		filter_end_route = [max(list(k)) for _, k in groupby(end_route, lambda x: x[1])]
164 | 		# 两个字符的图片，首先得到的是左边那个字符
165 | 		img1 = self._do_split(image, start_route, filter_end_route)
166 | 		img1 = img1.crop((self._get_black_border(img1)))
167 | 
168 | 		# 再得到最右边字符
169 | 		start_route = map(lambda x: (x[0] + 1, x[1]), filter_end_route)
170 | 		end_route = []
171 | 		for y in range(height):
172 | 			end_route.append((width - 1, y))
173 | 		img2 = self._do_split(image, start_route, end_route)
174 | 		img2 = img2.crop((self._get_black_border(img2)))
175 | 
176 | 		return [img1, img2]
177 | 
178 | 	def _get_start_x(self, hist_width):
179 | 		"""
180 | 		根据待切割的图片的竖直投影统计hist_width，找到合适的滴水起始点
181 | 		hist_width的中间值，前后再取4个值，在这个范围内找最小值
182 | 		"""
183 | 		mid = len(hist_width)/2
184 | 		# 共9个值
185 | 		return mid - 4 + np.argmin(hist_width[mid - 4:mid + 5])
186 | 
187 | 	def _get_end_route(self, image, start_x, height):
188 | 		"""
189 | 		获得滴水的路径
190 | 		: param start_x: 滴水的起始x位置
191 | 		"""
192 | 		left_limit = 0
193 | 		right_limit = image.size[0] - 1
194 | 
195 | 		end_route = []
196 | 		print "当前的start_x: %d" % start_x
197 | 		cur_p = (start_x, 0)
198 | 		last_p = cur_p
199 | 		end_route.append(cur_p)
200 | 
201 | 		while cur_p[1] < (height - 1):
202 | 			sum_n = 0
203 | 			maxW = 0 # max Z_j*W_j
204 | 			nextX = cur_p[0]
205 | 			nextY = cur_p[1]
206 | 			for i in range(1, 6):
207 | 				curW = self._get_nearby_pixel_val(image, cur_p[0], cur_p[1], i) * (6 - i)
208 | 				sum_n += curW
209 | 				if maxW < curW:
210 | 					maxW = curW
211 | 			
212 | 			# 如果全黑，需要看惯性
213 | 			if sum_n == 0:
214 | 				maxW = 4
215 | 
216 | 			# 如果全白，则默认垂直下落
217 | 			if sum_n == 15:
218 | 				maxW = 6
219 | 
220 | 			if maxW == 1:
221 | 				nextX = cur_p[0] - 1
222 | 				nextY = cur_p[1]
223 | 			elif maxW == 2:
224 | 				nextX = cur_p[0] + 1
225 | 				nextY = cur_p[1]
226 | 			elif maxW == 3:
227 | 				nextX = cur_p[0] + 1
228 | 				nextY = cur_p[1] + 1
229 | 			elif maxW == 5:
230 | 				nextX = cur_p[0] - 1
231 | 				nextY = cur_p[1] + 1
232 | 			elif maxW == 6:
233 | 				nextX = cur_p[0]
234 | 				nextY = cur_p[1] + 1
235 | 			elif maxW == 4:
236 | 				if nextX > cur_p[0]: # 具有向右的惯性
237 | 					nextX = cur_p[0] + 1
238 | 					nextY = cur_p[1] + 1
239 | 
240 | 				if nextX < cur_p[0]:
241 | 					nextX = cur_p[0]
242 | 					nextY = cur_p[1] + 1
243 | 
244 | 				if sum_n == 0:
245 | 					nextX = cur_p[0]
246 | 					nextY = cur_p[1] + 1
247 | 			else:
248 | 				raise Exception("get a wrong maxW, pls check")
249 | 
250 | 			# 如果出现重复运动
251 | 			if last_p[0] == nextX and last_p[1] == nextY:
252 | 				if nextX < cur_p[0]:
253 | 					maxW = 5
254 | 					nextX = cur_p[0] + 1
255 | 					nextY = cur_p[1] + 1
256 | 				else:
257 | 					maxW = 3
258 | 					nextX = cur_p[0] - 1
259 | 					nextY = cur_p[1] + 1
260 | 
261 | 			last_p = cur_p
262 | 
263 | 			if nextX > right_limit:
264 | 				nextX = right_limit
265 | 				nextY = cur_p[1] + 1
266 | 
267 | 			if nextX < left_limit:
268 | 				nextX = left_limit
269 | 				nextY = cur_p[1] + 1
270 | 
271 | 			cur_p = (nextX, nextY)
272 | 			end_route.append(cur_p)
273 | 
274 | 		# 返回分割路径
275 | 		return end_route
276 | 
277 | 	def _get_nearby_pixel_val(self, image, cx, cy, j):
278 | 		if j == 1:
279 | 			return 0 if self._is_black(image.getpixel((cx - 1, cy + 1))) else 1
280 | 		elif j == 2:
281 | 			return 0 if self._is_black(image.getpixel((cx, cy + 1))) else 1
282 | 		elif j == 3:
283 | 			return 0 if self._is_black(image.getpixel((cx + 1, cy + 1))) else 1
284 | 		elif j == 4:
285 | 			return 0 if self._is_black(image.getpixel((cx + 1, cy))) else 1
286 | 		elif j == 5:
287 | 			return 0 if self._is_black(image.getpixel((cx - 1, cy))) else 1
288 | 		else:
289 | 			raise Exception("what you request is out of nearby range")
290 | 
291 | 	def _do_split(self, source_image, starts, filter_ends):
292 | 		"""
293 | 		具体实行切割 
294 | 		: param starts: 每一行的起始点 tuple of list
295 | 		: param ends: 每一行的终止点
296 | 		"""
297 | 		left = starts[0][0]
298 | 		top = starts[0][1]
299 | 		right = filter_ends[0][0]
300 | 		bottom = filter_ends[0][1]
301 | 
302 | 		for i in range(len(starts)):
303 | 			left = min(starts[i][0], left)
304 | 			top = min(starts[i][1], top)
305 | 			right = max(filter_ends[i][0], right)
306 | 			bottom = max(filter_ends[i][1], bottom)
307 | 
308 | 		width = right - left + 1
309 | 		height = bottom - top + 1
310 | 
311 | 		image = Image.new('RGB', (width, height), COLOR_RGB_WHITE)
312 | 
313 | 		for i in range(height):
314 | 			start = starts[i]
315 | 			end = filter_ends[i]
316 | 			for x in range(start[0], end[0]+1):
317 | 				if self._is_black(source_image.getpixel((x, start[1]))):
318 | 					image.putpixel((x - left, start[1] - top), COLOR_RGB_BLACK)
319 | 
320 | 		return image
321 | 
322 | 	def _cut_images(self):
323 | 		"""
324 | 		切割图像为单个字符块
325 | 		:return: list对象, 每个元素为一个单独字符的Image Object
326 | 		"""
327 | 		# _image.size返回的是(width, height)
328 | 		split_seq = self._get_split_seq(self._get_projection_x())
329 | 		print split_seq
330 | 
331 | 		# 切割图片
332 | 		croped_images = []
333 | 		height = self._image.size[1]
334 | 
335 | 		for start_x, width in split_seq:
336 | 			# 同时去掉y轴上下多余的空白
337 | 			begin_row = 0
338 | 			end_row = height - 1
339 | 			for row in range(height):
340 | 				flag = True
341 | 				for col in range(start_x, start_x + width):
342 | 					if self._image.getpixel((col, row)) == COLOR_RGB_BLACK:
343 | 						flag = False
344 | 						break
345 | 				if not flag: # 如果在当前行找到了黑色像素点，就是起始行
346 | 				    begin_row = row
347 | 				    break
348 | 			for row in reversed(range(height)):
349 | 				flag = True
350 | 				for col in range(start_x, start_x + width):
351 | 					if self._image.getpixel((col, row)) == COLOR_RGB_BLACK:
352 | 						flag = False
353 | 						break
354 | 				if not flag:
355 | 					end_row = row
356 | 					break
357 | 			croped_images.append(self._image.crop((start_x, begin_row, start_x + width, end_row + 1)))
358 | 		
359 | 		# 没考虑一个source image出现多个粘连图片的情况
360 | 		need_drop_fall = False
361 | 		for idx, split_info in enumerate(split_seq):
362 | 			# split_info: (start_x, length)
363 | 			if self._is_joint(split_info[1]):
364 | 				need_drop_fall = True
365 | 				print "找到一张粘连图片: %d" % idx
366 | 				split_images = self._drop_fall(croped_images[idx])
367 | 				break
368 | 		if need_drop_fall:
369 | 			del croped_images[idx]
370 | 			croped_images.insert(idx, split_images[0])
371 | 			croped_images.insert(idx + 1, split_images[1])
372 | 
373 | 		return croped_images
374 | 	
375 | 	def _get_black_border(self, image):
376 | 		"""
377 | 		获取指定图像的内容边界坐标
378 | 		:param image: 图像 Image Object
379 | 		:return: 图像内容边界坐标tuple (left, top, right, bottom)
380 | 		"""
381 | 		width, height = image.size
382 | 		max_x = max_y = 0
383 | 		min_x = width - 1
384 | 		min_y = height - 1
385 | 		for y in range(height):
386 | 			for x in range(width):
387 | 				if image.getpixel((x, y)) == COLOR_RGB_BLACK:
388 | 					min_x = min(min_x, x)
389 | 					max_x = max(max_x, x) 
390 | 					min_y = min(min_y, y)
391 | 					max_y = max(max_y, y)
392 | 		return min_x, min_y, max_x + 1, max_y + 1
393 | 
394 | 	def _rotate_image(self, image):
395 | 		"""
396 | 		将单个字符图片旋转到合适角度 (投影至X轴长度最小)
397 | 		:return: 旋转后的图像 (RGB)
398 | 		"""
399 | 		image = image.convert('RGBA')
400 | 		optimisim_image = image
401 | 		for angle in range(-30, 31):
402 | 			image_copy = image.rotate(angle, expand=True)
403 | 			fff = Image.new('RGBA', image_copy.size, (255, )*4)
404 | 			out = Image.composite(image_copy, fff, image_copy)
405 | 
406 | 			border_out = self._get_black_border(out)
407 | 			border_optimisim = self._get_black_border(optimisim_image)
408 | 			if border_out[BORDER_RIGHT] - border_out[BORDER_LEFT] + 1 < border_optimisim[BORDER_RIGHT] - border_optimisim[BORDER_LEFT] + 1:
409 | 				optimisim_image = out
410 | 
411 | 		border = self._get_black_border(optimisim_image)
412 | 		optimisim_image = optimisim_image.crop((
413 | 		    border[BORDER_LEFT],
414 | 		    border[BORDER_TOP],
415 | 		    border[BORDER_RIGHT],
416 | 		    border[BORDER_BOTTOM]
417 | 		))
418 | 		optimisim_image = optimisim_image.convert('RGB')
419 | 		return optimisim_image
420 | 
421 | 	def _resize_to_norm(self, image):
422 | 		"""
423 | 		将单个图像缩放至32x32像素标准图像
424 | 		:param image: 图像 (RGB)
425 | 		:return: 缩放后的Image Object
426 | 		"""
427 | 		if image.size[0] > NORM_SIZE or image.size[1] > NORM_SIZE:
428 | 			image = image.resize((NORM_SIZE, NORM_SIZE))
429 | 		width, height = image.size
430 | 		new_image = Image.new('RGB', (NORM_SIZE, NORM_SIZE), COLOR_RGB_WHITE)
431 | 		offset = ((NORM_SIZE - width) / 2, (NORM_SIZE - height) / 2)
432 | 		new_image.paste(image, offset)
433 | 		return new_image
434 | 
435 | 	def _captcha_to_list(self, image):
436 | 		"""
437 | 		将验证码转换为数字编码
438 | 		:param image: 图像
439 | 		:return: 数字编码字符串
440 | 		"""
441 | 		if image.size != (NORM_SIZE, NORM_SIZE):
442 | 			raise Exception("Image needs to normalize before to string")
443 |         
444 | 		# 将pixel写到列表中
445 | 		data = [0]*(NORM_SIZE*NORM_SIZE)
446 | 		for y in range(0, NORM_SIZE):
447 | 			for x in range(0, NORM_SIZE):
448 | 				data[y*NORM_SIZE + x] = 1 if image.getpixel((x, y)) == COLOR_RGB_BLACK else 0
449 | 
450 | 		return data
451 | 
452 | 	def _captcha_to_string(self, image, save_as):
453 | 		data = self._captcha_to_list(image)
454 | 		# 写到文件: data的数据类型必须是str(map转换)
455 | 		with open(save_as, 'w') as outfile:
456 | 			for row in xrange(NORM_SIZE):
457 | 				outfile.write(''.join(map(str, data[row*NORM_SIZE:(row+1)*NORM_SIZE])) + '\n')
458 | 
459 | 
460 | 
461 | 
462 | 
463 | def load_labels():
464 | 	labels = []
465 | 	# files = [f for f in os.listdir(LABELS_DIR) if os.path.isfile(os.path.join(LABELS_DIR,f))]
466 | 	# sorted(files)
467 | 	files = ['0-499.txt', '500-999.txt', '1000-1499.txt', '1500-1999.txt', '2000-2499.txt', '2500-3999.txt',
468 | 			'4000-5499.txt', '5500-6999.txt', '7000-8499.txt', '8500-9999.txt']
469 | 	for f in files:
470 | 		print f
471 | 		with open(os.path.join(LABELS_DIR,f), 'r') as input_file:
472 | 			labels.extend(map(str.strip, input_file.readlines()))
473 | 	
474 | 	with open('data/labels.pkl', 'wb') as f:
475 | 		pickle.dump(np.asarray(labels), f)
476 | 	print np.asarray(labels).shape
477 | 
478 | 	print "==== test ===="
479 | 	test()
480 | 
481 | 
482 | 
483 | 
484 | def test():
485 | 	with open('data/labels.pkl', 'rb') as f:
486 | 		a = pickle.load(f)
487 | 	print a.shape
488 | 	print a[500:510]
489 | 
490 | def main():
491 | 	# train_data = [82, 190, 260, 279, 309, 339, 352, 360, 450]#np.zeros(shape = (1500, NORM_SIZE*NORM_SIZE))
492 | 	train_data = []
493 | 	for i in xrange(10000): # xrange(10000)
494 | 		myCaptcha = SJTUCaptcha(os.path.join(RAW_DATA_DIR, '%d.jpg'%i))
495 | 		print "图片为:%d" %i
496 | 		s = myCaptcha.preprocess()
497 | 		# print s
498 | 		# break
499 | 		train_data.append(s)
500 | 
501 | 	with open('images.plk', 'wb') as f:
502 | 		pickle.dump(np.asarray(train_data), f)
503 | 	print np.asarray(train_data).shape
504 | 
505 | 	print "==== test ===="
506 | 	test()
507 | 
508 | 
509 | 
510 | if __name__ == '__main__':
511 | 	# main()
512 | 	load_labels()
513 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #coding: utf-8
 2 | 
 3 | COLOR_RGB_BLACK = (0, 0, 0)
 4 | COLOR_RGB_WHITE = (255, 255, 255)
 5 | COLOR_RGBA_BLACK = (0, 0, 0, 255)
 6 | COLOR_RGBA_WHITE = (255, 255, 255, 255)
 7 | 
 8 | BORDER_LEFT = 0
 9 | BORDER_TOP = 1
10 | BORDER_RIGHT = 2
11 | BORDER_BOTTOM = 3
12 | 
13 | RAW_DATA_DIR = 'captcha/'
14 | PROCESSED_DATA_DIR = 'processed/'
15 | LABELS_DIR = 'labels/'
16 | 
17 | NORM_SIZE = 20


--------------------------------------------------------------------------------