├── .gitignore ├── fuck_sjtu_captcha.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | !*.py -------------------------------------------------------------------------------- /fuck_sjtu_captcha.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | """ 4 | 验证码处理步骤: 5 | 6 | 1. 二值化 7 | 2. 去噪点(由于sjtu验证码没有噪点,不需要这步) 8 | 3. 字符切割 9 | 4. 单个字符图片旋转到合适角度:旋转卡壳算法(投影至x轴长度最小)(效果不好,sjtu的验证码都没什么旋转,暂时不用后续再加) 10 | 5. 缩放到相同大小 11 | 6. 持久化,string hickle 12 | """ 13 | from PIL import Image 14 | import numpy as np 15 | import os 16 | import pickle 17 | from itertools import groupby 18 | 19 | from utils import ( 20 | COLOR_RGB_BLACK, COLOR_RGB_WHITE, COLOR_RGBA_BLACK, COLOR_RGBA_WHITE, 21 | BORDER_LEFT, BORDER_TOP, BORDER_RIGHT, BORDER_BOTTOM, 22 | RAW_DATA_DIR, PROCESSED_DATA_DIR, LABELS_DIR, 23 | NORM_SIZE, 24 | ) 25 | 26 | # 存放处理后的图片数据 27 | if not os.path.exists(PROCESSED_DATA_DIR): 28 | os.mkdir(PROCESSED_DATA_DIR) 29 | 30 | class SJTUCaptcha(object): 31 | def __init__(self, image): 32 | """ 33 | 初始化 34 | :param image: 验证码图片文件 Image Object 35 | :param manual: 是否人工验证, 默认为False, 采用机器验证 36 | """ 37 | if isinstance(image, str) or isinstance(image, unicode): 38 | self.name = image.split('/')[-1].split('.')[0] 39 | if isinstance(image, file) or isinstance(image, str) or isinstance(image, unicode): 40 | self._image = Image.open(image) 41 | elif isinstance(image, JpegImageFile): 42 | self._image = image 43 | else: 44 | raise Exception('captcha image file is unavailable') 45 | 46 | def preprocess(self): 47 | # 获取验证码预处理结果: 返回二维list,一行表示一个child image 48 | res = [] 49 | 50 | store_path = PROCESSED_DATA_DIR + self.name.split('.')[0] 51 | if not os.path.exists(store_path): 52 | os.mkdir(store_path) 53 | 54 | self._binaryzation() 55 | 56 | child_images = self._cut_images() 57 | for i in range(len(child_images)): 58 | 59 | normalized_image = self._resize_to_norm(child_images[i]) 60 | # normalized_image.show() 61 | # normalized_image.save(store_path + '/%d.jpg' % i) 62 | # normalized_image.show() 63 | 64 | # self._captcha_to_string(normalized_image, save_as = '%d'%i) 65 | res.append(self._captcha_to_list(normalized_image)) 66 | # 如果当前处理的验证码只有四位,再加一个20*20的全零list 67 | if len(res) == 4: 68 | res.append([0]*400) 69 | 70 | assert len(res) == 5 71 | return res 72 | 73 | def _binaryzation(self): 74 | """ 75 | 将图片进行二值化 76 | """ 77 | #有很多种算法,这里选择rgb加权平均值算法 78 | width, height = self._image.size 79 | for y in xrange(height): 80 | for x in xrange(width): 81 | r, g, b = self._image.getpixel((x, y)) 82 | value = 0.299 * r + 0.587 * g + 0.114 * b 83 | #value就是灰度值,这里使用127作为阀值 84 | #小于127的就认为是黑色也就是0 大于等于127的就是白色,也就是255 85 | if value < 170: 86 | self._image.putpixel((x, y), COLOR_RGB_BLACK) 87 | else: 88 | self._image.putpixel((x, y), COLOR_RGB_WHITE) 89 | 90 | # 图片到x轴或y轴的投影,如果有数据(黑色像素点)值为1,否则为0 91 | def _get_projection_x(self): # axis = 0: x轴, axis = 1: y轴 92 | # 初始化投影标记list 93 | p_x = [0 for _ in xrange(self._image.size[0])] 94 | width, height = self._image.size 95 | 96 | for x in xrange(width): 97 | for y in xrange(height): 98 | if self._image.getpixel((x, y)) == COLOR_RGB_BLACK: 99 | p_x[x] = 1 100 | break 101 | return p_x 102 | 103 | # 获取切割后的x轴坐标点,返回值为[初始位置,长度]的列表 104 | def _get_split_seq(self, projection_x): 105 | split_seq = [] 106 | start_x = 0 107 | length = 0 108 | for pos_x, val in enumerate(projection_x): 109 | if val == 0 and length == 0: 110 | continue 111 | elif val == 0 and length != 0: 112 | split_seq.append([start_x, length]) 113 | length = 0 114 | elif val == 1: 115 | if length == 0: 116 | start_x = pos_x 117 | length += 1 118 | else: 119 | raise Exception('generating split sequence occurs error') 120 | # 循环结束时如果length不为0,说明还有一部分需要append 121 | if length != 0: 122 | split_seq.append([start_x, length]) 123 | return split_seq 124 | 125 | def _is_joint(self, split_len): 126 | """ 127 | 以字符宽度统计值判断当前split_len是否是两个字符的长度 128 | 返回True需要进一步进行滴水算法分割 129 | """ 130 | return True if split_len >= 18 else False 131 | 132 | def _is_black(self, rgb): 133 | """ 134 | : param rgb: tuple (r, g, b) 135 | """ 136 | return True if rgb == COLOR_RGB_BLACK else False 137 | 138 | def _drop_fall(self, image): 139 | """ 140 | 对粘连两个字符的图片进行drop fall算法分割 141 | """ 142 | # 1. 竖直投影统计 143 | width, height = image.size 144 | print "当前待切割图片的 width: %d, height: %d" % (width, height) 145 | hist_width = [0]*width 146 | for x in xrange(width): 147 | for y in xrange(height): 148 | if self._is_black(image.getpixel((x, y))): 149 | hist_width[x] += 1 150 | 151 | print "当前的hist_width: %s" % str(hist_width) 152 | 153 | # 2. 找到极小值点 154 | start_x = self._get_start_x(hist_width) 155 | print "当前的起始点是: %d" % start_x 156 | 157 | # 3. 以这个极小值点作为起始滴落点,实施滴水算法 158 | start_route = [] 159 | for y in range(height): 160 | start_route.append((0, y)) 161 | 162 | end_route = self._get_end_route(image, start_x, height) 163 | filter_end_route = [max(list(k)) for _, k in groupby(end_route, lambda x: x[1])] 164 | # 两个字符的图片,首先得到的是左边那个字符 165 | img1 = self._do_split(image, start_route, filter_end_route) 166 | img1 = img1.crop((self._get_black_border(img1))) 167 | 168 | # 再得到最右边字符 169 | start_route = map(lambda x: (x[0] + 1, x[1]), filter_end_route) 170 | end_route = [] 171 | for y in range(height): 172 | end_route.append((width - 1, y)) 173 | img2 = self._do_split(image, start_route, end_route) 174 | img2 = img2.crop((self._get_black_border(img2))) 175 | 176 | return [img1, img2] 177 | 178 | def _get_start_x(self, hist_width): 179 | """ 180 | 根据待切割的图片的竖直投影统计hist_width,找到合适的滴水起始点 181 | hist_width的中间值,前后再取4个值,在这个范围内找最小值 182 | """ 183 | mid = len(hist_width)/2 184 | # 共9个值 185 | return mid - 4 + np.argmin(hist_width[mid - 4:mid + 5]) 186 | 187 | def _get_end_route(self, image, start_x, height): 188 | """ 189 | 获得滴水的路径 190 | : param start_x: 滴水的起始x位置 191 | """ 192 | left_limit = 0 193 | right_limit = image.size[0] - 1 194 | 195 | end_route = [] 196 | print "当前的start_x: %d" % start_x 197 | cur_p = (start_x, 0) 198 | last_p = cur_p 199 | end_route.append(cur_p) 200 | 201 | while cur_p[1] < (height - 1): 202 | sum_n = 0 203 | maxW = 0 # max Z_j*W_j 204 | nextX = cur_p[0] 205 | nextY = cur_p[1] 206 | for i in range(1, 6): 207 | curW = self._get_nearby_pixel_val(image, cur_p[0], cur_p[1], i) * (6 - i) 208 | sum_n += curW 209 | if maxW < curW: 210 | maxW = curW 211 | 212 | # 如果全黑,需要看惯性 213 | if sum_n == 0: 214 | maxW = 4 215 | 216 | # 如果全白,则默认垂直下落 217 | if sum_n == 15: 218 | maxW = 6 219 | 220 | if maxW == 1: 221 | nextX = cur_p[0] - 1 222 | nextY = cur_p[1] 223 | elif maxW == 2: 224 | nextX = cur_p[0] + 1 225 | nextY = cur_p[1] 226 | elif maxW == 3: 227 | nextX = cur_p[0] + 1 228 | nextY = cur_p[1] + 1 229 | elif maxW == 5: 230 | nextX = cur_p[0] - 1 231 | nextY = cur_p[1] + 1 232 | elif maxW == 6: 233 | nextX = cur_p[0] 234 | nextY = cur_p[1] + 1 235 | elif maxW == 4: 236 | if nextX > cur_p[0]: # 具有向右的惯性 237 | nextX = cur_p[0] + 1 238 | nextY = cur_p[1] + 1 239 | 240 | if nextX < cur_p[0]: 241 | nextX = cur_p[0] 242 | nextY = cur_p[1] + 1 243 | 244 | if sum_n == 0: 245 | nextX = cur_p[0] 246 | nextY = cur_p[1] + 1 247 | else: 248 | raise Exception("get a wrong maxW, pls check") 249 | 250 | # 如果出现重复运动 251 | if last_p[0] == nextX and last_p[1] == nextY: 252 | if nextX < cur_p[0]: 253 | maxW = 5 254 | nextX = cur_p[0] + 1 255 | nextY = cur_p[1] + 1 256 | else: 257 | maxW = 3 258 | nextX = cur_p[0] - 1 259 | nextY = cur_p[1] + 1 260 | 261 | last_p = cur_p 262 | 263 | if nextX > right_limit: 264 | nextX = right_limit 265 | nextY = cur_p[1] + 1 266 | 267 | if nextX < left_limit: 268 | nextX = left_limit 269 | nextY = cur_p[1] + 1 270 | 271 | cur_p = (nextX, nextY) 272 | end_route.append(cur_p) 273 | 274 | # 返回分割路径 275 | return end_route 276 | 277 | def _get_nearby_pixel_val(self, image, cx, cy, j): 278 | if j == 1: 279 | return 0 if self._is_black(image.getpixel((cx - 1, cy + 1))) else 1 280 | elif j == 2: 281 | return 0 if self._is_black(image.getpixel((cx, cy + 1))) else 1 282 | elif j == 3: 283 | return 0 if self._is_black(image.getpixel((cx + 1, cy + 1))) else 1 284 | elif j == 4: 285 | return 0 if self._is_black(image.getpixel((cx + 1, cy))) else 1 286 | elif j == 5: 287 | return 0 if self._is_black(image.getpixel((cx - 1, cy))) else 1 288 | else: 289 | raise Exception("what you request is out of nearby range") 290 | 291 | def _do_split(self, source_image, starts, filter_ends): 292 | """ 293 | 具体实行切割 294 | : param starts: 每一行的起始点 tuple of list 295 | : param ends: 每一行的终止点 296 | """ 297 | left = starts[0][0] 298 | top = starts[0][1] 299 | right = filter_ends[0][0] 300 | bottom = filter_ends[0][1] 301 | 302 | for i in range(len(starts)): 303 | left = min(starts[i][0], left) 304 | top = min(starts[i][1], top) 305 | right = max(filter_ends[i][0], right) 306 | bottom = max(filter_ends[i][1], bottom) 307 | 308 | width = right - left + 1 309 | height = bottom - top + 1 310 | 311 | image = Image.new('RGB', (width, height), COLOR_RGB_WHITE) 312 | 313 | for i in range(height): 314 | start = starts[i] 315 | end = filter_ends[i] 316 | for x in range(start[0], end[0]+1): 317 | if self._is_black(source_image.getpixel((x, start[1]))): 318 | image.putpixel((x - left, start[1] - top), COLOR_RGB_BLACK) 319 | 320 | return image 321 | 322 | def _cut_images(self): 323 | """ 324 | 切割图像为单个字符块 325 | :return: list对象, 每个元素为一个单独字符的Image Object 326 | """ 327 | # _image.size返回的是(width, height) 328 | split_seq = self._get_split_seq(self._get_projection_x()) 329 | print split_seq 330 | 331 | # 切割图片 332 | croped_images = [] 333 | height = self._image.size[1] 334 | 335 | for start_x, width in split_seq: 336 | # 同时去掉y轴上下多余的空白 337 | begin_row = 0 338 | end_row = height - 1 339 | for row in range(height): 340 | flag = True 341 | for col in range(start_x, start_x + width): 342 | if self._image.getpixel((col, row)) == COLOR_RGB_BLACK: 343 | flag = False 344 | break 345 | if not flag: # 如果在当前行找到了黑色像素点,就是起始行 346 | begin_row = row 347 | break 348 | for row in reversed(range(height)): 349 | flag = True 350 | for col in range(start_x, start_x + width): 351 | if self._image.getpixel((col, row)) == COLOR_RGB_BLACK: 352 | flag = False 353 | break 354 | if not flag: 355 | end_row = row 356 | break 357 | croped_images.append(self._image.crop((start_x, begin_row, start_x + width, end_row + 1))) 358 | 359 | # 没考虑一个source image出现多个粘连图片的情况 360 | need_drop_fall = False 361 | for idx, split_info in enumerate(split_seq): 362 | # split_info: (start_x, length) 363 | if self._is_joint(split_info[1]): 364 | need_drop_fall = True 365 | print "找到一张粘连图片: %d" % idx 366 | split_images = self._drop_fall(croped_images[idx]) 367 | break 368 | if need_drop_fall: 369 | del croped_images[idx] 370 | croped_images.insert(idx, split_images[0]) 371 | croped_images.insert(idx + 1, split_images[1]) 372 | 373 | return croped_images 374 | 375 | def _get_black_border(self, image): 376 | """ 377 | 获取指定图像的内容边界坐标 378 | :param image: 图像 Image Object 379 | :return: 图像内容边界坐标tuple (left, top, right, bottom) 380 | """ 381 | width, height = image.size 382 | max_x = max_y = 0 383 | min_x = width - 1 384 | min_y = height - 1 385 | for y in range(height): 386 | for x in range(width): 387 | if image.getpixel((x, y)) == COLOR_RGB_BLACK: 388 | min_x = min(min_x, x) 389 | max_x = max(max_x, x) 390 | min_y = min(min_y, y) 391 | max_y = max(max_y, y) 392 | return min_x, min_y, max_x + 1, max_y + 1 393 | 394 | def _rotate_image(self, image): 395 | """ 396 | 将单个字符图片旋转到合适角度 (投影至X轴长度最小) 397 | :return: 旋转后的图像 (RGB) 398 | """ 399 | image = image.convert('RGBA') 400 | optimisim_image = image 401 | for angle in range(-30, 31): 402 | image_copy = image.rotate(angle, expand=True) 403 | fff = Image.new('RGBA', image_copy.size, (255, )*4) 404 | out = Image.composite(image_copy, fff, image_copy) 405 | 406 | border_out = self._get_black_border(out) 407 | border_optimisim = self._get_black_border(optimisim_image) 408 | if border_out[BORDER_RIGHT] - border_out[BORDER_LEFT] + 1 < border_optimisim[BORDER_RIGHT] - border_optimisim[BORDER_LEFT] + 1: 409 | optimisim_image = out 410 | 411 | border = self._get_black_border(optimisim_image) 412 | optimisim_image = optimisim_image.crop(( 413 | border[BORDER_LEFT], 414 | border[BORDER_TOP], 415 | border[BORDER_RIGHT], 416 | border[BORDER_BOTTOM] 417 | )) 418 | optimisim_image = optimisim_image.convert('RGB') 419 | return optimisim_image 420 | 421 | def _resize_to_norm(self, image): 422 | """ 423 | 将单个图像缩放至32x32像素标准图像 424 | :param image: 图像 (RGB) 425 | :return: 缩放后的Image Object 426 | """ 427 | if image.size[0] > NORM_SIZE or image.size[1] > NORM_SIZE: 428 | image = image.resize((NORM_SIZE, NORM_SIZE)) 429 | width, height = image.size 430 | new_image = Image.new('RGB', (NORM_SIZE, NORM_SIZE), COLOR_RGB_WHITE) 431 | offset = ((NORM_SIZE - width) / 2, (NORM_SIZE - height) / 2) 432 | new_image.paste(image, offset) 433 | return new_image 434 | 435 | def _captcha_to_list(self, image): 436 | """ 437 | 将验证码转换为数字编码 438 | :param image: 图像 439 | :return: 数字编码字符串 440 | """ 441 | if image.size != (NORM_SIZE, NORM_SIZE): 442 | raise Exception("Image needs to normalize before to string") 443 | 444 | # 将pixel写到列表中 445 | data = [0]*(NORM_SIZE*NORM_SIZE) 446 | for y in range(0, NORM_SIZE): 447 | for x in range(0, NORM_SIZE): 448 | data[y*NORM_SIZE + x] = 1 if image.getpixel((x, y)) == COLOR_RGB_BLACK else 0 449 | 450 | return data 451 | 452 | def _captcha_to_string(self, image, save_as): 453 | data = self._captcha_to_list(image) 454 | # 写到文件: data的数据类型必须是str(map转换) 455 | with open(save_as, 'w') as outfile: 456 | for row in xrange(NORM_SIZE): 457 | outfile.write(''.join(map(str, data[row*NORM_SIZE:(row+1)*NORM_SIZE])) + '\n') 458 | 459 | 460 | 461 | 462 | 463 | def load_labels(): 464 | labels = [] 465 | # files = [f for f in os.listdir(LABELS_DIR) if os.path.isfile(os.path.join(LABELS_DIR,f))] 466 | # sorted(files) 467 | files = ['0-499.txt', '500-999.txt', '1000-1499.txt', '1500-1999.txt', '2000-2499.txt', '2500-3999.txt', 468 | '4000-5499.txt', '5500-6999.txt', '7000-8499.txt', '8500-9999.txt'] 469 | for f in files: 470 | print f 471 | with open(os.path.join(LABELS_DIR,f), 'r') as input_file: 472 | labels.extend(map(str.strip, input_file.readlines())) 473 | 474 | with open('data/labels.pkl', 'wb') as f: 475 | pickle.dump(np.asarray(labels), f) 476 | print np.asarray(labels).shape 477 | 478 | print "==== test ====" 479 | test() 480 | 481 | 482 | 483 | 484 | def test(): 485 | with open('data/labels.pkl', 'rb') as f: 486 | a = pickle.load(f) 487 | print a.shape 488 | print a[500:510] 489 | 490 | def main(): 491 | # train_data = [82, 190, 260, 279, 309, 339, 352, 360, 450]#np.zeros(shape = (1500, NORM_SIZE*NORM_SIZE)) 492 | train_data = [] 493 | for i in xrange(10000): # xrange(10000) 494 | myCaptcha = SJTUCaptcha(os.path.join(RAW_DATA_DIR, '%d.jpg'%i)) 495 | print "图片为:%d" %i 496 | s = myCaptcha.preprocess() 497 | # print s 498 | # break 499 | train_data.append(s) 500 | 501 | with open('images.plk', 'wb') as f: 502 | pickle.dump(np.asarray(train_data), f) 503 | print np.asarray(train_data).shape 504 | 505 | print "==== test ====" 506 | test() 507 | 508 | 509 | 510 | if __name__ == '__main__': 511 | # main() 512 | load_labels() 513 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | COLOR_RGB_BLACK = (0, 0, 0) 4 | COLOR_RGB_WHITE = (255, 255, 255) 5 | COLOR_RGBA_BLACK = (0, 0, 0, 255) 6 | COLOR_RGBA_WHITE = (255, 255, 255, 255) 7 | 8 | BORDER_LEFT = 0 9 | BORDER_TOP = 1 10 | BORDER_RIGHT = 2 11 | BORDER_BOTTOM = 3 12 | 13 | RAW_DATA_DIR = 'captcha/' 14 | PROCESSED_DATA_DIR = 'processed/' 15 | LABELS_DIR = 'labels/' 16 | 17 | NORM_SIZE = 20 --------------------------------------------------------------------------------