├── CNN_Model_predict.py
├── CNN_Model_training.py
├── README.md
├── VerifyCode.rar
├── data.csv
├── model.png
└── 测试集上的准确率曲线.png


/CNN_Model_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import cv2
  5 | import numpy as np
  6 | 
  7 | def split_picture(imagepath):
  8 | 
  9 |     # 以灰度模式读取图片
 10 |     gray = cv2.imread(imagepath, 0)
 11 | 
 12 |     # 将图片的边缘变为白色
 13 |     height, width = gray.shape
 14 |     for i in range(width):
 15 |         gray[0, i] = 255
 16 |         gray[height-1, i] = 255
 17 |     for j in range(height):
 18 |         gray[j, 0] = 255
 19 |         gray[j, width-1] = 255
 20 | 
 21 |     # 中值滤波
 22 |     blur = cv2.medianBlur(gray, 3) #模板大小3*3
 23 | 
 24 |     # 二值化
 25 |     ret,thresh1 = cv2.threshold(blur, 200, 255, cv2.THRESH_BINARY)
 26 | 
 27 |     # 提取单个字符
 28 |     chars_list = []
 29 |     image, contours, hierarchy = cv2.findContours(thresh1, 2, 2)
 30 |     for cnt in contours:
 31 |         # 最小的外接矩形
 32 |         x, y, w, h = cv2.boundingRect(cnt)
 33 |         if x != 0 and y != 0 and w*h >= 100:
 34 |             chars_list.append((x,y,w,h))
 35 | 
 36 |     sorted_chars_list = sorted(chars_list, key=lambda x:x[0])
 37 |     for i,item in enumerate(sorted_chars_list):
 38 |         x, y, w, h = item
 39 |         cv2.imwrite('F://test_verifycode/chars/%d.jpg'%(i+1), thresh1[y:y+h, x:x+w])
 40 | 
 41 | def remove_edge_picture(imagepath):
 42 | 
 43 |     image = cv2.imread(imagepath, 0)
 44 |     height, width = image.shape
 45 |     corner_list = [image[0,0] < 127,
 46 |                    image[height-1, 0] < 127,
 47 |                    image[0, width-1]<127,
 48 |                    image[ height-1, width-1] < 127
 49 |                    ]
 50 |     if sum(corner_list) >= 3:
 51 |         os.remove(imagepath)
 52 | 
 53 | def resplit_with_parts(imagepath, parts):
 54 |     image = cv2.imread(imagepath, 0)
 55 |     os.remove(imagepath)
 56 |     height, width = image.shape
 57 | 
 58 |     file_name = imagepath.split('/')[-1].split(r'.')[0]
 59 |     # 将图片重新分裂成parts部分
 60 |     step = width//parts     # 步长
 61 |     start = 0             # 起始位置
 62 |     for i in range(parts):
 63 |         cv2.imwrite('F://test_verifycode/chars/%s.jpg'%(file_name+'-'+str(i)), \
 64 |                     image[:, start:start+step])
 65 |         start += step
 66 | 
 67 | def resplit(imagepath):
 68 | 
 69 |     image = cv2.imread(imagepath, 0)
 70 |     height, width = image.shape
 71 | 
 72 |     if width >= 64:
 73 |         resplit_with_parts(imagepath, 4)
 74 |     elif width >= 48:
 75 |         resplit_with_parts(imagepath, 3)
 76 |     elif width >= 26:
 77 |         resplit_with_parts(imagepath, 2)
 78 | 
 79 | # rename and convert to 16*20 size
 80 | def convert(dir, file):
 81 | 
 82 |     imagepath = dir+'/'+file
 83 |     # 读取图片
 84 |     image = cv2.imread(imagepath, 0)
 85 |     # 二值化
 86 |     ret, thresh = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
 87 |     img = cv2.resize(thresh, (16, 20), interpolation=cv2.INTER_AREA)
 88 |     # 保存图片
 89 |     cv2.imwrite('%s/%s' % (dir, file), img)
 90 | 
 91 | # 读取图片的数据，并转化为0-1值
 92 | def Read_Data(dir, file):
 93 | 
 94 |     imagepath = dir+'/'+file
 95 |     # 读取图片
 96 |     image = cv2.imread(imagepath, 0)
 97 |     # 二值化
 98 |     ret, thresh = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
 99 |     # 显示图片
100 |     bin_values = [1 if pixel==255 else 0 for pixel in thresh.ravel()]
101 | 
102 |     return bin_values
103 | 
104 | def predict(VerifyCodePath):
105 | 
106 |     dir = 'F://test_verifycode/chars'
107 |     files = os.listdir(dir)
108 | 
109 |     # 清空原有的文件
110 |     if files:
111 |         for file in files:
112 |             os.remove(dir + '/' + file)
113 | 
114 |     split_picture(VerifyCodePath)
115 | 
116 |     files = os.listdir(dir)
117 |     if not files:
118 |         print('查看的文件夹为空！')
119 |     else:
120 | 
121 |         # 去除噪声图片
122 |         for file in files:
123 |             remove_edge_picture(dir + '/' + file)
124 | 
125 |         # 对黏连图片进行重分割
126 |         for file in os.listdir(dir):
127 |             resplit(dir + '/' + file)
128 | 
129 |         # 将图片统一调整至16*20大小
130 |         for file in os.listdir(dir):
131 |             convert(dir, file)
132 | 
133 |         # 图片中的字符代表的向量
134 |         files = sorted(os.listdir(dir), key=lambda x: x[0])
135 |         table = np.array([Read_Data(dir, file) for file in files]).reshape(-1,20,16,1)
136 | 
137 |         # 模型保存地址
138 |         mp = 'F://verifycode_data/verifycode_Keras.h5'
139 |         # 载入模型
140 |         from keras.models import load_model
141 |         cnn = load_model(mp)
142 |         # 模型预测
143 |         y_pred = cnn.predict(table)
144 |         predictions = np.argmax(y_pred, axis=1)
145 | 
146 |         # 标签字典
147 |         keys = range(31)
148 |         vals = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'N',
149 |                 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z']
150 |         label_dict = dict(zip(keys, vals))
151 | 
152 |         return ''.join([label_dict[pred] for pred in predictions])
153 | 
154 | def main():
155 | 
156 |     dir = 'F://VerifyCode/'
157 |     correct = 0
158 |     for i, file in enumerate(os.listdir(dir)):
159 |         true_label = file.split('.')[0]
160 |         VerifyCodePath = dir+file
161 |         pred = predict(VerifyCodePath)
162 | 
163 |         if true_label == pred:
164 |             correct += 1
165 |         print(i+1, (true_label, pred), true_label == pred, correct)
166 | 
167 |     total = len(os.listdir(dir))
168 |     print('\n总共图片：%d张\n识别正确：%d张\n识别准确率:%.2f%%.'\
169 |           %(total, correct, correct*100/total))
170 | 
171 | main()


--------------------------------------------------------------------------------
/CNN_Model_training.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.model_selection import train_test_split
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | from keras.utils import np_utils, plot_model
 8 | from keras.models import Sequential
 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten
10 | from keras.callbacks import EarlyStopping
11 | from keras.layers import Conv2D, MaxPooling2D
12 | 
13 | # 读取数据
14 | df = pd.read_csv('F://verifycode_data/data.csv')
15 | 
16 | # 标签值
17 | vals = range(31)
18 | keys = ['1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','J','K','L','N','P','Q','R','S','T','U','V','X','Y','Z']
19 | label_dict = dict(zip(keys, vals))
20 | 
21 | x_data = df[['v'+str(i+1) for i in range(320)]]
22 | y_data = pd.DataFrame({'label':df['label']})
23 | y_data['class'] = y_data['label'].apply(lambda x: label_dict[x])
24 | 
25 | # 将数据分为训练集和测试集
26 | X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data['class'], test_size=0.3, random_state=42)
27 | x_train = np.array(X_train).reshape((1167, 20, 16, 1))
28 | x_test = np.array(X_test).reshape((501, 20, 16, 1))
29 | 
30 | # 对标签值进行one-hot encoding
31 | n_classes = 31
32 | y_train = np_utils.to_categorical(Y_train, n_classes)
33 | y_val = np_utils.to_categorical(Y_test, n_classes)
34 | 
35 | input_shape = x_train[0].shape
36 | 
37 | # CNN模型
38 | model = Sequential()
39 | 
40 | # 卷积层和池化层
41 | model.add(Conv2D(32, kernel_size=(3, 3), input_shape=input_shape, padding='same'))
42 | model.add(Activation('relu'))
43 | model.add(Conv2D(32, kernel_size=(3, 3), padding='same'))
44 | model.add(Activation('relu'))
45 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
46 | 
47 | # Dropout层
48 | model.add(Dropout(0.25))
49 | 
50 | model.add(Conv2D(64, kernel_size=(3, 3), padding='same'))
51 | model.add(Activation('relu'))
52 | model.add(Conv2D(64, kernel_size=(3, 3), padding='same'))
53 | model.add(Activation('relu'))
54 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
55 | 
56 | model.add(Dropout(0.25))
57 | 
58 | model.add(Conv2D(128, kernel_size=(3, 3), padding='same'))
59 | model.add(Activation('relu'))
60 | model.add(Conv2D(128, kernel_size=(3, 3), padding='same'))
61 | model.add(Activation('relu'))
62 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
63 | 
64 | model.add(Dropout(0.25))
65 | 
66 | model.add(Flatten())
67 | 
68 | # 全连接层
69 | model.add(Dense(256, activation='relu'))
70 | model.add(Dropout(0.5))
71 | model.add(Dense(128, activation='relu'))
72 | model.add(Dense(n_classes, activation='softmax'))
73 | 
74 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
75 | 
76 | # plot model
77 | plot_model(model, to_file=r'./model.png', show_shapes=True)
78 | 
79 | # 模型训练
80 | callbacks = [EarlyStopping(monitor='val_acc', patience=5, verbose=1)]
81 | batch_size = 64
82 | n_epochs = 100
83 | history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, \
84 |                     verbose=1, validation_data=(x_test, y_val), callbacks=callbacks)
85 | 
86 | mp = 'F://verifycode_data/verifycode_Keras.h5'
87 | model.save(mp)
88 | 
89 | # 绘制验证集上的准确率曲线
90 | val_acc = history.history['val_acc']
91 | plt.plot(range(len(val_acc)), val_acc, label='CNN model')
92 | plt.title('Validation accuracy on verifycode dataset')
93 | plt.xlabel('epochs')
94 | plt.ylabel('accuracy')
95 | plt.legend()
96 | plt.show()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CNN_4_Verifycode
2 | 使用Keras搭建CNN模型，破解简单的网页验证码
3 | 


--------------------------------------------------------------------------------
/VerifyCode.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/VerifyCode.rar


--------------------------------------------------------------------------------
/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/model.png


--------------------------------------------------------------------------------
/测试集上的准确率曲线.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/测试集上的准确率曲线.png


--------------------------------------------------------------------------------