├── CNN_Model_predict.py ├── CNN_Model_training.py ├── README.md ├── VerifyCode.rar ├── data.csv ├── model.png └── 测试集上的准确率曲线.png /CNN_Model_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import cv2 5 | import numpy as np 6 | 7 | def split_picture(imagepath): 8 | 9 | # 以灰度模式读取图片 10 | gray = cv2.imread(imagepath, 0) 11 | 12 | # 将图片的边缘变为白色 13 | height, width = gray.shape 14 | for i in range(width): 15 | gray[0, i] = 255 16 | gray[height-1, i] = 255 17 | for j in range(height): 18 | gray[j, 0] = 255 19 | gray[j, width-1] = 255 20 | 21 | # 中值滤波 22 | blur = cv2.medianBlur(gray, 3) #模板大小3*3 23 | 24 | # 二值化 25 | ret,thresh1 = cv2.threshold(blur, 200, 255, cv2.THRESH_BINARY) 26 | 27 | # 提取单个字符 28 | chars_list = [] 29 | image, contours, hierarchy = cv2.findContours(thresh1, 2, 2) 30 | for cnt in contours: 31 | # 最小的外接矩形 32 | x, y, w, h = cv2.boundingRect(cnt) 33 | if x != 0 and y != 0 and w*h >= 100: 34 | chars_list.append((x,y,w,h)) 35 | 36 | sorted_chars_list = sorted(chars_list, key=lambda x:x[0]) 37 | for i,item in enumerate(sorted_chars_list): 38 | x, y, w, h = item 39 | cv2.imwrite('F://test_verifycode/chars/%d.jpg'%(i+1), thresh1[y:y+h, x:x+w]) 40 | 41 | def remove_edge_picture(imagepath): 42 | 43 | image = cv2.imread(imagepath, 0) 44 | height, width = image.shape 45 | corner_list = [image[0,0] < 127, 46 | image[height-1, 0] < 127, 47 | image[0, width-1]<127, 48 | image[ height-1, width-1] < 127 49 | ] 50 | if sum(corner_list) >= 3: 51 | os.remove(imagepath) 52 | 53 | def resplit_with_parts(imagepath, parts): 54 | image = cv2.imread(imagepath, 0) 55 | os.remove(imagepath) 56 | height, width = image.shape 57 | 58 | file_name = imagepath.split('/')[-1].split(r'.')[0] 59 | # 将图片重新分裂成parts部分 60 | step = width//parts # 步长 61 | start = 0 # 起始位置 62 | for i in range(parts): 63 | cv2.imwrite('F://test_verifycode/chars/%s.jpg'%(file_name+'-'+str(i)), \ 64 | image[:, start:start+step]) 65 | start += step 66 | 67 | def resplit(imagepath): 68 | 69 | image = cv2.imread(imagepath, 0) 70 | height, width = image.shape 71 | 72 | if width >= 64: 73 | resplit_with_parts(imagepath, 4) 74 | elif width >= 48: 75 | resplit_with_parts(imagepath, 3) 76 | elif width >= 26: 77 | resplit_with_parts(imagepath, 2) 78 | 79 | # rename and convert to 16*20 size 80 | def convert(dir, file): 81 | 82 | imagepath = dir+'/'+file 83 | # 读取图片 84 | image = cv2.imread(imagepath, 0) 85 | # 二值化 86 | ret, thresh = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY) 87 | img = cv2.resize(thresh, (16, 20), interpolation=cv2.INTER_AREA) 88 | # 保存图片 89 | cv2.imwrite('%s/%s' % (dir, file), img) 90 | 91 | # 读取图片的数据,并转化为0-1值 92 | def Read_Data(dir, file): 93 | 94 | imagepath = dir+'/'+file 95 | # 读取图片 96 | image = cv2.imread(imagepath, 0) 97 | # 二值化 98 | ret, thresh = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY) 99 | # 显示图片 100 | bin_values = [1 if pixel==255 else 0 for pixel in thresh.ravel()] 101 | 102 | return bin_values 103 | 104 | def predict(VerifyCodePath): 105 | 106 | dir = 'F://test_verifycode/chars' 107 | files = os.listdir(dir) 108 | 109 | # 清空原有的文件 110 | if files: 111 | for file in files: 112 | os.remove(dir + '/' + file) 113 | 114 | split_picture(VerifyCodePath) 115 | 116 | files = os.listdir(dir) 117 | if not files: 118 | print('查看的文件夹为空!') 119 | else: 120 | 121 | # 去除噪声图片 122 | for file in files: 123 | remove_edge_picture(dir + '/' + file) 124 | 125 | # 对黏连图片进行重分割 126 | for file in os.listdir(dir): 127 | resplit(dir + '/' + file) 128 | 129 | # 将图片统一调整至16*20大小 130 | for file in os.listdir(dir): 131 | convert(dir, file) 132 | 133 | # 图片中的字符代表的向量 134 | files = sorted(os.listdir(dir), key=lambda x: x[0]) 135 | table = np.array([Read_Data(dir, file) for file in files]).reshape(-1,20,16,1) 136 | 137 | # 模型保存地址 138 | mp = 'F://verifycode_data/verifycode_Keras.h5' 139 | # 载入模型 140 | from keras.models import load_model 141 | cnn = load_model(mp) 142 | # 模型预测 143 | y_pred = cnn.predict(table) 144 | predictions = np.argmax(y_pred, axis=1) 145 | 146 | # 标签字典 147 | keys = range(31) 148 | vals = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'N', 149 | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'] 150 | label_dict = dict(zip(keys, vals)) 151 | 152 | return ''.join([label_dict[pred] for pred in predictions]) 153 | 154 | def main(): 155 | 156 | dir = 'F://VerifyCode/' 157 | correct = 0 158 | for i, file in enumerate(os.listdir(dir)): 159 | true_label = file.split('.')[0] 160 | VerifyCodePath = dir+file 161 | pred = predict(VerifyCodePath) 162 | 163 | if true_label == pred: 164 | correct += 1 165 | print(i+1, (true_label, pred), true_label == pred, correct) 166 | 167 | total = len(os.listdir(dir)) 168 | print('\n总共图片:%d张\n识别正确:%d张\n识别准确率:%.2f%%.'\ 169 | %(total, correct, correct*100/total)) 170 | 171 | main() -------------------------------------------------------------------------------- /CNN_Model_training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | from matplotlib import pyplot as plt 6 | 7 | from keras.utils import np_utils, plot_model 8 | from keras.models import Sequential 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten 10 | from keras.callbacks import EarlyStopping 11 | from keras.layers import Conv2D, MaxPooling2D 12 | 13 | # 读取数据 14 | df = pd.read_csv('F://verifycode_data/data.csv') 15 | 16 | # 标签值 17 | vals = range(31) 18 | keys = ['1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','J','K','L','N','P','Q','R','S','T','U','V','X','Y','Z'] 19 | label_dict = dict(zip(keys, vals)) 20 | 21 | x_data = df[['v'+str(i+1) for i in range(320)]] 22 | y_data = pd.DataFrame({'label':df['label']}) 23 | y_data['class'] = y_data['label'].apply(lambda x: label_dict[x]) 24 | 25 | # 将数据分为训练集和测试集 26 | X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data['class'], test_size=0.3, random_state=42) 27 | x_train = np.array(X_train).reshape((1167, 20, 16, 1)) 28 | x_test = np.array(X_test).reshape((501, 20, 16, 1)) 29 | 30 | # 对标签值进行one-hot encoding 31 | n_classes = 31 32 | y_train = np_utils.to_categorical(Y_train, n_classes) 33 | y_val = np_utils.to_categorical(Y_test, n_classes) 34 | 35 | input_shape = x_train[0].shape 36 | 37 | # CNN模型 38 | model = Sequential() 39 | 40 | # 卷积层和池化层 41 | model.add(Conv2D(32, kernel_size=(3, 3), input_shape=input_shape, padding='same')) 42 | model.add(Activation('relu')) 43 | model.add(Conv2D(32, kernel_size=(3, 3), padding='same')) 44 | model.add(Activation('relu')) 45 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) 46 | 47 | # Dropout层 48 | model.add(Dropout(0.25)) 49 | 50 | model.add(Conv2D(64, kernel_size=(3, 3), padding='same')) 51 | model.add(Activation('relu')) 52 | model.add(Conv2D(64, kernel_size=(3, 3), padding='same')) 53 | model.add(Activation('relu')) 54 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) 55 | 56 | model.add(Dropout(0.25)) 57 | 58 | model.add(Conv2D(128, kernel_size=(3, 3), padding='same')) 59 | model.add(Activation('relu')) 60 | model.add(Conv2D(128, kernel_size=(3, 3), padding='same')) 61 | model.add(Activation('relu')) 62 | model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) 63 | 64 | model.add(Dropout(0.25)) 65 | 66 | model.add(Flatten()) 67 | 68 | # 全连接层 69 | model.add(Dense(256, activation='relu')) 70 | model.add(Dropout(0.5)) 71 | model.add(Dense(128, activation='relu')) 72 | model.add(Dense(n_classes, activation='softmax')) 73 | 74 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 75 | 76 | # plot model 77 | plot_model(model, to_file=r'./model.png', show_shapes=True) 78 | 79 | # 模型训练 80 | callbacks = [EarlyStopping(monitor='val_acc', patience=5, verbose=1)] 81 | batch_size = 64 82 | n_epochs = 100 83 | history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, \ 84 | verbose=1, validation_data=(x_test, y_val), callbacks=callbacks) 85 | 86 | mp = 'F://verifycode_data/verifycode_Keras.h5' 87 | model.save(mp) 88 | 89 | # 绘制验证集上的准确率曲线 90 | val_acc = history.history['val_acc'] 91 | plt.plot(range(len(val_acc)), val_acc, label='CNN model') 92 | plt.title('Validation accuracy on verifycode dataset') 93 | plt.xlabel('epochs') 94 | plt.ylabel('accuracy') 95 | plt.legend() 96 | plt.show() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNN_4_Verifycode 2 | 使用Keras搭建CNN模型,破解简单的网页验证码 3 | -------------------------------------------------------------------------------- /VerifyCode.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/VerifyCode.rar -------------------------------------------------------------------------------- /model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/model.png -------------------------------------------------------------------------------- /测试集上的准确率曲线.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/percent4/CNN_4_Verifycode/93eed5b99335898a79591dcd36a5f8152cc9bab1/测试集上的准确率曲线.png --------------------------------------------------------------------------------