├── 1.jpg ├── LICENSE ├── README.md ├── get_face.py ├── haarcascade_frontalface_alt.xml ├── handle_image.py ├── image_show.py ├── image_spider.py └── image_train.py /1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dapaopao66/recognition_gender/1cb226edd2c8bb2c678fc022847312fe36c75194/1.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 StevenKe8080 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recognition_gender 2 | 从数据收集到,训练模型,全套流程。 3 | 4 | 需要安装numpy,sklearn,keras,opencv,tensorflow,如果你有好的gpu推荐安装tensorflow-gpu,速度比cpu快很多 5 | 6 | ### 1.如果你没有数据 7 | 执行image_spider.py 下载图片 代码来源 [百度爬虫](https://github.com/kong36088/BaiduImageSpider ) 8 | 9 | ### 2.从下载的人图中,获取脸部图片 10 | 执行get_face.py 11 | 12 | ### 3.进行训练 13 | 执行image_train.py 14 | 15 | ### 4.查看训练结果 16 | 执行 image_show.py 17 | 18 | 19 | 可能是训练数据样本集不多的原因,正确率不是很高,可通过增加样本集,增加训练次数来提高正确率 20 | -------------------------------------------------------------------------------- /get_face.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf8-*- 2 | 3 | import os 4 | import cv2 5 | import time 6 | 7 | 8 | #根据输入的文件夹绝对路径,将该文件夹下的所有指定suffix的文件读取存入一个list,该list的第一个元素是该文件夹的名字 9 | def readAllImg(path,*suffix): 10 | try: 11 | 12 | s = os.listdir(path) 13 | resultArray = [] 14 | fileName = os.path.basename(path) 15 | resultArray.append(fileName) 16 | 17 | for i in s: 18 | if i.endswith(suffix): 19 | document = os.path.join(path, i) 20 | img = cv2.imread(document) 21 | resultArray.append(img) 22 | 23 | 24 | except IOError: 25 | print ("Error") 26 | 27 | else: 28 | print ("读取成功") 29 | return resultArray 30 | 31 | #从源路径中读取所有图片放入一个list,然后逐一进行检查,把其中的脸扣下来,存储到目标路径中 32 | def readPicSaveFace(sourcePath,objectPath,*suffix): 33 | try: 34 | #读取照片,注意第一个元素是文件名 35 | resultArray=readAllImg(sourcePath,*suffix) 36 | 37 | #对list中图片逐一进行检查,找出其中的人脸然后写到目标文件夹下 38 | 39 | count = 1 40 | face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml') 41 | for i in resultArray: 42 | if type(i) != str: 43 | 44 | gray = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY) 45 | faces = face_cascade.detectMultiScale(gray, 1.3, 5) 46 | for (x, y, w, h) in faces: 47 | 48 | listStr = [str(int(time.time())), str(count)] #以时间戳和读取的排序作为文件名称 49 | fileName = ''.join(listStr) 50 | 51 | f = cv2.resize(gray[y:(y + h), x:(x + w)], (200, 200)) 52 | cv2.imwrite(objectPath+os.sep+'%s.jpg' % fileName, f) 53 | count += 1 54 | 55 | 56 | except IOError: 57 | print(IOError) 58 | 59 | else: 60 | print ('Already read '+str(count-1)+' Faces to Destination '+objectPath) 61 | 62 | if __name__ == '__main__': 63 | readPicSaveFace('download_img/nv','get_face','.jpg','.JPG','png','PNG') 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /handle_image.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import numpy as np 4 | import cv2 5 | 6 | 7 | # 循环读取图片,添加到一个列表,每个图片为128 *128 8 | def get_file(path): 9 | IMAGE_SIZE = 128 10 | images_list = [] 11 | labels_list = [] 12 | counter = 0 13 | for child_dir in os.listdir(path): 14 | child_path = os.path.join(path, child_dir) 15 | # print(child_path) 16 | for dir_image in os.listdir(child_path): 17 | # print(dir_image) 18 | if dir_image.endswith('.jpg'): 19 | img = cv2.imread(os.path.join(child_path, dir_image)) 20 | resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE)) 21 | colored_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY) #转为灰度图 22 | images_list.append(colored_img) 23 | labels_list.append(counter) 24 | 25 | counter += 1 26 | 27 | images_list = np.array(images_list) 28 | 29 | return images_list,labels_list,counter 30 | 31 | def get_file_name(path): 32 | name_list = [] 33 | for child_dir in os.listdir(path): 34 | name_list.append(child_dir) 35 | return name_list 36 | 37 | 38 | if __name__ == '__main__': 39 | # img_list, label_lsit, counter = get_file('images') 40 | # print(counter) 41 | # print(label_lsit) 42 | # print(len(img_list)) 43 | print(get_file_name('images')) -------------------------------------------------------------------------------- /image_show.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | from image_train import Model 4 | from handle_image import get_file_name 5 | 6 | if __name__ == '__main__': 7 | IMAGE_SIZE = 128 8 | name_list = get_file_name('gender_image') 9 | print(name_list) 10 | face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml') 11 | model = Model() 12 | model.load() 13 | frame = cv2.imread('1.jpg') 14 | gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 15 | faces = face_cascade.detectMultiScale(gray, 1.3, 5) 16 | print(len(faces)) 17 | for (x, y, w, h) in faces: 18 | ROI = gray[x:x + w, y:y + h] 19 | ROI = cv2.resize(ROI, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LINEAR) 20 | label, prob = model.predict(ROI) 21 | print(prob) 22 | if prob > 0.5: 23 | show_name = name_list[label] 24 | if (show_name == '0'): 25 | show_name = "female" 26 | else: 27 | show_name = "male" 28 | print(name_list[label]) 29 | else: 30 | show_name = 'unknow' 31 | cv2.putText(frame, show_name, (x, y - 20), cv2.FONT_HERSHEY_SIMPLEX, 1, 255, 2) # 显示名字 32 | frame = cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2) # 在人脸区域画一个正方形出来 33 | cv2.imshow("frame", frame) 34 | cv2.waitKey(0) 35 | 36 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /image_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import re 5 | import urllib 6 | import json 7 | import socket 8 | import urllib.request 9 | import urllib.parse 10 | import urllib.error 11 | # 设置超时 12 | import time 13 | 14 | timeout = 5 15 | socket.setdefaulttimeout(timeout) 16 | 17 | 18 | class Crawler: 19 | # 睡眠时长 20 | __time_sleep = 0.1 21 | __amount = 0 22 | __start_amount = 0 23 | __counter = 0 24 | __dirname = "" 25 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'} 26 | 27 | # 获取图片url内容等 28 | # t 下载图片时间间隔 29 | def __init__(self, t=0.1,dirname=""): 30 | self.time_sleep = t 31 | self.__dirname = dirname 32 | 33 | # 保存图片 34 | def __save_image(self, rsp_data,dir, word): 35 | os.makedirs("./"+dir+"/" + self.__dirname, exist_ok=True) 36 | # 判断名字是否重复,获取图片长度 37 | self.__counter = len(os.listdir('./' +dir+"/"+ self.__dirname)) + 1 38 | for image_info in rsp_data['imgs']: 39 | try: 40 | time.sleep(self.time_sleep) 41 | fix = self.__get_suffix(image_info['objURL']) 42 | urllib.request.urlretrieve(image_info['objURL'], './'+dir+"/" + self.__dirname + '/' + str(self.__counter) + str(fix)) 43 | except urllib.error.HTTPError as urllib_err: 44 | print(urllib_err) 45 | continue 46 | except Exception as err: 47 | time.sleep(1) 48 | print(err) 49 | print("产生未知错误,放弃保存") 50 | continue 51 | else: 52 | print("图+1,已有" + str(self.__counter) + "张图") 53 | self.__counter += 1 54 | return 55 | 56 | # 获取后缀名 57 | @staticmethod 58 | def __get_suffix(name): 59 | m = re.search(r'\.[^\.]*$', name) 60 | if m.group(0) and len(m.group(0)) <= 5: 61 | return m.group(0) 62 | else: 63 | return '.jpeg' 64 | 65 | # 获取前缀 66 | @staticmethod 67 | def __get_prefix(name): 68 | return name[:name.find('.')] 69 | 70 | # 开始获取 71 | def __get_images(self,dir, word): 72 | search = urllib.parse.quote(word) 73 | # pn int 图片数 74 | pn = self.__start_amount 75 | while pn < self.__amount: 76 | 77 | url = 'http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=' + search + '&cg=girl&pn=' + str( 78 | pn - 120) + '&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e' 79 | print(url) 80 | # 设置header防ban 81 | try: 82 | time.sleep(self.time_sleep) 83 | req = urllib.request.Request(url=url, headers=self.headers) 84 | page = urllib.request.urlopen(req) 85 | rsp = page.read().decode('unicode_escape') 86 | except UnicodeDecodeError as e: 87 | print(e) 88 | print('-----UnicodeDecodeErrorurl:', url) 89 | except urllib.error.URLError as e: 90 | print(e) 91 | print("-----urlErrorurl:", url) 92 | except socket.timeout as e: 93 | print(e) 94 | print("-----socket timout:", url) 95 | else: 96 | # 解析json 97 | rsp_data = json.loads(rsp) 98 | self.__save_image(rsp_data,dir, word) 99 | # 读取下一页 100 | print("下载下一页") 101 | pn += 60 102 | finally: 103 | page.close() 104 | print("下载任务结束") 105 | return 106 | 107 | def start(self,dir, word, spider_page_num=1, start_page=1): 108 | """ 109 | 爬虫入口 110 | :param word: 抓取的关键词 111 | :param spider_page_num: 需要抓取数据页数 总抓取图片数量为 页数x60 112 | :param start_page:起始页数 113 | :return: 114 | """ 115 | self.__start_amount = (start_page - 1) * 60 116 | self.__amount = spider_page_num * 60 + self.__start_amount 117 | self.__get_images(dir,word) 118 | 119 | 120 | if __name__ == '__main__': 121 | 122 | 123 | name_arr = ["亚洲女"] 124 | dir_name = ['nv'] 125 | 126 | count = 0 127 | for name in name_arr: 128 | print(dir_name[count],name_arr[count]) 129 | crawler = Crawler(0.05, dir_name[count]) 130 | crawler.start("download_img",name_arr[count], 10, 3) 131 | count += 1 -------------------------------------------------------------------------------- /image_train.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from handle_image import get_file 4 | from sklearn.model_selection import train_test_split 5 | from keras.utils import np_utils 6 | from keras.models import Sequential,load_model 7 | from keras.layers import Dense,Activation,Convolution2D,MaxPooling2D,Flatten,Dropout 8 | from keras.optimizers import Adam 9 | 10 | #建立数据 11 | class DataSet(object): 12 | def __init__(self): 13 | self.nb_classes = None 14 | self.X_train = None 15 | self.X_test = None 16 | self.Y_train = None 17 | self.Y_test = None 18 | self.img_size = 128 19 | 20 | def extract_data(self,train_path): 21 | imgs, labels, counter = get_file(train_path) 22 | print(labels) 23 | # 避免过拟合,采用交叉验证,验证集占训练集30%,固定随机种子(random_state) 24 | X_train, X_test, y_train, y_test = train_test_split(imgs, labels, test_size=0.3, 25 | random_state=random.randint(0, 100)) 26 | 27 | #数据预处理 keras backend 用的TensorFlow 黑白图片 channel 1 28 | X_train = X_train.reshape(X_train.shape[0], 1, self.img_size, self.img_size) / 255. 29 | X_test = X_test.reshape(X_test.shape[0], 1, self.img_size, self.img_size) / 255. 30 | 31 | #label 转为 one-hot 数据 32 | Y_train = np_utils.to_categorical(y_train, num_classes=counter) 33 | Y_test = np_utils.to_categorical(y_test, num_classes=counter) 34 | 35 | self.X_train = X_train 36 | self.X_test = X_test 37 | self.Y_train = Y_train 38 | self.Y_test = Y_test 39 | self.nb_classes = counter 40 | 41 | 42 | #建立model 使用CNN(卷积神经网络) 43 | class Model(object): 44 | FILE_PATH = "store/model.h5" 45 | IMAGE_SIZE = 128 46 | def __init__(self): 47 | self.model = None 48 | 49 | def build_model(self,dataset): 50 | self.model = Sequential() 51 | #进行一层卷积 输出 shape (32,128,128) 52 | self.model.add(Convolution2D(filters=32,kernel_size=5,strides=1, padding='same',data_format='channels_first', input_shape=dataset.X_train.shape[1:])) 53 | #使用relu激励函数 54 | self.model.add(Activation('relu')) 55 | #池化,输出为shape (32,64,64) 56 | self.model.add(MaxPooling2D(pool_size=2,strides=2,padding='same',data_format='channels_first')) 57 | #dropout 防止过拟合 58 | self.model.add(Dropout(0.25)) 59 | 60 | #进行一层卷积 输出为shape (64,32,32) 61 | self.model.add(Convolution2D(64, 5, strides=1, padding='same', data_format='channels_first')) 62 | # 使用relu激励函数 63 | self.model.add(Activation('relu')) 64 | # 池化,输出为原来的一半 shape (64,32,32) 65 | self.model.add(MaxPooling2D(2, 2, 'same', data_format='channels_first')) 66 | # dropout 防止过拟合 67 | self.model.add(Dropout(0.25)) 68 | 69 | #全连接层 70 | self.model.add(Flatten()) 71 | self.model.add(Dense(512)) 72 | self.model.add(Activation('relu')) 73 | self.model.add(Dropout(0.5)) 74 | self.model.add(Dense(dataset.nb_classes)) 75 | self.model.add(Activation('softmax')) 76 | 77 | self.model.summary() 78 | 79 | def train(self,dataset): 80 | adam = Adam(lr=1e-4) 81 | self.model.compile(optimizer=adam, 82 | loss='categorical_crossentropy', 83 | metrics=['accuracy']) 84 | 85 | # epochs 循环次数 batch_size 批处理大小 86 | self.model.fit(dataset.X_train, dataset.Y_train, epochs=25, batch_size=32, ) 87 | 88 | def save(self, file_path=FILE_PATH): 89 | print('Model 保存.') 90 | self.model.save(file_path) 91 | 92 | def load(self, file_path=FILE_PATH): 93 | print('Model 读取.') 94 | self.model = load_model(file_path) 95 | 96 | #预测 97 | def predict(self,img): 98 | img = img.reshape((1, 1, self.IMAGE_SIZE, self.IMAGE_SIZE)) 99 | img = img.astype('float32') 100 | img = img/255.0 101 | 102 | result = self.model.predict_proba(img) #预测图像结果 103 | max_index = np.argmax(result) #取平局值最大 104 | print("begin") 105 | print(result) 106 | print(max_index) 107 | print(result[0][max_index]) 108 | print("end") 109 | return max_index,result[0][max_index] #第一个参数为概率最高的label的index,第二个参数为对应概率 110 | 111 | def evaluate(self, dataset): 112 | loss,score = self.model.evaluate(dataset.X_test, dataset.Y_test, verbose=0) 113 | # print("%s: %.2f%%" % (self.model.metrics_names[1], score[1] * 100)) 114 | print('\ntest loss: ', loss) 115 | print('\ntest accuracy: ', score) 116 | 117 | if __name__ == '__main__': 118 | dataset = DataSet() 119 | dataset.extract_data('gender_image') 120 | 121 | model = Model() 122 | model.build_model(dataset) 123 | model.train(dataset) 124 | model.save() 125 | 126 | model = Model() 127 | model.load() 128 | model.evaluate(dataset) 129 | --------------------------------------------------------------------------------