├── 1.jpg
├── LICENSE
├── README.md
├── get_face.py
├── haarcascade_frontalface_alt.xml
├── handle_image.py
├── image_show.py
├── image_spider.py
└── image_train.py


/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dapaopao66/recognition_gender/1cb226edd2c8bb2c678fc022847312fe36c75194/1.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 StevenKe8080
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # recognition_gender
 2 | 从数据收集到，训练模型，全套流程。
 3 | 
 4 | 需要安装numpy,sklearn,keras,opencv,tensorflow，如果你有好的gpu推荐安装tensorflow-gpu,速度比cpu快很多
 5 | 
 6 | ### 1.如果你没有数据
 7 | 执行image_spider.py 下载图片 代码来源 [百度爬虫](https://github.com/kong36088/BaiduImageSpider )
 8 | 
 9 | ### 2.从下载的人图中，获取脸部图片
10 | 执行get_face.py
11 | 
12 | ### 3.进行训练
13 | 执行image_train.py
14 | 
15 | ### 4.查看训练结果
16 | 执行 image_show.py
17 | 
18 | 
19 | 可能是训练数据样本集不多的原因，正确率不是很高，可通过增加样本集，增加训练次数来提高正确率
20 | 


--------------------------------------------------------------------------------
/get_face.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf8-*-
 2 | 
 3 | import os
 4 | import cv2
 5 | import time
 6 | 
 7 | 
 8 | #根据输入的文件夹绝对路径，将该文件夹下的所有指定suffix的文件读取存入一个list,该list的第一个元素是该文件夹的名字
 9 | def readAllImg(path,*suffix):
10 |     try:
11 | 
12 |         s = os.listdir(path)
13 |         resultArray = []
14 |         fileName = os.path.basename(path)
15 |         resultArray.append(fileName)
16 | 
17 |         for i in s:
18 |             if i.endswith(suffix):
19 |                 document = os.path.join(path, i)
20 |                 img = cv2.imread(document)
21 |                 resultArray.append(img)
22 | 
23 | 
24 |     except IOError:
25 |         print ("Error")
26 | 
27 |     else:
28 |         print ("读取成功")
29 |         return resultArray
30 | 
31 | #从源路径中读取所有图片放入一个list，然后逐一进行检查，把其中的脸扣下来，存储到目标路径中
32 | def readPicSaveFace(sourcePath,objectPath,*suffix):
33 |     try:
34 |         #读取照片,注意第一个元素是文件名
35 |         resultArray=readAllImg(sourcePath,*suffix)
36 | 
37 |         #对list中图片逐一进行检查,找出其中的人脸然后写到目标文件夹下
38 | 
39 |         count = 1
40 |         face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml')
41 |         for i in resultArray:
42 |             if type(i) != str:
43 | 
44 |               gray = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
45 |               faces = face_cascade.detectMultiScale(gray, 1.3, 5)
46 |               for (x, y, w, h) in faces:
47 | 
48 |                 listStr = [str(int(time.time())), str(count)]  #以时间戳和读取的排序作为文件名称
49 |                 fileName = ''.join(listStr)
50 | 
51 |                 f = cv2.resize(gray[y:(y + h), x:(x + w)], (200, 200))
52 |                 cv2.imwrite(objectPath+os.sep+'%s.jpg' % fileName, f)
53 |                 count += 1
54 | 
55 | 
56 |     except IOError:
57 |         print(IOError)
58 | 
59 |     else:
60 |         print ('Already read '+str(count-1)+' Faces to Destination '+objectPath)
61 | 
62 | if __name__ == '__main__':
63 |      readPicSaveFace('download_img/nv','get_face','.jpg','.JPG','png','PNG')
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/handle_image.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import numpy as np
 4 | import cv2
 5 | 
 6 | 
 7 | # 循环读取图片，添加到一个列表，每个图片为128 *128
 8 | def get_file(path):
 9 |     IMAGE_SIZE = 128
10 |     images_list = []
11 |     labels_list = []
12 |     counter = 0
13 |     for child_dir in os.listdir(path):
14 |         child_path = os.path.join(path, child_dir)
15 |         # print(child_path)
16 |         for dir_image in os.listdir(child_path):
17 |             # print(dir_image)
18 |             if dir_image.endswith('.jpg'):
19 |                 img = cv2.imread(os.path.join(child_path, dir_image))
20 |                 resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE))
21 |                 colored_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY) #转为灰度图
22 |                 images_list.append(colored_img)
23 |                 labels_list.append(counter)
24 | 
25 |         counter += 1
26 | 
27 |     images_list = np.array(images_list)
28 | 
29 |     return images_list,labels_list,counter
30 | 
31 | def get_file_name(path):
32 |     name_list = []
33 |     for child_dir in os.listdir(path):
34 |         name_list.append(child_dir)
35 |     return name_list
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     # img_list, label_lsit, counter = get_file('images')
40 |     # print(counter)
41 |     # print(label_lsit)
42 |     # print(len(img_list))
43 |     print(get_file_name('images'))


--------------------------------------------------------------------------------
/image_show.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os
 3 | from image_train import Model
 4 | from handle_image import get_file_name
 5 | 
 6 | if __name__ == '__main__':
 7 |     IMAGE_SIZE = 128
 8 |     name_list = get_file_name('gender_image')
 9 |     print(name_list)
10 |     face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml')
11 |     model = Model()
12 |     model.load()
13 |     frame = cv2.imread('1.jpg')
14 |     gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
15 |     faces = face_cascade.detectMultiScale(gray, 1.3, 5)
16 |     print(len(faces))
17 |     for (x, y, w, h) in faces:
18 |         ROI = gray[x:x + w, y:y + h]
19 |         ROI = cv2.resize(ROI, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LINEAR)
20 |         label, prob = model.predict(ROI)
21 |         print(prob)
22 |         if prob > 0.5:
23 |             show_name = name_list[label]
24 |             if (show_name == '0'):
25 |                 show_name = "female"
26 |             else:
27 |                 show_name = "male"
28 |             print(name_list[label])
29 |         else:
30 |             show_name = 'unknow'
31 |         cv2.putText(frame, show_name, (x, y - 20), cv2.FONT_HERSHEY_SIMPLEX, 1, 255, 2)  # 显示名字
32 |         frame = cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)  # 在人脸区域画一个正方形出来
33 |     cv2.imshow("frame", frame)
34 |     cv2.waitKey(0)
35 | 
36 |     cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/image_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import os
  4 | import re
  5 | import urllib
  6 | import json
  7 | import socket
  8 | import urllib.request
  9 | import urllib.parse
 10 | import urllib.error
 11 | # 设置超时
 12 | import time
 13 | 
 14 | timeout = 5
 15 | socket.setdefaulttimeout(timeout)
 16 | 
 17 | 
 18 | class Crawler:
 19 |     # 睡眠时长
 20 |     __time_sleep = 0.1
 21 |     __amount = 0
 22 |     __start_amount = 0
 23 |     __counter = 0
 24 |     __dirname = ""
 25 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'}
 26 | 
 27 |     # 获取图片url内容等
 28 |     # t 下载图片时间间隔
 29 |     def __init__(self, t=0.1,dirname=""):
 30 |         self.time_sleep = t
 31 |         self.__dirname = dirname
 32 | 
 33 |     # 保存图片
 34 |     def __save_image(self, rsp_data,dir, word):
 35 |         os.makedirs("./"+dir+"/" + self.__dirname, exist_ok=True)
 36 |         # 判断名字是否重复，获取图片长度
 37 |         self.__counter = len(os.listdir('./' +dir+"/"+ self.__dirname)) + 1
 38 |         for image_info in rsp_data['imgs']:
 39 |             try:
 40 |                 time.sleep(self.time_sleep)
 41 |                 fix = self.__get_suffix(image_info['objURL'])
 42 |                 urllib.request.urlretrieve(image_info['objURL'], './'+dir+"/" + self.__dirname + '/' + str(self.__counter) + str(fix))
 43 |             except urllib.error.HTTPError as urllib_err:
 44 |                 print(urllib_err)
 45 |                 continue
 46 |             except Exception as err:
 47 |                 time.sleep(1)
 48 |                 print(err)
 49 |                 print("产生未知错误，放弃保存")
 50 |                 continue
 51 |             else:
 52 |                 print("图+1,已有" + str(self.__counter) + "张图")
 53 |                 self.__counter += 1
 54 |         return
 55 | 
 56 |     # 获取后缀名
 57 |     @staticmethod
 58 |     def __get_suffix(name):
 59 |         m = re.search(r'\.[^\.]*$', name)
 60 |         if m.group(0) and len(m.group(0)) <= 5:
 61 |             return m.group(0)
 62 |         else:
 63 |             return '.jpeg'
 64 | 
 65 |     # 获取前缀
 66 |     @staticmethod
 67 |     def __get_prefix(name):
 68 |         return name[:name.find('.')]
 69 | 
 70 |     # 开始获取
 71 |     def __get_images(self,dir, word):
 72 |         search = urllib.parse.quote(word)
 73 |         # pn int 图片数
 74 |         pn = self.__start_amount
 75 |         while pn < self.__amount:
 76 | 
 77 |             url = 'http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=' + search + '&cg=girl&pn=' + str(
 78 |                 pn - 120) + '&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e'
 79 |             print(url)
 80 |             # 设置header防ban
 81 |             try:
 82 |                 time.sleep(self.time_sleep)
 83 |                 req = urllib.request.Request(url=url, headers=self.headers)
 84 |                 page = urllib.request.urlopen(req)
 85 |                 rsp = page.read().decode('unicode_escape')
 86 |             except UnicodeDecodeError as e:
 87 |                 print(e)
 88 |                 print('-----UnicodeDecodeErrorurl:', url)
 89 |             except urllib.error.URLError as e:
 90 |                 print(e)
 91 |                 print("-----urlErrorurl:", url)
 92 |             except socket.timeout as e:
 93 |                 print(e)
 94 |                 print("-----socket timout:", url)
 95 |             else:
 96 |                 # 解析json
 97 |                 rsp_data = json.loads(rsp)
 98 |                 self.__save_image(rsp_data,dir, word)
 99 |                 # 读取下一页
100 |                 print("下载下一页")
101 |                 pn += 60
102 |             finally:
103 |                 page.close()
104 |         print("下载任务结束")
105 |         return
106 | 
107 |     def start(self,dir, word, spider_page_num=1, start_page=1):
108 |         """
109 |         爬虫入口
110 |         :param word: 抓取的关键词
111 |         :param spider_page_num: 需要抓取数据页数 总抓取图片数量为 页数x60
112 |         :param start_page:起始页数
113 |         :return:
114 |         """
115 |         self.__start_amount = (start_page - 1) * 60
116 |         self.__amount = spider_page_num * 60 + self.__start_amount
117 |         self.__get_images(dir,word)
118 | 
119 | 
120 | if __name__ == '__main__':
121 | 
122 | 
123 |     name_arr = ["亚洲女"]
124 |     dir_name = ['nv']
125 | 
126 |     count = 0
127 |     for name in name_arr:
128 |         print(dir_name[count],name_arr[count])
129 |         crawler = Crawler(0.05, dir_name[count])
130 |         crawler.start("download_img",name_arr[count], 10, 3)
131 |         count += 1


--------------------------------------------------------------------------------
/image_train.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | from handle_image import get_file
  4 | from sklearn.model_selection import train_test_split
  5 | from keras.utils import np_utils
  6 | from keras.models import Sequential,load_model
  7 | from keras.layers import Dense,Activation,Convolution2D,MaxPooling2D,Flatten,Dropout
  8 | from keras.optimizers import Adam
  9 | 
 10 | #建立数据
 11 | class DataSet(object):
 12 |     def __init__(self):
 13 |         self.nb_classes = None
 14 |         self.X_train = None
 15 |         self.X_test = None
 16 |         self.Y_train = None
 17 |         self.Y_test = None
 18 |         self.img_size = 128
 19 | 
 20 |     def extract_data(self,train_path):
 21 |         imgs, labels, counter = get_file(train_path)
 22 |         print(labels)
 23 |         # 避免过拟合，采用交叉验证，验证集占训练集30%，固定随机种子（random_state)
 24 |         X_train, X_test, y_train, y_test = train_test_split(imgs, labels, test_size=0.3,
 25 |                                                             random_state=random.randint(0, 100))
 26 | 
 27 |         #数据预处理 keras backend 用的TensorFlow 黑白图片 channel 1
 28 |         X_train = X_train.reshape(X_train.shape[0], 1, self.img_size, self.img_size) / 255.
 29 |         X_test = X_test.reshape(X_test.shape[0], 1, self.img_size, self.img_size) / 255.
 30 | 
 31 |         #label 转为 one-hot 数据
 32 |         Y_train = np_utils.to_categorical(y_train, num_classes=counter)
 33 |         Y_test = np_utils.to_categorical(y_test, num_classes=counter)
 34 | 
 35 |         self.X_train = X_train
 36 |         self.X_test = X_test
 37 |         self.Y_train = Y_train
 38 |         self.Y_test = Y_test
 39 |         self.nb_classes = counter
 40 | 
 41 | 
 42 | #建立model  使用CNN（卷积神经网络）
 43 | class Model(object):
 44 |     FILE_PATH = "store/model.h5"
 45 |     IMAGE_SIZE = 128
 46 |     def __init__(self):
 47 |         self.model = None
 48 | 
 49 |     def build_model(self,dataset):
 50 |         self.model = Sequential()
 51 |         #进行一层卷积 输出 shape (32,128,128)
 52 |         self.model.add(Convolution2D(filters=32,kernel_size=5,strides=1, padding='same',data_format='channels_first', input_shape=dataset.X_train.shape[1:]))
 53 |         #使用relu激励函数
 54 |         self.model.add(Activation('relu'))
 55 |         #池化，输出为shape (32,64,64)
 56 |         self.model.add(MaxPooling2D(pool_size=2,strides=2,padding='same',data_format='channels_first'))
 57 |         #dropout 防止过拟合
 58 |         self.model.add(Dropout(0.25))
 59 | 
 60 |         #进行一层卷积 输出为shape (64,32,32)
 61 |         self.model.add(Convolution2D(64, 5, strides=1, padding='same', data_format='channels_first'))
 62 |         # 使用relu激励函数
 63 |         self.model.add(Activation('relu'))
 64 |         # 池化，输出为原来的一半 shape (64,32,32)
 65 |         self.model.add(MaxPooling2D(2, 2, 'same', data_format='channels_first'))
 66 |         # dropout 防止过拟合
 67 |         self.model.add(Dropout(0.25))
 68 | 
 69 |         #全连接层
 70 |         self.model.add(Flatten())
 71 |         self.model.add(Dense(512))
 72 |         self.model.add(Activation('relu'))
 73 |         self.model.add(Dropout(0.5))
 74 |         self.model.add(Dense(dataset.nb_classes))
 75 |         self.model.add(Activation('softmax'))
 76 | 
 77 |         self.model.summary()
 78 | 
 79 |     def train(self,dataset):
 80 |         adam = Adam(lr=1e-4)
 81 |         self.model.compile(optimizer=adam,
 82 |                       loss='categorical_crossentropy',
 83 |                       metrics=['accuracy'])
 84 | 
 85 |         # epochs 循环次数  batch_size 批处理大小
 86 |         self.model.fit(dataset.X_train, dataset.Y_train, epochs=25, batch_size=32, )
 87 | 
 88 |     def save(self, file_path=FILE_PATH):
 89 |         print('Model 保存.')
 90 |         self.model.save(file_path)
 91 | 
 92 |     def load(self, file_path=FILE_PATH):
 93 |         print('Model 读取.')
 94 |         self.model = load_model(file_path)
 95 | 
 96 |     #预测
 97 |     def predict(self,img):
 98 |         img = img.reshape((1, 1, self.IMAGE_SIZE, self.IMAGE_SIZE))
 99 |         img = img.astype('float32')
100 |         img = img/255.0
101 | 
102 |         result = self.model.predict_proba(img)  #预测图像结果
103 |         max_index = np.argmax(result)   #取平局值最大
104 |         print("begin")
105 |         print(result)
106 |         print(max_index)
107 |         print(result[0][max_index])
108 |         print("end")
109 |         return max_index,result[0][max_index]  #第一个参数为概率最高的label的index,第二个参数为对应概率
110 | 
111 |     def evaluate(self, dataset):
112 |         loss,score = self.model.evaluate(dataset.X_test, dataset.Y_test, verbose=0)
113 |         # print("%s: %.2f%%" % (self.model.metrics_names[1], score[1] * 100))
114 |         print('\ntest loss: ', loss)
115 |         print('\ntest accuracy: ', score)
116 | 
117 | if __name__ == '__main__':
118 |     dataset = DataSet()
119 |     dataset.extract_data('gender_image')
120 | 
121 |     model = Model()
122 |     model.build_model(dataset)
123 |     model.train(dataset)
124 |     model.save()
125 | 
126 |     model = Model()
127 |     model.load()
128 |     model.evaluate(dataset)
129 | 


--------------------------------------------------------------------------------