├── FaceFinder.py ├── README.md ├── demo.jpg ├── demo.py ├── demo_result.png ├── face_ds.py ├── face_model └── tfac.py /FaceFinder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for face detection. 3 | Trains a small convolutional neural network for binary classification of an image as a face/non-face. 4 | Uses a simple sliding window approach with variable sized windows to localize faces. 5 | See demo.py for usage using the pre-trained model "face_model". 6 | 7 | Author: Prithvijit Chakrabarty (prithvichakra@gmail.com) 8 | """ 9 | 10 | import cv2 11 | import tensorflow as tf 12 | from tensorflow import nn 13 | import tfac 14 | import face_ds 15 | import numpy as np 16 | 17 | #Localization parameters 18 | DET_SIZE = (300,300) #Run all localization at a standard size 19 | BLUR_DIM = (50,50) #Dimension for blurring the face location mask 20 | CONF_THRESH = 0.99 #Confidence threshold to mark a window as a face 21 | 22 | X_STEP = 10 #Horizontal slide for the sliding window 23 | Y_STEP = 10 #Vertical stride for the sliding window 24 | WIN_MIN = 40 #Minimum sliding window size 25 | WIN_MAX = 100 #Maximum sliding window size 26 | WIN_STRIDE = 10 #Stride to increase the sliding window 27 | 28 | #Build the net in the session 29 | def build_net(sess): 30 | in_len = 32 31 | in_dep = 1 32 | 33 | x_hold = tf.placeholder(tf.float32,shape=[None,in_dep*in_len*in_len]) 34 | y_hold = tf.placeholder(tf.float32,shape=[None,2]) 35 | keep_prob = tf.placeholder(tf.float32) 36 | 37 | xt = tf.reshape(x_hold,[-1,in_len,in_len,in_dep]) 38 | 39 | #Layer 1 - 5x5 convolution 40 | w1 = tfac.weight([5,5,in_dep,4]) 41 | b1 = tfac.bias([4]) 42 | c1 = nn.relu(nn.conv2d(xt,w1,strides=[1,2,2,1],padding='VALID')+b1) 43 | o1 = c1 44 | 45 | #Layer 2 - 3x3 convolution 46 | w2 = tfac.weight([3,3,4,16]) 47 | b2 = tfac.bias([16]) 48 | c2 = nn.relu(nn.conv2d(o1,w2,strides=[1,2,2,1],padding='VALID')+b2) 49 | o2 = c2 50 | 51 | #Layer 3 - 3x3 convolution 52 | w3 = tfac.weight([3,3,16,32]) 53 | b3 = tfac.bias([32]) 54 | c3 = nn.relu(nn.conv2d(o2,w3,strides=[1,1,1,1],padding='VALID')+b3) 55 | o3 = c3 56 | 57 | dim = 32 * 4*4 58 | 59 | #Fully connected layer - 600 units 60 | of = tf.reshape(o3,[-1,dim]) 61 | w4 = tfac.weight([dim,600]) 62 | b4 = tfac.bias([600]) 63 | o4 = nn.relu(tf.matmul(of,w4)+b4) 64 | 65 | o4 = nn.dropout(o4, keep_prob) 66 | 67 | #Output softmax layer - 2 units 68 | w5 = tfac.weight([600,2]) 69 | b5 = tfac.bias([2]) 70 | y = nn.softmax(tf.matmul(o4,w5)+b5) 71 | 72 | sess.run(tf.initialize_all_variables()) 73 | 74 | return y,x_hold,y_hold,keep_prob 75 | 76 | #Method to run the training 77 | def train_net(): 78 | train,val,test = face_ds.load_find_ds() 79 | sess = tfac.start_sess() 80 | y,x_hold,y_hold,keep_prob = build_net(sess) 81 | acc = tfac.train(sess, 82 | y, 83 | x_hold, 84 | y_hold, 85 | keep_prob, 86 | train[0],train[1], 87 | test[0],test[1], 88 | lrate=1e-4, 89 | epsilon=1e-16, 90 | n_epoch=8, 91 | batch_size=100, 92 | print_epoch=1, 93 | save_path=model_path) 94 | print "Accuracy:",acc 95 | sess.close() 96 | 97 | #Basic sliding window detector to find faces 98 | #Returns an image showing only the faces along with the sliding window mask (before blurring) 99 | def localize(img,model_path): 100 | sess = tfac.start_sess() 101 | y,x_hold,y_hold,keep_prob = build_net(sess) 102 | saver = tf.train.Saver() 103 | saver.restore(sess,model_path) 104 | 105 | #Run all detection at a fixed size 106 | img = cv2.resize(img,DET_SIZE) 107 | mask = np.zeros(img.shape) 108 | #Run sliding windows of different sizes 109 | for bx in range(WIN_MIN,WIN_MAX,WIN_STRIDE): 110 | by = bx 111 | for i in xrange(0, img.shape[1]-bx, X_STEP): 112 | for j in xrange(0, img.shape[0]-by, Y_STEP): 113 | sub_img = cv2.resize(img[i:i+bx,j:j+by],face_ds.IN_SIZE) 114 | X = sub_img.reshape((1,tfac.dim_prod(face_ds.IN_SIZE))) 115 | out = y.eval(session=sess,feed_dict={x_hold:X,keep_prob:1})[0] 116 | if out[0] >= CONF_THRESH: 117 | mask[i:i+bx,j:j+by] = mask[i:i+bx,j:j+by]+1 118 | 119 | sess.close() 120 | mask = np.uint8(255*mask/np.max(mask)) 121 | faces = img*(cv2.threshold(cv2.blur(mask,BLUR_DIM),0,255,cv2.THRESH_OTSU)[1]/255) 122 | return (faces,mask) 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FaceDetect 2 | 3 | This is a module for face detection with convolutional neural networks (CNNs). It uses a small CNN as a binary classifier to distinguish between faces and non-faces. A simple sliding window (with multiple windows of varying size) is used to locaize the faces in the image. 4 | 5 | **Requirements** 6 | 7 | 1. TensorFlow 8 | 2. OpenCV for Python 9 | 10 | **Network topology** 11 | 12 | The network consists of 3 convolution layers 13 | 14 | Input: 32x32 black and white image 15 | 1. Layer 1: 5x5 convolutions 16 | 4 feature maps 17 | 2. Layer 2: 3x3 convolutions 18 | 16 feature maps 19 | 3. Layer 3: 3x3 feature maps 20 | 32 feature maps 21 | Layer 3 outputs 32 4x4 feature maps 22 | 4. Layer 4: Fully connected layer 23 | 600 units 24 | 5. Layer 5: Softmax layer 25 | 2 units 26 | 27 | **Training parameters** 28 | 29 | The network was trained with TensorFlow's AdamOptimzer 30 | 31 | lrate: 1e-4 32 | epsilon: 1e-16 33 | mini-batch size: 100 34 | number of epochs: 8 35 | 36 | The validation accuracy was 98.762% and the final accuracy on the test set was 98.554%. 37 | 38 | **Dataset** 39 | 40 | Positive samples (images of faces) for the classification were taken from 2 sources: 41 | 42 | 1. Cropped labelled faces in the wild (http://conradsanderson.id.au/lfwcrop/) 43 | 2. MIT CBCL face recognition database (http://cbcl.mit.edu/software-datasets/heisele/facerecognition-database.html) 44 | 45 | The horizontal mirror images of these images were included in the dataset. 46 | 47 | Negative samples (non-faces) were taken from 4 sources: 48 | 49 | 1. Fifteen scene categories (http://www-cvr.ai.uiuc.edu/ponce_grp/data/) 50 | 2. Texture database (http://www-cvr.ai.uiuc.edu/ponce_grp/data/) 51 | 3. Caltech cars (Rear) background dataset (http://www.robots.ox.ac.uk/~vgg/data3.html) 52 | 4. Caltech houses dataset (http://www.robots.ox.ac.uk/~vgg/data3.html) 53 | 54 | Random snapshots from these images were generated by taking sub-images of a random lengths at random positions in the images. These snapshots were mixed in the dataset along with the complete images. 55 | 56 | The final dataset consists of the 32,000 images from each class (positive and negative). Training, validation and test sets were generated from this with a 0.6 split for training, 0.2 for validation and 0.2 for test. Each of these sets have 50% positive and 50% negative samples. 57 | 58 | **Localization** 59 | 60 | The module uses a simple sliding window localizer. The input image is reshaped to (300,300). Square windows of side lengths 40,50... 100 are slid along the image. Each sub image seen through the window is reshaped to (32,32) and fed to the network. If the sub image is a face with a minimum confidence of 0.99, the window is marked in the mask. After running all different sized windows on the image, the final mask is blurred with a 50x50 Gaussian filter and binarized. This final binarized mask is used to extract only the faces from the image. The localizer returns two images: an image with only the faces and the raw mask (before blurring and binarization). 61 | 62 | **Demo** 63 | 64 | The repo includes a pre-trained model: face_model. This can directly be used for localization. Sample usage of this model with FaceDetect.py can be seen in demo.py. Running the demo should display the result of running the localizer on demo.jpg. 65 | Demos with other images can be seen here: https://youtu.be/N4GIGVnyNBo 66 | 67 | Output of demo.py: 68 | ![Alt text](demo_result.png?raw=true) 69 | -------------------------------------------------------------------------------- /demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PCJohn/FaceDetect/d4b624b95244b32e1241a4f3532fd8e386381519/demo.jpg -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demo for the face detection. Runs the sliding window detector on demo.jpg 3 | Displays the input image, the localized faces and the sliding window mask 4 | 5 | Author: Prithvijit Chakrabarty (prithvichakra@gmail.com) 6 | """ 7 | 8 | import FaceFinder 9 | import cv2 10 | 11 | model_path = 'face_model' 12 | img = cv2.imread("demo.jpg",0) 13 | faces,mask = FaceFinder.localize(img,model_path) 14 | cv2.imshow("faces",faces) 15 | cv2.imshow("sliding window mask",mask) 16 | cv2.imshow("input image",img) 17 | cv2.waitKey(0) 18 | cv2.destroyAllWindows() 19 | -------------------------------------------------------------------------------- /demo_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PCJohn/FaceDetect/d4b624b95244b32e1241a4f3532fd8e386381519/demo_result.png -------------------------------------------------------------------------------- /face_ds.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import os 3 | import numpy as np 4 | import cv2 5 | import random 6 | import requests 7 | 8 | FACE_PATH = "/home/prithvi/dsets/Faces/positive/" 9 | NON_FACE_PATH = "/home/prithvi/dsets/Faces/negative/" 10 | IN_SIZE = (32,32) #Input dimensions of image for the network 11 | SNAP_COUNT = 5 #Number of random snapshots per non-face image 12 | MIN_LEN = 10 #Minimum length for the ranom snaphsots of non-faces 13 | GOOD = [1,0] #Vector output for faces 14 | BAD = [0,1] #Vector output for non-faces 15 | 16 | FACE_COUNT = 36000 #Number of images of each class (positive and negative) in the dataset 17 | TRAIN_SPLIT = int(0.6*FACE_COUNT) 18 | VAL_SPLIT = int(0.2*FACE_COUNT) + TRAIN_SPLIT 19 | 20 | #Method to generate multiple snapshots from an image 21 | def rand_snap(img): 22 | r = [] 23 | x = img.shape[0] 24 | y = img.shape[1] 25 | #Generate 5 snapshots of different sizes 26 | for i in range(SNAP_COUNT): 27 | snap_size = max([MIN_LEN,int(random.random()*200)]) 28 | fx = int(random.random()*(x-snap_size)) 29 | fy = int(random.random()*(y-snap_size)) 30 | snap = img[fx:fx+snap_size,fy:fy+snap_size] 31 | r.append(cv2.resize(snap,IN_SIZE)) 32 | return r 33 | 34 | #Load the dataset for face/non face classification 35 | def load_find_ds(): 36 | ds = [] 37 | #Load faces (positive samples) 38 | for n in os.listdir(FACE_PATH): 39 | name = FACE_PATH+n 40 | for img_path in os.listdir(name): 41 | t_img = cv2.resize(cv2.imread(name+"/"+img_path,0),IN_SIZE) 42 | ds.append((t_img, GOOD)) 43 | ds.append((cv2.flip(t_img,1),GOOD)) #Use the horizontal mirror image 44 | random.shuffle(ds) 45 | ds = ds[:FACE_COUNT] 46 | #Load non-faces (negative samples) from dataset 47 | nface_ds = [] 48 | for n in os.listdir(NON_FACE_PATH): 49 | name = NON_FACE_PATH+n 50 | for img_path in os.listdir(name): 51 | t_img = cv2.imread(name+"/"+img_path,0) 52 | nface_ds.extend([(r,BAD) for r in rand_snap(t_img)]) 53 | nface_ds.append((cv2.resize(t_img, IN_SIZE),BAD)) 54 | random.shuffle(nface_ds) 55 | nface_ds = nface_ds[:FACE_COUNT] 56 | 57 | #Make the train, val and test sets: Ensure 50% for each set 58 | train = ds[:TRAIN_SPLIT] 59 | train.extend(nface_ds[:TRAIN_SPLIT]) 60 | random.shuffle(train) 61 | val = ds[TRAIN_SPLIT:VAL_SPLIT] 62 | val.extend(nface_ds[TRAIN_SPLIT:VAL_SPLIT]) 63 | random.shuffle(val) 64 | test = ds[TRAIN_SPLIT:] 65 | test.extend(nface_ds[TRAIN_SPLIT:]) 66 | random.shuffle(test) 67 | 68 | trainX,trainY = map(np.array,zip(*train)) 69 | valX,valY = map(np.array,zip(*val)) 70 | testX,testY = map(np.array,zip(*test)) 71 | 72 | return ((trainX,trainY),(valX,valY),(testX,testY)) 73 | -------------------------------------------------------------------------------- /face_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PCJohn/FaceDetect/d4b624b95244b32e1241a4f3532fd8e386381519/face_model -------------------------------------------------------------------------------- /tfac.py: -------------------------------------------------------------------------------- 1 | """ 2 | Accessory methods for using TensorFlow -- Mostly taken out from the TensorFlow tutorials! 3 | 4 | Author: Prithvijit Chakrabarty (prithvichakra@gmail.com) 5 | """ 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import random 10 | 11 | #Make weight and bias variables -- From the TensorFlow tutorial 12 | def weight(shape): 13 | intial = tf.truncated_normal(shape, stddev=0.1) 14 | return tf.Variable(intial) 15 | 16 | def bias(shape): 17 | intial = tf.constant(0.1, shape=shape) 18 | return tf.Variable(intial) 19 | 20 | #Finds the product of a dimension tuple to find the total legth 21 | def dim_prod(dim_arr): 22 | return np.prod([d for d in dim_arr if d != None]) 23 | 24 | #Start a TensorFlow session 25 | def start_sess(): 26 | config = tf.ConfigProto() 27 | config.gpu_options.allocator_type = 'BFC' 28 | sess = tf.Session(config=config) 29 | return sess 30 | 31 | #Train the model 32 | def train(sess, y, x_hold, y_hold, keep_prob, X, Y, valX, valY, lrate=0.5, epsilon=1e-8, n_epoch=100, batch_size=10, print_epoch=100, save_path=None): 33 | cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_hold*tf.log(y+1e-10), reduction_indices=[1])) 34 | train_step = tf.train.AdamOptimizer(learning_rate=lrate,epsilon=epsilon).minimize(cross_entropy) 35 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(y_hold,1)) 36 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 37 | #Flatten the input images for the placeholder 38 | flat_len = dim_prod(x_hold._shape_as_list()) 39 | X = X.reshape((X.shape[0],flat_len)) 40 | 41 | print 'Starting training session...' 42 | 43 | sess.run(tf.initialize_all_variables()) 44 | batch_num = 0 45 | batches = batchify(X,Y,batch_size) 46 | print 'Number of batches:',len(batches) 47 | for i in range(n_epoch): 48 | avg_acc = 0 49 | random.shuffle(batches) 50 | for batchX,batchY in batches: 51 | avg_acc = avg_acc + accuracy.eval(session=sess, feed_dict={x_hold:batchX, y_hold:batchY, keep_prob:1}) 52 | train_step.run(session=sess,feed_dict={x_hold:batchX, y_hold:batchY, keep_prob:0.5}) 53 | print 'Epoch '+str(i)+': '+str(avg_acc/len(batches)) 54 | if (not valX is None) & (not valY is None): 55 | #Validation 56 | valX = valX.reshape((valX.shape[0],flat_len)) 57 | val_accuracy = accuracy.eval(session=sess,feed_dict={x_hold:valX, y_hold:valY, keep_prob:1}) 58 | print 'Val acc:',val_accuracy 59 | 60 | if not save_path is None: 61 | saver = tf.train.Saver(tf.all_variables()) 62 | saver.save(sess,save_path) 63 | merged = tf.merge_all_summaries() 64 | writer = tf.train.SummaryWriter(save_path+'_graph',sess.graph) 65 | writer.flush() 66 | writer.close() 67 | print 'Model saved' 68 | return val_accuracy 69 | 70 | #Test a model 71 | def test(sess, X, Y, model_path): 72 | correct_prediction = tf.equal(tf.argmax(self.net,1), tf.argmax(self.y_hold,1)) 73 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 74 | saver = tf.train.Saver() 75 | sess.run(tf.initialize_all_variables()) 76 | saver.restore(sess,model_path) 77 | X = X.reshape((X.shape[0],X.shape[1]*X.shape[2])) 78 | test_accuracy = accuracy.eval(session=sess,feed_dict={x_hold:X,y_hold:Y,keep_prob:1}) 79 | return test_accuracy 80 | 81 | #Split to mini batches 82 | def batchify(X, Y, batch_size): 83 | batches = [(X[i:i+batch_size],Y[i:i+batch_size]) for i in xrange(0,X.shape[0],batch_size)] 84 | random.shuffle(batches) 85 | return batches 86 | --------------------------------------------------------------------------------