├── requirements.in ├── reference.pdf ├── facenet_code ├── face.py ├── encoder.py ├── detection.py ├── align_dataset_mtcnn.py ├── facenet.py └── detect_face.py ├── requirements.txt ├── detect_blur.py └── README.md /requirements.in: -------------------------------------------------------------------------------- 1 | # requirements.in 2 | 3 | imutils 4 | opencv-python 5 | tensorflow 6 | numpy 7 | scipy 8 | -------------------------------------------------------------------------------- /reference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giovanadelucca/Blurry-Faces-Detection-in-Videos/HEAD/reference.pdf -------------------------------------------------------------------------------- /facenet_code/face.py: -------------------------------------------------------------------------------- 1 | class Face: 2 | def __init__(self): 3 | self.name = None 4 | self.bounding_box = None 5 | self.image = None 6 | self.container_image = None 7 | self.embedding = None 8 | self.confidence = None 9 | self.class_probabilities = None -------------------------------------------------------------------------------- /facenet_code/encoder.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | 4 | from facenet_code import facenet 5 | 6 | tf.logging.set_verbosity(tf.logging.ERROR) 7 | # tf.logging.set_verbosity(tf.logging.INFO) 8 | # tf.logging.set_verbosity(tf.logging.WARN) 9 | # tf.logging.set_verbosity(tf.logging.DEBUG) 10 | # tf.logging.set_verbosity(tf.logging.FATAL) 11 | 12 | 13 | PATH_ENCODE_EMBEDDED = "facenet_code/weights/20180402-114759.pb" 14 | 15 | 16 | class Encoder: 17 | def __init__(self): 18 | self.sess = tf.Session() 19 | with self.sess.as_default(): 20 | facenet.load_model(PATH_ENCODE_EMBEDDED) 21 | 22 | def generate_embedding(self, face): 23 | # Get input and output tensors 24 | images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") 25 | embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") 26 | phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") 27 | 28 | prewhiten_face = facenet.prewhiten(face.image) 29 | 30 | # Run forward pass to calculate embeddings 31 | feed_dict = {images_placeholder: [prewhiten_face], phase_train_placeholder: False} 32 | return self.sess.run(embeddings, feed_dict=feed_dict)[0] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | absl-py==0.9.0 # via tensorboard, tensorflow 8 | astor==0.8.1 # via tensorflow 9 | gast==0.2.2 # via tensorflow 10 | google-pasta==0.1.8 # via tensorflow 11 | grpcio==1.27.2 # via tensorboard, tensorflow 12 | h5py==2.10.0 # via keras-applications 13 | imutils==0.5.3 # via -r requirements.in 14 | keras-applications==1.0.8 # via tensorflow 15 | keras-preprocessing==1.1.0 # via tensorflow 16 | markdown==3.2.1 # via tensorboard 17 | numpy==1.18.1 # via -r requirements.in, h5py, keras-applications, keras-preprocessing, opencv-python, opt-einsum, scipy, tensorboard, tensorflow 18 | opencv-python==4.2.0.32 # via -r requirements.in 19 | opt-einsum==3.2.0 # via tensorflow 20 | protobuf==3.11.3 # via tensorboard, tensorflow 21 | scipy==1.4.1 # via -r requirements.in 22 | six==1.14.0 # via absl-py, google-pasta, grpcio, h5py, keras-preprocessing, protobuf, tensorboard, tensorflow 23 | tensorboard==1.15.2 # via tensorflow 24 | tensorflow-estimator==1.15.2 # via tensorflow 25 | tensorflow==1.15.2 # via -r requirements.in 26 | termcolor==1.1.0 # via tensorflow 27 | werkzeug==1.0.0 # via tensorboard 28 | wheel==0.34.2 # via tensorboard, tensorflow 29 | wrapt==1.12.0 # via tensorflow 30 | 31 | # The following packages are considered to be unsafe in a requirements file: 32 | # setuptools 33 | -------------------------------------------------------------------------------- /facenet_code/detection.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | import os 5 | 6 | from facenet_code.face import Face 7 | from facenet_code import detect_face 8 | 9 | tf.logging.set_verbosity(tf.logging.ERROR) 10 | # tf.logging.set_verbosity(tf.logging.INFO) 11 | # tf.logging.set_verbosity(tf.logging.WARN) 12 | # tf.logging.set_verbosity(tf.logging.DEBUG) 13 | # tf.logging.set_verbosity(tf.logging.FATAL) 14 | 15 | gpu_memory_fraction = 0.3 16 | 17 | class Detection: 18 | # face detection parameters 19 | minsize = 20 # minimum size of face 20 | threshold = [0.6, 0.7, 0.7] # three steps's threshold 21 | factor = 0.709 # scale factor 22 | 23 | def __init__(self, face_crop_size=160, face_crop_margin=32): 24 | self.pnet, self.rnet, self.onet = self._setup_mtcnn() 25 | self.face_crop_size = face_crop_size 26 | self.face_crop_margin = face_crop_margin 27 | 28 | def _setup_mtcnn(self): 29 | with tf.Graph().as_default(): 30 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_memory_fraction) 31 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) 32 | with sess.as_default(): 33 | return detect_face.create_mtcnn(sess, None) 34 | 35 | def find_faces(self, image): 36 | faces = [] 37 | 38 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 39 | 40 | bounding_boxes, _ = detect_face.detect_face(image, self.minsize, 41 | self.pnet, self.rnet, self.onet, 42 | self.threshold, self.factor) 43 | for bb in bounding_boxes: 44 | face = Face() 45 | face.container_image = image 46 | face.bounding_box = np.zeros(4, dtype=np.int32) 47 | 48 | img_size = np.asarray(image.shape)[0:2] 49 | face.bounding_box[0] = np.maximum(bb[0] - self.face_crop_margin / 2, 0) 50 | face.bounding_box[1] = np.maximum(bb[1] - self.face_crop_margin / 2, 0) 51 | face.bounding_box[2] = np.minimum(bb[2] + self.face_crop_margin / 2, img_size[1]) 52 | face.bounding_box[3] = np.minimum(bb[3] + self.face_crop_margin / 2, img_size[0]) 53 | cropped = image[face.bounding_box[1]:face.bounding_box[3], face.bounding_box[0]:face.bounding_box[2], :] 54 | face.image = cv2.resize(cropped, (self.face_crop_size, self.face_crop_size), interpolation=cv2.INTER_LINEAR) 55 | # face.image = misc.imresize(cropped, (self.face_crop_size, self.face_crop_size), interp='bilinear') 56 | face.confidence = bb[4] 57 | faces.append(face) 58 | 59 | return faces -------------------------------------------------------------------------------- /detect_blur.py: -------------------------------------------------------------------------------- 1 | from facenet_code.detection import Detection 2 | from facenet_code.encoder import Encoder 3 | from scipy.linalg import svd 4 | from imutils import paths 5 | import numpy as np 6 | import argparse 7 | import cv2 8 | import os 9 | 10 | class DetectBlur(object): 11 | def __init__(self, video, threshold=0.8): 12 | self.video = video 13 | self.threshold = threshold 14 | print(self.threshold) 15 | self.video_frames = [] 16 | 17 | self.detect = Detection() 18 | 19 | self.process() 20 | 21 | def process(self): 22 | self.create_output_folder() 23 | self.get_video_frames() 24 | self.detect_blur() 25 | 26 | def create_output_folder(self): 27 | if not os.path.isdir('output'): 28 | os.mkdir('output') 29 | video_name = self.video.split('.')[0] 30 | if not os.path.isdir('output/'+video_name): 31 | os.mkdir('output/'+video_name) 32 | if not os.path.isdir('output/'+video_name+'/'+'frames'): 33 | os.mkdir('output/'+video_name+'/'+'frames') 34 | 35 | def get_blur_degree(self, img, sv_num=10): 36 | gray_img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) 37 | u, s, v = np.linalg.svd(gray_img) 38 | top_sv = np.sum(s[0:sv_num]) 39 | total_sv = np.sum(s) 40 | return top_sv/total_sv 41 | 42 | # def get_blur_map(self, img, win_size=10, sv_num=3): 43 | # gray_img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) 44 | # new_img = np.zeros((gray_img.shape[0]+win_size*2, gray_img.shape[1]+win_size*2)) 45 | # for i in range(new_img.shape[0]): 46 | # for j in range(new_img.shape[1]): 47 | # if igray_img.shape[0]+win_size-1: 50 | # p = gray_img.shape[0]*2-i 51 | # else: 52 | # p = i-win_size 53 | # if jgray_img.shape[1]+win_size-1: 56 | # q = gray_img.shape[1]*2-j 57 | # else: 58 | # q = j-win_size 59 | # new_img[i,j] = img[p,q] 60 | # blur_map = np.zeros((gray_img.shape[0], gray_img.shape[1])) 61 | # max_sv = 0 62 | # min_sv = 1 63 | # for i in range(gray_img.shape[0]): 64 | # for j in range(gray_img.shape[1]): 65 | # block = new_img[i:i+win_size*2, j:j+win_size*2] 66 | # u, s, v = np.linalg.svd(block) 67 | # top_sv = np.sum(s[0:sv_num]) 68 | # total_sv = np.sum(s) 69 | # sv_degree = top_sv/total_sv 70 | # if max_sv < sv_degree: 71 | # max_sv = sv_degree 72 | # if min_sv > sv_degree: 73 | # min_sv = sv_degree 74 | # blur_map[i, j] = sv_degree 75 | # blur_map = (blur_map-min_sv)/(max_sv-min_sv) 76 | # return blur_map 77 | 78 | def get_video_frames(self): 79 | vidcap = cv2.VideoCapture(self.video) 80 | success, image = vidcap.read() 81 | count = 0 82 | while success: 83 | self.video_frames.append(image) 84 | success, image = vidcap.read() 85 | 86 | def print_box(self, frame, name, blur_degree, face_bb, color): 87 | left, top, right, bottom = face_bb 88 | width = right - left 89 | height = bottom - top 90 | 91 | if height > width: 92 | tam = int(height/4) 93 | else: 94 | tam = int(width/4) 95 | 96 | cv2.putText(frame, name, (right + 15, top + 30), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2) 97 | cv2.putText(frame, blur_degree, (right + 15, top + 60), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, 2) 98 | 99 | cv2.rectangle(frame, (face_bb[0], face_bb[1]), (face_bb[2], face_bb[3]), color, 1) 100 | 101 | cv2.line(frame, (left, top), (left+tam, top), color, 3) 102 | cv2.line(frame, (left, top), (left, top+tam), color, 3) 103 | 104 | cv2.line(frame, (left, bottom), (left, bottom-tam), color, 3) 105 | cv2.line(frame, (left, bottom), (left+tam, bottom), color, 3) 106 | 107 | cv2.line(frame, (right, top), (right-tam, top), color, 3) 108 | cv2.line(frame, (right, top), (right, top+tam), color, 3) 109 | 110 | cv2.line(frame, (right, bottom), (right-tam, bottom), color, 3) 111 | cv2.line(frame, (right, bottom), (right, bottom-tam), color, 3) 112 | 113 | def detect_blur(self): 114 | output_video = None 115 | if output_video is None: 116 | video_name = self.video.split('.')[0] 117 | size = (self.video_frames[0].shape[1], self.video_frames[0].shape[0]) 118 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 119 | output_video = cv2.VideoWriter('output/'+video_name+'/'+video_name+'.avi',fourcc, 5, size, True) 120 | for i, frame in enumerate(self.video_frames): 121 | print('[INFO] detecting blur in image '+str(i+1)+'/'+str(len(self.video_frames))) 122 | faces = self.detect.find_faces(frame) 123 | if len(faces) > 0: 124 | for face in faces: 125 | if face.confidence > 0.9: 126 | text = "Not Blurry" 127 | boxes = face.bounding_box.astype(int) 128 | left, top, right, bottom = boxes 129 | face_image = frame[top:bottom, left:right] 130 | blur_degree = self.get_blur_degree(face_image) 131 | if blur_degree > self.threshold: 132 | text = "Blurry" 133 | self.print_box(frame, text, "{:.2f}".format(blur_degree), boxes, (255,255,255)) 134 | if output_video is not None: 135 | output_video.write(frame) 136 | cv2.imwrite('output/'+video_name+'/'+'frames/frame_'+str(i+1)+'.jpg', frame) 137 | if output_video is not None: 138 | output_video.release() 139 | 140 | 141 | if __name__ == "__main__": 142 | ap = argparse.ArgumentParser() 143 | ap.add_argument('video', type=str, help='the video input to detect blurry faces') 144 | ap.add_argument('--threshold', default=0.8, type=float, help='the threshold of blur degree to classify if some face is blurry or not') 145 | args = vars(ap.parse_args()) 146 | 147 | DetectBlur(video=args['video'], threshold=args['threshold']) 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blurry Faces Detection in Videos 2 | 3 | Many digital images contain blurred regions which are caused by incorrect focus, object motion, hand shaking and so on. In any cases, automatic image blurred region detection are useful for learning the image information, which can be used in different multimedia analysis applications such as image segmentation, depth recovery, image retrieval and face recognition. For machine learning process known as face recognition, the blur detection it's important to avoid wrong predictions caused by people motion. The main objective of this experiment is detect blur on face pictures to improve the results of face recognition process. The experiment is based on paper entitled "Blurred Image Region Detection and Classification" that can be found in `reference.pdf` file. 4 | 5 | # 6 | 7 | ## FaceNet Project 8 | 9 | The achieved results in this experiment was reached using a face recognition project named FaceNet. This project was implemented by David Sandberg and it's available on his Github account in [facenet](https://github.com/davidsandberg/facenet) repository. The code is open source with MIT [license](https://github.com/davidsandberg/facenet/blob/master/LICENSE.md) and was developed using Python programming language, with TersorFlow library for Machine Learning process and OpenCV multiplatform library for image processing. 10 | 11 | David Sandberg describes in repository documentation that the code was heavily inspired by the [OpenFace](https://github.com/cmusatyalab/openface) implementation and uses ideas from the paper ["Deep Face Recognition"](http://www.robots.ox.ac.uk/~vgg/publications/2015/Parkhi15/parkhi15.pdf) from the [Visual Geometry Group](http://www.robots.ox.ac.uk/~vgg/) at Oxford. The FaceNet implementation was tested using Tensorflow r1.7 under Ubuntu 14.04 with Python 2.7 and Python 3.5. The test cases and their results can be found in repository as reported in documentation. 12 | 13 | Besides the tests, two pre-trained models are available in repository to download. The model named [20180408-102900](https://drive.google.com/file/d/1R77HmFADxe87GmoLwzfgMu_HY0IhcyBz/view) obtained 0.9905 of accuracy using CASIA-WebFace dataset to training and [Inception ResNet v1](https://github.com/davidsandberg/facenet/blob/master/src/models/inception_resnet_v1.py) architecture. The other model, named [20180402-114759](https://drive.google.com/file/d/1EXPBSXwTaqrSC0OhUdXNmKSh9qJUQ55-/view), obtained a relative better accuracy of 0.9965 using VGGFace2 training dataset and the same architecture. Some another informations are available in FaceNet repository like updates, details of the training data, pre-processing, performance, etc. 14 | 15 | # 16 | 17 | ## Face Detection Process 18 | 19 | The FaceNet project, detailed in previous section, was used in this experiments for face detection. The project provide, as mentioned before, two pre-trained models capable of detect faces in a picture. The picture is introduced as input and the implementation detect where the faces are localized in the picture. The exact result is a vector of faces. Each face is represent by anothe vector wich has five numbers, the first four represents the four bounding boxes in pixels of the respective detected face and the other number is the confidence of the detection result in percent. The FaceNet used code is available in `facenet_code` folder. 20 | 21 | # 22 | 23 | ## Blur Detection Process 24 | 25 | Singular Value Decomposition (SVD) is one of the most useful techniques in Linear Algebra, and has been applied to different areas of Computer Science. The blur detection process uses the SVD factorization to calculate a blur degree and, based on estipulated threshold, classify some picture in "Blurred" or "Not blurred". Generally, blurred picture regions have a higher blur degree compared with clear image regions with no blurs. The reference paper suggest, based on tested different images, a 0.75 threshold, achieved with the accuracy is 88.78%. In case of a detailed explanation, the step by step of the calculation of blur degree is described in the paper. The implementation of the paper description can be found in [blur_detection](https://github.com/fled/blur_detection) repository, in one of the authors Github account. 26 | 27 | # 28 | 29 | ## System Requirements 30 | 31 | ### `Warning: To follow the documentation, it's necessary to use Ubuntu 18+ as operational system, but accompanying the documentation it's possible verify all the requirements and project dependencies to reproduce the configuration in another operational systems.` 32 | 33 | - **python3-venv** 34 | >$ sudo apt install python3-venv 35 | - **pip3** 36 | >$ sudo apt install python3-pip 37 | 38 | # 39 | 40 | ## Virtual Environment 41 | It's advisable to create a virtual environment to manage the project dependencies without libraries conflicts. For create, activate and deactivate a virutal enviroment, follow the instructions bellow. 42 | 43 | From the project root directory: 44 | 45 | - **Create** a new virtual enviroment: 46 | >$ python3 -m venv env 47 | - **Activate** a virtual enviroment: 48 | >$ source env/bin/activate 49 | - **Deactivate** a virtual enviroment: 50 | >$ deactivate 51 | 52 | # 53 | 54 | ## Project Dependencies 55 | Follow the instructions bellow to install all project dependencies in a virtual enviroments. It's important to mention that all required libraries are listed in `requirements.in`. 56 | 57 | From the project root directory: 58 | 59 | - **Create** a new virtual enviroment: 60 | >$ python3 -m venv env 61 | - **Activate** the virtual enviroment: 62 | >$ source env/bin/activate 63 | - Install **pip-tools**: 64 | >$ pip3 install pip-tools 65 | - **Compile** all the requirements: 66 | >$ pip-compile 67 | - **Syncronize** all the requirements: 68 | >$ pip-sync 69 | 70 | To learn more about **pip-tools** please refer to [documentation](https://pypi.org/project/pip-tools/). 71 | 72 | After running all these instructions the `requirements.txt` file will be generated and all the dependencies will be installed. 73 | 74 | # 75 | 76 | ## Download Weights 77 | 78 | As mentioned before, two pre-trained models are available in FaceNet repository to download. These models are the key of the algorithm. One of them, wich is used in this experiment, is available in [weights](https://drive.google.com/drive/folders/1Thfg7WguOLfjZ3iAtdQqQNev6uoaXwsy?usp=sharing) and must be downloaded. It's fundamental put the downloaded `weitghts` folder inside `facenet_code` folder to garantee the code operation. 79 | 80 | # 81 | 82 | ## Run Blurry Faces Detection Process 83 | 84 | The main file with the whole blurry faces detection implementation is `detect_blur.py`. Executing this file the process will running automatically. Besisdes that, at most two parameters can be included in command line. First of them is the video wich will be used to detect the blurry faces. This parameter is required and it's necessary include the path and extension of the file. The second parameter is optional and refer to threshold of blur degree. Remember that the threshold default is 0.8 and if it's necessary change it include the `--threshold` label before the float threshold value in command line. 85 | 86 | Two examples of running blur detection process: 87 | >$ python detect_blur.py video.mp4 88 | 89 | >$ python detect_blur.py video.mp4 --threshold 0.75 90 | 91 | All the results of any code execution will be available in `output` folder. Inside this folder will be create another folder with the blur detection process video name that will contain the respective results. One of the results is the same video with three descriptions in each video frame: the bounding boxes in each detected face, the value of blur degree about this bounding boxes and the classification based on defined threshold if the faces were blurry or not. Besisdes the video, each video frame of the video with the same informations will also be available. 92 | 93 | # 94 | 95 | ## License 96 | 97 | FaceNet project is open source with MIT [license](https://github.com/davidsandberg/facenet/blob/master/LICENSE.md). 98 | 99 | About code developed by paper authors, everyone is permitted to copy and distribute verbatim copies of the [license document](https://github.com/fled/blur_detection/blob/master/LICENSE), but changing it is not allowed. 100 | 101 | About this project, just consider the other two licenses. Use these informations wisely. 102 | 103 | # 104 | 105 | ## Final Considerations 106 | 107 | In spite of this project basically merge two existing Github repositories, this project was made especifically to detect blurry faces in videos. As mentioned before, this research is very usefull to be used in facial recognition projects or different multimedia analysis applications. The code is very small and have all the necessary documentation to be adapted to your implementation. 108 | 109 | # 110 | 111 | ## Thank you for reading and enjoy it! 112 | 113 | -------------------------------------------------------------------------------- /facenet_code/align_dataset_mtcnn.py: -------------------------------------------------------------------------------- 1 | """Performs face alignment and stores face thumbnails in the output directory.""" 2 | # MIT License 3 | # 4 | # Copyright (c) 2016 David Sandberg 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | 28 | from scipy import misc 29 | import sys 30 | import os 31 | import argparse 32 | import tensorflow as tf 33 | import numpy as np 34 | 35 | import random 36 | from time import sleep 37 | from facenet import facenet 38 | from facenet import detect_face 39 | 40 | tf.logging.set_verbosity(tf.logging.ERROR) 41 | # tf.logging.set_verbosity(tf.logging.INFO) 42 | # tf.logging.set_verbosity(tf.logging.WARN) 43 | # tf.logging.set_verbosity(tf.logging.DEBUG) 44 | # tf.logging.set_verbosity(tf.logging.FATAL) 45 | 46 | def main(args): 47 | sleep(random.random()) 48 | output_dir = os.path.expanduser(args.output_dir) 49 | if not os.path.exists(output_dir): 50 | os.makedirs(output_dir) 51 | # Store some git revision info in a text file in the log directory 52 | src_path,_ = os.path.split(os.path.realpath(__file__)) 53 | facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) 54 | dataset = facenet.get_dataset(args.input_dir) 55 | 56 | print('Creating networks and loading parameters') 57 | 58 | with tf.Graph().as_default(): 59 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) 60 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) 61 | with sess.as_default(): 62 | pnet, rnet, onet = detect_face.create_mtcnn(sess, None) 63 | 64 | minsize = 20 # minimum size of face 65 | threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold 66 | factor = 0.709 # scale factor 67 | 68 | # Add a random key to the filename to allow alignment using multiple processes 69 | random_key = np.random.randint(0, high=99999) 70 | bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) 71 | 72 | with open(bounding_boxes_filename, "w") as text_file: 73 | nrof_images_total = 0 74 | nrof_successfully_aligned = 0 75 | if args.random_order: 76 | random.shuffle(dataset) 77 | for cls in dataset: 78 | output_class_dir = os.path.join(output_dir, cls.name) 79 | if not os.path.exists(output_class_dir): 80 | os.makedirs(output_class_dir) 81 | if args.random_order: 82 | random.shuffle(cls.image_paths) 83 | for image_path in cls.image_paths: 84 | nrof_images_total += 1 85 | filename = os.path.splitext(os.path.split(image_path)[1])[0] 86 | output_filename = os.path.join(output_class_dir, filename+'.png') 87 | print(image_path) 88 | if not os.path.exists(output_filename): 89 | try: 90 | img = misc.imread(image_path) 91 | except (IOError, ValueError, IndexError) as e: 92 | errorMessage = '{}: {}'.format(image_path, e) 93 | print(errorMessage) 94 | else: 95 | if img.ndim<2: 96 | print('Unable to align "%s"' % image_path) 97 | text_file.write('%s\n' % (output_filename)) 98 | continue 99 | if img.ndim == 2: 100 | img = facenet.to_rgb(img) 101 | img = img[:,:,0:3] 102 | 103 | bounding_boxes, _ = detect_face.detect_face(img, minsize, pnet, rnet, onet, threshold, factor) 104 | nrof_faces = bounding_boxes.shape[0] 105 | if nrof_faces>0: 106 | det = bounding_boxes[:,0:4] 107 | det_arr = [] 108 | img_size = np.asarray(img.shape)[0:2] 109 | if nrof_faces>1: 110 | if args.detect_multiple_faces: 111 | for i in range(nrof_faces): 112 | det_arr.append(np.squeeze(det[i])) 113 | else: 114 | bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) 115 | img_center = img_size / 2 116 | offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) 117 | offset_dist_squared = np.sum(np.power(offsets,2.0),0) 118 | index = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering 119 | det_arr.append(det[index,:]) 120 | else: 121 | det_arr.append(np.squeeze(det)) 122 | 123 | for i, det in enumerate(det_arr): 124 | det = np.squeeze(det) 125 | bb = np.zeros(4, dtype=np.int32) 126 | bb[0] = np.maximum(det[0]-args.margin/2, 0) 127 | bb[1] = np.maximum(det[1]-args.margin/2, 0) 128 | bb[2] = np.minimum(det[2]+args.margin/2, img_size[1]) 129 | bb[3] = np.minimum(det[3]+args.margin/2, img_size[0]) 130 | cropped = img[bb[1]:bb[3],bb[0]:bb[2],:] 131 | scaled = misc.imresize(cropped, (args.image_size, args.image_size), interp='bilinear') 132 | nrof_successfully_aligned += 1 133 | filename_base, file_extension = os.path.splitext(output_filename) 134 | if args.detect_multiple_faces: 135 | output_filename_n = "{}_{}{}".format(filename_base, i, file_extension) 136 | else: 137 | output_filename_n = "{}{}".format(filename_base, file_extension) 138 | misc.imsave(output_filename_n, scaled) 139 | text_file.write('%s %d %d %d %d\n' % (output_filename_n, bb[0], bb[1], bb[2], bb[3])) 140 | else: 141 | print('Unable to align "%s"' % image_path) 142 | text_file.write('%s\n' % (output_filename)) 143 | 144 | print('Total number of images: %d' % nrof_images_total) 145 | print('Number of successfully aligned images: %d' % nrof_successfully_aligned) 146 | 147 | 148 | def parse_arguments(argv): 149 | parser = argparse.ArgumentParser() 150 | 151 | parser.add_argument('input_dir', type=str, help='Directory with unaligned images.') 152 | parser.add_argument('output_dir', type=str, help='Directory with aligned face thumbnails.') 153 | parser.add_argument('--image_size', type=int, 154 | help='Image size (height, width) in pixels.', default=182) 155 | parser.add_argument('--margin', type=int, 156 | help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) 157 | parser.add_argument('--random_order', 158 | help='Shuffles the order of images to enable alignment using multiple processes.', action='store_true') 159 | parser.add_argument('--gpu_memory_fraction', type=float, 160 | help='Upper bound on the amount of GPU memory that will be used by the process.', default=1.0) 161 | parser.add_argument('--detect_multiple_faces', type=bool, 162 | help='Detect and align multiple faces per image.', default=False) 163 | return parser.parse_args(argv) 164 | 165 | if __name__ == '__main__': 166 | main(parse_arguments(sys.argv[1:])) 167 | -------------------------------------------------------------------------------- /facenet_code/facenet.py: -------------------------------------------------------------------------------- 1 | """Functions for building the face recognition network. 2 | """ 3 | # MIT License 4 | # 5 | # Copyright (c) 2016 David Sandberg 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | # pylint: disable=missing-docstring 26 | from __future__ import absolute_import 27 | from __future__ import division 28 | from __future__ import print_function 29 | 30 | import os 31 | from subprocess import Popen, PIPE 32 | import tensorflow as tf 33 | import numpy as np 34 | from scipy import misc 35 | from tensorflow.python.training import training 36 | import re 37 | from tensorflow.python.platform import gfile 38 | 39 | tf.logging.set_verbosity(tf.logging.ERROR) 40 | # tf.logging.set_verbosity(tf.logging.INFO) 41 | # tf.logging.set_verbosity(tf.logging.WARN) 42 | # tf.logging.set_verbosity(tf.logging.DEBUG) 43 | # tf.logging.set_verbosity(tf.logging.FATAL) 44 | 45 | 46 | # 1: Random rotate 2: Random crop 4: Random flip 8: Fixed image standardization 16: Flip 47 | RANDOM_ROTATE = 1 48 | RANDOM_CROP = 2 49 | RANDOM_FLIP = 4 50 | FIXED_STANDARDIZATION = 8 51 | FLIP = 16 52 | 53 | def get_image_paths_and_labels(dataset): 54 | image_paths_flat = [] 55 | labels_flat = [] 56 | for i in range(len(dataset)): 57 | image_paths_flat += dataset[i].image_paths 58 | labels_flat += [i] * len(dataset[i].image_paths) 59 | return image_paths_flat, labels_flat 60 | 61 | def prewhiten(x): 62 | mean = np.mean(x) 63 | std = np.std(x) 64 | std_adj = np.maximum(std, 1.0/np.sqrt(x.size)) 65 | y = np.multiply(np.subtract(x, mean), 1/std_adj) 66 | return y 67 | 68 | def crop(image, random_crop, image_size): 69 | if image.shape[1]>image_size: 70 | sz1 = int(image.shape[1]//2) 71 | sz2 = int(image_size//2) 72 | if random_crop: 73 | diff = sz1-sz2 74 | (h, v) = (np.random.randint(-diff, diff+1), np.random.randint(-diff, diff+1)) 75 | else: 76 | (h, v) = (0,0) 77 | image = image[(sz1-sz2+v):(sz1+sz2+v),(sz1-sz2+h):(sz1+sz2+h),:] 78 | return image 79 | 80 | def flip(image, random_flip): 81 | if random_flip and np.random.choice([True, False]): 82 | image = np.fliplr(image) 83 | return image 84 | 85 | def to_rgb(img): 86 | w, h = img.shape 87 | ret = np.empty((w, h, 3), dtype=np.uint8) 88 | ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img 89 | return ret 90 | 91 | def load_data(image_paths, do_random_crop, do_random_flip, image_size, do_prewhiten=True): 92 | nrof_samples = len(image_paths) 93 | images = np.zeros((nrof_samples, image_size, image_size, 3)) 94 | for i in range(nrof_samples): 95 | img = misc.imread(image_paths[i]) 96 | if img.ndim == 2: 97 | img = to_rgb(img) 98 | if do_prewhiten: 99 | img = prewhiten(img) 100 | img = crop(img, do_random_crop, image_size) 101 | img = flip(img, do_random_flip) 102 | images[i,:,:,:] = img 103 | return images 104 | 105 | 106 | class ImageClass(): 107 | "Stores the paths to images for a given class" 108 | def __init__(self, name, image_paths): 109 | self.name = name 110 | self.image_paths = image_paths 111 | 112 | def __str__(self): 113 | return self.name + ', ' + str(len(self.image_paths)) + ' images' 114 | 115 | def __len__(self): 116 | return len(self.image_paths) 117 | 118 | def get_dataset(path, has_class_directories=True): 119 | dataset = [] 120 | path_exp = os.path.expanduser(path) 121 | classes = [path for path in os.listdir(path_exp) \ 122 | if os.path.isdir(os.path.join(path_exp, path))] 123 | classes.sort() 124 | nrof_classes = len(classes) 125 | for i in range(nrof_classes): 126 | class_name = classes[i] 127 | facedir = os.path.join(path_exp, class_name) 128 | image_paths = get_image_paths(facedir) 129 | dataset.append(ImageClass(class_name, image_paths)) 130 | 131 | return dataset 132 | 133 | def get_image_paths(facedir): 134 | image_paths = [] 135 | if os.path.isdir(facedir): 136 | images = os.listdir(facedir) 137 | image_paths = [os.path.join(facedir,img) for img in images] 138 | return image_paths 139 | 140 | def load_model(model, input_map=None): 141 | # Check if the sample-weight is a sample-weight directory (containing a metagraph and a checkpoint file) 142 | # or if it is a protobuf file with a frozen graph 143 | model_exp = os.path.expanduser(model) 144 | if (os.path.isfile(model_exp)): 145 | with gfile.FastGFile(model_exp,'rb') as f: 146 | graph_def = tf.GraphDef() 147 | graph_def.ParseFromString(f.read()) 148 | tf.import_graph_def(graph_def, input_map=input_map, name='') 149 | else: 150 | print('Model directory: %s' % model_exp) 151 | meta_file, ckpt_file = get_model_filenames(model_exp) 152 | 153 | print('Metagraph file: %s' % meta_file) 154 | print('Checkpoint file: %s' % ckpt_file) 155 | 156 | saver = tf.train.import_meta_graph(os.path.join(model_exp, meta_file), input_map=input_map) 157 | saver.restore(tf.get_default_session(), os.path.join(model_exp, ckpt_file)) 158 | 159 | def get_model_filenames(model_dir): 160 | files = os.listdir(model_dir) 161 | meta_files = [s for s in files if s.endswith('.meta')] 162 | if len(meta_files)==0: 163 | raise ValueError('No meta file found in the sample-weight directory (%s)' % model_dir) 164 | elif len(meta_files)>1: 165 | raise ValueError('There should not be more than one meta file in the sample-weight directory (%s)' % model_dir) 166 | meta_file = meta_files[0] 167 | ckpt = tf.train.get_checkpoint_state(model_dir) 168 | if ckpt and ckpt.model_checkpoint_path: 169 | ckpt_file = os.path.basename(ckpt.model_checkpoint_path) 170 | return meta_file, ckpt_file 171 | 172 | meta_files = [s for s in files if '.ckpt' in s] 173 | max_step = -1 174 | for f in files: 175 | step_str = re.match(r'(^sample-weight-[\w\- ]+.ckpt-(\d+))', f) 176 | if step_str is not None and len(step_str.groups())>=2: 177 | step = int(step_str.groups()[1]) 178 | if step > max_step: 179 | max_step = step 180 | ckpt_file = step_str.groups()[0] 181 | return meta_file, ckpt_file 182 | 183 | 184 | def store_revision_info(src_path, output_dir, arg_string): 185 | try: 186 | # Get git hash 187 | cmd = ['git', 'rev-parse', 'HEAD'] 188 | gitproc = Popen(cmd, stdout = PIPE, cwd=src_path) 189 | (stdout, _) = gitproc.communicate() 190 | git_hash = stdout.strip() 191 | except OSError as e: 192 | git_hash = ' '.join(cmd) + ': ' + e.strerror 193 | 194 | try: 195 | # Get local changes 196 | cmd = ['git', 'diff', 'HEAD'] 197 | gitproc = Popen(cmd, stdout = PIPE, cwd=src_path) 198 | (stdout, _) = gitproc.communicate() 199 | git_diff = stdout.strip() 200 | except OSError as e: 201 | git_diff = ' '.join(cmd) + ': ' + e.strerror 202 | 203 | # Store a text file in the log directory 204 | rev_info_filename = os.path.join(output_dir, 'revision_info.txt') 205 | with open(rev_info_filename, "w") as text_file: 206 | text_file.write('arguments: %s\n--------------------\n' % arg_string) 207 | text_file.write('tensorflow version: %s\n--------------------\n' % tf.__version__) # @UndefinedVariable 208 | text_file.write('git hash: %s\n--------------------\n' % git_hash) 209 | text_file.write('%s' % git_diff) 210 | 211 | def list_variables(filename): 212 | reader = training.NewCheckpointReader(filename) 213 | variable_map = reader.get_variable_to_shape_map() 214 | names = sorted(variable_map.keys()) 215 | return names 216 | 217 | def put_images_on_grid(images, shape=(16,8)): 218 | nrof_images = images.shape[0] 219 | img_size = images.shape[1] 220 | bw = 3 221 | img = np.zeros((shape[1]*(img_size+bw)+bw, shape[0]*(img_size+bw)+bw, 3), np.float32) 222 | for i in range(shape[1]): 223 | x_start = i*(img_size+bw)+bw 224 | for j in range(shape[0]): 225 | img_index = i*shape[0]+j 226 | if img_index>=nrof_images: 227 | break 228 | y_start = j*(img_size+bw)+bw 229 | img[x_start:x_start+img_size, y_start:y_start+img_size, :] = images[img_index, :, :, :] 230 | if img_index>=nrof_images: 231 | break 232 | return img 233 | 234 | 235 | -------------------------------------------------------------------------------- /facenet_code/detect_face.py: -------------------------------------------------------------------------------- 1 | """ Tensorflow implementation of the face detection / alignment algorithm found at 2 | https://github.com/kpzhang93/MTCNN_face_detection_alignment 3 | """ 4 | # MIT License 5 | # 6 | # Copyright (c) 2016 David Sandberg 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | 26 | from __future__ import absolute_import 27 | from __future__ import division 28 | from __future__ import print_function 29 | from six import string_types, iteritems 30 | 31 | import numpy as np 32 | import tensorflow as tf 33 | import cv2 34 | import os 35 | 36 | tf.logging.set_verbosity(tf.logging.ERROR) 37 | # tf.logging.set_verbosity(tf.logging.INFO) 38 | # tf.logging.set_verbosity(tf.logging.WARN) 39 | # tf.logging.set_verbosity(tf.logging.DEBUG) 40 | # tf.logging.set_verbosity(tf.logging.FATAL) 41 | 42 | 43 | def layer(op): 44 | """Decorator for composable network layers.""" 45 | 46 | def layer_decorated(self, *args, **kwargs): 47 | # Automatically set a name if not provided. 48 | name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) 49 | # Figure out the layer inputs. 50 | if len(self.terminals) == 0: 51 | raise RuntimeError('No input variables found for layer %s.' % name) 52 | elif len(self.terminals) == 1: 53 | layer_input = self.terminals[0] 54 | else: 55 | layer_input = list(self.terminals) 56 | # Perform the operation and get the output. 57 | layer_output = op(self, layer_input, *args, **kwargs) 58 | # Add to layer LUT. 59 | self.layers[name] = layer_output 60 | # This output is now the input for the next layer. 61 | self.feed(layer_output) 62 | # Return self for chained calls. 63 | return self 64 | 65 | return layer_decorated 66 | 67 | class Network(object): 68 | 69 | def __init__(self, inputs, trainable=True): 70 | # The input nodes for this network 71 | self.inputs = inputs 72 | # The current list of terminal nodes 73 | self.terminals = [] 74 | # Mapping from layer names to layers 75 | self.layers = dict(inputs) 76 | # If true, the resulting variables are set as trainable 77 | self.trainable = trainable 78 | 79 | self.setup() 80 | 81 | def setup(self): 82 | """Construct the network. """ 83 | raise NotImplementedError('Must be implemented by the subclass.') 84 | 85 | def load(self, data_path, session, ignore_missing=False): 86 | """Load network weights. 87 | data_path: The path to the numpy-serialized network weights 88 | session: The current TensorFlow session 89 | ignore_missing: If true, serialized weights for missing layers are ignored. 90 | """ 91 | data_dict = np.load(data_path, encoding='latin1', allow_pickle=True).item() #pylint: disable=no-member 92 | 93 | for op_name in data_dict: 94 | with tf.variable_scope(op_name, reuse=True): 95 | for param_name, data in iteritems(data_dict[op_name]): 96 | try: 97 | var = tf.get_variable(param_name) 98 | session.run(var.assign(data)) 99 | except ValueError: 100 | if not ignore_missing: 101 | raise 102 | 103 | def feed(self, *args): 104 | """Set the input(s) for the next operation by replacing the terminal nodes. 105 | The arguments can be either layer names or the actual layers. 106 | """ 107 | assert len(args) != 0 108 | self.terminals = [] 109 | for fed_layer in args: 110 | if isinstance(fed_layer, string_types): 111 | try: 112 | fed_layer = self.layers[fed_layer] 113 | except KeyError: 114 | raise KeyError('Unknown layer name fed: %s' % fed_layer) 115 | self.terminals.append(fed_layer) 116 | return self 117 | 118 | def get_output(self): 119 | """Returns the current network output.""" 120 | return self.terminals[-1] 121 | 122 | def get_unique_name(self, prefix): 123 | """Returns an index-suffixed unique name for the given prefix. 124 | This is used for auto-generating layer names based on the type-prefix. 125 | """ 126 | ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 127 | return '%s_%d' % (prefix, ident) 128 | 129 | def make_var(self, name, shape): 130 | """Creates a new TensorFlow variable.""" 131 | return tf.get_variable(name, shape, trainable=self.trainable) 132 | 133 | def validate_padding(self, padding): 134 | """Verifies that the padding is one of the supported ones.""" 135 | assert padding in ('SAME', 'VALID') 136 | 137 | @layer 138 | def conv(self, 139 | inp, 140 | k_h, 141 | k_w, 142 | c_o, 143 | s_h, 144 | s_w, 145 | name, 146 | relu=True, 147 | padding='SAME', 148 | group=1, 149 | biased=True): 150 | # Verify that the padding is acceptable 151 | self.validate_padding(padding) 152 | # Get the number of channels in the input 153 | c_i = int(inp.get_shape()[-1]) 154 | # Verify that the grouping parameter is valid 155 | assert c_i % group == 0 156 | assert c_o % group == 0 157 | # Convolution for a given input and kernel 158 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 159 | with tf.variable_scope(name) as scope: 160 | kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o]) 161 | # This is the common-case. Convolve the input without any further complications. 162 | output = convolve(inp, kernel) 163 | # Add the biases 164 | if biased: 165 | biases = self.make_var('biases', [c_o]) 166 | output = tf.nn.bias_add(output, biases) 167 | if relu: 168 | # ReLU non-linearity 169 | output = tf.nn.relu(output, name=scope.name) 170 | return output 171 | 172 | @layer 173 | def prelu(self, inp, name): 174 | with tf.variable_scope(name): 175 | i = int(inp.get_shape()[-1]) 176 | alpha = self.make_var('alpha', shape=(i,)) 177 | output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp)) 178 | return output 179 | 180 | @layer 181 | def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'): 182 | self.validate_padding(padding) 183 | return tf.nn.max_pool(inp, 184 | ksize=[1, k_h, k_w, 1], 185 | strides=[1, s_h, s_w, 1], 186 | padding=padding, 187 | name=name) 188 | 189 | @layer 190 | def fc(self, inp, num_out, name, relu=True): 191 | with tf.variable_scope(name): 192 | input_shape = inp.get_shape() 193 | if input_shape.ndims == 4: 194 | # The input is spatial. Vectorize it first. 195 | dim = 1 196 | for d in input_shape[1:].as_list(): 197 | dim *= int(d) 198 | feed_in = tf.reshape(inp, [-1, dim]) 199 | else: 200 | feed_in, dim = (inp, input_shape[-1].value) 201 | weights = self.make_var('weights', shape=[dim, num_out]) 202 | biases = self.make_var('biases', [num_out]) 203 | op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b 204 | fc = op(feed_in, weights, biases, name=name) 205 | return fc 206 | 207 | 208 | """ 209 | Multi dimensional softmax, 210 | refer to https://github.com/tensorflow/tensorflow/issues/210 211 | compute softmax along the dimension of target 212 | the native softmax only supports batch_size x dimension 213 | """ 214 | @layer 215 | def softmax(self, target, axis, name=None): 216 | max_axis = tf.reduce_max(target, axis, keepdims=True) 217 | target_exp = tf.exp(target-max_axis) 218 | normalize = tf.reduce_sum(target_exp, axis, keepdims=True) 219 | softmax = tf.div(target_exp, normalize, name) 220 | return softmax 221 | 222 | class PNet(Network): 223 | def setup(self): 224 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 225 | .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1') 226 | .prelu(name='PReLU1') 227 | .max_pool(2, 2, 2, 2, name='pool1') 228 | .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2') 229 | .prelu(name='PReLU2') 230 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3') 231 | .prelu(name='PReLU3') 232 | .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1') 233 | .softmax(3,name='prob1')) 234 | 235 | (self.feed('PReLU3') #pylint: disable=no-value-for-parameter 236 | .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2')) 237 | 238 | class RNet(Network): 239 | def setup(self): 240 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 241 | .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1') 242 | .prelu(name='prelu1') 243 | .max_pool(3, 3, 2, 2, name='pool1') 244 | .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2') 245 | .prelu(name='prelu2') 246 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 247 | .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3') 248 | .prelu(name='prelu3') 249 | .fc(128, relu=False, name='conv4') 250 | .prelu(name='prelu4') 251 | .fc(2, relu=False, name='conv5-1') 252 | .softmax(1,name='prob1')) 253 | 254 | (self.feed('prelu4') #pylint: disable=no-value-for-parameter 255 | .fc(4, relu=False, name='conv5-2')) 256 | 257 | class ONet(Network): 258 | def setup(self): 259 | (self.feed('data') #pylint: disable=no-value-for-parameter, no-member 260 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1') 261 | .prelu(name='prelu1') 262 | .max_pool(3, 3, 2, 2, name='pool1') 263 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2') 264 | .prelu(name='prelu2') 265 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 266 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3') 267 | .prelu(name='prelu3') 268 | .max_pool(2, 2, 2, 2, name='pool3') 269 | .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4') 270 | .prelu(name='prelu4') 271 | .fc(256, relu=False, name='conv5') 272 | .prelu(name='prelu5') 273 | .fc(2, relu=False, name='conv6-1') 274 | .softmax(1, name='prob1')) 275 | 276 | (self.feed('prelu5') #pylint: disable=no-value-for-parameter 277 | .fc(4, relu=False, name='conv6-2')) 278 | 279 | (self.feed('prelu5') #pylint: disable=no-value-for-parameter 280 | .fc(10, relu=False, name='conv6-3')) 281 | 282 | def create_mtcnn(sess, model_path): 283 | if not model_path: 284 | model_path,_ = os.path.split(os.path.realpath(__file__)) 285 | 286 | with tf.variable_scope('pnet'): 287 | data = tf.placeholder(tf.float32, (None,None,None,3), 'input') 288 | pnet = PNet({'data':data}) 289 | pnet.load(os.path.join(model_path, 'weights/det1.npy'), sess) 290 | with tf.variable_scope('rnet'): 291 | data = tf.placeholder(tf.float32, (None,24,24,3), 'input') 292 | rnet = RNet({'data':data}) 293 | rnet.load(os.path.join(model_path, 'weights/det2.npy'), sess) 294 | with tf.variable_scope('onet'): 295 | data = tf.placeholder(tf.float32, (None,48,48,3), 'input') 296 | onet = ONet({'data':data}) 297 | onet.load(os.path.join(model_path, 'weights/det3.npy'), sess) 298 | 299 | pnet_fun = lambda img : sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0':img}) 300 | rnet_fun = lambda img : sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0':img}) 301 | onet_fun = lambda img : sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), feed_dict={'onet/input:0':img}) 302 | return pnet_fun, rnet_fun, onet_fun 303 | 304 | def detect_face(img, minsize, pnet, rnet, onet, threshold, factor): 305 | """Detects faces in an image, and returns bounding boxes and points for them. 306 | img: input image 307 | minsize: minimum faces' size 308 | pnet, rnet, onet: caffemodel 309 | threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold 310 | factor: the factor used to create a scaling pyramid of face sizes to detect in the image. 311 | """ 312 | factor_count=0 313 | total_boxes=np.empty((0,9)) 314 | points=np.empty(0) 315 | h=img.shape[0] 316 | w=img.shape[1] 317 | minl=np.amin([h, w]) 318 | m=12.0/minsize 319 | minl=minl*m 320 | # create scale pyramid 321 | scales=[] 322 | while minl>=12: 323 | scales += [m*np.power(factor, factor_count)] 324 | minl = minl*factor 325 | factor_count += 1 326 | 327 | # first stage 328 | for scale in scales: 329 | hs=int(np.ceil(h*scale)) 330 | ws=int(np.ceil(w*scale)) 331 | im_data = imresample(img, (hs, ws)) 332 | im_data = (im_data-127.5)*0.0078125 333 | img_x = np.expand_dims(im_data, 0) 334 | img_y = np.transpose(img_x, (0,2,1,3)) 335 | out = pnet(img_y) 336 | out0 = np.transpose(out[0], (0,2,1,3)) 337 | out1 = np.transpose(out[1], (0,2,1,3)) 338 | 339 | boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0]) 340 | 341 | # inter-scale nms 342 | pick = nms(boxes.copy(), 0.5, 'Union') 343 | if boxes.size>0 and pick.size>0: 344 | boxes = boxes[pick,:] 345 | total_boxes = np.append(total_boxes, boxes, axis=0) 346 | 347 | numbox = total_boxes.shape[0] 348 | if numbox>0: 349 | pick = nms(total_boxes.copy(), 0.7, 'Union') 350 | total_boxes = total_boxes[pick,:] 351 | regw = total_boxes[:,2]-total_boxes[:,0] 352 | regh = total_boxes[:,3]-total_boxes[:,1] 353 | qq1 = total_boxes[:,0]+total_boxes[:,5]*regw 354 | qq2 = total_boxes[:,1]+total_boxes[:,6]*regh 355 | qq3 = total_boxes[:,2]+total_boxes[:,7]*regw 356 | qq4 = total_boxes[:,3]+total_boxes[:,8]*regh 357 | total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]])) 358 | total_boxes = rerec(total_boxes.copy()) 359 | total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32) 360 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 361 | 362 | numbox = total_boxes.shape[0] 363 | if numbox>0: 364 | # second stage 365 | tempimg = np.zeros((24,24,3,numbox)) 366 | for k in range(0,numbox): 367 | tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) 368 | tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] 369 | if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: 370 | tempimg[:,:,:,k] = imresample(tmp, (24, 24)) 371 | else: 372 | return np.empty() 373 | tempimg = (tempimg-127.5)*0.0078125 374 | tempimg1 = np.transpose(tempimg, (3,1,0,2)) 375 | out = rnet(tempimg1) 376 | out0 = np.transpose(out[0]) 377 | out1 = np.transpose(out[1]) 378 | score = out1[1,:] 379 | ipass = np.where(score>threshold[1]) 380 | total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) 381 | mv = out0[:,ipass[0]] 382 | if total_boxes.shape[0]>0: 383 | pick = nms(total_boxes, 0.7, 'Union') 384 | total_boxes = total_boxes[pick,:] 385 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick])) 386 | total_boxes = rerec(total_boxes.copy()) 387 | 388 | numbox = total_boxes.shape[0] 389 | if numbox>0: 390 | # third stage 391 | total_boxes = np.fix(total_boxes).astype(np.int32) 392 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 393 | tempimg = np.zeros((48,48,3,numbox)) 394 | for k in range(0,numbox): 395 | tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) 396 | tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] 397 | if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: 398 | tempimg[:,:,:,k] = imresample(tmp, (48, 48)) 399 | else: 400 | return np.empty() 401 | tempimg = (tempimg-127.5)*0.0078125 402 | tempimg1 = np.transpose(tempimg, (3,1,0,2)) 403 | out = onet(tempimg1) 404 | out0 = np.transpose(out[0]) 405 | out1 = np.transpose(out[1]) 406 | out2 = np.transpose(out[2]) 407 | score = out2[1,:] 408 | points = out1 409 | ipass = np.where(score>threshold[2]) 410 | points = points[:,ipass[0]] 411 | total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) 412 | mv = out0[:,ipass[0]] 413 | 414 | w = total_boxes[:,2]-total_boxes[:,0]+1 415 | h = total_boxes[:,3]-total_boxes[:,1]+1 416 | points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1 417 | points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1 418 | if total_boxes.shape[0]>0: 419 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv)) 420 | pick = nms(total_boxes.copy(), 0.7, 'Min') 421 | total_boxes = total_boxes[pick,:] 422 | points = points[:,pick] 423 | 424 | return total_boxes, points 425 | 426 | 427 | def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor): 428 | """Detects faces in a list of images 429 | images: list containing input images 430 | detection_window_size_ratio: ratio of minimum face size to smallest image dimension 431 | pnet, rnet, onet: caffemodel 432 | threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1] 433 | factor: the factor used to create a scaling pyramid of face sizes to detect in the image. 434 | """ 435 | all_scales = [None] * len(images) 436 | images_with_boxes = [None] * len(images) 437 | 438 | for i in range(len(images)): 439 | images_with_boxes[i] = {'total_boxes': np.empty((0, 9))} 440 | 441 | # create scale pyramid 442 | for index, img in enumerate(images): 443 | all_scales[index] = [] 444 | h = img.shape[0] 445 | w = img.shape[1] 446 | minsize = int(detection_window_size_ratio * np.minimum(w, h)) 447 | factor_count = 0 448 | minl = np.amin([h, w]) 449 | if minsize <= 12: 450 | minsize = 12 451 | 452 | m = 12.0 / minsize 453 | minl = minl * m 454 | while minl >= 12: 455 | all_scales[index].append(m * np.power(factor, factor_count)) 456 | minl = minl * factor 457 | factor_count += 1 458 | 459 | # # # # # # # # # # # # # 460 | # first stage - fast proposal network (pnet) to obtain face candidates 461 | # # # # # # # # # # # # # 462 | 463 | images_obj_per_resolution = {} 464 | 465 | # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images 466 | 467 | for index, scales in enumerate(all_scales): 468 | h = images[index].shape[0] 469 | w = images[index].shape[1] 470 | 471 | for scale in scales: 472 | hs = int(np.ceil(h * scale)) 473 | ws = int(np.ceil(w * scale)) 474 | 475 | if (ws, hs) not in images_obj_per_resolution: 476 | images_obj_per_resolution[(ws, hs)] = [] 477 | 478 | im_data = imresample(images[index], (hs, ws)) 479 | im_data = (im_data - 127.5) * 0.0078125 480 | img_y = np.transpose(im_data, (1, 0, 2)) # caffe uses different dimensions ordering 481 | images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index}) 482 | 483 | for resolution in images_obj_per_resolution: 484 | images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]] 485 | outs = pnet(images_per_resolution) 486 | 487 | for index in range(len(outs[0])): 488 | scale = images_obj_per_resolution[resolution][index]['scale'] 489 | image_index = images_obj_per_resolution[resolution][index]['index'] 490 | out0 = np.transpose(outs[0][index], (1, 0, 2)) 491 | out1 = np.transpose(outs[1][index], (1, 0, 2)) 492 | 493 | boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0]) 494 | 495 | # inter-scale nms 496 | pick = nms(boxes.copy(), 0.5, 'Union') 497 | if boxes.size > 0 and pick.size > 0: 498 | boxes = boxes[pick, :] 499 | images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'], 500 | boxes, 501 | axis=0) 502 | 503 | for index, image_obj in enumerate(images_with_boxes): 504 | numbox = image_obj['total_boxes'].shape[0] 505 | if numbox > 0: 506 | h = images[index].shape[0] 507 | w = images[index].shape[1] 508 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union') 509 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 510 | regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] 511 | regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] 512 | qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw 513 | qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh 514 | qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw 515 | qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh 516 | image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]])) 517 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 518 | image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32) 519 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 520 | 521 | numbox = image_obj['total_boxes'].shape[0] 522 | tempimg = np.zeros((24, 24, 3, numbox)) 523 | 524 | if numbox > 0: 525 | for k in range(0, numbox): 526 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 527 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 528 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 529 | tempimg[:, :, :, k] = imresample(tmp, (24, 24)) 530 | else: 531 | return np.empty() 532 | 533 | tempimg = (tempimg - 127.5) * 0.0078125 534 | image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 535 | 536 | # # # # # # # # # # # # # 537 | # second stage - refinement of face candidates with rnet 538 | # # # # # # # # # # # # # 539 | 540 | bulk_rnet_input = np.empty((0, 24, 24, 3)) 541 | for index, image_obj in enumerate(images_with_boxes): 542 | if 'rnet_input' in image_obj: 543 | bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0) 544 | 545 | out = rnet(bulk_rnet_input) 546 | out0 = np.transpose(out[0]) 547 | out1 = np.transpose(out[1]) 548 | score = out1[1, :] 549 | 550 | i = 0 551 | for index, image_obj in enumerate(images_with_boxes): 552 | if 'rnet_input' not in image_obj: 553 | continue 554 | 555 | rnet_input_count = image_obj['rnet_input'].shape[0] 556 | score_per_image = score[i:i + rnet_input_count] 557 | out0_per_image = out0[:, i:i + rnet_input_count] 558 | 559 | ipass = np.where(score_per_image > threshold[1]) 560 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 561 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 562 | 563 | mv = out0_per_image[:, ipass[0]] 564 | 565 | if image_obj['total_boxes'].shape[0] > 0: 566 | h = images[index].shape[0] 567 | w = images[index].shape[1] 568 | pick = nms(image_obj['total_boxes'], 0.7, 'Union') 569 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 570 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick])) 571 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 572 | 573 | numbox = image_obj['total_boxes'].shape[0] 574 | 575 | if numbox > 0: 576 | tempimg = np.zeros((48, 48, 3, numbox)) 577 | image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32) 578 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 579 | 580 | for k in range(0, numbox): 581 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 582 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 583 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 584 | tempimg[:, :, :, k] = imresample(tmp, (48, 48)) 585 | else: 586 | return np.empty() 587 | tempimg = (tempimg - 127.5) * 0.0078125 588 | image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 589 | 590 | i += rnet_input_count 591 | 592 | # # # # # # # # # # # # # 593 | # third stage - further refinement and facial landmarks positions with onet 594 | # # # # # # # # # # # # # 595 | 596 | bulk_onet_input = np.empty((0, 48, 48, 3)) 597 | for index, image_obj in enumerate(images_with_boxes): 598 | if 'onet_input' in image_obj: 599 | bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0) 600 | 601 | out = onet(bulk_onet_input) 602 | 603 | out0 = np.transpose(out[0]) 604 | out1 = np.transpose(out[1]) 605 | out2 = np.transpose(out[2]) 606 | score = out2[1, :] 607 | points = out1 608 | 609 | i = 0 610 | ret = [] 611 | for index, image_obj in enumerate(images_with_boxes): 612 | if 'onet_input' not in image_obj: 613 | ret.append(None) 614 | continue 615 | 616 | onet_input_count = image_obj['onet_input'].shape[0] 617 | 618 | out0_per_image = out0[:, i:i + onet_input_count] 619 | score_per_image = score[i:i + onet_input_count] 620 | points_per_image = points[:, i:i + onet_input_count] 621 | 622 | ipass = np.where(score_per_image > threshold[2]) 623 | points_per_image = points_per_image[:, ipass[0]] 624 | 625 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 626 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 627 | mv = out0_per_image[:, ipass[0]] 628 | 629 | w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1 630 | h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1 631 | points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile( 632 | image_obj['total_boxes'][:, 0], (5, 1)) - 1 633 | points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile( 634 | image_obj['total_boxes'][:, 1], (5, 1)) - 1 635 | 636 | if image_obj['total_boxes'].shape[0] > 0: 637 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv)) 638 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min') 639 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 640 | points_per_image = points_per_image[:, pick] 641 | 642 | ret.append((image_obj['total_boxes'], points_per_image)) 643 | else: 644 | ret.append(None) 645 | 646 | i += onet_input_count 647 | 648 | return ret 649 | 650 | 651 | # function [boundingbox] = bbreg(boundingbox,reg) 652 | def bbreg(boundingbox,reg): 653 | """Calibrate bounding boxes""" 654 | if reg.shape[1]==1: 655 | reg = np.reshape(reg, (reg.shape[2], reg.shape[3])) 656 | 657 | w = boundingbox[:,2]-boundingbox[:,0]+1 658 | h = boundingbox[:,3]-boundingbox[:,1]+1 659 | b1 = boundingbox[:,0]+reg[:,0]*w 660 | b2 = boundingbox[:,1]+reg[:,1]*h 661 | b3 = boundingbox[:,2]+reg[:,2]*w 662 | b4 = boundingbox[:,3]+reg[:,3]*h 663 | boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ])) 664 | return boundingbox 665 | 666 | def generateBoundingBox(imap, reg, scale, t): 667 | """Use heatmap to generate bounding boxes""" 668 | stride=2 669 | cellsize=12 670 | 671 | imap = np.transpose(imap) 672 | dx1 = np.transpose(reg[:,:,0]) 673 | dy1 = np.transpose(reg[:,:,1]) 674 | dx2 = np.transpose(reg[:,:,2]) 675 | dy2 = np.transpose(reg[:,:,3]) 676 | y, x = np.where(imap >= t) 677 | if y.shape[0]==1: 678 | dx1 = np.flipud(dx1) 679 | dy1 = np.flipud(dy1) 680 | dx2 = np.flipud(dx2) 681 | dy2 = np.flipud(dy2) 682 | score = imap[(y,x)] 683 | reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ])) 684 | if reg.size==0: 685 | reg = np.empty((0,3)) 686 | bb = np.transpose(np.vstack([y,x])) 687 | q1 = np.fix((stride*bb+1)/scale) 688 | q2 = np.fix((stride*bb+cellsize-1+1)/scale) 689 | boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg]) 690 | return boundingbox, reg 691 | 692 | # function pick = nms(boxes,threshold,type) 693 | def nms(boxes, threshold, method): 694 | if boxes.size==0: 695 | return np.empty((0,3)) 696 | x1 = boxes[:,0] 697 | y1 = boxes[:,1] 698 | x2 = boxes[:,2] 699 | y2 = boxes[:,3] 700 | s = boxes[:,4] 701 | area = (x2-x1+1) * (y2-y1+1) 702 | I = np.argsort(s) 703 | pick = np.zeros_like(s, dtype=np.int16) 704 | counter = 0 705 | while I.size>0: 706 | i = I[-1] 707 | pick[counter] = i 708 | counter += 1 709 | idx = I[0:-1] 710 | xx1 = np.maximum(x1[i], x1[idx]) 711 | yy1 = np.maximum(y1[i], y1[idx]) 712 | xx2 = np.minimum(x2[i], x2[idx]) 713 | yy2 = np.minimum(y2[i], y2[idx]) 714 | w = np.maximum(0.0, xx2-xx1+1) 715 | h = np.maximum(0.0, yy2-yy1+1) 716 | inter = w * h 717 | if method is 'Min': 718 | o = inter / np.minimum(area[i], area[idx]) 719 | else: 720 | o = inter / (area[i] + area[idx] - inter) 721 | I = I[np.where(o<=threshold)] 722 | pick = pick[0:counter] 723 | return pick 724 | 725 | # function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h) 726 | def pad(total_boxes, w, h): 727 | """Compute the padding coordinates (pad the bounding boxes to square)""" 728 | tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32) 729 | tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32) 730 | numbox = total_boxes.shape[0] 731 | 732 | dx = np.ones((numbox), dtype=np.int32) 733 | dy = np.ones((numbox), dtype=np.int32) 734 | edx = tmpw.copy().astype(np.int32) 735 | edy = tmph.copy().astype(np.int32) 736 | 737 | x = total_boxes[:,0].copy().astype(np.int32) 738 | y = total_boxes[:,1].copy().astype(np.int32) 739 | ex = total_boxes[:,2].copy().astype(np.int32) 740 | ey = total_boxes[:,3].copy().astype(np.int32) 741 | 742 | tmp = np.where(ex>w) 743 | edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1) 744 | ex[tmp] = w 745 | 746 | tmp = np.where(ey>h) 747 | edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1) 748 | ey[tmp] = h 749 | 750 | tmp = np.where(x<1) 751 | dx.flat[tmp] = np.expand_dims(2-x[tmp],1) 752 | x[tmp] = 1 753 | 754 | tmp = np.where(y<1) 755 | dy.flat[tmp] = np.expand_dims(2-y[tmp],1) 756 | y[tmp] = 1 757 | 758 | return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph 759 | 760 | # function [bboxA] = rerec(bboxA) 761 | def rerec(bboxA): 762 | """Convert bboxA to square.""" 763 | h = bboxA[:,3]-bboxA[:,1] 764 | w = bboxA[:,2]-bboxA[:,0] 765 | l = np.maximum(w, h) 766 | bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5 767 | bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5 768 | bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1))) 769 | return bboxA 770 | 771 | def imresample(img, sz): 772 | im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA) #@UndefinedVariable 773 | return im_data 774 | 775 | # This method is kept for debugging purpose 776 | # h=img.shape[0] 777 | # w=img.shape[1] 778 | # hs, ws = sz 779 | # dx = float(w) / ws 780 | # dy = float(h) / hs 781 | # im_data = np.zeros((hs,ws,3)) 782 | # for a1 in range(0,hs): 783 | # for a2 in range(0,ws): 784 | # for a3 in range(0,3): 785 | # im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3] 786 | # return im_data 787 | 788 | --------------------------------------------------------------------------------