├── .gitignore ├── LICENSE ├── README.md ├── align ├── __init__.py ├── det1.npy ├── det2.npy ├── det3.npy └── detect_face.py ├── lib ├── __init__.py ├── face_utils.py └── utils.py ├── project_root_dir.py ├── requirements.txt ├── src ├── __init__.py ├── data_association.py ├── kalman_tracker.py └── sort.py ├── start.py └── videos ├── 1_Hours Of _Harassment' In NYC!.mp4 └── 2_Obama.mp4 /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | logs 3 | facepics 4 | led / optimized / DLL files 5 | __pycache__/ 6 | *$py.class -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Linzaer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Face Detection & Tracking & Extract 2 | 3 | ![GitHub](https://img.shields.io/github/license/mashape/apistatus.svg) 4 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Django.svg) 5 | 6 | This project can **detect** , **track** and **extract** the **optimal** face in multi-target faces (exclude side face and select the optimal face). 7 | 8 | ## Introduction 9 | * **Dependencies:** 10 | * Python 3.5+ 11 | * Tensorflow 12 | * [**MTCNN**](https://github.com/davidsandberg/facenet/tree/master/src/align) 13 | * Scikit-learn 14 | * Numpy 15 | * Numba 16 | * Opencv-python 17 | * Filterpy 18 | 19 | ## Run 20 | * To run the python version of the code : 21 | ```sh 22 | python3 start.py 23 | ``` 24 | * Then you can find faces extracted stored in the floder **./facepics** . 25 | * If you want to draw 5 face landmarks on the face extracted,you just add the argument **face_landmarks** 26 | ```sh 27 | python3 start.py --face_landmarks 28 | ``` 29 | ## What can this project do? 30 | 31 | * You can run it to extract the optimal face for everyone from a lot of videos and use it as a training set for **CNN Training**. 32 | * You can also send the extracted face to the backend for **Face Recognition**. 33 | 34 | 35 | 36 | ## Results 37 | ![alt text](https://raw.githubusercontent.com/wiki/Linzaer/Face-Track-Detect-Extract/pic4.gif "scene 1") 38 | ![alt text](https://raw.githubusercontent.com/wiki/Linzaer/Face-Track-Detect-Extract/pic5.jpg "faces extracted") 39 | 40 | ## Special Thanks to: 41 | * [**experimenting-with-sort**](https://github.com/ZidanMusk/experimenting-with-sort) 42 | 43 | ## License 44 | MIT LICENSE 45 | 46 | -------------------------------------------------------------------------------- /align/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/align/__init__.py -------------------------------------------------------------------------------- /align/det1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/align/det1.npy -------------------------------------------------------------------------------- /align/det2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/align/det2.npy -------------------------------------------------------------------------------- /align/det3.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/align/det3.npy -------------------------------------------------------------------------------- /align/detect_face.py: -------------------------------------------------------------------------------- 1 | """ Tensorflow implementation of the face detection / alignment algorithm found at 2 | https://github.com/kpzhang93/MTCNN_face_detection_alignment 3 | """ 4 | # MIT License 5 | # 6 | # Copyright (c) 2016 David Sandberg 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | 26 | from __future__ import absolute_import 27 | from __future__ import division 28 | from __future__ import print_function 29 | 30 | import os 31 | 32 | # from math import floor 33 | import cv2 34 | import numpy as np 35 | import tensorflow as tf 36 | from six import string_types, iteritems 37 | 38 | 39 | def layer(op): 40 | '''Decorator for composable network layers.''' 41 | 42 | def layer_decorated(self, *args, **kwargs): 43 | # Automatically set a name if not provided. 44 | name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) 45 | # Figure out the layer inputs. 46 | if len(self.terminals) == 0: 47 | raise RuntimeError('No input variables found for layer %s.' % name) 48 | elif len(self.terminals) == 1: 49 | layer_input = self.terminals[0] 50 | else: 51 | layer_input = list(self.terminals) 52 | # Perform the operation and get the output. 53 | layer_output = op(self, layer_input, *args, **kwargs) 54 | # Add to layer LUT. 55 | self.layers[name] = layer_output 56 | # This output is now the input for the next layer. 57 | self.feed(layer_output) 58 | # Return self for chained calls. 59 | return self 60 | 61 | return layer_decorated 62 | 63 | 64 | class Network(object): 65 | def __init__(self, inputs, trainable=True): 66 | # The input nodes for this network 67 | self.inputs = inputs 68 | # The current list of terminal nodes 69 | self.terminals = [] 70 | # Mapping from layer names to layers 71 | self.layers = dict(inputs) 72 | # If true, the resulting variables are set as trainable 73 | self.trainable = trainable 74 | 75 | self.setup() 76 | 77 | def setup(self): 78 | '''Construct the network. ''' 79 | raise NotImplementedError('Must be implemented by the subclass.') 80 | 81 | def load(self, data_path, session, ignore_missing=False): 82 | '''Load network weights. 83 | data_path: The path to the numpy-serialized network weights 84 | session: The current TensorFlow session 85 | ignore_missing: If true, serialized weights for missing layers are ignored. 86 | ''' 87 | data_dict = np.load(data_path, encoding='latin1').item() # pylint: disable=no-member 88 | 89 | for op_name in data_dict: 90 | with tf.variable_scope(op_name, reuse=True): 91 | for param_name, data in iteritems(data_dict[op_name]): 92 | try: 93 | var = tf.get_variable(param_name) 94 | session.run(var.assign(data)) 95 | except ValueError: 96 | if not ignore_missing: 97 | raise 98 | 99 | def feed(self, *args): 100 | '''Set the input(s) for the next operation by replacing the terminal nodes. 101 | The arguments can be either layer names or the actual layers. 102 | ''' 103 | assert len(args) != 0 104 | self.terminals = [] 105 | for fed_layer in args: 106 | if isinstance(fed_layer, string_types): 107 | try: 108 | fed_layer = self.layers[fed_layer] 109 | except KeyError: 110 | raise KeyError('Unknown layer name fed: %s' % fed_layer) 111 | self.terminals.append(fed_layer) 112 | return self 113 | 114 | def get_output(self): 115 | '''Returns the current network output.''' 116 | return self.terminals[-1] 117 | 118 | def get_unique_name(self, prefix): 119 | '''Returns an index-suffixed unique name for the given prefix. 120 | This is used for auto-generating layer names based on the type-prefix. 121 | ''' 122 | ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 123 | return '%s_%d' % (prefix, ident) 124 | 125 | def make_var(self, name, shape): 126 | '''Creates a new TensorFlow variable.''' 127 | return tf.get_variable(name, shape, trainable=self.trainable) 128 | 129 | def validate_padding(self, padding): 130 | '''Verifies that the padding is one of the supported ones.''' 131 | assert padding in ('SAME', 'VALID') 132 | 133 | @layer 134 | def conv(self, 135 | inp, 136 | k_h, 137 | k_w, 138 | c_o, 139 | s_h, 140 | s_w, 141 | name, 142 | relu=True, 143 | padding='SAME', 144 | group=1, 145 | biased=True): 146 | # Verify that the padding is acceptable 147 | self.validate_padding(padding) 148 | # Get the number of channels in the input 149 | c_i = int(inp.get_shape()[-1]) 150 | # Verify that the grouping parameter is valid 151 | assert c_i % group == 0 152 | assert c_o % group == 0 153 | # Convolution for a given input and kernel 154 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 155 | with tf.variable_scope(name) as scope: 156 | kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o]) 157 | # This is the common-case. Convolve the input without any further complications. 158 | output = convolve(inp, kernel) 159 | # Add the biases 160 | if biased: 161 | biases = self.make_var('biases', [c_o]) 162 | output = tf.nn.bias_add(output, biases) 163 | if relu: 164 | # ReLU non-linearity 165 | output = tf.nn.relu(output, name=scope.name) 166 | return output 167 | 168 | @layer 169 | def prelu(self, inp, name): 170 | with tf.variable_scope(name): 171 | i = int(inp.get_shape()[-1]) 172 | alpha = self.make_var('alpha', shape=(i,)) 173 | output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp)) 174 | return output 175 | 176 | @layer 177 | def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'): 178 | self.validate_padding(padding) 179 | return tf.nn.max_pool(inp, 180 | ksize=[1, k_h, k_w, 1], 181 | strides=[1, s_h, s_w, 1], 182 | padding=padding, 183 | name=name) 184 | 185 | @layer 186 | def fc(self, inp, num_out, name, relu=True): 187 | with tf.variable_scope(name): 188 | input_shape = inp.get_shape() 189 | if input_shape.ndims == 4: 190 | # The input is spatial. Vectorize it first. 191 | dim = 1 192 | for d in input_shape[1:].as_list(): 193 | dim *= int(d) 194 | feed_in = tf.reshape(inp, [-1, dim]) 195 | else: 196 | feed_in, dim = (inp, input_shape[-1].value) 197 | weights = self.make_var('weights', shape=[dim, num_out]) 198 | biases = self.make_var('biases', [num_out]) 199 | op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b 200 | fc = op(feed_in, weights, biases, name=name) 201 | return fc 202 | 203 | """ 204 | Multi dimensional softmax, 205 | refer to https://github.com/tensorflow/tensorflow/issues/210 206 | compute softmax along the dimension of target 207 | the native softmax only supports batch_size x dimension 208 | """ 209 | 210 | @layer 211 | def softmax(self, target, axis, name=None): 212 | max_axis = tf.reduce_max(target, axis, keep_dims=True) 213 | target_exp = tf.exp(target - max_axis) 214 | normalize = tf.reduce_sum(target_exp, axis, keep_dims=True) 215 | softmax = tf.div(target_exp, normalize, name) 216 | return softmax 217 | 218 | 219 | class PNet(Network): 220 | def setup(self): 221 | (self.feed('data') # pylint: disable=no-value-for-parameter, no-member 222 | .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1') 223 | .prelu(name='PReLU1') 224 | .max_pool(2, 2, 2, 2, name='pool1') 225 | .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2') 226 | .prelu(name='PReLU2') 227 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3') 228 | .prelu(name='PReLU3') 229 | .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1') 230 | .softmax(3, name='prob1')) 231 | 232 | (self.feed('PReLU3') # pylint: disable=no-value-for-parameter 233 | .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2')) 234 | 235 | 236 | class RNet(Network): 237 | def setup(self): 238 | (self.feed('data') # pylint: disable=no-value-for-parameter, no-member 239 | .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1') 240 | .prelu(name='prelu1') 241 | .max_pool(3, 3, 2, 2, name='pool1') 242 | .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2') 243 | .prelu(name='prelu2') 244 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 245 | .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3') 246 | .prelu(name='prelu3') 247 | .fc(128, relu=False, name='conv4') 248 | .prelu(name='prelu4') 249 | .fc(2, relu=False, name='conv5-1') 250 | .softmax(1, name='prob1')) 251 | 252 | (self.feed('prelu4') # pylint: disable=no-value-for-parameter 253 | .fc(4, relu=False, name='conv5-2')) 254 | 255 | 256 | class ONet(Network): 257 | def setup(self): 258 | (self.feed('data') # pylint: disable=no-value-for-parameter, no-member 259 | .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1') 260 | .prelu(name='prelu1') 261 | .max_pool(3, 3, 2, 2, name='pool1') 262 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2') 263 | .prelu(name='prelu2') 264 | .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') 265 | .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3') 266 | .prelu(name='prelu3') 267 | .max_pool(2, 2, 2, 2, name='pool3') 268 | .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4') 269 | .prelu(name='prelu4') 270 | .fc(256, relu=False, name='conv5') 271 | .prelu(name='prelu5') 272 | .fc(2, relu=False, name='conv6-1') 273 | .softmax(1, name='prob1')) 274 | 275 | (self.feed('prelu5') # pylint: disable=no-value-for-parameter 276 | .fc(4, relu=False, name='conv6-2')) 277 | 278 | (self.feed('prelu5') # pylint: disable=no-value-for-parameter 279 | .fc(10, relu=False, name='conv6-3')) 280 | 281 | 282 | def create_mtcnn(sess, model_path): 283 | if not model_path: 284 | model_path, _ = os.path.split(os.path.realpath(__file__)) 285 | 286 | with tf.variable_scope('pnet'): 287 | data = tf.placeholder(tf.float32, (None, None, None, 3), 'input') 288 | pnet = PNet({'data': data}) 289 | pnet.load(os.path.join(model_path, 'det1.npy'), sess) 290 | with tf.variable_scope('rnet'): 291 | data = tf.placeholder(tf.float32, (None, 24, 24, 3), 'input') 292 | rnet = RNet({'data': data}) 293 | rnet.load(os.path.join(model_path, 'det2.npy'), sess) 294 | with tf.variable_scope('onet'): 295 | data = tf.placeholder(tf.float32, (None, 48, 48, 3), 'input') 296 | onet = ONet({'data': data}) 297 | onet.load(os.path.join(model_path, 'det3.npy'), sess) 298 | 299 | pnet_fun = lambda img: sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0': img}) 300 | rnet_fun = lambda img: sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0': img}) 301 | onet_fun = lambda img: sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), 302 | feed_dict={'onet/input:0': img}) 303 | return pnet_fun, rnet_fun, onet_fun 304 | 305 | 306 | def detect_face(img, minsize, pnet, rnet, onet, threshold, factor): 307 | # im: input image 308 | # minsize: minimum of faces' size 309 | # pnet, rnet, onet: caffemodel 310 | # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold 311 | # fastresize: resize img from last scale (using in high-resolution images) if fastresize==true 312 | factor_count = 0 313 | total_boxes = np.empty((0, 9)) 314 | points = np.empty(0) 315 | h = img.shape[0] 316 | w = img.shape[1] 317 | minl = np.amin([h, w]) 318 | m = 12.0 / minsize 319 | minl = minl * m 320 | # creat scale pyramid 321 | scales = [] 322 | while minl >= 12: 323 | scales += [m * np.power(factor, factor_count)] 324 | minl = minl * factor 325 | factor_count += 1 326 | 327 | # first stage 328 | for j in range(len(scales)): 329 | scale = scales[j] 330 | hs = int(np.ceil(h * scale)) 331 | ws = int(np.ceil(w * scale)) 332 | im_data = imresample(img, (hs, ws)) 333 | im_data = (im_data - 127.5) * 0.0078125 334 | img_x = np.expand_dims(im_data, 0) 335 | img_y = np.transpose(img_x, (0, 2, 1, 3)) 336 | out = pnet(img_y) 337 | out0 = np.transpose(out[0], (0, 2, 1, 3)) 338 | out1 = np.transpose(out[1], (0, 2, 1, 3)) 339 | 340 | boxes, _ = generateBoundingBox(out1[0, :, :, 1].copy(), out0[0, :, :, :].copy(), scale, threshold[0]) 341 | 342 | # inter-scale nms 343 | pick = nms(boxes.copy(), 0.5, 'Union') 344 | if boxes.size > 0 and pick.size > 0: 345 | boxes = boxes[pick, :] 346 | total_boxes = np.append(total_boxes, boxes, axis=0) 347 | 348 | numbox = total_boxes.shape[0] 349 | if numbox > 0: 350 | pick = nms(total_boxes.copy(), 0.7, 'Union') 351 | total_boxes = total_boxes[pick, :] 352 | regw = total_boxes[:, 2] - total_boxes[:, 0] 353 | regh = total_boxes[:, 3] - total_boxes[:, 1] 354 | qq1 = total_boxes[:, 0] + total_boxes[:, 5] * regw 355 | qq2 = total_boxes[:, 1] + total_boxes[:, 6] * regh 356 | qq3 = total_boxes[:, 2] + total_boxes[:, 7] * regw 357 | qq4 = total_boxes[:, 3] + total_boxes[:, 8] * regh 358 | total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:, 4]])) 359 | total_boxes = rerec(total_boxes.copy()) 360 | total_boxes[:, 0:4] = np.fix(total_boxes[:, 0:4]).astype(np.int32) 361 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 362 | 363 | numbox = total_boxes.shape[0] 364 | if numbox > 0: 365 | # second stage 366 | tempimg = np.zeros((24, 24, 3, numbox)) 367 | for k in range(0, numbox): 368 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 369 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = img[y[k] - 1:ey[k], x[k] - 1:ex[k], :] 370 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 371 | tempimg[:, :, :, k] = imresample(tmp, (24, 24)) 372 | else: 373 | return np.empty() 374 | tempimg = (tempimg - 127.5) * 0.0078125 375 | tempimg1 = np.transpose(tempimg, (3, 1, 0, 2)) 376 | out = rnet(tempimg1) 377 | out0 = np.transpose(out[0]) 378 | out1 = np.transpose(out[1]) 379 | score = out1[1, :] 380 | ipass = np.where(score > threshold[1]) 381 | total_boxes = np.hstack([total_boxes[ipass[0], 0:4].copy(), np.expand_dims(score[ipass].copy(), 1)]) 382 | mv = out0[:, ipass[0]] 383 | if total_boxes.shape[0] > 0: 384 | pick = nms(total_boxes, 0.7, 'Union') 385 | total_boxes = total_boxes[pick, :] 386 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:, pick])) 387 | total_boxes = rerec(total_boxes.copy()) 388 | 389 | numbox = total_boxes.shape[0] 390 | if numbox > 0: 391 | # third stage 392 | total_boxes = np.fix(total_boxes).astype(np.int32) 393 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) 394 | tempimg = np.zeros((48, 48, 3, numbox)) 395 | for k in range(0, numbox): 396 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 397 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = img[y[k] - 1:ey[k], x[k] - 1:ex[k], :] 398 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 399 | tempimg[:, :, :, k] = imresample(tmp, (48, 48)) 400 | else: 401 | return np.empty() 402 | tempimg = (tempimg - 127.5) * 0.0078125 403 | tempimg1 = np.transpose(tempimg, (3, 1, 0, 2)) 404 | out = onet(tempimg1) 405 | out0 = np.transpose(out[0]) 406 | out1 = np.transpose(out[1]) 407 | out2 = np.transpose(out[2]) 408 | score = out2[1, :] 409 | points = out1 410 | ipass = np.where(score > threshold[2]) 411 | points = points[:, ipass[0]] 412 | total_boxes = np.hstack([total_boxes[ipass[0], 0:4].copy(), np.expand_dims(score[ipass].copy(), 1)]) 413 | mv = out0[:, ipass[0]] 414 | 415 | w = total_boxes[:, 2] - total_boxes[:, 0] + 1 416 | h = total_boxes[:, 3] - total_boxes[:, 1] + 1 417 | points[0:5, :] = np.tile(w, (5, 1)) * points[0:5, :] + np.tile(total_boxes[:, 0], (5, 1)) - 1 418 | points[5:10, :] = np.tile(h, (5, 1)) * points[5:10, :] + np.tile(total_boxes[:, 1], (5, 1)) - 1 419 | if total_boxes.shape[0] > 0: 420 | total_boxes = bbreg(total_boxes.copy(), np.transpose(mv)) 421 | pick = nms(total_boxes.copy(), 0.7, 'Min') 422 | total_boxes = total_boxes[pick, :] 423 | points = points[:, pick] 424 | 425 | return total_boxes, points 426 | 427 | 428 | def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor): 429 | # im: input image 430 | # minsize: minimum of faces' size 431 | # pnet, rnet, onet: caffemodel 432 | # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1] 433 | 434 | all_scales = [None] * len(images) 435 | images_with_boxes = [None] * len(images) 436 | 437 | for i in range(len(images)): 438 | images_with_boxes[i] = {'total_boxes': np.empty((0, 9))} 439 | 440 | # create scale pyramid 441 | for index, img in enumerate(images): 442 | all_scales[index] = [] 443 | h = img.shape[0] 444 | w = img.shape[1] 445 | minsize = int(detection_window_size_ratio * np.minimum(w, h)) 446 | factor_count = 0 447 | minl = np.amin([h, w]) 448 | if minsize <= 12: 449 | minsize = 12 450 | 451 | m = 12.0 / minsize 452 | minl = minl * m 453 | while minl >= 12: 454 | all_scales[index].append(m * np.power(factor, factor_count)) 455 | minl = minl * factor 456 | factor_count += 1 457 | 458 | # # # # # # # # # # # # # 459 | # first stage - fast proposal network (pnet) to obtain face candidates 460 | # # # # # # # # # # # # # 461 | 462 | images_obj_per_resolution = {} 463 | 464 | # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images 465 | 466 | for index, scales in enumerate(all_scales): 467 | h = images[index].shape[0] 468 | w = images[index].shape[1] 469 | 470 | for scale in scales: 471 | hs = int(np.ceil(h * scale)) 472 | ws = int(np.ceil(w * scale)) 473 | 474 | if (ws, hs) not in images_obj_per_resolution: 475 | images_obj_per_resolution[(ws, hs)] = [] 476 | 477 | im_data = imresample(images[index], (hs, ws)) 478 | im_data = (im_data - 127.5) * 0.0078125 479 | img_y = np.transpose(im_data, (1, 0, 2)) # caffe uses different dimensions ordering 480 | images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index}) 481 | 482 | for resolution in images_obj_per_resolution: 483 | images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]] 484 | outs = pnet(images_per_resolution) 485 | 486 | for index in range(len(outs[0])): 487 | scale = images_obj_per_resolution[resolution][index]['scale'] 488 | image_index = images_obj_per_resolution[resolution][index]['index'] 489 | out0 = np.transpose(outs[0][index], (1, 0, 2)) 490 | out1 = np.transpose(outs[1][index], (1, 0, 2)) 491 | 492 | boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0]) 493 | 494 | # inter-scale nms 495 | pick = nms(boxes.copy(), 0.5, 'Union') 496 | if boxes.size > 0 and pick.size > 0: 497 | boxes = boxes[pick, :] 498 | images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'], 499 | boxes, 500 | axis=0) 501 | 502 | for index, image_obj in enumerate(images_with_boxes): 503 | numbox = image_obj['total_boxes'].shape[0] 504 | if numbox > 0: 505 | h = images[index].shape[0] 506 | w = images[index].shape[1] 507 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union') 508 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 509 | regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] 510 | regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] 511 | qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw 512 | qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh 513 | qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw 514 | qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh 515 | image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]])) 516 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 517 | image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32) 518 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 519 | 520 | numbox = image_obj['total_boxes'].shape[0] 521 | tempimg = np.zeros((24, 24, 3, numbox)) 522 | 523 | if numbox > 0: 524 | for k in range(0, numbox): 525 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 526 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 527 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 528 | tempimg[:, :, :, k] = imresample(tmp, (24, 24)) 529 | else: 530 | return np.empty() 531 | 532 | tempimg = (tempimg - 127.5) * 0.0078125 533 | image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 534 | 535 | # # # # # # # # # # # # # 536 | # second stage - refinement of face candidates with rnet 537 | # # # # # # # # # # # # # 538 | 539 | bulk_rnet_input = np.empty((0, 24, 24, 3)) 540 | for index, image_obj in enumerate(images_with_boxes): 541 | if 'rnet_input' in image_obj: 542 | bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0) 543 | 544 | out = rnet(bulk_rnet_input) 545 | out0 = np.transpose(out[0]) 546 | out1 = np.transpose(out[1]) 547 | score = out1[1, :] 548 | 549 | i = 0 550 | for index, image_obj in enumerate(images_with_boxes): 551 | if 'rnet_input' not in image_obj: 552 | continue 553 | 554 | rnet_input_count = image_obj['rnet_input'].shape[0] 555 | score_per_image = score[i:i + rnet_input_count] 556 | out0_per_image = out0[:, i:i + rnet_input_count] 557 | 558 | ipass = np.where(score_per_image > threshold[1]) 559 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 560 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 561 | 562 | mv = out0_per_image[:, ipass[0]] 563 | 564 | if image_obj['total_boxes'].shape[0] > 0: 565 | h = images[index].shape[0] 566 | w = images[index].shape[1] 567 | pick = nms(image_obj['total_boxes'], 0.7, 'Union') 568 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 569 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick])) 570 | image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) 571 | 572 | numbox = image_obj['total_boxes'].shape[0] 573 | 574 | if numbox > 0: 575 | tempimg = np.zeros((48, 48, 3, numbox)) 576 | image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32) 577 | dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) 578 | 579 | for k in range(0, numbox): 580 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) 581 | tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] 582 | if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: 583 | tempimg[:, :, :, k] = imresample(tmp, (48, 48)) 584 | else: 585 | return np.empty() 586 | tempimg = (tempimg - 127.5) * 0.0078125 587 | image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) 588 | 589 | i += rnet_input_count 590 | 591 | # # # # # # # # # # # # # 592 | # third stage - further refinement and facial landmarks positions with onet 593 | # # # # # # # # # # # # # 594 | 595 | bulk_onet_input = np.empty((0, 48, 48, 3)) 596 | for index, image_obj in enumerate(images_with_boxes): 597 | if 'onet_input' in image_obj: 598 | bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0) 599 | 600 | out = onet(bulk_onet_input) 601 | 602 | out0 = np.transpose(out[0]) 603 | out1 = np.transpose(out[1]) 604 | out2 = np.transpose(out[2]) 605 | score = out2[1, :] 606 | points = out1 607 | 608 | i = 0 609 | ret = [] 610 | for index, image_obj in enumerate(images_with_boxes): 611 | if 'onet_input' not in image_obj: 612 | ret.append(None) 613 | continue 614 | 615 | onet_input_count = image_obj['onet_input'].shape[0] 616 | 617 | out0_per_image = out0[:, i:i + onet_input_count] 618 | score_per_image = score[i:i + onet_input_count] 619 | points_per_image = points[:, i:i + onet_input_count] 620 | 621 | ipass = np.where(score_per_image > threshold[2]) 622 | points_per_image = points_per_image[:, ipass[0]] 623 | 624 | image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), 625 | np.expand_dims(score_per_image[ipass].copy(), 1)]) 626 | mv = out0_per_image[:, ipass[0]] 627 | 628 | w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1 629 | h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1 630 | points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile( 631 | image_obj['total_boxes'][:, 0], (5, 1)) - 1 632 | points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile( 633 | image_obj['total_boxes'][:, 1], (5, 1)) - 1 634 | 635 | if image_obj['total_boxes'].shape[0] > 0: 636 | image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv)) 637 | pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min') 638 | image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] 639 | points_per_image = points_per_image[:, pick] 640 | 641 | ret.append((image_obj['total_boxes'], points_per_image)) 642 | else: 643 | ret.append(None) 644 | 645 | i += onet_input_count 646 | 647 | return ret 648 | 649 | 650 | # function [boundingbox] = bbreg(boundingbox,reg) 651 | def bbreg(boundingbox, reg): 652 | # calibrate bounding boxes 653 | if reg.shape[1] == 1: 654 | reg = np.reshape(reg, (reg.shape[2], reg.shape[3])) 655 | 656 | w = boundingbox[:, 2] - boundingbox[:, 0] + 1 657 | h = boundingbox[:, 3] - boundingbox[:, 1] + 1 658 | b1 = boundingbox[:, 0] + reg[:, 0] * w 659 | b2 = boundingbox[:, 1] + reg[:, 1] * h 660 | b3 = boundingbox[:, 2] + reg[:, 2] * w 661 | b4 = boundingbox[:, 3] + reg[:, 3] * h 662 | boundingbox[:, 0:4] = np.transpose(np.vstack([b1, b2, b3, b4])) 663 | return boundingbox 664 | 665 | 666 | def generateBoundingBox(imap, reg, scale, t): 667 | # use heatmap to generate bounding boxes 668 | stride = 2 669 | cellsize = 12 670 | 671 | imap = np.transpose(imap) 672 | dx1 = np.transpose(reg[:, :, 0]) 673 | dy1 = np.transpose(reg[:, :, 1]) 674 | dx2 = np.transpose(reg[:, :, 2]) 675 | dy2 = np.transpose(reg[:, :, 3]) 676 | y, x = np.where(imap >= t) 677 | if y.shape[0] == 1: 678 | dx1 = np.flipud(dx1) 679 | dy1 = np.flipud(dy1) 680 | dx2 = np.flipud(dx2) 681 | dy2 = np.flipud(dy2) 682 | score = imap[(y, x)] 683 | reg = np.transpose(np.vstack([dx1[(y, x)], dy1[(y, x)], dx2[(y, x)], dy2[(y, x)]])) 684 | if reg.size == 0: 685 | reg = np.empty((0, 3)) 686 | bb = np.transpose(np.vstack([y, x])) 687 | q1 = np.fix((stride * bb + 1) / scale) 688 | q2 = np.fix((stride * bb + cellsize - 1 + 1) / scale) 689 | boundingbox = np.hstack([q1, q2, np.expand_dims(score, 1), reg]) 690 | return boundingbox, reg 691 | 692 | 693 | # function pick = nms(boxes,threshold,type) 694 | def nms(boxes, threshold, method): 695 | if boxes.size == 0: 696 | return np.empty((0, 3)) 697 | x1 = boxes[:, 0] 698 | y1 = boxes[:, 1] 699 | x2 = boxes[:, 2] 700 | y2 = boxes[:, 3] 701 | s = boxes[:, 4] 702 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 703 | I = np.argsort(s) 704 | pick = np.zeros_like(s, dtype=np.int16) 705 | counter = 0 706 | while I.size > 0: 707 | i = I[-1] 708 | pick[counter] = i 709 | counter += 1 710 | idx = I[0:-1] 711 | xx1 = np.maximum(x1[i], x1[idx]) 712 | yy1 = np.maximum(y1[i], y1[idx]) 713 | xx2 = np.minimum(x2[i], x2[idx]) 714 | yy2 = np.minimum(y2[i], y2[idx]) 715 | w = np.maximum(0.0, xx2 - xx1 + 1) 716 | h = np.maximum(0.0, yy2 - yy1 + 1) 717 | inter = w * h 718 | if method is 'Min': 719 | o = inter / np.minimum(area[i], area[idx]) 720 | else: 721 | o = inter / (area[i] + area[idx] - inter) 722 | I = I[np.where(o <= threshold)] 723 | pick = pick[0:counter] 724 | return pick 725 | 726 | 727 | # function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h) 728 | def pad(total_boxes, w, h): 729 | # compute the padding coordinates (pad the bounding boxes to square) 730 | tmpw = (total_boxes[:, 2] - total_boxes[:, 0] + 1).astype(np.int32) 731 | tmph = (total_boxes[:, 3] - total_boxes[:, 1] + 1).astype(np.int32) 732 | numbox = total_boxes.shape[0] 733 | 734 | dx = np.ones((numbox), dtype=np.int32) 735 | dy = np.ones((numbox), dtype=np.int32) 736 | edx = tmpw.copy().astype(np.int32) 737 | edy = tmph.copy().astype(np.int32) 738 | 739 | x = total_boxes[:, 0].copy().astype(np.int32) 740 | y = total_boxes[:, 1].copy().astype(np.int32) 741 | ex = total_boxes[:, 2].copy().astype(np.int32) 742 | ey = total_boxes[:, 3].copy().astype(np.int32) 743 | 744 | tmp = np.where(ex > w) 745 | edx.flat[tmp] = np.expand_dims(-ex[tmp] + w + tmpw[tmp], 1) 746 | ex[tmp] = w 747 | 748 | tmp = np.where(ey > h) 749 | edy.flat[tmp] = np.expand_dims(-ey[tmp] + h + tmph[tmp], 1) 750 | ey[tmp] = h 751 | 752 | tmp = np.where(x < 1) 753 | dx.flat[tmp] = np.expand_dims(2 - x[tmp], 1) 754 | x[tmp] = 1 755 | 756 | tmp = np.where(y < 1) 757 | dy.flat[tmp] = np.expand_dims(2 - y[tmp], 1) 758 | y[tmp] = 1 759 | 760 | return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph 761 | 762 | 763 | # function [bboxA] = rerec(bboxA) 764 | def rerec(bboxA): 765 | # convert bboxA to square 766 | h = bboxA[:, 3] - bboxA[:, 1] 767 | w = bboxA[:, 2] - bboxA[:, 0] 768 | l = np.maximum(w, h) 769 | bboxA[:, 0] = bboxA[:, 0] + w * 0.5 - l * 0.5 770 | bboxA[:, 1] = bboxA[:, 1] + h * 0.5 - l * 0.5 771 | bboxA[:, 2:4] = bboxA[:, 0:2] + np.transpose(np.tile(l, (2, 1))) 772 | return bboxA 773 | 774 | 775 | def imresample(img, sz): 776 | im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA) # @UndefinedVariable 777 | return im_data 778 | 779 | # This method is kept for debugging purpose 780 | 781 | # h=img.shape[0] 782 | # w=img.shape[1] 783 | # hs, ws = sz 784 | # dx = float(w) / ws 785 | # dy = float(h) / hs 786 | # im_data = np.zeros((hs,ws,3)) 787 | # for a1 in range(0,hs): 788 | # for a2 in range(0,ws): 789 | # for a3 in range(0,3): 790 | # im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3] 791 | # return im_data 792 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/lib/__init__.py -------------------------------------------------------------------------------- /lib/face_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def judge_side_face(facial_landmarks): 5 | wide_dist = np.linalg.norm(facial_landmarks[0] - facial_landmarks[1]) 6 | high_dist = np.linalg.norm(facial_landmarks[0] - facial_landmarks[3]) 7 | dist_rate = high_dist / wide_dist 8 | 9 | # cal std 10 | vec_A = facial_landmarks[0] - facial_landmarks[2] 11 | vec_B = facial_landmarks[1] - facial_landmarks[2] 12 | vec_C = facial_landmarks[3] - facial_landmarks[2] 13 | vec_D = facial_landmarks[4] - facial_landmarks[2] 14 | dist_A = np.linalg.norm(vec_A) 15 | dist_B = np.linalg.norm(vec_B) 16 | dist_C = np.linalg.norm(vec_C) 17 | dist_D = np.linalg.norm(vec_D) 18 | 19 | # cal rate 20 | high_rate = dist_A / dist_C 21 | width_rate = dist_C / dist_D 22 | high_ratio_variance = np.fabs(high_rate - 1.1) # smaller is better 23 | width_ratio_variance = np.fabs(width_rate - 1) 24 | 25 | return dist_rate, high_ratio_variance, width_ratio_variance 26 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import uuid 5 | from operator import itemgetter 6 | 7 | import cv2 8 | import project_root_dir 9 | 10 | log_file_root_path = os.path.join(project_root_dir.project_dir, 'logs') 11 | log_time = time.strftime('%Y_%m_%d_%H_%M', time.localtime(time.time())) 12 | 13 | 14 | def mkdir(path): 15 | path.strip() 16 | path.rstrip('\\') 17 | isExists = os.path.exists(path) 18 | if not isExists: 19 | os.makedirs(path) 20 | 21 | 22 | def save_to_file(root_dic, tracker): 23 | filter_face_addtional_attribute_list = [] 24 | for item in tracker.face_addtional_attribute: 25 | if item[2] < 1.4 and item[4] < 1: # recommended thresold value 26 | filter_face_addtional_attribute_list.append(item) 27 | if len(filter_face_addtional_attribute_list) > 0: 28 | score_reverse_sorted_list = sorted(filter_face_addtional_attribute_list, key=itemgetter(4)) 29 | mkdir(root_dic) 30 | cv2.imwrite("{0}/{1}.jpg".format(root_dic, str(uuid.uuid1())), score_reverse_sorted_list[0][0]) 31 | 32 | 33 | class Logger: 34 | 35 | def __init__(self, module_name="MOT"): 36 | super().__init__() 37 | path_join = os.path.join(log_file_root_path, module_name) 38 | mkdir(path_join) 39 | 40 | self.logger = logging.getLogger(module_name) 41 | self.logger.setLevel(logging.INFO) 42 | log_file = os.path.join(path_join, '{}.log'.format(log_time)) 43 | if not self.logger.handlers: 44 | fh = logging.FileHandler(log_file, encoding='utf-8') 45 | fh.setLevel(logging.INFO) 46 | 47 | ch = logging.StreamHandler() 48 | ch.setLevel(logging.INFO) 49 | formatter = logging.Formatter( 50 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s - %(threadName)s - %(process)d ") 51 | ch.setFormatter(formatter) 52 | fh.setFormatter(formatter) 53 | self.logger.addHandler(ch) 54 | self.logger.addHandler(fh) 55 | 56 | def error(self, msg, *args, **kwargs): 57 | if self.logger is not None: 58 | self.logger.error(msg, *args, **kwargs) 59 | 60 | def info(self, msg, *args, **kwargs): 61 | if self.logger is not None: 62 | self.logger.info(msg, *args, **kwargs) 63 | 64 | def warn(self, msg, *args, **kwargs): 65 | if self.logger is not None: 66 | self.logger.warning(msg, *args, **kwargs) 67 | 68 | def warning(self, msg, *args, **kwargs): 69 | if self.logger is not None: 70 | self.logger.warning(msg, *args, **kwargs) 71 | 72 | def exception(self, msg, *args, exc_info=True, **kwargs): 73 | if self.logger is not None: 74 | self.logger.exception(msg, *args, exc_info=True, **kwargs) 75 | -------------------------------------------------------------------------------- /project_root_dir.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | project_dir = os.path.dirname(os.path.abspath(__file__)) 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | scikit-learn 3 | opencv-python 4 | numpy 5 | filterpy 6 | tensorflow-gpu -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linzaer/Face-Track-Detect-Extract/384bdb3a9f88127baf96a41dfd21cb3e56d56b6e/src/__init__.py -------------------------------------------------------------------------------- /src/data_association.py: -------------------------------------------------------------------------------- 1 | """ 2 | As implemented in https://github.com/abewley/sort but with some modifications 3 | 4 | For each detected item, it computes the intersection over union (IOU) w.r.t. each tracked object. (IOU matrix) 5 | Then, it applies the Hungarian algorithm (via linear_assignment) to assign each det. item to the best possible 6 | tracked item (i.e. to the one with max. IOU). 7 | 8 | Note: a more recent approach uses a Deep Association Metric instead. 9 | see https://github.com/nwojke/deep_sort 10 | """ 11 | 12 | import numpy as np 13 | from numba import jit 14 | from sklearn.utils.linear_assignment_ import linear_assignment 15 | 16 | 17 | @jit 18 | def iou(bb_test, bb_gt): 19 | """ 20 | Computes IUO between two bboxes in the form [x1,y1,x2,y2] 21 | """ 22 | xx1 = np.maximum(bb_test[0], bb_gt[0]) 23 | yy1 = np.maximum(bb_test[1], bb_gt[1]) 24 | xx2 = np.minimum(bb_test[2], bb_gt[2]) 25 | yy2 = np.minimum(bb_test[3], bb_gt[3]) 26 | w = np.maximum(0., xx2 - xx1) 27 | h = np.maximum(0., yy2 - yy1) 28 | wh = w * h 29 | o = wh / ((bb_test[2] - bb_test[0]) * (bb_test[3] - bb_test[1]) 30 | + (bb_gt[2] - bb_gt[0]) * (bb_gt[3] - bb_gt[1]) - wh) 31 | return (o) 32 | 33 | 34 | def associate_detections_to_trackers(detections, trackers, iou_threshold=0.25): 35 | """ 36 | Assigns detections to tracked object (both represented as bounding boxes) 37 | 38 | Returns 3 lists of matches, unmatched_detections and unmatched_trackers 39 | """ 40 | if len(trackers) == 0: 41 | return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) 42 | iou_matrix = np.zeros((len(detections), len(trackers)), dtype=np.float32) 43 | 44 | for d, det in enumerate(detections): 45 | for t, trk in enumerate(trackers): 46 | iou_matrix[d, t] = iou(det, trk) 47 | '''The linear assignment module tries to minimise the total assignment cost. 48 | In our case we pass -iou_matrix as we want to maximise the total IOU between track predictions and the frame detection.''' 49 | matched_indices = linear_assignment(-iou_matrix) 50 | 51 | unmatched_detections = [] 52 | for d, det in enumerate(detections): 53 | if d not in matched_indices[:, 0]: 54 | unmatched_detections.append(d) 55 | unmatched_trackers = [] 56 | for t, trk in enumerate(trackers): 57 | if t not in matched_indices[:, 1]: 58 | unmatched_trackers.append(t) 59 | 60 | # filter out matched with low IOU 61 | matches = [] 62 | for m in matched_indices: 63 | if iou_matrix[m[0], m[1]] < iou_threshold: 64 | unmatched_detections.append(m[0]) 65 | unmatched_trackers.append(m[1]) 66 | else: 67 | matches.append(m.reshape(1, 2)) 68 | if len(matches) == 0: 69 | matches = np.empty((0, 2), dtype=int) 70 | else: 71 | matches = np.concatenate(matches, axis=0) 72 | 73 | return matches, np.array(unmatched_detections), np.array(unmatched_trackers) 74 | -------------------------------------------------------------------------------- /src/kalman_tracker.py: -------------------------------------------------------------------------------- 1 | """ 2 | As implemented in https://github.com/abewley/sort but with some modifications 3 | """ 4 | 5 | import numpy as np 6 | from filterpy.kalman import KalmanFilter 7 | 8 | '''Motion Model''' 9 | 10 | 11 | class KalmanBoxTracker(object): 12 | """ 13 | This class represents the internal state of individual tracked objects observed as bbox. 14 | """ 15 | count = 0 16 | 17 | def __init__(self, bbox): 18 | """ 19 | Initialises a tracker using initial bounding box. 20 | """ 21 | # define constant velocity model 22 | self.kf = KalmanFilter(dim_x=7, dim_z=4) 23 | self.kf.F = np.array( 24 | [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0], 25 | [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]]) 26 | self.kf.H = np.array( 27 | [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]) 28 | 29 | self.kf.R[2:, 2:] *= 10. 30 | self.kf.P[4:, 4:] *= 1000. # give high uncertainty to the unobservable initial velocities 31 | self.kf.P *= 10. 32 | self.kf.Q[-1, -1] *= 0.01 33 | self.kf.Q[4:, 4:] *= 0.01 34 | 35 | self.kf.x[:4] = convert_bbox_to_z(bbox) 36 | self.time_since_update = 0 37 | self.id = KalmanBoxTracker.count 38 | KalmanBoxTracker.count += 1 39 | self.history = [] 40 | self.hits = 0 41 | self.hit_streak = 0 42 | self.age = 0 43 | 44 | self.predict_num = 0 # 解决画面中无人脸检测到时而导致的原有追踪器人像预测的漂移bug 45 | 46 | # addtional fields 47 | self.face_addtional_attribute = [] 48 | 49 | def update(self, bbox): 50 | """ 51 | Updates the state vector with observed bbox. 52 | """ 53 | self.time_since_update = 0 54 | self.history = [] 55 | self.hits += 1 56 | self.hit_streak += 1 57 | if bbox != []: 58 | self.kf.update(convert_bbox_to_z(bbox)) 59 | self.predict_num = 0 60 | else: 61 | self.predict_num += 1 62 | 63 | def predict(self): 64 | """ 65 | Advances the state vector and returns the predicted bounding box estimate. 66 | """ 67 | if (self.kf.x[6] + self.kf.x[2]) <= 0: 68 | self.kf.x[6] *= 0.0 69 | self.kf.predict() 70 | self.age += 1 71 | if self.time_since_update > 0: 72 | self.hit_streak = 0 73 | self.time_since_update += 1 74 | self.history.append(convert_x_to_bbox(self.kf.x)) 75 | return self.history[-1][0] 76 | 77 | def get_state(self): 78 | """ 79 | Returns the current bounding box estimate. 80 | """ 81 | return convert_x_to_bbox(self.kf.x)[0] 82 | 83 | 84 | def convert_bbox_to_z(bbox): 85 | """ 86 | Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form 87 | [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is 88 | the aspect ratio 89 | """ 90 | w = bbox[2] - bbox[0] 91 | h = bbox[3] - bbox[1] 92 | x = bbox[0] + w / 2. 93 | y = bbox[1] + h / 2. 94 | s = w * h # scale is just area 95 | r = w / float(h) 96 | return np.array([x, y, s, r]).reshape((4, 1)) 97 | 98 | 99 | def convert_x_to_bbox(x, score=None): 100 | """ 101 | Takes a bounding box in the centre form [x,y,s,r] and returns it in the form 102 | [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right 103 | """ 104 | w = np.sqrt(x[2] * x[3]) 105 | h = x[2] / w 106 | if score is None: 107 | return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4)) 108 | else: 109 | return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score]).reshape((1, 5)) 110 | -------------------------------------------------------------------------------- /src/sort.py: -------------------------------------------------------------------------------- 1 | """ 2 | As implemented in https://github.com/abewley/sort but with some modifications 3 | """ 4 | 5 | from __future__ import print_function 6 | 7 | import lib.utils as utils 8 | import numpy as np 9 | from src.data_association import associate_detections_to_trackers 10 | from src.kalman_tracker import KalmanBoxTracker 11 | 12 | logger = utils.Logger("MOT") 13 | 14 | 15 | class Sort: 16 | 17 | def __init__(self, max_age=1, min_hits=3): 18 | """ 19 | Sets key parameters for SORT 20 | """ 21 | self.max_age = max_age 22 | self.min_hits = min_hits 23 | self.trackers = [] 24 | self.frame_count = 0 25 | 26 | def update(self, dets, img_size, root_dic, addtional_attribute_list, predict_num): 27 | """ 28 | Params: 29 | dets - a numpy array of detections in the format [[x,y,w,h,score],[x,y,w,h,score],...] 30 | Requires: this method must be called once for each frame even with empty detections. 31 | Returns the a similar array, where the last column is the object ID. 32 | 33 | NOTE:as in practical realtime MOT, the detector doesn't run on every single frame 34 | """ 35 | self.frame_count += 1 36 | # get predicted locations from existing trackers. 37 | trks = np.zeros((len(self.trackers), 5)) 38 | to_del = [] 39 | ret = [] 40 | for t, trk in enumerate(trks): 41 | pos = self.trackers[t].predict() # kalman predict ,very fast ,<1ms 42 | trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] 43 | if np.any(np.isnan(pos)): 44 | to_del.append(t) 45 | trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) 46 | for t in reversed(to_del): 47 | self.trackers.pop(t) 48 | if dets != []: 49 | matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets, trks) 50 | 51 | # update matched trackers with assigned detections 52 | for t, trk in enumerate(self.trackers): 53 | if t not in unmatched_trks: 54 | d = matched[np.where(matched[:, 1] == t)[0], 0] 55 | trk.update(dets[d, :][0]) 56 | trk.face_addtional_attribute.append(addtional_attribute_list[d[0]]) 57 | 58 | # create and initialise new trackers for unmatched detections 59 | for i in unmatched_dets: 60 | trk = KalmanBoxTracker(dets[i, :]) 61 | trk.face_addtional_attribute.append(addtional_attribute_list[i]) 62 | logger.info("new Tracker: {0}".format(trk.id + 1)) 63 | self.trackers.append(trk) 64 | 65 | i = len(self.trackers) 66 | for trk in reversed(self.trackers): 67 | if dets == []: 68 | trk.update([]) 69 | d = trk.get_state() 70 | if (trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits): 71 | ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) # +1 as MOT benchmark requires positive 72 | i -= 1 73 | # remove dead tracklet 74 | if trk.time_since_update >= self.max_age or trk.predict_num >= predict_num or d[2] < 0 or d[3] < 0 or d[0] > img_size[1] or d[1] > img_size[0]: 75 | if len(trk.face_addtional_attribute) >= 5: 76 | utils.save_to_file(root_dic, trk) 77 | logger.info('remove tracker: {0}'.format(trk.id + 1)) 78 | self.trackers.pop(i) 79 | if len(ret) > 0: 80 | return np.concatenate(ret) 81 | return np.empty((0, 5)) 82 | -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from time import time 4 | 5 | import align.detect_face as detect_face 6 | import cv2 7 | import numpy as np 8 | import tensorflow as tf 9 | from lib.face_utils import judge_side_face 10 | from lib.utils import Logger, mkdir 11 | from project_root_dir import project_dir 12 | from src.sort import Sort 13 | 14 | logger = Logger() 15 | 16 | 17 | def main(): 18 | global colours, img_size 19 | args = parse_args() 20 | videos_dir = args.videos_dir 21 | output_path = args.output_path 22 | no_display = args.no_display 23 | detect_interval = args.detect_interval # you need to keep a balance between performance and fluency 24 | margin = args.margin # if the face is big in your video ,you can set it bigger for tracking easiler 25 | scale_rate = args.scale_rate # if set it smaller will make input frames smaller 26 | show_rate = args.show_rate # if set it smaller will dispaly smaller frames 27 | face_score_threshold = args.face_score_threshold 28 | 29 | mkdir(output_path) 30 | # for display 31 | if not no_display: 32 | colours = np.random.rand(32, 3) 33 | 34 | # init tracker 35 | tracker = Sort() # create instance of the SORT tracker 36 | 37 | logger.info('Start track and extract......') 38 | with tf.Graph().as_default(): 39 | with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True), 40 | log_device_placement=False)) as sess: 41 | pnet, rnet, onet = detect_face.create_mtcnn(sess, os.path.join(project_dir, "align")) 42 | 43 | minsize = 40 # minimum size of face for mtcnn to detect 44 | threshold = [0.6, 0.7, 0.7] # three steps's threshold 45 | factor = 0.709 # scale factor 46 | 47 | for filename in os.listdir(videos_dir): 48 | logger.info('All files:{}'.format(filename)) 49 | for filename in os.listdir(videos_dir): 50 | suffix = filename.split('.')[1] 51 | if suffix != 'mp4' and suffix != 'avi': # you can specify more video formats if you need 52 | continue 53 | video_name = os.path.join(videos_dir, filename) 54 | directoryname = os.path.join(output_path, filename.split('.')[0]) 55 | logger.info('Video_name:{}'.format(video_name)) 56 | cam = cv2.VideoCapture(video_name) 57 | c = 0 58 | while True: 59 | final_faces = [] 60 | addtional_attribute_list = [] 61 | ret, frame = cam.read() 62 | if not ret: 63 | logger.warning("ret false") 64 | break 65 | if frame is None: 66 | logger.warning("frame drop") 67 | break 68 | 69 | frame = cv2.resize(frame, (0, 0), fx=scale_rate, fy=scale_rate) 70 | r_g_b_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 71 | if c % detect_interval == 0: 72 | img_size = np.asarray(frame.shape)[0:2] 73 | mtcnn_starttime = time() 74 | faces, points = detect_face.detect_face(r_g_b_frame, minsize, pnet, rnet, onet, threshold, 75 | factor) 76 | logger.info("MTCNN detect face cost time : {} s".format( 77 | round(time() - mtcnn_starttime, 3))) # mtcnn detect ,slow 78 | face_sums = faces.shape[0] 79 | if face_sums > 0: 80 | face_list = [] 81 | for i, item in enumerate(faces): 82 | score = round(faces[i, 4], 6) 83 | if score > face_score_threshold: 84 | det = np.squeeze(faces[i, 0:4]) 85 | 86 | # face rectangle 87 | det[0] = np.maximum(det[0] - margin, 0) 88 | det[1] = np.maximum(det[1] - margin, 0) 89 | det[2] = np.minimum(det[2] + margin, img_size[1]) 90 | det[3] = np.minimum(det[3] + margin, img_size[0]) 91 | face_list.append(item) 92 | 93 | # face cropped 94 | bb = np.array(det, dtype=np.int32) 95 | 96 | # use 5 face landmarks to judge the face is front or side 97 | squeeze_points = np.squeeze(points[:, i]) 98 | tolist = squeeze_points.tolist() 99 | facial_landmarks = [] 100 | for j in range(5): 101 | item = [tolist[j], tolist[(j + 5)]] 102 | facial_landmarks.append(item) 103 | if args.face_landmarks: 104 | for (x, y) in facial_landmarks: 105 | cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 0), -1) 106 | cropped = frame[bb[1]:bb[3], bb[0]:bb[2], :].copy() 107 | 108 | dist_rate, high_ratio_variance, width_rate = judge_side_face( 109 | np.array(facial_landmarks)) 110 | 111 | # face addtional attribute(index 0:face score; index 1:0 represents front face and 1 for side face ) 112 | item_list = [cropped, score, dist_rate, high_ratio_variance, width_rate] 113 | addtional_attribute_list.append(item_list) 114 | 115 | final_faces = np.array(face_list) 116 | 117 | trackers = tracker.update(final_faces, img_size, directoryname, addtional_attribute_list, detect_interval) 118 | 119 | c += 1 120 | 121 | for d in trackers: 122 | if not no_display: 123 | d = d.astype(np.int32) 124 | cv2.rectangle(frame, (d[0], d[1]), (d[2], d[3]), colours[d[4] % 32, :] * 255, 3) 125 | if final_faces != []: 126 | cv2.putText(frame, 'ID : %d DETECT' % (d[4]), (d[0] - 10, d[1] - 10), 127 | cv2.FONT_HERSHEY_SIMPLEX, 128 | 0.75, 129 | colours[d[4] % 32, :] * 255, 2) 130 | cv2.putText(frame, 'DETECTOR', (5, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.75, 131 | (1, 1, 1), 2) 132 | else: 133 | cv2.putText(frame, 'ID : %d' % (d[4]), (d[0] - 10, d[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 134 | 0.75, 135 | colours[d[4] % 32, :] * 255, 2) 136 | 137 | if not no_display: 138 | frame = cv2.resize(frame, (0, 0), fx=show_rate, fy=show_rate) 139 | cv2.imshow("Frame", frame) 140 | if cv2.waitKey(1) & 0xFF == ord('q'): 141 | break 142 | 143 | 144 | def parse_args(): 145 | """Parse input arguments.""" 146 | parser = argparse.ArgumentParser() 147 | parser.add_argument("--videos_dir", type=str, 148 | help='Path to the data directory containing aligned your face patches.', default='videos') 149 | parser.add_argument('--output_path', type=str, 150 | help='Path to save face', 151 | default='facepics') 152 | parser.add_argument('--detect_interval', 153 | help='how many frames to make a detection', 154 | type=int, default=1) 155 | parser.add_argument('--margin', 156 | help='add margin for face', 157 | type=int, default=10) 158 | parser.add_argument('--scale_rate', 159 | help='Scale down or enlarge the original video img', 160 | type=float, default=0.7) 161 | parser.add_argument('--show_rate', 162 | help='Scale down or enlarge the imgs drawn by opencv', 163 | type=float, default=1) 164 | parser.add_argument('--face_score_threshold', 165 | help='The threshold of the extracted faces,range 0