├── .gitignore ├── README.md ├── __init__.py ├── config.py ├── core ├── MtcnnDetector.py ├── __init__.py ├── detector.py ├── fcn_detector.py ├── imdb.py ├── loader.py ├── metric.py ├── minibatch.py ├── negativemining.py └── symbol.py ├── demo.py ├── example ├── __init__.py ├── train.py ├── train_O_net.py ├── train_P_net.py └── train_R_net.py ├── fddb_result.png ├── model ├── onet-0016.params ├── pnet-0016.params └── rnet-0016.params ├── mxnet_diff.patch ├── prepare_data ├── __init__.py ├── gen_hard_example.py ├── gen_imglist.py ├── gen_pnet_data.py ├── utils.py └── wider_annotations │ ├── __init__.py │ ├── readme.txt │ ├── transform.m │ ├── transform.py │ ├── wider_face_test.mat │ ├── wider_face_train.mat │ ├── wider_face_val.mat │ ├── wider_loader.py │ └── writeLabel.m ├── test01.jpg ├── test_fddb.py └── tools ├── __init__.py ├── image_processing.py ├── load_model.py └── nms.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | this repository is the implementation of MTCNN in MXnet 3 | * `core`: core routines for MTCNN training and testing. 4 | * `tools`: utilities for training and testing 5 | * `data`: Refer to `Data Folder Structure` for dataset reference. Usually dataset contains `images` and `imglists` 6 | * `model`: Folder to save training symbol and model 7 | * `prepare_data`: scripts for generating training data for pnet, rnet and onet 8 | 9 | ## Useful information 10 | You're required to modify mxnet/src/regression_output-inl.h according to mxnet_diff.patch before using the code for training. 11 | 12 | * Dataset format 13 | The images used for training are stored in ./data/dataset_name/images/ 14 | The annotation file is placed in ./data/dataset_name/imglists/ 15 | 16 | * For training: 17 | Each line of the annotation file states a training sample. 18 | The format is: 19 | [path to image] [cls_label] [bbox_label] 20 | cls_label: 1 for positive, 0 for negative, -1 for part face. 21 | bbox_label are the offset of x1, y1, x2, y2, calculated by (xgt(ygt) - x(y)) / width(height) 22 | An example would be `12/positive/28 1 -0.05 0.11 -0.05 -0.11`. 23 | Note that all the strings are seperated by space. 24 | 25 | * For testing: 26 | Similar to training but only path-to-image is needed. 27 | 28 | * Data Folder Structure (suppose root is `data`) 29 | ``` 30 | cache (created by imdb) 31 | -- name + image set + gt_roidb 32 | -- results (created by detection and evaluation) 33 | mtcnn # contains images and anno for training mtcnn 34 | -- images 35 | ---- 12 (images of size 12 x 12, used by pnet) 36 | ---- 24 (images of size 24 x 24, used by rnet) 37 | ---- 48 (images of size 48 x 48, used by onet) 38 | -- imglists 39 | ---- train_12.txt 40 | ---- train_24.txt 41 | ---- train_48.txt 42 | custom (datasets for testing) 43 | -- images 44 | -- imglists 45 | ---- image_set.txt 46 | ``` 47 | 48 | * Scripts to generate training data(from wider face dataset) 49 | * run wider_annotations/transform.m (or transform.py) to get the annotation file of the format we need. 50 | * gen_pnet_data.py: obtain training samples for pnet 51 | * gen_hard_example.py: prepare hard examples. 52 | you can set test_mode to "pnet" to get training data for rnet, 53 | or set test_mode to "rnet" to get training data for onet. 54 | * gen_imglist.py: ramdom sample images generated by gen_pnet_data.py or gen_hard_example.py to form training set. 55 | 56 | ## Results 57 | 58 | ![image](https://github.com/Seanlinx/mtcnn/blob/master/fddb_result.png) 59 | 60 | ## License 61 | MIT LICENSE 62 | 63 | ## Reference 64 | Kaipeng Zhang, Zhanpeng Zhang, Zhifeng Li, Yu Qiao , " Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks," IEEE Signal Processing Letter 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/__init__.py -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from easydict import EasyDict as edict 3 | 4 | config = edict() 5 | 6 | config.BATCH_SIZE = 128 7 | 8 | config.CLS_OHEM = True 9 | config.CLS_OHEM_RATIO = 0.7 10 | config.BBOX_OHEM = False 11 | config.BBOX_OHEM_RATIO = 0.7 12 | 13 | config.EPS = 1e-14 14 | config.LR_EPOCH = [8, 14] 15 | -------------------------------------------------------------------------------- /core/MtcnnDetector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import mxnet as mx 3 | import time 4 | from tools import image_processing 5 | #from mx.model import FeedForward 6 | import numpy as np 7 | from config import config 8 | from tools.nms import py_nms 9 | 10 | class MtcnnDetector(object): 11 | """ 12 | Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Neural Networks 13 | see https://github.com/kpzhang93/MTCNN_face_detection_alignment 14 | this is a mxnet version 15 | """ 16 | def __init__(self, 17 | detectors, 18 | min_face_size=24, 19 | stride=2, 20 | threshold=[0.6, 0.7, 0.7], 21 | scale_factor=0.709, 22 | ctx=mx.cpu(), 23 | slide_window=False): 24 | 25 | self.pnet_detector = detectors[0] 26 | self.rnet_detector = detectors[1] 27 | self.onet_detector = detectors[2] 28 | self.min_face_size = min_face_size 29 | self.stride=stride 30 | self.thresh = threshold 31 | self.ctx = ctx 32 | self.scale_factor = scale_factor 33 | self.slide_window = slide_window 34 | 35 | 36 | def convert_to_square(self, bbox): 37 | """ 38 | convert bbox to square 39 | Parameters: 40 | ---------- 41 | bbox: numpy array , shape n x 5 42 | input bbox 43 | Returns: 44 | ------- 45 | square bbox 46 | """ 47 | square_bbox = bbox.copy() 48 | 49 | h = bbox[:, 3] - bbox[:, 1] + 1 50 | w = bbox[:, 2] - bbox[:, 0] + 1 51 | max_side = np.maximum(h,w) 52 | square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_side*0.5 53 | square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_side*0.5 54 | square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1 55 | square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1 56 | return square_bbox 57 | 58 | def calibrate_box(self, bbox, reg): 59 | """ 60 | calibrate bboxes 61 | Parameters: 62 | ---------- 63 | bbox: numpy array, shape n x 5 64 | input bboxes 65 | reg: numpy array, shape n x 4 66 | bboxes adjustment 67 | Returns: 68 | ------- 69 | bboxes after refinement 70 | """ 71 | 72 | bbox_c = bbox.copy() 73 | w = bbox[:, 2] - bbox[:, 0] + 1 74 | w = np.expand_dims(w, 1) 75 | h = bbox[:, 3] - bbox[:, 1] + 1 76 | h = np.expand_dims(h, 1) 77 | reg_m = np.hstack([w, h, w, h]) 78 | aug = reg_m * reg 79 | bbox_c[:, 0:4] = bbox_c[:, 0:4] + aug 80 | return bbox_c 81 | 82 | def generate_bbox(self, map, reg, scale, threshold): 83 | """ 84 | generate bbox from feature map 85 | Parameters: 86 | ---------- 87 | map: numpy array , n x m x 1 88 | detect score for each position 89 | reg: numpy array , n x m x 4 90 | bbox 91 | scale: float number 92 | scale of this detection 93 | threshold: float number 94 | detect threshold 95 | Returns: 96 | ------- 97 | bbox array 98 | """ 99 | stride = 2 100 | cellsize = 12 101 | 102 | t_index = np.where(map>threshold) 103 | 104 | # find nothing 105 | if t_index[0].size == 0: 106 | return np.array([]) 107 | 108 | dx1, dy1, dx2, dy2 = [reg[0, i, t_index[0], t_index[1]] for i in range(4)] 109 | 110 | reg = np.array([dx1, dy1, dx2, dy2]) 111 | score = map[t_index[0], t_index[1]] 112 | boundingbox = np.vstack([np.round((stride*t_index[1])/scale), 113 | np.round((stride*t_index[0])/scale), 114 | np.round((stride*t_index[1]+cellsize)/scale), 115 | np.round((stride*t_index[0]+cellsize)/scale), 116 | score, 117 | reg]) 118 | 119 | return boundingbox.T 120 | 121 | 122 | def resize_image(self, img, scale): 123 | """ 124 | resize image and transform dimention to [batchsize, channel, height, width] 125 | Parameters: 126 | ---------- 127 | img: numpy array , height x width x channel 128 | input image, channels in BGR order here 129 | scale: float number 130 | scale factor of resize operation 131 | Returns: 132 | ------- 133 | transformed image tensor , 1 x channel x height x width 134 | """ 135 | height, width, channels = img.shape 136 | new_height = int(height * scale) # resized new height 137 | new_width = int(width * scale) # resized new width 138 | new_dim = (new_width, new_height) 139 | img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR) # resized image 140 | img_resized = image_processing.transform(img_resized) 141 | return img_resized # (batch_size, c, h, w) 142 | 143 | 144 | def pad(self, bboxes, w, h): 145 | """ 146 | pad the the bboxes, alse restrict the size of it 147 | Parameters: 148 | ---------- 149 | bboxes: numpy array, n x 5 150 | input bboxes 151 | w: float number 152 | width of the input image 153 | h: float number 154 | height of the input image 155 | Returns : 156 | ------ 157 | dy, dx : numpy array, n x 1 158 | start point of the bbox in target image 159 | edy, edx : numpy array, n x 1 160 | end point of the bbox in target image 161 | y, x : numpy array, n x 1 162 | start point of the bbox in original image 163 | ex, ex : numpy array, n x 1 164 | end point of the bbox in original image 165 | tmph, tmpw: numpy array, n x 1 166 | height and width of the bbox 167 | """ 168 | tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1 169 | num_box = bboxes.shape[0] 170 | 171 | dx , dy= np.zeros((num_box, )), np.zeros((num_box, )) 172 | edx, edy = tmpw.copy()-1, tmph.copy()-1 173 | 174 | x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] 175 | 176 | tmp_index = np.where(ex > w-1) 177 | edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index] 178 | ex[tmp_index] = w - 1 179 | 180 | tmp_index = np.where(ey > h-1) 181 | edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index] 182 | ey[tmp_index] = h - 1 183 | 184 | tmp_index = np.where(x < 0) 185 | dx[tmp_index] = 0 - x[tmp_index] 186 | x[tmp_index] = 0 187 | 188 | tmp_index = np.where(y < 0) 189 | dy[tmp_index] = 0 - y[tmp_index] 190 | y[tmp_index] = 0 191 | 192 | return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] 193 | return_list = [item.astype(np.int32) for item in return_list] 194 | 195 | return return_list 196 | 197 | 198 | def detect_pnet(self, im): 199 | """Get face candidates through pnet 200 | 201 | Parameters: 202 | ---------- 203 | im: numpy array 204 | input image array 205 | 206 | Returns: 207 | ------- 208 | boxes: numpy array 209 | detected boxes before calibration 210 | boxes_c: numpy array 211 | boxes after calibration 212 | """ 213 | h, w, c = im.shape 214 | net_size = 12 215 | 216 | current_scale = float(net_size) / self.min_face_size # find initial scale 217 | im_resized = self.resize_image(im, current_scale) 218 | _, _, current_height, current_width = im_resized.shape 219 | 220 | if self.slide_window: 221 | # sliding window 222 | temp_rectangles = list() 223 | rectangles = list() # list of rectangles [x11, y11, x12, y12, confidence] (corresponding to original image) 224 | all_cropped_ims = list() 225 | while min(current_height, current_width) > net_size: 226 | current_y_list = range(0, current_height - net_size + 1, self.stride) if (current_height - net_size) % self.stride == 0 \ 227 | else range(0, current_height - net_size + 1, self.stride) + [current_height - net_size] 228 | current_x_list = range(0, current_width - net_size + 1, self.stride) if (current_width - net_size) % self.stride == 0 \ 229 | else range(0, current_width - net_size + 1, self.stride) + [current_width - net_size] 230 | 231 | for current_y in current_y_list: 232 | for current_x in current_x_list: 233 | cropped_im = im_resized[:, :, current_y:current_y + net_size, current_x:current_x + net_size] 234 | 235 | current_rectangle = [int(w * float(current_x) / current_width), int(h * float(current_y) / current_height), 236 | int(w * float(current_x) / current_width) + int(w * float(net_size) / current_width), 237 | int(h * float(current_y) / current_height) + int(w * float(net_size) / current_width), 238 | 0.0] 239 | temp_rectangles.append(current_rectangle) 240 | all_cropped_ims.append(cropped_im) 241 | 242 | current_scale *= self.scale_factor 243 | im_resized = self.resize_image(im, current_scale) 244 | _, _, current_height, current_width = im_resized.shape 245 | 246 | ''' 247 | # helper for setting PNet batch size 248 | num_boxes = len(all_cropped_ims) 249 | batch_size = self.pnet_detector.batch_size 250 | ratio = float(num_boxes) / batch_size 251 | if ratio > 3 or ratio < 0.3: 252 | print "You may need to reset PNet batch size if this info appears frequently, \ 253 | face candidates:%d, current batch_size:%d"%(num_boxes, batch_size) 254 | ''' 255 | all_cropped_ims = np.vstack(all_cropped_ims) 256 | cls_scores, reg = self.pnet_detector.predict(all_cropped_ims) 257 | 258 | cls_scores = cls_scores[:, 1].flatten() 259 | keep_inds = np.where(cls_scores > self.thresh[0])[0] 260 | 261 | if len(keep_inds) > 0: 262 | boxes = np.vstack(temp_rectangles[ind] for ind in keep_inds) 263 | boxes[:, 4] = cls_scores[keep_inds] 264 | reg = reg[keep_inds].reshape(-1, 4) 265 | else: 266 | return None, None 267 | 268 | 269 | keep = py_nms(boxes, 0.7, 'Union') 270 | boxes = boxes[keep] 271 | 272 | boxes_c = self.calibrate_box(boxes, reg[keep]) 273 | 274 | else: 275 | # fcn 276 | all_boxes = list() 277 | while min(current_height, current_width) > net_size: 278 | cls_map, reg = self.pnet_detector.predict(im_resized) 279 | cls_map = cls_map.asnumpy() 280 | reg = reg.asnumpy() 281 | boxes = self.generate_bbox(cls_map[0, 1, :, :], reg, current_scale, self.thresh[0]) 282 | 283 | current_scale *= self.scale_factor 284 | im_resized = self.resize_image(im, current_scale) 285 | _, _, current_height, current_width = im_resized.shape 286 | 287 | if boxes.size == 0: 288 | continue 289 | keep = py_nms(boxes[:, :5], 0.5, 'Union') 290 | boxes = boxes[keep] 291 | all_boxes.append(boxes) 292 | 293 | if len(all_boxes) == 0: 294 | return None, None 295 | 296 | all_boxes = np.vstack(all_boxes) 297 | 298 | # merge the detection from first stage 299 | keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union') 300 | all_boxes = all_boxes[keep] 301 | boxes = all_boxes[:, :5] 302 | 303 | bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1 304 | bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1 305 | 306 | # refine the boxes 307 | boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw, 308 | all_boxes[:, 1] + all_boxes[:, 6] * bbh, 309 | all_boxes[:, 2] + all_boxes[:, 7] * bbw, 310 | all_boxes[:, 3] + all_boxes[:, 8] * bbh, 311 | all_boxes[:, 4]]) 312 | boxes_c = boxes_c.T 313 | 314 | return boxes, boxes_c 315 | 316 | def detect_rnet(self, im, dets): 317 | """Get face candidates using rnet 318 | 319 | Parameters: 320 | ---------- 321 | im: numpy array 322 | input image array 323 | dets: numpy array 324 | detection results of pnet 325 | 326 | Returns: 327 | ------- 328 | boxes: numpy array 329 | detected boxes before calibration 330 | boxes_c: numpy array 331 | boxes after calibration 332 | """ 333 | h, w, c = im.shape 334 | dets = self.convert_to_square(dets) 335 | dets[:, 0:4] = np.round(dets[:, 0:4]) 336 | 337 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h) 338 | num_boxes = dets.shape[0] 339 | 340 | ''' 341 | # helper for setting RNet batch size 342 | batch_size = self.rnet_detector.batch_size 343 | ratio = float(num_boxes) / batch_size 344 | if ratio > 3 or ratio < 0.3: 345 | print "You may need to reset RNet batch size if this info appears frequently, \ 346 | face candidates:%d, current batch_size:%d"%(num_boxes, batch_size) 347 | ''' 348 | 349 | cropped_ims = np.zeros((num_boxes, 3, 24, 24), dtype=np.float32) 350 | for i in range(num_boxes): 351 | tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) 352 | tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = im[y[i]:ey[i]+1, x[i]:ex[i]+1, :] 353 | cropped_ims[i, :, :, :] = image_processing.transform(cv2.resize(tmp, (24, 24))) 354 | 355 | cls_scores, reg = self.rnet_detector.predict(cropped_ims) 356 | cls_scores = cls_scores[:, 1].flatten() 357 | keep_inds = np.where(cls_scores > self.thresh[1])[0] 358 | 359 | if len(keep_inds) > 0: 360 | boxes = dets[keep_inds] 361 | boxes[:, 4] = cls_scores[keep_inds] 362 | reg = reg[keep_inds] 363 | else: 364 | return None, None 365 | 366 | keep = py_nms(boxes, 0.7) 367 | boxes = boxes[keep] 368 | 369 | boxes_c = self.calibrate_box(boxes, reg[keep]) 370 | 371 | return boxes, boxes_c 372 | 373 | def detect_onet(self, im, dets): 374 | """Get face candidates using onet 375 | 376 | Parameters: 377 | ---------- 378 | im: numpy array 379 | input image array 380 | dets: numpy array 381 | detection results of rnet 382 | 383 | Returns: 384 | ------- 385 | boxes: numpy array 386 | detected boxes before calibration 387 | boxes_c: numpy array 388 | boxes after calibration 389 | """ 390 | h, w, c = im.shape 391 | dets = self.convert_to_square(dets) 392 | dets[:, 0:4] = np.round(dets[:, 0:4]) 393 | 394 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h) 395 | num_boxes = dets.shape[0] 396 | 397 | ''' 398 | # helper for setting ONet batch size 399 | batch_size = self.onet_detector.batch_size 400 | ratio = float(num_boxes) / batch_size 401 | if ratio > 3 or ratio < 0.3: 402 | print "You may need to reset ONet batch size if this info appears frequently, \ 403 | face candidates:%d, current batch_size:%d"%(num_boxes, batch_size) 404 | ''' 405 | 406 | cropped_ims = np.zeros((num_boxes, 3, 48, 48), dtype=np.float32) 407 | for i in range(num_boxes): 408 | tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) 409 | tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = im[y[i]:ey[i]+1, x[i]:ex[i]+1, :] 410 | cropped_ims[i, :, :, :] = image_processing.transform(cv2.resize(tmp, (48, 48))) 411 | cls_scores, reg = self.onet_detector.predict(cropped_ims) 412 | 413 | cls_scores = cls_scores[:, 1].flatten() 414 | keep_inds = np.where(cls_scores > self.thresh[2])[0] 415 | 416 | if len(keep_inds) > 0: 417 | boxes = dets[keep_inds] 418 | boxes[:, 4] = cls_scores[keep_inds] 419 | reg = reg[keep_inds] 420 | else: 421 | return None, None 422 | 423 | boxes_c = self.calibrate_box(boxes, reg) 424 | 425 | keep = py_nms(boxes_c, 0.7, "Minimum") 426 | boxes_c = boxes_c[keep] 427 | 428 | return boxes, boxes_c 429 | 430 | 431 | def detect_face(self, imdb, test_data, vis): 432 | """Detect face over image 433 | 434 | Parameters: 435 | ---------- 436 | imdb: imdb 437 | image database 438 | test_data: data iter 439 | test data iterator 440 | vis: bool 441 | whether to visualize detection results 442 | 443 | Returns: 444 | ------- 445 | """ 446 | all_boxes = list() 447 | batch_idx = 0 448 | for databatch in test_data: 449 | if batch_idx % 100 == 0: 450 | print "%d images done"%batch_idx 451 | im = databatch.data[0].asnumpy().astype(np.uint8) 452 | t = time.time() 453 | 454 | # pnet 455 | if self.pnet_detector: 456 | boxes, boxes_c = self.detect_pnet(im) 457 | if boxes_c is None: 458 | all_boxes.append(np.array([])) 459 | batch_idx += 1 460 | continue 461 | if vis: 462 | rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 463 | self.vis_two(rgb_im, boxes, boxes_c) 464 | 465 | t1 = time.time() - t 466 | t = time.time() 467 | 468 | # rnet 469 | if self.rnet_detector: 470 | boxes, boxes_c = self.detect_rnet(im, boxes_c) 471 | if boxes_c is None: 472 | all_boxes.append(np.array([])) 473 | batch_idx += 1 474 | continue 475 | if vis: 476 | self.vis_two(rgb_im, boxes, boxes_c) 477 | 478 | t2 = time.time() - t 479 | t = time.time() 480 | 481 | # onet 482 | if self.onet_detector: 483 | boxes, boxes_c = self.detect_onet(im, boxes_c) 484 | if boxes_c is None: 485 | all_boxes.append(np.array([])) 486 | batch_idx += 1 487 | continue 488 | # all_boxes.append(boxes_c) 489 | if vis: 490 | self.vis_two(rgb_im, boxes, boxes_c) 491 | 492 | t3 = time.time() - t 493 | t = time.time() 494 | print "time cost " + '{:.3f}'.format(t1+t2+t3) + ' pnet {:.3f} rnet {:.3f} onet {:.3f}'.format(t1, t2, t3) 495 | 496 | all_boxes.append(boxes_c) 497 | batch_idx += 1 498 | # save detections into fddb format 499 | # imdb.write_results(all_boxes) 500 | return all_boxes 501 | 502 | 503 | def vis_two(self, im_array, dets1, dets2, thresh=0.9): 504 | """Visualize detection results before and after calibration 505 | 506 | Parameters: 507 | ---------- 508 | im_array: numpy.ndarray, shape(1, c, h, w) 509 | test image in rgb 510 | dets1: numpy.ndarray([[x1 y1 x2 y2 score]]) 511 | detection results before calibration 512 | dets2: numpy.ndarray([[x1 y1 x2 y2 score]]) 513 | detection results after calibration 514 | thresh: float 515 | boxes with scores > thresh will be drawn in red otherwise yellow 516 | 517 | Returns: 518 | ------- 519 | """ 520 | import matplotlib.pyplot as plt 521 | import random 522 | 523 | figure = plt.figure() 524 | plt.subplot(121) 525 | plt.imshow(im_array) 526 | color = 'yellow' 527 | 528 | for i in range(dets1.shape[0]): 529 | bbox = dets1[i, :4] 530 | score = dets1[i, 4] 531 | if score > thresh: 532 | rect = plt.Rectangle((bbox[0], bbox[1]), 533 | bbox[2] - bbox[0], 534 | bbox[3] - bbox[1], fill=False, 535 | edgecolor='red', linewidth=0.7) 536 | plt.gca().add_patch(rect) 537 | plt.gca().text(bbox[0], bbox[1] - 2, 538 | '{:.3f}'.format(score), 539 | bbox=dict(facecolor='blue', alpha=0.5), fontsize=12, color='white') 540 | else: 541 | rect = plt.Rectangle((bbox[0], bbox[1]), 542 | bbox[2] - bbox[0], 543 | bbox[3] - bbox[1], fill=False, 544 | edgecolor=color, linewidth=0.5) 545 | plt.gca().add_patch(rect) 546 | 547 | plt.subplot(122) 548 | plt.imshow(im_array) 549 | color = 'yellow' 550 | 551 | for i in range(dets2.shape[0]): 552 | bbox = dets2[i, :4] 553 | score = dets2[i, 4] 554 | if score > thresh: 555 | rect = plt.Rectangle((bbox[0], bbox[1]), 556 | bbox[2] - bbox[0], 557 | bbox[3] - bbox[1], fill=False, 558 | edgecolor='red', linewidth=0.7) 559 | plt.gca().add_patch(rect) 560 | plt.gca().text(bbox[0], bbox[1] - 2, 561 | '{:.3f}'.format(score), 562 | bbox=dict(facecolor='blue', alpha=0.5), fontsize=12, color='white') 563 | else: 564 | rect = plt.Rectangle((bbox[0], bbox[1]), 565 | bbox[2] - bbox[0], 566 | bbox[3] - bbox[1], fill=False, 567 | edgecolor=color, linewidth=0.5) 568 | plt.gca().add_patch(rect) 569 | plt.show() 570 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/core/__init__.py -------------------------------------------------------------------------------- /core/detector.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | 4 | from config import config 5 | 6 | class Detector(object): 7 | def __init__(self, symbol, data_size, batch_size, ctx=None, 8 | arg_params=None, aux_params=None): 9 | self.symbol = symbol 10 | self.data_size = data_size 11 | self.ctx = ctx 12 | if self.ctx is None: 13 | self.ctx = mx.cpu() 14 | self.arg_params = arg_params 15 | self.aux_params = aux_params 16 | 17 | self.batch_size = batch_size 18 | data_shapes = {'data': (self.batch_size, 3, self.data_size, self.data_size)} 19 | executor = self.symbol.simple_bind(self.ctx, grad_req='null', **dict(data_shapes)) 20 | executor.copy_params_from(self.arg_params, self.aux_params) 21 | self.executor = executor 22 | 23 | self.output_dict = None 24 | self.data_shape = data_shapes 25 | self.t = 0 26 | 27 | 28 | def predict(self, databatch): 29 | # access data 30 | # databatch: N x 3 x data_size x data_size 31 | scores = [] 32 | batch_size = self.batch_size 33 | 34 | minibatch = [] 35 | cur = 0 36 | n = databatch.shape[0] 37 | while cur < n: 38 | minibatch.append(databatch[cur:min(cur+batch_size, n), :, :, :]) 39 | cur += batch_size 40 | 41 | data_arrays = self.executor.arg_dict['data'] 42 | out_list = [[] for _ in range(len(self.executor.outputs))] 43 | 44 | for idx, data in enumerate(minibatch): 45 | m = data.shape[0] 46 | real_size = self.batch_size 47 | if m < batch_size: 48 | keep_inds = np.arange(m) 49 | gap = self.batch_size - m 50 | while gap >= len(keep_inds): 51 | gap -= len(keep_inds) 52 | keep_inds = np.concatenate((keep_inds, keep_inds)) 53 | if gap != 0: 54 | keep_inds = np.concatenate((keep_inds, keep_inds[:gap])) 55 | data = data[keep_inds] 56 | real_size = m 57 | 58 | data_arrays[:] = data 59 | self.executor.forward(is_train=False) 60 | 61 | for o_list, o_nd in zip(out_list, self.executor.outputs): 62 | o_list.append(o_nd[0:real_size].asnumpy()) 63 | 64 | out = list() 65 | 66 | for o in out_list: 67 | out.append(np.vstack(o)) 68 | 69 | return out 70 | -------------------------------------------------------------------------------- /core/fcn_detector.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | 4 | from config import config 5 | 6 | class FcnDetector(object): 7 | def __init__(self, symbol, ctx=None, 8 | arg_params=None, aux_params=None): 9 | self.symbol = symbol 10 | self.ctx = ctx 11 | if self.ctx is None: 12 | self.ctx = mx.cpu() 13 | self.arg_params = arg_params 14 | self.aux_params = aux_params 15 | self.output_dict = None 16 | 17 | def predict(self, databatch): 18 | data_shape = {'data': databatch.shape} 19 | self.arg_params['data'] = mx.nd.array(databatch, self.ctx) 20 | 21 | arg_shapes, out_shape, aux_shapes = self.symbol.infer_shape(**data_shape) 22 | arg_shapes_dict = dict(zip(self.symbol.list_arguments(), arg_shapes)) 23 | 24 | self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=None, 25 | grad_req='null', aux_states=self.aux_params) 26 | 27 | self.executor.forward(is_train=False) 28 | outputs = self.executor.outputs 29 | 30 | return outputs 31 | -------------------------------------------------------------------------------- /core/imdb.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import os 3 | import cPickle 4 | import numpy as np 5 | from config import config 6 | 7 | class IMDB(object): 8 | def __init__(self, name, image_set, root_path, dataset_path, mode='train'): 9 | self.name = name + '_' + image_set 10 | self.image_set = image_set 11 | self.root_path = root_path 12 | self.data_path = dataset_path 13 | self.mode = mode 14 | 15 | self.classes = ['__background__', 'face'] 16 | self.num_classes = 2 17 | self.image_set_index = self.load_image_set_index() 18 | self.num_images = len(self.image_set_index) 19 | 20 | 21 | @property 22 | def cache_path(self): 23 | """Make a directory to store all caches 24 | 25 | Parameters: 26 | ---------- 27 | Returns: 28 | ------- 29 | cache_path: str 30 | directory to store caches 31 | """ 32 | cache_path = os.path.join(self.root_path, 'cache') 33 | if not os.path.exists(cache_path): 34 | os.mkdir(cache_path) 35 | return cache_path 36 | 37 | 38 | def load_image_set_index(self): 39 | """Get image index 40 | 41 | Parameters: 42 | ---------- 43 | Returns: 44 | ------- 45 | image_set_index: str 46 | relative path of image 47 | """ 48 | image_set_index_file = os.path.join(self.data_path, 'imglists', self.image_set + '.txt') 49 | assert os.path.exists(image_set_index_file), 'Path does not exist: {}'.format(image_set_index_file) 50 | with open(image_set_index_file, 'r') as f: 51 | image_set_index = [x.strip().split(' ')[0] for x in f.readlines()] 52 | return image_set_index 53 | 54 | 55 | def gt_imdb(self): 56 | """Get and save ground truth image database 57 | 58 | Parameters: 59 | ---------- 60 | Returns: 61 | ------- 62 | gt_imdb: dict 63 | image database with annotations 64 | """ 65 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 66 | if os.path.exists(cache_file): 67 | with open(cache_file, 'rb') as f: 68 | imdb = cPickle.load(f) 69 | print '{} gt imdb loaded from {}'.format(self.name, cache_file) 70 | return imdb 71 | gt_imdb = self.load_annotations() 72 | with open(cache_file, 'wb') as f: 73 | cPickle.dump(gt_imdb, f, cPickle.HIGHEST_PROTOCOL) 74 | return gt_imdb 75 | 76 | 77 | def image_path_from_index(self, index): 78 | """Given image index, return full path 79 | 80 | Parameters: 81 | ---------- 82 | index: str 83 | relative path of image 84 | Returns: 85 | ------- 86 | image_file: str 87 | full path of image 88 | """ 89 | image_file = os.path.join(self.data_path, 'images', index) 90 | if "." not in image_file: 91 | image_file = image_file + '.jpg' 92 | assert os.path.exists(image_file), 'Path does not exist: {}'.format(image_file) 93 | return image_file 94 | 95 | 96 | def load_annotations(self): 97 | """Load annotations 98 | 99 | Parameters: 100 | ---------- 101 | Returns: 102 | ------- 103 | imdb: dict 104 | image database with annotations 105 | """ 106 | annotation_file = os.path.join(self.data_path, 'imglists', self.image_set + '.txt') 107 | assert os.path.exists(annotation_file), 'annotations not found at {}'.format(annotation_file) 108 | with open(annotation_file, 'r') as f: 109 | annotations = f.readlines() 110 | 111 | imdb = [] 112 | for i in range(self.num_images): 113 | annotation = annotations[i].strip().split(' ') 114 | index = annotation[0] 115 | im_path = self.image_path_from_index(index) 116 | imdb_ = dict() 117 | imdb_['image'] = im_path 118 | if self.mode == 'test': 119 | # gt_boxes = map(float, annotation[1:]) 120 | # boxes = np.array(bbox, dtype=np.float32).reshape(-1, 4) 121 | # imdb_['gt_boxes'] = boxes 122 | pass 123 | else: 124 | label = annotation[1] 125 | imdb_['label'] = int(label) 126 | imdb_['flipped'] = False 127 | imdb_['bbox_target'] = np.zeros((4,)) 128 | if len(annotation[2:]) == 4: 129 | bbox_target = annotation[2:] 130 | imdb_['bbox_target'] = np.array(bbox_target).astype(float) 131 | 132 | imdb.append(imdb_) 133 | return imdb 134 | 135 | 136 | def append_flipped_images(self, imdb): 137 | """append flipped images to imdb 138 | 139 | Parameters: 140 | ---------- 141 | imdb: imdb 142 | image database 143 | Returns: 144 | ------- 145 | imdb: dict 146 | image database with flipped image annotations added 147 | """ 148 | print 'append flipped images to imdb', len(imdb) 149 | for i in range(len(imdb)): 150 | imdb_ = imdb[i] 151 | m_bbox = imdb_['bbox_target'].copy() 152 | m_bbox[0], m_bbox[2] = -m_bbox[2], -m_bbox[0] 153 | 154 | entry = {'image': imdb_['image'], 155 | 'label': imdb_['label'], 156 | 'bbox_target': m_bbox, 157 | 'flipped': True} 158 | 159 | imdb.append(entry) 160 | self.image_set_index *= 2 161 | return imdb 162 | 163 | def write_results(self, all_boxes): 164 | """write results 165 | 166 | Parameters: 167 | ---------- 168 | all_boxes: list of numpy.ndarray 169 | detection results 170 | Returns: 171 | ------- 172 | """ 173 | print 'Writing fddb results' 174 | res_folder = os.path.join(self.cache_path, 'results') 175 | if not os.path.exists(res_folder): 176 | os.makedirs(res_folder) 177 | 178 | # save results to fddb format 179 | filename = os.path.join(res_folder, self.image_set + '-out.txt') 180 | with open(filename, 'w') as f: 181 | for im_ind, index in enumerate(self.image_set_index): 182 | f.write('%s\n'%index) 183 | dets = all_boxes[im_ind] 184 | f.write('%d\n'%dets.shape[0]) 185 | if len(dets) == 0: 186 | continue 187 | for k in range(dets.shape[0]): 188 | f.write('{:.2f} {:.2f} {:.2f} {:.2f} {:.5f}\n'. 189 | format(dets[k, 0], dets[k, 1], dets[k, 2]-dets[k, 0], dets[k, 3]-dets[k, 1], dets[k, 4])) 190 | -------------------------------------------------------------------------------- /core/loader.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | import minibatch 4 | from config import config 5 | 6 | class TestLoader(mx.io.DataIter): 7 | def __init__(self, imdb, batch_size=1, shuffle=False): 8 | self.imdb = imdb 9 | self.batch_size = batch_size 10 | self.shuffle = shuffle 11 | self.size = len(imdb) 12 | self.index = np.arange(self.size) 13 | 14 | self.cur = 0 15 | self.data = None 16 | self.label = None 17 | 18 | self.data_names = ['data'] 19 | self.label_names = [] 20 | 21 | self.reset() 22 | self.get_batch() 23 | 24 | @property 25 | def provide_data(self): 26 | return [(k, v.shape) for k, v in zip(self.data_names, self.data)] 27 | 28 | @property 29 | def provide_label(self): 30 | return [(k, v.shape) for k, v in zip(self.label_names, self.label)] 31 | 32 | def reset(self): 33 | self.cur = 0 34 | if self.shuffle: 35 | np.random.shuffle(self.index) 36 | 37 | def iter_next(self): 38 | return self.cur + self.batch_size <= self.size 39 | 40 | def next(self): 41 | if self.iter_next(): 42 | self.get_batch() 43 | self.cur += self.batch_size 44 | return mx.io.DataBatch(data=self.data, label=self.label, 45 | pad=self.getpad(), index=self.getindex(), 46 | provide_data=self.provide_data, provide_label=self.provide_label) 47 | else: 48 | raise StopIteration 49 | 50 | def getindex(self): 51 | return self.cur / self.batch_size 52 | 53 | def getpad(self): 54 | if self.cur + self.batch_size > self.size: 55 | return self.cur + self.batch_size - self.size 56 | else: 57 | return 0 58 | 59 | def get_batch(self): 60 | cur_from = self.cur 61 | cur_to = min(cur_from + self.batch_size, self.size) 62 | imdb = [self.imdb[self.index[i]] for i in range(cur_from, cur_to)] 63 | data, label = minibatch.get_testbatch(imdb) 64 | self.data = [mx.nd.array(data[name]) for name in self.data_names] 65 | self.label = [mx.nd.array(label[name]) for name in self.label_names] 66 | 67 | class ImageLoader(mx.io.DataIter): 68 | def __init__(self, imdb, im_size, batch_size=config.BATCH_SIZE, shuffle=False, ctx=None, work_load_list=None): 69 | 70 | super(ImageLoader, self).__init__() 71 | 72 | self.imdb = imdb 73 | self.batch_size = batch_size 74 | self.im_size = im_size 75 | self.shuffle = shuffle 76 | self.ctx = ctx 77 | if self.ctx is None: 78 | self.ctx = [mx.cpu()] 79 | self.work_load_list = work_load_list 80 | 81 | self.cur = 0 82 | self.size = len(imdb) 83 | self.index = np.arange(self.size) 84 | self.num_classes = 2 85 | 86 | self.batch = None 87 | self.data = None 88 | self.label = None 89 | 90 | self.label_names= ['label', 'bbox_target'] 91 | self.reset() 92 | self.get_batch() 93 | 94 | @property 95 | def provide_data(self): 96 | return [('data', self.data[0].shape)] 97 | # return [(k, v.shape) for k, v in zip(self.data_name, self.data)] 98 | 99 | 100 | @property 101 | def provide_label(self): 102 | return [(k, v.shape) for k, v in zip(self.label_names, self.label)] 103 | 104 | 105 | def reset(self): 106 | self.cur = 0 107 | if self.shuffle: 108 | np.random.shuffle(self.index) 109 | 110 | def iter_next(self): 111 | return self.cur + self.batch_size <= self.size 112 | 113 | def next(self): 114 | if self.iter_next(): 115 | self.get_batch() 116 | self.cur += self.batch_size 117 | return mx.io.DataBatch(data=self.data, label=self.label, 118 | pad=self.getpad(), index=self.getindex(), 119 | provide_data=self.provide_data, provide_label=self.provide_label) 120 | else: 121 | raise StopIteration 122 | 123 | def getindex(self): 124 | return self.cur / self.batch_size 125 | 126 | def getpad(self): 127 | if self.cur + self.batch_size > self.size: 128 | return self.cur + self.batch_size - self.size 129 | else: 130 | return 0 131 | 132 | def get_batch(self): 133 | cur_from = self.cur 134 | cur_to = min(cur_from + self.batch_size, self.size) 135 | imdb = [self.imdb[self.index[i]] for i in range(cur_from, cur_to)] 136 | data, label = minibatch.get_minibatch(imdb, self.num_classes, self.im_size) 137 | self.data = [mx.nd.array(data['data'])] 138 | self.label = [mx.nd.array(label[name]) for name in self.label_names] 139 | -------------------------------------------------------------------------------- /core/metric.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | from config import config 4 | 5 | 6 | class Accuracy(mx.metric.EvalMetric): 7 | def __init__(self): 8 | super(Accuracy, self).__init__('Accuracy') 9 | 10 | def update(self, labels, preds): 11 | # output: cls_prob_output, bbox_pred_output, cls_keep_inds, bbox_keep_inds 12 | # label: label, bbox_target 13 | pred_label = mx.ndarray.argmax_channel(preds[0]).asnumpy().astype('int32') 14 | label = labels[0].asnumpy() 15 | 16 | # negative mining 17 | cls_keep = preds[2].asnumpy() 18 | keep = np.where(cls_keep == 1)[0] 19 | 20 | pred_label = pred_label[keep] 21 | label = label[keep] 22 | 23 | self.sum_metric += (pred_label.flat == label.flat).sum() 24 | self.num_inst += len(pred_label.flat) 25 | 26 | 27 | class LogLoss(mx.metric.EvalMetric): 28 | def __init__(self): 29 | super(LogLoss, self).__init__('LogLoss') 30 | 31 | def update(self, labels, preds): 32 | # output: cls_prob_output, bbox_pred_output, cls_keep_inds, bbox_keep_inds 33 | # label: label, bbox_target 34 | pred_cls = preds[0].asnumpy() 35 | label = labels[0].asnumpy().astype('int32') 36 | 37 | cls_keep = preds[2].asnumpy() 38 | keep = np.where(cls_keep == 1)[0] 39 | 40 | pred_cls = pred_cls[keep].reshape(-1, 2) 41 | label = label[keep] 42 | 43 | cls = pred_cls[np.arange(label.shape[0]), label.flat] 44 | 45 | cls += config.EPS 46 | cls_loss = -1 * np.log(cls) 47 | 48 | cls_loss = np.sum(cls_loss) 49 | self.sum_metric += cls_loss 50 | self.num_inst += label.shape[0] 51 | 52 | 53 | class BBOX_MSE(mx.metric.EvalMetric): 54 | def __init__(self): 55 | super(BBOX_MSE, self).__init__('BBOX_MSE') 56 | 57 | def update(self,labels, preds): 58 | pred_delta = preds[1].asnumpy() 59 | bbox_target = labels[1].asnumpy() 60 | 61 | bbox_keep = preds[3].asnumpy() 62 | keep = np.where(bbox_keep == 1)[0] 63 | 64 | pred_delta = pred_delta[keep] 65 | bbox_target = bbox_target[keep] 66 | 67 | e = (pred_delta - bbox_target)**2 68 | error = np.sum(e) 69 | self.sum_metric += error 70 | self.num_inst += e.size 71 | -------------------------------------------------------------------------------- /core/minibatch.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from tools import image_processing 3 | from config import config 4 | import numpy as np 5 | 6 | def get_minibatch(imdb, num_classes, im_size): 7 | # im_size: 12, 24 or 48 8 | num_images = len(imdb) 9 | processed_ims = list() 10 | cls_label = list() 11 | bbox_reg_target = list() 12 | for i in range(num_images): 13 | im = cv2.imread(imdb[i]['image']) 14 | h, w, c = im.shape 15 | cls = imdb[i]['label'] 16 | bbox_target = imdb[i]['bbox_target'] 17 | 18 | assert h == w == im_size, "image size wrong" 19 | if imdb[i]['flipped']: 20 | im = im[:, ::-1, :] 21 | 22 | im_tensor = image_processing.transform(im) 23 | processed_ims.append(im_tensor) 24 | cls_label.append(cls) 25 | bbox_reg_target.append(bbox_target) 26 | 27 | im_array = np.vstack(processed_ims) 28 | label_array = np.array(cls_label) 29 | bbox_target_array = np.vstack(bbox_reg_target) 30 | ''' 31 | bbox_reg_weight = np.ones(label_array.shape) 32 | invalid = np.where(label_array == 0)[0] 33 | bbox_reg_weight[invalid] = 0 34 | bbox_reg_weight = np.repeat(bbox_reg_weight, 4, axis=1) 35 | ''' 36 | if im_size == 12: 37 | label_array = label_array.reshape(-1, 1) 38 | 39 | data = {'data': im_array} 40 | label = {'label': label_array, 41 | 'bbox_target': bbox_target_array} 42 | 43 | return data, label 44 | 45 | def get_testbatch(imdb): 46 | assert len(imdb) == 1, "Single batch only" 47 | im = cv2.imread(imdb[0]['image']) 48 | im_array = im 49 | data = {'data': im_array} 50 | label = {} 51 | return data, label 52 | -------------------------------------------------------------------------------- /core/negativemining.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | from config import config 4 | 5 | class NegativeMiningOperator(mx.operator.CustomOp): 6 | def __init__(self, cls_ohem=config.CLS_OHEM, cls_ohem_ratio=config.CLS_OHEM_RATIO, 7 | bbox_ohem=config.BBOX_OHEM, bbox_ohem_ratio=config.BBOX_OHEM_RATIO): 8 | super(NegativeMiningOperator, self).__init__() 9 | self.cls_ohem = cls_ohem 10 | self.cls_ohem_ratio = cls_ohem_ratio 11 | self.bbox_ohem = bbox_ohem 12 | self.bbox_ohem_ratio = bbox_ohem_ratio 13 | 14 | def forward(self, is_train, req, in_data, out_data, aux): 15 | cls_prob = in_data[0].asnumpy() # batchsize x 2 x 1 x 1 16 | bbox_pred = in_data[1].asnumpy() # batchsize x 4 17 | label = in_data[2].asnumpy().astype(int) # batchsize x 1 18 | bbox_target = in_data[3].asnumpy() # batchsize x 4 19 | 20 | self.assign(out_data[0], req[0], in_data[0]) 21 | self.assign(out_data[1], req[1], in_data[1]) 22 | 23 | # cls 24 | cls_prob = cls_prob.reshape(-1, 2) 25 | valid_inds = np.where(label > -1)[0] 26 | cls_keep = np.zeros(cls_prob.shape[0]) 27 | 28 | if self.cls_ohem: 29 | keep_num = int(len(valid_inds) * self.cls_ohem_ratio) 30 | cls_valid = cls_prob[valid_inds, :] 31 | label_valid = label.flatten()[valid_inds] 32 | 33 | cls = cls_valid[np.arange(len(valid_inds)), label_valid] + config.EPS 34 | log_loss = - np.log(cls) 35 | keep = np.argsort(log_loss)[::-1][:keep_num] 36 | cls_keep[valid_inds[keep]] = 1 37 | else: 38 | cls_keep[valid_inds] = 1 39 | self.assign(out_data[2], req[2], mx.nd.array(cls_keep)) 40 | 41 | # bbox 42 | valid_inds = np.where(abs(label) == 1)[0] 43 | bbox_keep = np.zeros(cls_prob.shape[0]) 44 | 45 | if self.bbox_ohem: 46 | keep_num = int(len(valid_inds) * self.bbox_ohem_ratio) 47 | bbox_valid = bbox_pred[valid_inds, :] 48 | bbox_target_valid = bbox_target[valid_inds, :] 49 | square_error = np.sum((bbox_valid - bbox_target_valid)**2, axis=1) 50 | keep = np.argsort(square_error)[::-1][:keep_num] 51 | bbox_keep[valid_inds[keep]] = 1 52 | else: 53 | bbox_keep[valid_inds] = 1 54 | self.assign(out_data[3], req[3], mx.nd.array(bbox_keep)) 55 | 56 | 57 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 58 | cls_keep = out_data[2].asnumpy().reshape(-1, 1) 59 | bbox_keep = out_data[3].asnumpy().reshape(-1, 1) 60 | 61 | cls_grad = np.repeat(cls_keep, 2, axis=1) 62 | bbox_grad = np.repeat(bbox_keep, 4, axis=1) 63 | 64 | cls_grad /= len(np.where(cls_keep == 1)[0]) 65 | bbox_grad /= len(np.where(bbox_keep == 1)[0]) 66 | 67 | cls_grad = cls_grad.reshape(in_data[0].shape) 68 | self.assign(in_grad[0], req[0], mx.nd.array(cls_grad)) 69 | self.assign(in_grad[1], req[1], mx.nd.array(bbox_grad)) 70 | 71 | 72 | @mx.operator.register("negativemining") 73 | class NegativeMiningProp(mx.operator.CustomOpProp): 74 | def __init__(self): 75 | super(NegativeMiningProp, self).__init__(need_top_grad=False) 76 | 77 | def list_arguments(self): 78 | return ['cls_prob', 'bbox_pred', 'label', 'bbox_target'] 79 | 80 | def list_outputs(self): 81 | return ['cls_out', 'bbox_out', 'cls_keep', 'bbox_keep'] 82 | 83 | def infer_shape(self, in_shape): 84 | keep_shape = (in_shape[0][0], ) 85 | return in_shape, [in_shape[0], in_shape[1], keep_shape, keep_shape] 86 | 87 | def create_operator(self, ctx, shapes, dtypes): 88 | return NegativeMiningOperator() 89 | -------------------------------------------------------------------------------- /core/symbol.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import negativemining 3 | from config import config 4 | 5 | def P_Net(mode='train'): 6 | """ 7 | Proposal Network 8 | input shape 3 x 12 x 12 9 | """ 10 | data = mx.symbol.Variable(name="data") 11 | bbox_target = mx.symbol.Variable(name="bbox_target") 12 | label = mx.symbol.Variable(name="label") 13 | 14 | conv1 = mx.symbol.Convolution(data=data, kernel=(3, 3), num_filter=10, name="conv1") 15 | prelu1 = mx.symbol.LeakyReLU(data=conv1, act_type="prelu", name="prelu1") 16 | pool1 = mx.symbol.Pooling(data=prelu1, pool_type="max", pooling_convention="full", kernel=(2, 2), stride=(2, 2), name="pool1") 17 | 18 | conv2 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), num_filter=16, name="conv2") 19 | prelu2 = mx.symbol.LeakyReLU(data=conv2, act_type="prelu", name="prelu2") 20 | 21 | conv3 = mx.symbol.Convolution(data=prelu2, kernel=(3, 3), num_filter=32, name="conv3") 22 | prelu3 = mx.symbol.LeakyReLU(data=conv3, act_type="prelu", name="prelu3") 23 | 24 | conv4_1 = mx.symbol.Convolution(data=prelu3, kernel=(1, 1), num_filter=2, name="conv4_1") 25 | conv4_2 = mx.symbol.Convolution(data=prelu3, kernel=(1, 1), num_filter=4, name="conv4_2") 26 | 27 | if mode == 'test': 28 | cls_prob = mx.symbol.SoftmaxActivation(data=conv4_1, mode="channel", name="cls_prob") 29 | bbox_pred = conv4_2 30 | group = mx.symbol.Group([cls_prob, bbox_pred]) 31 | 32 | else: 33 | cls_prob = mx.symbol.SoftmaxOutput(data=conv4_1, label=label, 34 | multi_output=True, use_ignore=True, 35 | out_grad=True, name="cls_prob") 36 | conv4_2_reshape = mx.symbol.Reshape(data = conv4_2, shape=(-1, 4), name="conv4_2_reshape") 37 | bbox_pred = mx.symbol.LinearRegressionOutput(data=conv4_2_reshape, label=bbox_target, 38 | grad_scale=1, out_grad=True, name="bbox_pred") 39 | 40 | out = mx.symbol.Custom(cls_prob=cls_prob, bbox_pred=bbox_pred, 41 | label=label, bbox_target=bbox_target, 42 | op_type='negativemining', name="negative_mining") 43 | group = mx.symbol.Group([out]) 44 | return group 45 | 46 | 47 | def R_Net(mode='train'): 48 | """ 49 | Refine Network 50 | input shape 3 x 24 x 24 51 | """ 52 | data = mx.symbol.Variable(name="data") 53 | bbox_target = mx.symbol.Variable(name="bbox_target") 54 | label = mx.symbol.Variable(name="label") 55 | 56 | conv1 = mx.symbol.Convolution(data=data, kernel=(3, 3), num_filter=28, name="conv1") 57 | prelu1 = mx.symbol.LeakyReLU(data=conv1, act_type="prelu", name="prelu1") 58 | pool1 = mx.symbol.Pooling(data=prelu1, pool_type="max", pooling_convention="full", kernel=(3, 3), stride=(2, 2), name="pool1") 59 | 60 | conv2 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), num_filter=48, name="conv2") 61 | prelu2 = mx.symbol.LeakyReLU(data=conv2, act_type="prelu", name="prelu2") 62 | pool2 = mx.symbol.Pooling(data=prelu2, pool_type="max", pooling_convention="full", kernel=(3, 3), stride=(2, 2), name="pool2") 63 | 64 | conv3 = mx.symbol.Convolution(data=pool2, kernel=(2, 2), num_filter=64, name="conv3") 65 | prelu3 = mx.symbol.LeakyReLU(data=conv3, act_type="prelu", name="prelu3") 66 | 67 | fc1 = mx.symbol.FullyConnected(data=prelu3, num_hidden=128, name="fc1") 68 | prelu4 = mx.symbol.LeakyReLU(data=fc1, act_type="prelu", name="prelu4") 69 | 70 | fc2 = mx.symbol.FullyConnected(data=prelu4, num_hidden=2, name="fc2") 71 | fc3 = mx.symbol.FullyConnected(data=prelu4, num_hidden=4, name="fc3") 72 | 73 | cls_prob = mx.symbol.SoftmaxOutput(data=fc2, label=label, use_ignore=True, 74 | out_grad=True, name="cls_prob") 75 | if mode == 'test': 76 | cls_prob = mx.symbol.SoftmaxOutput(data=fc2, label=label, use_ignore=True, name="cls_prob") 77 | bbox_pred = fc3 78 | group = mx.symbol.Group([cls_prob, bbox_pred]) 79 | else: 80 | bbox_pred = mx.symbol.LinearRegressionOutput(data=fc3, label=bbox_target, 81 | out_grad=True, grad_scale=1, name="bbox_pred") 82 | 83 | out = mx.symbol.Custom(cls_prob=cls_prob, bbox_pred=bbox_pred, label=label, 84 | bbox_target=bbox_target, op_type='negativemining', name="negative_mining") 85 | 86 | group = mx.symbol.Group([out]) 87 | return group 88 | 89 | 90 | def O_Net(mode="train"): 91 | """ 92 | Refine Network 93 | input shape 3 x 48 x 48 94 | """ 95 | data = mx.symbol.Variable(name="data") 96 | bbox_target = mx.symbol.Variable(name="bbox_target") 97 | label = mx.symbol.Variable(name="label") 98 | 99 | conv1 = mx.symbol.Convolution(data=data, kernel=(3, 3), num_filter=32, name="conv1") 100 | prelu1 = mx.symbol.LeakyReLU(data=conv1, act_type="prelu", name="prelu1") 101 | pool1 = mx.symbol.Pooling(data=prelu1, pool_type="max", pooling_convention="full", kernel=(3, 3), stride=(2, 2), name="pool1") 102 | 103 | conv2 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), num_filter=64, name="conv2") 104 | prelu2 = mx.symbol.LeakyReLU(data=conv2, act_type="prelu", name="prelu2") 105 | pool2 = mx.symbol.Pooling(data=prelu2, pool_type="max", pooling_convention="full", kernel=(3, 3), stride=(2, 2), name="pool2") 106 | 107 | conv3 = mx.symbol.Convolution(data=pool2, kernel=(3, 3), num_filter=64, name="conv3") 108 | prelu3 = mx.symbol.LeakyReLU(data=conv3, act_type="prelu", name="prelu3") 109 | pool3 = mx.symbol.Pooling(data=prelu3, pool_type="max", pooling_convention="full", kernel=(2, 2), stride=(2, 2), name="pool3") 110 | 111 | conv4 = mx.symbol.Convolution(data=pool3, kernel=(2, 2), num_filter=128, name="conv4") 112 | prelu4 = mx.symbol.LeakyReLU(data=conv4, act_type="prelu", name="prelu4") 113 | 114 | fc1 = mx.symbol.FullyConnected(data=prelu4, num_hidden=256, name="fc1") 115 | prelu5 = mx.symbol.LeakyReLU(data=fc1, act_type="prelu", name="prelu5") 116 | 117 | fc2 = mx.symbol.FullyConnected(data=prelu5, num_hidden=2, name="fc2") 118 | fc3 = mx.symbol.FullyConnected(data=prelu5, num_hidden=4, name="fc3") 119 | 120 | cls_prob = mx.symbol.SoftmaxOutput(data=fc2, label=label, use_ignore=True, out_grad=True, name="cls_prob") 121 | if mode == "test": 122 | bbox_pred = fc3 123 | group = mx.symbol.Group([cls_prob, bbox_pred]) 124 | else: 125 | bbox_pred = mx.symbol.LinearRegressionOutput(data=fc3, label=bbox_target, 126 | grad_scale=1, out_grad=True, name="bbox_pred") 127 | out = mx.symbol.Custom(cls_prob=cls_prob, bbox_pred=bbox_pred, label=label, 128 | bbox_target=bbox_target, op_type='negativemining', name="negative_mining") 129 | group = mx.symbol.Group([out]) 130 | return group 131 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mxnet as mx 3 | import argparse 4 | import cv2 5 | import time 6 | from core.symbol import P_Net, R_Net, O_Net 7 | from core.imdb import IMDB 8 | from config import config 9 | from core.loader import TestLoader 10 | from core.detector import Detector 11 | from core.fcn_detector import FcnDetector 12 | from tools.load_model import load_param 13 | from core.MtcnnDetector import MtcnnDetector 14 | 15 | 16 | def test_net(prefix, epoch, batch_size, ctx, 17 | thresh=[0.6, 0.6, 0.7], min_face_size=24, 18 | stride=2, slide_window=False): 19 | 20 | detectors = [None, None, None] 21 | 22 | # load pnet model 23 | args, auxs = load_param(prefix[0], epoch[0], convert=True, ctx=ctx) 24 | if slide_window: 25 | PNet = Detector(P_Net("test"), 12, batch_size[0], ctx, args, auxs) 26 | else: 27 | PNet = FcnDetector(P_Net("test"), ctx, args, auxs) 28 | detectors[0] = PNet 29 | 30 | # load rnet model 31 | args, auxs = load_param(prefix[1], epoch[0], convert=True, ctx=ctx) 32 | RNet = Detector(R_Net("test"), 24, batch_size[1], ctx, args, auxs) 33 | detectors[1] = RNet 34 | 35 | # load onet model 36 | args, auxs = load_param(prefix[2], epoch[2], convert=True, ctx=ctx) 37 | ONet = Detector(O_Net("test"), 48, batch_size[2], ctx, args, auxs) 38 | detectors[2] = ONet 39 | 40 | mtcnn_detector = MtcnnDetector(detectors=detectors, ctx=ctx, min_face_size=min_face_size, 41 | stride=stride, threshold=thresh, slide_window=slide_window) 42 | 43 | img = cv2.imread('test01.jpg') 44 | t1 = time.time() 45 | 46 | boxes, boxes_c = mtcnn_detector.detect_pnet(img) 47 | boxes, boxes_c = mtcnn_detector.detect_rnet(img, boxes_c) 48 | boxes, boxes_c = mtcnn_detector.detect_onet(img, boxes_c) 49 | 50 | print 'time: ',time.time() - t1 51 | 52 | if boxes_c is not None: 53 | draw = img.copy() 54 | font = cv2.FONT_HERSHEY_SIMPLEX 55 | for b in boxes_c: 56 | cv2.rectangle(draw, (int(b[0]), int(b[1])), (int(b[2]), int(b[3])), (0, 255, 255), 1) 57 | cv2.putText(draw, '%.3f'%b[4], (int(b[0]), int(b[1])), font, 0.4, (255, 255, 255), 1) 58 | 59 | cv2.imshow("detection result", draw) 60 | cv2.waitKey(0) 61 | 62 | 63 | 64 | def parse_args(): 65 | parser = argparse.ArgumentParser(description='Test mtcnn', 66 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 67 | parser.add_argument('--prefix', dest='prefix', help='prefix of model name', nargs="+", 68 | default=['model/pnet', 'model/rnet', 'model/onet'], type=str) 69 | parser.add_argument('--epoch', dest='epoch', help='epoch number of model to load', nargs="+", 70 | default=[16, 16, 16], type=int) 71 | parser.add_argument('--batch_size', dest='batch_size', help='list of batch size used in prediction', nargs="+", 72 | default=[2048, 256, 16], type=int) 73 | parser.add_argument('--thresh', dest='thresh', help='list of thresh for pnet, rnet, onet', nargs="+", 74 | default=[0.5, 0.5, 0.7], type=float) 75 | parser.add_argument('--min_face', dest='min_face', help='minimum face size for detection', 76 | default=40, type=int) 77 | parser.add_argument('--stride', dest='stride', help='stride of sliding window', 78 | default=2, type=int) 79 | parser.add_argument('--sw', dest='slide_window', help='use sliding window in pnet', action='store_true') 80 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with', 81 | default=0, type=int) 82 | args = parser.parse_args() 83 | return args 84 | 85 | if __name__ == '__main__': 86 | args = parse_args() 87 | print 'Called with argument:' 88 | print args 89 | ctx = mx.gpu(args.gpu_id) 90 | if args.gpu_id == -1: 91 | ctx = mx.cpu(0) 92 | test_net(args.prefix, args.epoch, args.batch_size, 93 | ctx, args.thresh, args.min_face, 94 | args.stride, args.slide_window) 95 | -------------------------------------------------------------------------------- /example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/example/__init__.py -------------------------------------------------------------------------------- /example/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import mxnet as mx 3 | import core.metric as metric 4 | from mxnet.module.module import Module 5 | from core.loader import ImageLoader 6 | from core.imdb import IMDB 7 | from config import config 8 | from tools.load_model import load_param 9 | 10 | def train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, imdb, 11 | net=12, frequent=50, initialize=True, base_lr=0.01): 12 | logger = logging.getLogger() 13 | logger.setLevel(logging.INFO) 14 | 15 | train_data = ImageLoader(imdb, net, config.BATCH_SIZE, shuffle=True, ctx=ctx) 16 | 17 | if not initialize: 18 | args, auxs = load_param(pretrained, epoch, convert=True) 19 | 20 | if initialize: 21 | print "init weights and bias:" 22 | data_shape_dict = dict(train_data.provide_data + train_data.provide_label) 23 | arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) 24 | arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) 25 | aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) 26 | init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) 27 | args = dict() 28 | auxs = dict() 29 | 30 | for k in sym.list_arguments(): 31 | if k in data_shape_dict: 32 | continue 33 | 34 | print 'init', k 35 | 36 | args[k] = mx.nd.zeros(arg_shape_dict[k]) 37 | init(k, args[k]) 38 | if k.startswith('fc'): 39 | args[k][:] /= 10 40 | 41 | ''' 42 | if k.endswith('weight'): 43 | if k.startswith('conv'): 44 | args[k] = mx.random.normal(loc=0, scale=0.001, shape=arg_shape_dict[k]) 45 | else: 46 | args[k] = mx.random.normal(loc=0, scale=0.01, shape=arg_shape_dict[k]) 47 | else: # bias 48 | args[k] = mx.nd.zeros(shape=arg_shape_dict[k]) 49 | ''' 50 | 51 | for k in sym.list_auxiliary_states(): 52 | auxs[k] = mx.nd.zeros() 53 | init(k, auxs[k]) 54 | 55 | lr_factor = 0.1 56 | lr_epoch = config.LR_EPOCH 57 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 58 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 59 | lr_iters = [int(epoch * len(imdb) / config.BATCH_SIZE) for epoch in lr_epoch_diff] 60 | print 'lr', lr, 'lr_epoch', lr_epoch, 'lr_epoch_diff', lr_epoch_diff 61 | lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) 62 | 63 | data_names = [k[0] for k in train_data.provide_data] 64 | label_names = [k[0] for k in train_data.provide_label] 65 | 66 | batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent) 67 | epoch_end_callback = mx.callback.do_checkpoint(prefix) 68 | eval_metrics = mx.metric.CompositeEvalMetric() 69 | metric1 = metric.Accuracy() 70 | metric2 = metric.LogLoss() 71 | metric3 = metric.BBOX_MSE() 72 | for child_metric in [metric1, metric2, metric3]: 73 | eval_metrics.add(child_metric) 74 | optimizer_params = {'momentum': 0.9, 75 | 'wd': 0.00001, 76 | 'learning_rate': lr, 77 | 'lr_scheduler': lr_scheduler, 78 | 'rescale_grad': 1.0} 79 | 80 | mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx) 81 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 82 | batch_end_callback=batch_end_callback, 83 | optimizer='sgd', optimizer_params=optimizer_params, 84 | arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch) 85 | 86 | -------------------------------------------------------------------------------- /example/train_O_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mxnet as mx 3 | from core.imdb import IMDB 4 | from train import train_net 5 | from core.symbol import O_Net 6 | 7 | def train_O_net(image_set, root_path, dataset_path, prefix, ctx, 8 | pretrained, epoch, begin_epoch, 9 | end_epoch, frequent, lr, resume): 10 | imdb = IMDB("mtcnn", image_set, root_path, dataset_path) 11 | gt_imdb = imdb.gt_imdb() 12 | gt_imdb = imdb.append_flipped_images(gt_imdb) 13 | sym = O_Net() 14 | 15 | train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, gt_imdb, 16 | 48, frequent, not resume, lr) 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Train O_net(48-net)', 20 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 21 | parser.add_argument('--image_set', dest='image_set', help='training set', 22 | default='train_48', type=str) 23 | parser.add_argument('--root_path', dest='root_path', help='output data folder', 24 | default='data', type=str) 25 | parser.add_argument('--dataset_path', dest='dataset_path', help='dataset folder', 26 | default='data/mtcnn', type=str) 27 | parser.add_argument('--prefix', dest='prefix', help='new model prefix', 28 | default='model/onet', type=str) 29 | parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with', 30 | default='0', type=str) 31 | parser.add_argument('--pretrained', dest='pretrained', help='pretrained prefix', 32 | default='model/onet', type=str) 33 | parser.add_argument('--epoch', dest='epoch', help='load epoch', 34 | default=0, type=int) 35 | parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training', 36 | default=0, type=int) 37 | parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training', 38 | default=16, type=int) 39 | parser.add_argument('--frequent', dest='frequent', help='frequency of logging', 40 | default=200, type=int) 41 | parser.add_argument('--lr', dest='lr', help='learning rate', 42 | default=0.01, type=float) 43 | parser.add_argument('--resume', dest='resume', help='continue training', action='store_true') 44 | args = parser.parse_args() 45 | return args 46 | 47 | if __name__ == '__main__': 48 | args = parse_args() 49 | print 'Called with argument:' 50 | print args 51 | ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')] 52 | train_O_net(args.image_set, args.root_path, args.dataset_path, args.prefix, 53 | ctx, args.pretrained, args.epoch, args.begin_epoch, 54 | args.end_epoch, args.frequent, args.lr, args.resume) 55 | -------------------------------------------------------------------------------- /example/train_P_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mxnet as mx 3 | from core.imdb import IMDB 4 | from train import train_net 5 | from core.symbol import P_Net 6 | 7 | def train_P_net(image_set, root_path, dataset_path, prefix, ctx, 8 | pretrained, epoch, begin_epoch, 9 | end_epoch, frequent, lr, resume): 10 | imdb = IMDB("mtcnn", image_set, root_path, dataset_path) 11 | gt_imdb = imdb.gt_imdb() 12 | gt_imdb = imdb.append_flipped_images(gt_imdb) 13 | sym = P_Net() 14 | 15 | train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, gt_imdb, 16 | 12, frequent, not resume, lr) 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Train proposal net(12-net)', 20 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 21 | parser.add_argument('--image_set', dest='image_set', help='training set', 22 | default='train_12', type=str) 23 | parser.add_argument('--root_path', dest='root_path', help='output data folder', 24 | default='data', type=str) 25 | parser.add_argument('--dataset_path', dest='dataset_path', help='dataset folder', 26 | default='data/mtcnn', type=str) 27 | parser.add_argument('--prefix', dest='prefix', help='new model prefix', 28 | default='model/pnet', type=str) 29 | parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with', 30 | default='0', type=str) 31 | parser.add_argument('--pretrained', dest='pretrained', help='pretrained prefix', 32 | default='model/pnet', type=str) 33 | parser.add_argument('--epoch', dest='epoch', help='load epoch', 34 | default=0, type=int) 35 | parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training', 36 | default=0, type=int) 37 | parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training', 38 | default=16, type=int) 39 | parser.add_argument('--frequent', dest='frequent', help='frequency of logging', 40 | default=200, type=int) 41 | parser.add_argument('--lr', dest='lr', help='learning rate', 42 | default=0.01, type=float) 43 | parser.add_argument('--resume', dest='resume', help='continue training', action='store_true') 44 | args = parser.parse_args() 45 | return args 46 | 47 | if __name__ == '__main__': 48 | args = parse_args() 49 | print 'Called with argument:' 50 | print args 51 | ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')] 52 | train_P_net(args.image_set, args.root_path, args.dataset_path, args.prefix, ctx, 53 | args.pretrained, args.epoch, 54 | args.begin_epoch, args.end_epoch, args.frequent, args.lr, args.resume) 55 | -------------------------------------------------------------------------------- /example/train_R_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mxnet as mx 3 | from core.imdb import IMDB 4 | from train import train_net 5 | from core.symbol import R_Net 6 | 7 | def train_R_net(image_set, root_path, dataset_path, prefix, ctx, 8 | pretrained, epoch, begin_epoch, 9 | end_epoch, frequent, lr, resume): 10 | imdb = IMDB("mtcnn", image_set, root_path, dataset_path) 11 | gt_imdb = imdb.gt_imdb() 12 | gt_imdb = imdb.append_flipped_images(gt_imdb) 13 | sym = R_Net() 14 | 15 | train_net(sym, prefix, ctx, pretrained, epoch, begin_epoch, end_epoch, gt_imdb, 16 | 24, frequent, not resume, lr) 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Train refine net(24-net)', 20 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 21 | parser.add_argument('--image_set', dest='image_set', help='training set', 22 | default='train_24', type=str) 23 | parser.add_argument('--root_path', dest='root_path', help='output data folder', 24 | default='data', type=str) 25 | parser.add_argument('--dataset_path', dest='dataset_path', help='dataset folder', 26 | default='data/mtcnn', type=str) 27 | parser.add_argument('--prefix', dest='prefix', help='new model prefix', 28 | default='model/rnet', type=str) 29 | parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with', 30 | default='0', type=str) 31 | parser.add_argument('--pretrained', dest='pretrained', help='pretrained prefix', 32 | default='model/rnet', type=str) 33 | parser.add_argument('--epoch', dest='epoch', help='load epoch', 34 | default=0, type=int) 35 | parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training', 36 | default=0, type=int) 37 | parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training', 38 | default=16, type=int) 39 | parser.add_argument('--frequent', dest='frequent', help='frequency of logging', 40 | default=200, type=int) 41 | parser.add_argument('--lr', dest='lr', help='learning rate', 42 | default=0.01, type=float) 43 | parser.add_argument('--resume', dest='resume', help='continue training', action='store_true') 44 | args = parser.parse_args() 45 | return args 46 | 47 | if __name__ == '__main__': 48 | args = parse_args() 49 | print 'Called with argument:' 50 | print args 51 | ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')] 52 | train_R_net(args.image_set, args.root_path, args.dataset_path, args.prefix, 53 | ctx, args.pretrained, args.epoch, args.begin_epoch, 54 | args.end_epoch, args.frequent, args.lr, args.resume) 55 | -------------------------------------------------------------------------------- /fddb_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/fddb_result.png -------------------------------------------------------------------------------- /model/onet-0016.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/model/onet-0016.params -------------------------------------------------------------------------------- /model/pnet-0016.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/model/pnet-0016.params -------------------------------------------------------------------------------- /model/rnet-0016.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/model/rnet-0016.params -------------------------------------------------------------------------------- /mxnet_diff.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h 2 | index d70066d..acebc2c 100644 3 | --- a/src/operator/regression_output-inl.h 4 | +++ b/src/operator/regression_output-inl.h 5 | @@ -25,9 +25,12 @@ enum RegressionOutputType {kLinear, kLogistic, kMAE}; 6 | 7 | struct RegressionOutputParam : public dmlc::Parameter { 8 | float grad_scale; 9 | + bool out_grad; 10 | DMLC_DECLARE_PARAMETER(RegressionOutputParam) { 11 | DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f) 12 | .describe("Scale the gradient by a float factor"); 13 | + DMLC_DECLARE_FIELD(out_grad).set_default(false) 14 | + .describe("Apply weighting from output gradient"); 15 | }; 16 | }; 17 | 18 | @@ -75,6 +78,10 @@ class RegressionOutputOp : public Operator { 19 | .get_with_shape(out.shape_, s); 20 | Assign(grad, req[reg_enum::kData], param_.grad_scale/num_output* 21 | F(out, reshape(label, grad.shape_))); 22 | + if (param_.out_grad) { 23 | + Tensor ograd = out_grad[reg_enum::kOut].FlatTo2D(s); 24 | + grad *= ograd; 25 | + } 26 | } 27 | 28 | private: 29 | @@ -148,7 +155,12 @@ class RegressionOutputProp : public OperatorProperty { 30 | const std::vector &out_grad, 31 | const std::vector &in_data, 32 | const std::vector &out_data) const override { 33 | - return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut]}; 34 | + if (param_.out_grad) { 35 | + return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut], 36 | + out_grad[reg_enum::kOut]}; 37 | + } else { 38 | + return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut]}; 39 | + } 40 | } 41 | 42 | std::vector > BackwardInplaceOption( 43 | -------------------------------------------------------------------------------- /prepare_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/prepare_data/__init__.py -------------------------------------------------------------------------------- /prepare_data/gen_hard_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mxnet as mx 3 | import argparse 4 | import os 5 | import cPickle 6 | import cv2 7 | from core.symbol import P_Net, R_Net, O_Net 8 | from core.imdb import IMDB 9 | from config import config 10 | from core.loader import TestLoader 11 | from core.detector import Detector 12 | from core.fcn_detector import FcnDetector 13 | from tools.load_model import load_param 14 | from core.MtcnnDetector import MtcnnDetector 15 | from utils import * 16 | 17 | def save_hard_example(net): 18 | 19 | image_dir = "./data/wider/images" 20 | neg_save_dir = "/data3/seanlx/mtcnn1/24/negative" 21 | pos_save_dir = "/data3/seanlx/mtcnn1/24/positive" 22 | part_save_dir = "/data3/seanlx/mtcnn1/24/part" 23 | 24 | # load ground truth from annotation file 25 | # format of each line: image/path [x1,y1,x2,y2] for each gt_box in this image 26 | anno_file = './prepare_data/wider_annotations/anno.txt' 27 | with open(anno_file, 'r') as f: 28 | annotations = f.readlines() 29 | 30 | if net == "rnet": 31 | image_size = 24 32 | if net == "onet": 33 | image_size = 48 34 | 35 | im_idx_list = list() 36 | gt_boxes_list = list() 37 | num_of_images = len(annotations) 38 | print "processing %d images in total"%num_of_images 39 | 40 | for annotation in annotations: 41 | annotation = annotation.strip().split(' ') 42 | im_idx = annotation[0] 43 | 44 | boxes = map(float, annotation[1:]) 45 | boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4) 46 | im_idx_list.append(im_idx) 47 | gt_boxes_list.append(boxes) 48 | 49 | save_path = "./prepare_data/%s"%net 50 | f1 = open(os.path.join(save_path, 'pos_%d.txt'%image_size), 'w') 51 | f2 = open(os.path.join(save_path, 'neg_%d.txt'%image_size), 'w') 52 | f3 = open(os.path.join(save_path, 'part_%d.txt'%image_size), 'w') 53 | 54 | det_boxes = cPickle.load(open(os.path.join(save_path, 'detections.pkl'), 'r')) 55 | assert len(det_boxes) == num_of_images, "incorrect detections or ground truths" 56 | 57 | # index of neg, pos and part face, used as their image names 58 | n_idx = 0 59 | p_idx = 0 60 | d_idx = 0 61 | image_done = 0 62 | for im_idx, dets, gts in zip(im_idx_list, det_boxes, gt_boxes_list): 63 | if image_done % 100 == 0: 64 | print "%d images done"%image_done 65 | image_done += 1 66 | 67 | if dets.shape[0]==0: 68 | continue 69 | img = cv2.imread(os.path.join(image_dir, im_idx+'.jpg')) 70 | dets = convert_to_square(dets) 71 | dets[:, 0:4] = np.round(dets[:, 0:4]) 72 | 73 | for box in dets: 74 | x_left, y_top, x_right, y_bottom, _ = box.astype(int) 75 | width = x_right - x_left + 1 76 | height = y_bottom - y_top + 1 77 | 78 | # ignore box that is too small or beyond image border 79 | if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1: 80 | continue 81 | 82 | # compute intersection over union(IoU) between current box and all gt boxes 83 | Iou = IoU(box, gts) 84 | cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :] 85 | resized_im = cv2.resize(cropped_im, (image_size, image_size), 86 | interpolation=cv2.INTER_LINEAR) 87 | 88 | # save negative images and write label 89 | if np.max(Iou) < 0.3: 90 | # Iou with all gts must below 0.3 91 | save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx) 92 | f2.write("%s/negative/%s"%(image_size, n_idx) + ' 0\n') 93 | cv2.imwrite(save_file, resized_im) 94 | n_idx += 1 95 | else: 96 | # find gt_box with the highest iou 97 | idx = np.argmax(Iou) 98 | assigned_gt = gts[idx] 99 | x1, y1, x2, y2 = assigned_gt 100 | 101 | # compute bbox reg label 102 | offset_x1 = (x1 - x_left) / float(width) 103 | offset_y1 = (y1 - y_top) / float(height) 104 | offset_x2 = (x2 - x_right) / float(width) 105 | offset_y2 = (y2 - y_bottom ) / float(height) 106 | 107 | # save positive and part-face images and write labels 108 | if np.max(Iou) >= 0.65: 109 | save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx) 110 | f1.write("%s/positive/%s"%(image_size, p_idx) + ' 1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2)) 111 | cv2.imwrite(save_file, resized_im) 112 | p_idx += 1 113 | 114 | elif np.max(Iou) >= 0.4: 115 | save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx) 116 | f3.write("%s/part/%s"%(image_size, d_idx) + ' -1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2)) 117 | cv2.imwrite(save_file, resized_im) 118 | d_idx += 1 119 | f1.close() 120 | f2.close() 121 | f3.close() 122 | 123 | def test_net(root_path, dataset_path, image_set, prefix, epoch, 124 | batch_size, ctx, test_mode="rnet", 125 | thresh=[0.6, 0.6, 0.7], min_face_size=24, 126 | stride=2, slide_window=False, shuffle=False, vis=False): 127 | 128 | detectors = [None, None, None] 129 | 130 | # load pnet model 131 | args, auxs = load_param(prefix[0], epoch[0], convert=True, ctx=ctx) 132 | if slide_window: 133 | PNet = Detector(P_Net("test"), 12, batch_size[0], ctx, args, auxs) 134 | else: 135 | PNet = FcnDetector(P_Net("test"), ctx, args, auxs) 136 | detectors[0] = PNet 137 | 138 | # load rnet model 139 | if test_mode in ["rnet", "onet"]: 140 | args, auxs = load_param(prefix[1], epoch[0], convert=True, ctx=ctx) 141 | RNet = Detector(R_Net("test"), 24, batch_size[1], ctx, args, auxs) 142 | detectors[1] = RNet 143 | 144 | # load onet model 145 | if test_mode == "onet": 146 | args, auxs = load_param(prefix[2], epoch[2], convert=True, ctx=ctx) 147 | ONet = Detector(O_Net("test"), 48, batch_size[2], ctx, args, auxs) 148 | detectors[2] = ONet 149 | 150 | mtcnn_detector = MtcnnDetector(detectors=detectors, ctx=ctx, min_face_size=min_face_size, 151 | stride=stride, threshold=thresh, slide_window=slide_window) 152 | 153 | 154 | imdb = IMDB("wider", image_set, root_path, dataset_path, 'test') 155 | gt_imdb = imdb.gt_imdb() 156 | 157 | test_data = TestLoader(gt_imdb) 158 | detections = mtcnn_detector.detect_face(imdb, test_data, vis=vis) 159 | 160 | if test_mode == "pnet": 161 | net = "rnet" 162 | elif test_mode == "rnet": 163 | net = "onet" 164 | 165 | save_path = "./prepare_data/%s"%net 166 | if not os.path.exists(save_path): 167 | os.mkdir(save_path) 168 | save_file = os.path.join(save_path, "detections.pkl") 169 | with open(save_file, 'wb') as f: 170 | cPickle.dump(detections, f, cPickle.HIGHEST_PROTOCOL) 171 | 172 | save_hard_example(net) 173 | 174 | 175 | def parse_args(): 176 | parser = argparse.ArgumentParser(description='Test mtcnn', 177 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 178 | parser.add_argument('--root_path', dest='root_path', help='output data folder', 179 | default='data', type=str) 180 | parser.add_argument('--dataset_path', dest='dataset_path', help='dataset folder', 181 | default='data/wider', type=str) 182 | parser.add_argument('--image_set', dest='image_set', help='image set', 183 | default='train', type=str) 184 | parser.add_argument('--test_mode', dest='test_mode', help='test net type, can be pnet, rnet or onet', 185 | default='pnet', type=str) 186 | parser.add_argument('--prefix', dest='prefix', help='prefix of model name', nargs="+", 187 | default=['model/pnet', 'model/rnet', 'model/onet'], type=str) 188 | parser.add_argument('--epoch', dest='epoch', help='epoch number of model to load', nargs="+", 189 | default=[16, 16, 16], type=int) 190 | parser.add_argument('--batch_size', dest='batch_size', help='list of batch size used in prediction', nargs="+", 191 | default=[2048, 256, 16], type=int) 192 | parser.add_argument('--thresh', dest='thresh', help='list of thresh for pnet, rnet, onet', nargs="+", 193 | default=[0.6, 0.7, 0.7], type=float) 194 | parser.add_argument('--min_face', dest='min_face', help='minimum face size for detection', 195 | default=24, type=int) 196 | parser.add_argument('--stride', dest='stride', help='stride of sliding window', 197 | default=2, type=int) 198 | parser.add_argument('--sw', dest='slide_window', help='use sliding window in pnet', action='store_true') 199 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with', 200 | default=0, type=int) 201 | parser.add_argument('--shuffle', dest='shuffle', help='shuffle data on visualization', action='store_true') 202 | parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true') 203 | args = parser.parse_args() 204 | return args 205 | 206 | if __name__ == '__main__': 207 | args = parse_args() 208 | print 'Called with argument:' 209 | print args 210 | ctx = mx.gpu(args.gpu_id) 211 | if args.gpu_id == -1: 212 | ctx = mx.cpu(0) 213 | test_net(args.root_path, args.dataset_path, args.image_set, args.prefix, 214 | args.epoch, args.batch_size, ctx, args.test_mode, 215 | args.thresh, args.min_face, args.stride, 216 | args.slide_window, args.shuffle, args.vis) 217 | -------------------------------------------------------------------------------- /prepare_data/gen_imglist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.random as npr 3 | 4 | size = 12 5 | 6 | if size == 12: 7 | net = "pnet" 8 | elif size == 24: 9 | net = "rnet" 10 | elif size == 48: 11 | net = "onet" 12 | 13 | with open('%s/pos_%s.txt'%(net, size), 'r') as f: 14 | pos = f.readlines() 15 | 16 | with open('%s/neg_%s.txt'%(net, size), 'r') as f: 17 | neg = f.readlines() 18 | 19 | with open('%s/part_%s.txt'%(net, size), 'r') as f: 20 | part = f.readlines() 21 | 22 | 23 | with open("%s/train_%s.txt"%(net, size), "w") as f: 24 | f.writelines(pos) 25 | neg_keep = npr.choice(len(neg), size=600000, replace=False) 26 | part_keep = npr.choice(len(part), size=300000, replace=False) 27 | for i in neg_keep: 28 | f.write(neg[i]) 29 | for i in part_keep: 30 | f.write(part[i]) 31 | -------------------------------------------------------------------------------- /prepare_data/gen_pnet_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import os 4 | import numpy.random as npr 5 | from utils import IoU 6 | 7 | anno_file = "./wider_annotations/anno.txt" 8 | im_dir = "/home/seanlx/Dataset/wider_face/WIDER_train/images" 9 | neg_save_dir = "/data3/seanlx/mtcnn1/12/negative" 10 | pos_save_dir = "/data3/seanlx/mtcnn1/12/positive" 11 | part_save_dir = "/data3/seanlx/mtcnn1/12/part" 12 | 13 | save_dir = "./pnet" 14 | if not os.path.exists(save_dir): 15 | os.mkdir(save_dir) 16 | f1 = open(os.path.join(save_dir, 'pos_12.txt'), 'w') 17 | f2 = open(os.path.join(save_dir, 'neg_12.txt'), 'w') 18 | f3 = open(os.path.join(save_dir, 'part_12.txt'), 'w') 19 | 20 | with open(anno_file, 'r') as f: 21 | annotations = f.readlines() 22 | 23 | num = len(annotations) 24 | print "%d pics in total" % num 25 | p_idx = 0 # positive 26 | n_idx = 0 # negative 27 | d_idx = 0 # dont care 28 | idx = 0 29 | box_idx = 0 30 | for annotation in annotations: 31 | annotation = annotation.strip().split(' ') 32 | im_path = annotation[0] 33 | bbox = map(float, annotation[1:]) 34 | boxes = np.array(bbox, dtype=np.float32).reshape(-1, 4) 35 | img = cv2.imread(os.path.join(im_dir, im_path + '.jpg')) 36 | idx += 1 37 | if idx % 100 == 0: 38 | print idx, "images done" 39 | 40 | height, width, channel = img.shape 41 | 42 | neg_num = 0 43 | while neg_num < 50: 44 | size = npr.randint(12, min(width, height) / 2) 45 | nx = npr.randint(0, width - size) 46 | ny = npr.randint(0, height - size) 47 | crop_box = np.array([nx, ny, nx + size, ny + size]) 48 | 49 | Iou = IoU(crop_box, boxes) 50 | 51 | cropped_im = img[ny : ny + size, nx : nx + size, :] 52 | resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR) 53 | 54 | if np.max(Iou) < 0.3: 55 | # Iou with all gts must below 0.3 56 | save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx) 57 | f2.write("12/negative/%s"%n_idx + ' 0\n') 58 | cv2.imwrite(save_file, resized_im) 59 | n_idx += 1 60 | neg_num += 1 61 | 62 | 63 | for box in boxes: 64 | # box (x_left, y_top, x_right, y_bottom) 65 | x1, y1, x2, y2 = box 66 | w = x2 - x1 + 1 67 | h = y2 - y1 + 1 68 | 69 | # ignore small faces 70 | # in case the ground truth boxes of small faces are not accurate 71 | if max(w, h) < 40 or x1 < 0 or y1 < 0: 72 | continue 73 | 74 | # generate negative examples that have overlap with gt 75 | for i in range(5): 76 | size = npr.randint(12, min(width, height) / 2) 77 | # delta_x and delta_y are offsets of (x1, y1) 78 | delta_x = npr.randint(max(-size, -x1), w) 79 | delta_y = npr.randint(max(-size, -y1), h) 80 | nx1 = max(0, x1 + delta_x) 81 | ny1 = max(0, y1 + delta_y) 82 | if nx1 + size > width or ny1 + size > height: 83 | continue 84 | crop_box = np.array([nx1, ny1, nx1 + size, ny1 + size]) 85 | Iou = IoU(crop_box, boxes) 86 | 87 | cropped_im = img[ny1 : ny1 + size, nx1 : nx1 + size, :] 88 | resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR) 89 | 90 | if np.max(Iou) < 0.3: 91 | # Iou with all gts must below 0.3 92 | save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx) 93 | f2.write("12/negative/%s"%n_idx + ' 0\n') 94 | cv2.imwrite(save_file, resized_im) 95 | n_idx += 1 96 | 97 | # generate positive examples and part faces 98 | for i in range(20): 99 | size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.25 * max(w, h))) 100 | 101 | # delta here is the offset of box center 102 | delta_x = npr.randint(-w * 0.2, w * 0.2) 103 | delta_y = npr.randint(-h * 0.2, h * 0.2) 104 | 105 | nx1 = max(x1 + w / 2 + delta_x - size / 2, 0) 106 | ny1 = max(y1 + h / 2 + delta_y - size / 2, 0) 107 | nx2 = nx1 + size 108 | ny2 = ny1 + size 109 | 110 | if nx2 > width or ny2 > height: 111 | continue 112 | crop_box = np.array([nx1, ny1, nx2, ny2]) 113 | 114 | offset_x1 = (x1 - nx1) / float(size) 115 | offset_y1 = (y1 - ny1) / float(size) 116 | offset_x2 = (x2 - nx2) / float(size) 117 | offset_y2 = (y2 - ny2) / float(size) 118 | 119 | cropped_im = img[ny1 : ny2, nx1 : nx2, :] 120 | resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR) 121 | 122 | box_ = box.reshape(1, -1) 123 | if IoU(crop_box, box_) >= 0.65: 124 | save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx) 125 | f1.write("12/positive/%s"%p_idx + ' 1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2)) 126 | cv2.imwrite(save_file, resized_im) 127 | p_idx += 1 128 | elif IoU(crop_box, box_) >= 0.4: 129 | save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx) 130 | f3.write("12/part/%s"%d_idx + ' -1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2)) 131 | cv2.imwrite(save_file, resized_im) 132 | d_idx += 1 133 | box_idx += 1 134 | print "%s images done, pos: %s part: %s neg: %s"%(idx, p_idx, d_idx, n_idx) 135 | 136 | f1.close() 137 | f2.close() 138 | f3.close() 139 | -------------------------------------------------------------------------------- /prepare_data/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def IoU(box, boxes): 4 | """Compute IoU between detect box and gt boxes 5 | 6 | Parameters: 7 | ---------- 8 | box: numpy array , shape (5, ): x1, y1, x2, y2, score 9 | input box 10 | boxes: numpy array, shape (n, 4): x1, y1, x2, y2 11 | input ground truth boxes 12 | 13 | Returns: 14 | ------- 15 | ovr: numpy.array, shape (n, ) 16 | IoU 17 | """ 18 | box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1) 19 | area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) 20 | xx1 = np.maximum(box[0], boxes[:, 0]) 21 | yy1 = np.maximum(box[1], boxes[:, 1]) 22 | xx2 = np.minimum(box[2], boxes[:, 2]) 23 | yy2 = np.minimum(box[3], boxes[:, 3]) 24 | 25 | # compute the width and height of the bounding box 26 | w = np.maximum(0, xx2 - xx1 + 1) 27 | h = np.maximum(0, yy2 - yy1 + 1) 28 | 29 | inter = w * h 30 | ovr = inter / (box_area + area - inter) 31 | return ovr 32 | 33 | 34 | def convert_to_square(bbox): 35 | """Convert bbox to square 36 | 37 | Parameters: 38 | ---------- 39 | bbox: numpy array , shape n x 5 40 | input bbox 41 | 42 | Returns: 43 | ------- 44 | square bbox 45 | """ 46 | square_bbox = bbox.copy() 47 | 48 | h = bbox[:, 3] - bbox[:, 1] + 1 49 | w = bbox[:, 2] - bbox[:, 0] + 1 50 | max_side = np.maximum(h,w) 51 | square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_side*0.5 52 | square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_side*0.5 53 | square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1 54 | square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1 55 | return square_bbox 56 | -------------------------------------------------------------------------------- /prepare_data/wider_annotations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/prepare_data/wider_annotations/__init__.py -------------------------------------------------------------------------------- /prepare_data/wider_annotations/readme.txt: -------------------------------------------------------------------------------- 1 | Attached the mappings between attribute names and label values. 2 | 3 | blur: 4 | clear->0 5 | normal blur->1 6 | heavy blur->2 7 | 8 | expression: 9 | typical expression->0 10 | exaggerate expression->1 11 | 12 | illumination: 13 | normal illumination->0 14 | extreme illumination->1 15 | 16 | occlusion: 17 | no occlusion->0 18 | partial occlusion->1 19 | heavy occlusion->2 20 | 21 | pose: 22 | typical pose->0 23 | atypical pose->1 24 | 25 | invalid: 26 | false->0(valid image) 27 | true->1(invalid image) -------------------------------------------------------------------------------- /prepare_data/wider_annotations/transform.m: -------------------------------------------------------------------------------- 1 | writeLabel('train'); 2 | -------------------------------------------------------------------------------- /prepare_data/wider_annotations/transform.py: -------------------------------------------------------------------------------- 1 | from wider_loader import WIDER 2 | import cv2 3 | import time 4 | 5 | #wider face original images path 6 | path_to_image = '/home/seanlx/Dataset/wider_face/WIDER_train/images' 7 | 8 | #matlab file path 9 | file_to_label = './wider_face_train.mat' 10 | 11 | #target file path 12 | target_file = './anno.txt' 13 | 14 | wider = WIDER(file_to_label, path_to_image) 15 | 16 | 17 | line_count = 0 18 | box_count = 0 19 | 20 | print 'start transforming....' 21 | t = time.time() 22 | 23 | with open(target_file, 'w+') as f: 24 | # press ctrl-C to stop the process 25 | for data in wider.next(): 26 | line = [] 27 | line.append(str(data.image_name)) 28 | line_count += 1 29 | for i,box in enumerate(data.bboxes): 30 | box_count += 1 31 | for j,bvalue in enumerate(box): 32 | line.append(str(bvalue)) 33 | 34 | line.append('\n') 35 | 36 | line_str = ' '.join(line) 37 | f.write(line_str) 38 | 39 | st = time.time()-t 40 | print 'end transforming' 41 | 42 | print 'spend time:%ld'%st 43 | print 'total line(images):%d'%line_count 44 | print 'total boxes(faces):%d'%box_count 45 | 46 | 47 | -------------------------------------------------------------------------------- /prepare_data/wider_annotations/wider_face_test.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/prepare_data/wider_annotations/wider_face_test.mat -------------------------------------------------------------------------------- /prepare_data/wider_annotations/wider_face_train.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/prepare_data/wider_annotations/wider_face_train.mat -------------------------------------------------------------------------------- /prepare_data/wider_annotations/wider_face_val.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/prepare_data/wider_annotations/wider_face_val.mat -------------------------------------------------------------------------------- /prepare_data/wider_annotations/wider_loader.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import os 3 | 4 | 5 | class DATA(object): 6 | def __init__(self, image_name, bboxes): 7 | self.image_name = image_name 8 | self.bboxes = bboxes 9 | 10 | 11 | class WIDER(object): 12 | def __init__(self, file_to_label, path_to_image): 13 | self.file_to_label = file_to_label 14 | self.path_to_image = path_to_image 15 | 16 | self.f = h5py.File(file_to_label, 'r') 17 | self.event_list = self.f.get('event_list') 18 | self.file_list = self.f.get('file_list') 19 | self.face_bbx_list = self.f.get('face_bbx_list') 20 | 21 | def next(self): 22 | 23 | for event_idx, event in enumerate(self.event_list.value[0]): 24 | directory = self.f[event].value.tostring().decode('utf-16') 25 | for im_idx, im in enumerate( 26 | self.f[self.file_list.value[0][event_idx]].value[0]): 27 | 28 | im_name = self.f[im].value.tostring().decode('utf-16') 29 | face_bbx = self.f[self.f[self.face_bbx_list.value 30 | [0][event_idx]].value[0][im_idx]].value 31 | 32 | bboxes = [] 33 | 34 | for i in range(face_bbx.shape[1]): 35 | xmin = int(face_bbx[0][i]) 36 | ymin = int(face_bbx[1][i]) 37 | xmax = int(face_bbx[0][i] + face_bbx[2][i]) 38 | ymax = int(face_bbx[1][i] + face_bbx[3][i]) 39 | bboxes.append((xmin, ymin, xmax, ymax)) 40 | 41 | yield DATA(os.path.join(self.path_to_image, directory, 42 | im_name + '.jpg'), bboxes) 43 | -------------------------------------------------------------------------------- /prepare_data/wider_annotations/writeLabel.m: -------------------------------------------------------------------------------- 1 | function writeLabel(image_set) 2 | 3 | f=load(sprintf('wider_face_%s.mat', image_set)); 4 | fid = fopen(sprintf('%s.txt', image_set), 'a'); 5 | for i = 1 : length(f.event_list) 6 | for j = 1 : length(f.file_list{i}) 7 | folder_name = f.event_list{i}; 8 | file_name = f.file_list{i}{j}; 9 | face_bboxes = f.face_bbx_list{i}{j}; 10 | fprintf(fid, '%s/%s ', folder_name, file_name); 11 | for k = 1 : size(face_bboxes, 1) 12 | bbox = face_bboxes(k, :); 13 | bbox(3) = bbox(1) + bbox(3); 14 | bbox(4) = bbox(2) + bbox(4); 15 | for id = 1:4 16 | fprintf(fid, '%.2f ', bbox(id)); 17 | end 18 | end 19 | fprintf(fid, '\n'); 20 | end 21 | end 22 | fclose(fid); 23 | 24 | -------------------------------------------------------------------------------- /test01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/test01.jpg -------------------------------------------------------------------------------- /test_fddb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mxnet as mx 3 | import argparse 4 | from core.symbol import P_Net, R_Net, O_Net 5 | from core.imdb import IMDB 6 | from config import config 7 | from core.loader import TestLoader 8 | from core.detector import Detector 9 | from core.fcn_detector import FcnDetector 10 | from tools.load_model import load_param 11 | from core.MtcnnDetector import MtcnnDetector 12 | 13 | 14 | def test_net(root_path, dataset_path, prefix, epoch, 15 | batch_size, ctx, test_mode="onet", 16 | thresh=[0.6, 0.6, 0.7], min_face_size=24, 17 | stride=2, slide_window=False, shuffle=False, vis=False): 18 | 19 | detectors = [None, None, None] 20 | 21 | # load pnet model 22 | args, auxs = load_param(prefix[0], epoch[0], convert=True, ctx=ctx) 23 | if slide_window: 24 | PNet = Detector(P_Net("test"), 12, batch_size[0], ctx, args, auxs) 25 | else: 26 | PNet = FcnDetector(P_Net("test"), ctx, args, auxs) 27 | detectors[0] = PNet 28 | 29 | # load rnet model 30 | if test_mode in ["rnet", "onet"]: 31 | args, auxs = load_param(prefix[1], epoch[0], convert=True, ctx=ctx) 32 | RNet = Detector(R_Net("test"), 24, batch_size[1], ctx, args, auxs) 33 | detectors[1] = RNet 34 | 35 | # load onet model 36 | if test_mode == "onet": 37 | args, auxs = load_param(prefix[2], epoch[2], convert=True, ctx=ctx) 38 | ONet = Detector(O_Net("test"), 48, batch_size[2], ctx, args, auxs) 39 | detectors[2] = ONet 40 | 41 | mtcnn_detector = MtcnnDetector(detectors=detectors, ctx=ctx, min_face_size=min_face_size, 42 | stride=stride, threshold=thresh, slide_window=slide_window) 43 | 44 | for i in range(1,11): 45 | image_set = "fold-" + str(i).zfill(2) 46 | imdb = IMDB("fddb", image_set, root_path, dataset_path, 'test') 47 | gt_imdb = imdb.gt_imdb() 48 | 49 | test_data = TestLoader(gt_imdb) 50 | all_boxes = mtcnn_detector.detect_face(imdb, test_data, vis=vis) 51 | imdb.write_results(all_boxes) 52 | 53 | 54 | 55 | def parse_args(): 56 | parser = argparse.ArgumentParser(description='Test mtcnn', 57 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 58 | parser.add_argument('--root_path', dest='root_path', help='output data folder', 59 | default='data', type=str) 60 | parser.add_argument('--dataset_path', dest='dataset_path', help='dataset folder', 61 | default='data/fddb', type=str) 62 | parser.add_argument('--test_mode', dest='test_mode', help='test net type, can be pnet, rnet or onet', 63 | default='onet', type=str) 64 | parser.add_argument('--prefix', dest='prefix', help='prefix of model name', nargs="+", 65 | default=['model/pnet', 'model/rnet', 'model/onet'], type=str) 66 | parser.add_argument('--epoch', dest='epoch', help='epoch number of model to load', nargs="+", 67 | default=[16, 16, 16], type=int) 68 | parser.add_argument('--batch_size', dest='batch_size', help='list of batch size used in prediction', nargs="+", 69 | default=[2048, 256, 16], type=int) 70 | parser.add_argument('--thresh', dest='thresh', help='list of thresh for pnet, rnet, onet', nargs="+", 71 | default=[0.6, 0.7, 0.7], type=float) 72 | parser.add_argument('--min_face', dest='min_face', help='minimum face size for detection', 73 | default=24, type=int) 74 | parser.add_argument('--stride', dest='stride', help='stride of sliding window', 75 | default=2, type=int) 76 | parser.add_argument('--sw', dest='slide_window', help='use sliding window in pnet', action='store_true') 77 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with', 78 | default=0, type=int) 79 | parser.add_argument('--shuffle', dest='shuffle', help='shuffle data on visualization', action='store_true') 80 | parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true') 81 | args = parser.parse_args() 82 | return args 83 | 84 | if __name__ == '__main__': 85 | args = parse_args() 86 | print 'Called with argument:' 87 | print args 88 | ctx = mx.gpu(args.gpu_id) 89 | if args.gpu_id == -1: 90 | ctx = mx.cpu(0) 91 | test_net(args.root_path, args.dataset_path, args.prefix, 92 | args.epoch, args.batch_size, ctx, args.test_mode, 93 | args.thresh, args.min_face, args.stride, 94 | args.slide_window, args.shuffle, args.vis) 95 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Seanlinx/mtcnn/feacb3b6ae1bf177664f2a0b676ed4cfd3f5ca55/tools/__init__.py -------------------------------------------------------------------------------- /tools/image_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def transform(im): 4 | """ 5 | transform into mxnet tensor 6 | substract pixel size and transform to correct format 7 | :param im: [height, width, channel] in BGR 8 | :return: [batch, channel, height, width] 9 | """ 10 | im_tensor = im.transpose(2, 0, 1) 11 | im_tensor = im_tensor[np.newaxis, :] 12 | im_tensor = (im_tensor - 127.5)*0.0078125 13 | return im_tensor 14 | -------------------------------------------------------------------------------- /tools/load_model.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | 3 | 4 | def load_checkpoint(prefix, epoch): 5 | """ 6 | Load model checkpoint from file. 7 | :param prefix: Prefix of model name. 8 | :param epoch: Epoch number of model we would like to load. 9 | :return: (arg_params, aux_params) 10 | arg_params : dict of str to NDArray 11 | Model parameter, dict of name to NDArray of net's weights. 12 | aux_params : dict of str to NDArray 13 | Model parameter, dict of name to NDArray of net's auxiliary states. 14 | """ 15 | save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch)) 16 | arg_params = {} 17 | aux_params = {} 18 | for k, v in save_dict.items(): 19 | tp, name = k.split(':', 1) 20 | if tp == 'arg': 21 | arg_params[name] = v 22 | if tp == 'aux': 23 | aux_params[name] = v 24 | return arg_params, aux_params 25 | 26 | 27 | def convert_context(params, ctx): 28 | """ 29 | :param params: dict of str to NDArray 30 | :param ctx: the context to convert to 31 | :return: dict of str of NDArray with context ctx 32 | """ 33 | new_params = dict() 34 | for k, v in params.items(): 35 | new_params[k] = v.as_in_context(ctx) 36 | return new_params 37 | 38 | 39 | def load_param(prefix, epoch, convert=False, ctx=None): 40 | """ 41 | wrapper for load checkpoint 42 | :param prefix: Prefix of model name. 43 | :param epoch: Epoch number of model we would like to load. 44 | :param convert: reference model should be converted to GPU NDArray first 45 | :param ctx: if convert then ctx must be designated. 46 | :return: (arg_params, aux_params) 47 | """ 48 | arg_params, aux_params = load_checkpoint(prefix, epoch) 49 | if convert: 50 | if ctx is None: 51 | ctx = mx.cpu() 52 | arg_params = convert_context(arg_params, ctx) 53 | aux_params = convert_context(aux_params, ctx) 54 | return arg_params, aux_params 55 | -------------------------------------------------------------------------------- /tools/nms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def py_nms(dets, thresh, mode="Union"): 5 | """ 6 | greedily select boxes with high confidence 7 | keep boxes overlap <= thresh 8 | rule out overlap > thresh 9 | :param dets: [[x1, y1, x2, y2 score]] 10 | :param thresh: retain overlap <= thresh 11 | :return: indexes to keep 12 | """ 13 | x1 = dets[:, 0] 14 | y1 = dets[:, 1] 15 | x2 = dets[:, 2] 16 | y2 = dets[:, 3] 17 | scores = dets[:, 4] 18 | 19 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 20 | order = scores.argsort()[::-1] 21 | 22 | keep = [] 23 | while order.size > 0: 24 | i = order[0] 25 | keep.append(i) 26 | xx1 = np.maximum(x1[i], x1[order[1:]]) 27 | yy1 = np.maximum(y1[i], y1[order[1:]]) 28 | xx2 = np.minimum(x2[i], x2[order[1:]]) 29 | yy2 = np.minimum(y2[i], y2[order[1:]]) 30 | 31 | w = np.maximum(0.0, xx2 - xx1 + 1) 32 | h = np.maximum(0.0, yy2 - yy1 + 1) 33 | inter = w * h 34 | if mode == "Union": 35 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 36 | elif mode == "Minimum": 37 | ovr = inter / np.minimum(areas[i], areas[order[1:]]) 38 | 39 | inds = np.where(ovr <= thresh)[0] 40 | order = order[inds + 1] 41 | 42 | return keep 43 | --------------------------------------------------------------------------------