├── tmp ├── .gitkeep ├── onet_iter_90000.caffemodel ├── pnet_iter_446000.caffemodel └── rnet_iter_116000.caffemodel ├── jfda ├── __init__.py ├── mdetector.py ├── bbox.pyx ├── config.py ├── minibatch.py ├── train.py ├── lnet.py ├── utils.py ├── detector.py └── prepare.py ├── layers ├── __init__.py ├── caffe.proto ├── copy.sh ├── jfda_loss_layer.cu ├── data_layer.py ├── jfda_loss_layer.hpp ├── test_jfda_loss_layer.cpp └── jfda_loss_layer.cpp ├── 9387493245278.jpg ├── proto ├── o_solver.prototxt ├── p_solver.prototxt ├── r_solver.prototxt ├── p.prototxt ├── r.prototxt ├── o.prototxt ├── p_train_val.prototxt ├── r_train_val.prototxt └── o_train_val.prototxt ├── cpp ├── mtcnn_detect.h ├── simplemain.cpp └── mtcnn_detect.cpp ├── LICENSE ├── README.md ├── summary.md ├── simplewebcam.py └── simpledemo.py /tmp/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jfda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /9387493245278.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincolnhard/mtcnn-head-detection/HEAD/9387493245278.jpg -------------------------------------------------------------------------------- /tmp/onet_iter_90000.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincolnhard/mtcnn-head-detection/HEAD/tmp/onet_iter_90000.caffemodel -------------------------------------------------------------------------------- /tmp/pnet_iter_446000.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincolnhard/mtcnn-head-detection/HEAD/tmp/pnet_iter_446000.caffemodel -------------------------------------------------------------------------------- /tmp/rnet_iter_116000.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincolnhard/mtcnn-head-detection/HEAD/tmp/rnet_iter_116000.caffemodel -------------------------------------------------------------------------------- /layers/caffe.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package caffe; 4 | 5 | message LayerParameter { 6 | // 23333 may need to be changed due to caffe's proto 7 | optional JfdaLossParameter jfda_loss_param = 23333; 8 | } 9 | 10 | message JfdaLossParameter { 11 | optional float drop_loss_rate = 1 [default = 0]; // online hard mining drop rate 12 | } 13 | -------------------------------------------------------------------------------- /layers/copy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cp layers/jfda_loss_layer.hpp $CAFFE_HOME/include/caffe/layers/jfda_loss_layer.hpp 4 | cp layers/jfda_loss_layer.cpp $CAFFE_HOME/src/caffe/layers/jfda_loss_layer.cpp 5 | cp layers/jfda_loss_layer.cu $CAFFE_HOME/src/caffe/layers/jfda_loss_layer.cu 6 | cp layers/test_jfda_loss_layer.cpp $CAFFE_HOME/src/caffe/test/test_jfda_loss_layer.cpp 7 | mkdir data 8 | -------------------------------------------------------------------------------- /proto/o_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "proto/o_train_val.prototxt" 2 | 3 | test_initialization: false 4 | test_iter: 600 5 | test_interval: 2500 6 | 7 | base_lr: 0.001 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | 11 | lr_policy: "step" 12 | gamma: 0.5 13 | stepsize: 25000 14 | 15 | display: 500 16 | average_loss: 500 17 | 18 | max_iter: 100000 19 | snapshot: 5000 20 | snapshot_prefix: "tmp/onet" 21 | 22 | solver_mode: GPU 23 | -------------------------------------------------------------------------------- /proto/p_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "proto/p_train_val.prototxt" 2 | 3 | test_initialization: false 4 | test_iter: 600 5 | test_interval: 2500 6 | 7 | base_lr: 0.001 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | 11 | lr_policy: "step" 12 | gamma: 0.5 13 | stepsize: 25000 14 | 15 | display: 500 16 | average_loss: 500 17 | 18 | max_iter: 150000 19 | snapshot: 1000 20 | snapshot_prefix: "tmp/pnet" 21 | 22 | solver_mode: GPU 23 | -------------------------------------------------------------------------------- /proto/r_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "proto/r_train_val.prototxt" 2 | 3 | test_initialization: false 4 | test_iter: 600 5 | test_interval: 2500 6 | 7 | base_lr: 0.001 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | 11 | lr_policy: "step" 12 | gamma: 0.5 13 | stepsize: 25000 14 | 15 | display: 500 16 | average_loss: 500 17 | 18 | max_iter: 100000 19 | snapshot: 5000 20 | snapshot_prefix: "tmp/rnet" 21 | 22 | solver_mode: GPU 23 | -------------------------------------------------------------------------------- /layers/jfda_loss_layer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "caffe/util/math_functions.hpp" 3 | #include "caffe/layers/jfda_loss_layer.hpp" 4 | 5 | namespace caffe { 6 | 7 | template 8 | void JfdaLossLayer::Forward_gpu( 9 | const vector*>& bottom, const vector*>& top) { 10 | Forward_cpu(bottom, top); 11 | } 12 | 13 | template 14 | void JfdaLossLayer::Backward_gpu(const vector*>& top, 15 | const vector& propagate_down, const vector*>& bottom) { 16 | Backward_cpu(top, propagate_down, bottom); 17 | } 18 | 19 | INSTANTIATE_LAYER_GPU_FUNCS(JfdaLossLayer); 20 | 21 | } // namespace caffe 22 | -------------------------------------------------------------------------------- /jfda/mdetector.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 2 | from .detector import JfdaDetector 3 | import minicaffe as caffe 4 | 5 | 6 | class MiniCaffeDetector(JfdaDetector): 7 | 8 | def __init__(self, nets): 9 | assert len(nets) in [2, 4, 6, 8], 'wrong number of nets' 10 | self.pnet, self.rnet, self.onet, self.lnet = None, None, None, None 11 | if len(nets) >= 2: 12 | self.pnet = caffe.Net(nets[0], nets[1]) 13 | if len(nets) >= 4: 14 | self.rnet = caffe.Net(nets[2], nets[3]) 15 | if len(nets) >= 6: 16 | self.onet = caffe.Net(nets[4], nets[5]) 17 | if len(nets) >= 8: 18 | self.lnet = caffe.Net(nets[6], nets[7]) 19 | self.pnet_single_forward = False 20 | -------------------------------------------------------------------------------- /cpp/mtcnn_detect.h: -------------------------------------------------------------------------------- 1 | #ifndef MTCNN_DETECT_H 2 | #define MTCNN_DETECT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static const float pnet_th = 0.7f; 11 | static const float rnet_th = 0.7f; 12 | static const float onet_th = 0.2f; 13 | static const float min_objsize = 24.0f; 14 | static const float pyramid_factor = 0.666f; 15 | static const float max_objsize = 70.0f; 16 | static const int max_pnet_bbox_num = 100; // 100 17 | static const int max_rnet_bbox_num = 50; // 50 18 | static const float pnet_merge_th = 0.6f; 19 | static const float rnet_merge_th = 0.5f; 20 | 21 | typedef struct 22 | { 23 | float xmin; 24 | float ymin; 25 | float xmax; 26 | float ymax; 27 | float score; 28 | } obj_box; 29 | 30 | typedef struct 31 | { 32 | float bbox_reg[4]; 33 | obj_box bbox; 34 | } obj_info; 35 | 36 | void init_mtcnn 37 | ( 38 | const int srcw, 39 | const int srch 40 | ); 41 | 42 | void run_mtcnn 43 | ( 44 | cv::Mat& im, 45 | std::vector& onet_boxes 46 | ); 47 | 48 | #endif // MTCNN_DETECT_H 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, zhangjie 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of Joint-Face-Detection-and-Alignment nor the names of its contributors 14 | may be used to endorse or promote products derived from this software 15 | without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /jfda/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float32 13 | ctypedef np.float32_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /cpp/simplemain.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mtcnn_detect.h" 4 | 5 | 6 | double what_time_is_it_now 7 | ( 8 | void 9 | ) 10 | { 11 | struct timespec now; 12 | clock_gettime(CLOCK_REALTIME, &now); 13 | return now.tv_sec + now.tv_nsec*1e-9; 14 | } 15 | 16 | int main 17 | ( 18 | int ac, 19 | char *av[] 20 | ) 21 | { 22 | cv::VideoCapture cap; 23 | if (ac == 2) 24 | { 25 | cap.open(av[1]); 26 | } 27 | else 28 | { 29 | cap.open(0); 30 | } 31 | 32 | cv::Mat im; 33 | cap >> im; 34 | const int IMW = im.cols; 35 | const int IMH = im.rows; 36 | 37 | init_mtcnn(IMW, IMH); 38 | 39 | int cnt = 0; 40 | double timesum = 0.0; 41 | while (1) 42 | { 43 | cap >> im; 44 | 45 | std::vector detectedobj_info; 46 | 47 | double time1 = what_time_is_it_now(); 48 | 49 | run_mtcnn(im, detectedobj_info); 50 | 51 | double time2 = what_time_is_it_now(); 52 | if (cnt < 50) 53 | { 54 | double duration = time2 - time1; 55 | timesum += duration; 56 | ++cnt; 57 | } 58 | else 59 | { 60 | std::cout << "fps: " << (double)cnt / timesum << std::endl; 61 | timesum = 0.0; 62 | cnt = 0; 63 | } 64 | 65 | 66 | unsigned int num_onet_boxes = detectedobj_info.size(); 67 | for (unsigned int i = 0; i < num_onet_boxes; ++i) 68 | { 69 | cv::rectangle(im, cv::Point(detectedobj_info[i].bbox.xmin, detectedobj_info[i].bbox.ymin), 70 | cv::Point(detectedobj_info[i].bbox.xmax, detectedobj_info[i].bbox.ymax), cv::Scalar(0, 0, 255), 1, 16); 71 | } 72 | 73 | cv::imshow("demo", im); 74 | unsigned char key = cv::waitKey(1); 75 | if (key == 27) 76 | { 77 | break; 78 | } 79 | 80 | } 81 | 82 | return 0; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /proto/p.prototxt: -------------------------------------------------------------------------------- 1 | name: "pNet" 2 | layer { 3 | name: "data" 4 | type: "Input" 5 | top: "data" 6 | input_param { 7 | shape: { 8 | dim: 1 9 | dim: 3 10 | dim: 12 11 | dim: 12 12 | } 13 | } 14 | } 15 | 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | convolution_param { 22 | num_output: 10 23 | kernel_size: 3 24 | stride: 1 25 | } 26 | } 27 | layer { 28 | name: "prelu1" 29 | type: "PReLU" 30 | bottom: "conv1" 31 | top: "conv1" 32 | } 33 | layer { 34 | name: "pool1" 35 | type: "Pooling" 36 | bottom: "conv1" 37 | top: "pool1" 38 | pooling_param { 39 | pool: MAX 40 | kernel_size: 2 41 | stride: 2 42 | } 43 | } 44 | 45 | layer { 46 | name: "conv2" 47 | type: "Convolution" 48 | bottom: "pool1" 49 | top: "conv2" 50 | convolution_param { 51 | num_output: 16 52 | kernel_size: 3 53 | stride: 1 54 | } 55 | } 56 | layer { 57 | name: "prelu2" 58 | type: "PReLU" 59 | bottom: "conv2" 60 | top: "conv2" 61 | } 62 | 63 | layer { 64 | name: "conv3" 65 | type: "Convolution" 66 | bottom: "conv2" 67 | top: "conv3" 68 | convolution_param { 69 | num_output: 32 70 | kernel_size: 3 71 | stride: 1 72 | } 73 | } 74 | layer { 75 | name: "prelu3" 76 | type: "PReLU" 77 | bottom: "conv3" 78 | top: "conv3" 79 | } 80 | 81 | # score 82 | layer { 83 | name: "score" 84 | type: "Convolution" 85 | bottom: "conv3" 86 | top: "score" 87 | convolution_param { 88 | num_output: 2 89 | kernel_size: 1 90 | stride: 1 91 | } 92 | } 93 | layer { 94 | name: "prob" 95 | type: "Softmax" 96 | bottom: "score" 97 | top: "prob" 98 | softmax_param { 99 | axis: 1 100 | } 101 | } 102 | # bbox 103 | layer { 104 | name: "bbox_pred" 105 | type: "Convolution" 106 | bottom: "conv3" 107 | top: "bbox_pred" 108 | convolution_param { 109 | num_output: 4 110 | kernel_size: 1 111 | stride: 1 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /layers/data_layer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import caffe 3 | from jfda.config import cfg 4 | 5 | 6 | class FaceDataLayer(caffe.Layer): 7 | '''Custom Data Layer 8 | LayerOutput 9 | top[0]: image data 10 | top[1]: bbox target 11 | top[2]: landmark target 12 | top[3]: face data type / label, 0 for negatives, 1 for positives 13 | 2 for part faces, 3 for landmark faces 14 | 15 | Howto 16 | layer { 17 | name: "data" 18 | type: "Python" 19 | top: "data" 20 | top: "bbox_target" 21 | top: "landmark_target" 22 | top: "label" 23 | python_param { 24 | module: "layers.data_layer" 25 | layer: "FaceDataLayer" 26 | } 27 | } 28 | ''' 29 | 30 | def set_batch_num(self, n1, n2, n3, n4): 31 | '''set data type number 32 | n1 for negatives, n2 for positives, n3 for part faces, n4 for landmark faces 33 | net_input_size for network input size (width, height) 34 | ''' 35 | self.n1 = n1 36 | self.n2 = n2 37 | self.n3 = n3 38 | self.n4 = n4 39 | self.n = n1 + n2 + n3 + n4 40 | self.net_input_size = cfg.NET_INPUT_SIZE[cfg.NET_TYPE] 41 | 42 | def set_data_queue(self, queue): 43 | '''the queue should put a minibatch with size of (negatives, positives, part faces, landmark faces) = 44 | (n1, n2, n3, n4) in a dict 45 | ''' 46 | self.data_queue = queue 47 | 48 | def setup(self, bottom, top): 49 | self.n1 = 1 50 | self.n2 = 1 51 | self.n3 = 1 52 | self.n4 = 1 53 | self.n = 4 54 | self.net_input_size = cfg.NET_INPUT_SIZE[cfg.NET_TYPE] 55 | self.reshape(bottom, top) 56 | 57 | def reshape(self, bottom, top): 58 | top[0].reshape(self.n, 3, self.net_input_size, self.net_input_size) 59 | top[1].reshape(self.n, 4) 60 | top[2].reshape(self.n, 10) 61 | top[3].reshape(self.n) 62 | 63 | def forward(self, bottom, top): 64 | minibatch = self._get_minibacth() 65 | # face data 66 | top[0].data[...] = minibatch['data'] 67 | top[1].data[...] = minibatch['bbox_target'] 68 | top[2].data[...] = minibatch['landmark_target'] 69 | top[3].data[...] = minibatch['label'] 70 | 71 | def backward(self, bottom, top): 72 | pass 73 | 74 | def _get_minibacth(self): 75 | minibatch = self.data_queue.get() 76 | return minibatch 77 | -------------------------------------------------------------------------------- /layers/jfda_loss_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_JFDA_LOSS_LAYER_HPP_ 2 | #define CAFFE_JFDA_LOSS_LAYER_HPP_ 3 | 4 | #include 5 | #include "caffe/blob.hpp" 6 | #include "caffe/layer.hpp" 7 | #include "caffe/proto/caffe.pb.h" 8 | #include "caffe/layers/loss_layer.hpp" 9 | 10 | namespace caffe { 11 | 12 | // Howto 13 | // layer { 14 | // name: "loss" 15 | // type: "JfdaLoss" 16 | // bottom: "score" 17 | // bottom: "bbox_pred" 18 | // bottom: "landmark_pred" 19 | // bottom: "bbox_target" 20 | // bottom: "landmark_target" 21 | // bottom: "label" 22 | // top: "face_cls_loss" 23 | // top: "bbox_reg_loss" 24 | // top: "landmark_reg_loss" 25 | // top: "face_cls_pos_acc" 26 | // top: "face_cls_neg_acc" 27 | // loss_weight: 1 # face_cls_loss 28 | // loss_weight: 0.5 # bbox_reg_loss 29 | // loss_weight: 0.5 # landmark_reg_loss 30 | // loss_weight: 0 # no loss for neg acc 31 | // loss_weight: 0 # no loss for pos acc 32 | // } 33 | 34 | template 35 | class JfdaLossLayer : public Layer { 36 | public: 37 | explicit JfdaLossLayer(const LayerParameter& param) 38 | : Layer(param) {} 39 | 40 | virtual void LayerSetUp(const vector*>& bottom, 41 | const vector*>& top); 42 | virtual void Reshape(const vector*>& bottom, 43 | const vector*>& top); 44 | 45 | virtual inline const char* type() const { return "JfdaLoss"; } 46 | 47 | virtual inline int ExactNumBottomBlobs() const { return 6; } 48 | virtual inline int ExactNumTopBlobs() const { return 5; } 49 | virtual inline bool AutoTopBlobs() const { return true; } 50 | 51 | protected: 52 | virtual void Forward_cpu(const vector*>& bottom, 53 | const vector*>& top); 54 | virtual void Forward_gpu(const vector*>& bottom, 55 | const vector*>& top); 56 | 57 | virtual void Backward_cpu(const vector*>& top, 58 | const vector& propagate_down, const vector*>& bottom); 59 | virtual void Backward_gpu(const vector*>& top, 60 | const vector& propagate_down, const vector*>& bottom); 61 | 62 | public: 63 | float drop_loss_rate_; 64 | Blob mask_; 65 | Blob prob_; 66 | Blob bbox_diff_; 67 | Blob landmark_diff_; 68 | }; 69 | 70 | } // namespace caffe 71 | 72 | #endif // CAFFE_JFDA_LOSS_LAYER_HPP_ 73 | -------------------------------------------------------------------------------- /proto/r.prototxt: -------------------------------------------------------------------------------- 1 | name: "rNet" 2 | layer { 3 | name: "data" 4 | type: "Input" 5 | top: "data" 6 | input_param { 7 | shape: { 8 | dim: 1 9 | dim: 3 10 | dim: 24 11 | dim: 24 12 | } 13 | } 14 | } 15 | 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | convolution_param { 22 | num_output: 28 23 | kernel_size: 3 24 | stride: 1 25 | } 26 | } 27 | layer { 28 | name: "prelu1" 29 | type: "PReLU" 30 | bottom: "conv1" 31 | top: "conv1" 32 | } 33 | layer { 34 | name: "pool1" 35 | type: "Pooling" 36 | bottom: "conv1" 37 | top: "pool1" 38 | pooling_param { 39 | pool: MAX 40 | kernel_size: 3 41 | stride: 2 42 | } 43 | } 44 | 45 | layer { 46 | name: "conv2" 47 | type: "Convolution" 48 | bottom: "pool1" 49 | top: "conv2" 50 | convolution_param { 51 | num_output: 48 52 | kernel_size: 3 53 | stride: 1 54 | } 55 | } 56 | layer { 57 | name: "prelu2" 58 | type: "PReLU" 59 | bottom: "conv2" 60 | top: "conv2" 61 | } 62 | layer { 63 | name: "pool2" 64 | type: "Pooling" 65 | bottom: "conv2" 66 | top: "pool2" 67 | pooling_param { 68 | pool: MAX 69 | kernel_size: 3 70 | stride: 2 71 | } 72 | } 73 | 74 | layer { 75 | name: "conv3" 76 | type: "Convolution" 77 | bottom: "pool2" 78 | top: "conv3" 79 | convolution_param { 80 | num_output: 64 81 | kernel_size: 2 82 | stride: 1 83 | } 84 | } 85 | layer { 86 | name: "prelu3" 87 | type: "PReLU" 88 | bottom: "conv3" 89 | top: "conv3" 90 | } 91 | 92 | layer { 93 | name: "fc" 94 | type: "InnerProduct" 95 | bottom: "conv3" 96 | top: "fc" 97 | inner_product_param { 98 | num_output: 128 99 | } 100 | } 101 | layer { 102 | name: "prelu4" 103 | type: "PReLU" 104 | bottom: "fc" 105 | top: "fc" 106 | } 107 | 108 | # score 109 | layer { 110 | name: "score" 111 | type: "InnerProduct" 112 | bottom: "fc" 113 | top: "score" 114 | inner_product_param { 115 | num_output: 2 116 | } 117 | } 118 | layer { 119 | name: "prob" 120 | type: "Softmax" 121 | bottom: "score" 122 | top: "prob" 123 | } 124 | # bbox 125 | layer { 126 | name: "bbox_pred" 127 | type: "InnerProduct" 128 | bottom: "fc" 129 | top: "bbox_pred" 130 | inner_product_param { 131 | num_output: 4 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MTCNN head detection 2 | ================================== 3 | 4 | ## Set up 5 | 6 | Set up environment and copy C++ layer code to Caffe's source code tree. 7 | 8 | ``` 9 | $ export PYTHONPATH=/path/to/mtcnn-head-detection:$PYTHONPATH 10 | $ export CAFFE_HOME=/path/to/caffe 11 | $ pip install easydict 12 | $ pip install lmdb 13 | $ sh layers/copy.sh 14 | ``` 15 | 16 | Compile Caffe following its document. 17 | 18 | ## Prepare data 19 | 20 | Download dataset [SCUT-HEAD](https://github.com/HCIILAB/SCUT-HEAD-Dataset-Release). 21 | Unzip and put them in data directory. 22 | 23 | ## Train 24 | 25 | **pnet** 26 | ``` 27 | python jfda/prepare.py --net p --wider --worker 8 28 | python jfda/train.py --net p --gpu 0 --size 128 --lr 0.05 --lrw 0.1 --lrp 5 --wd 0.0001 --epoch 25 29 | ``` 30 | **rnet** 31 | 32 | Choose appropriate pnet caffemodel to generate prior for rnet, and edit ```cfg.PROPOSAL_NETS``` in ```config.py``` 33 | ``` 34 | python jfda/prepare.py --net r --gpu 0 --detect --wider --worker 4 35 | python jfda/train.py --net r --gpu 0 --size 128 --lr 0.05 --lrw 0.1 --lrp 5 --wd 0.0001 --epoch 25 36 | ``` 37 | **onet** 38 | 39 | Choose appropriate rnet caffemodel to generate prior for onet, and edit ```cfg.PROPOSAL_NETS``` in ```config.py``` 40 | ``` 41 | python jfda/prepare.py --net o --gpu 0 --detect --wider --worker 4 42 | python jfda/train.py --net o --gpu $GPU --size 64 --lr 0.05 --lrw 0.1 --lrp 7 --wd 0.0001 --epoch 35 43 | ``` 44 | 45 | ## Test 46 | 47 | ``` 48 | python simpledemo.py 49 | ``` 50 | 51 | ## Note 52 | 53 | 1. Landmark alignment in original mtcnn is removed in this repo. Here only do object classification and bounding box regression. 54 | 55 | 2. Each convolutional layer kernel number in onet has reduced for faster network inference. 56 | 57 | ## Result 58 | 59 | **pnet** 60 | 61 | ![pnet1](https://user-images.githubusercontent.com/16308037/53081537-059a1180-3536-11e9-8aa6-4ecfa8639bee.jpg) 62 | 63 | **rnet** 64 | 65 | ![rnet1](https://user-images.githubusercontent.com/16308037/53081792-7fca9600-3536-11e9-8341-16b176bb9b12.jpg) 66 | 67 | **onet** 68 | 69 | ![onet1](https://user-images.githubusercontent.com/16308037/53081747-6c1f2f80-3536-11e9-84bc-6885cf991468.jpg) 70 | 71 | ## References 72 | 73 | - [A Convolutional Neural Network Cascade for Face Detection](http://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Li_A_Convolutional_Neural_2015_CVPR_paper.pdf) 74 | - [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](http://arxiv.org/abs/1604.02878) 75 | - [MTCNN_face_detection_alignment](https://github.com/kpzhang93/MTCNN_face_detection_alignment) 76 | - [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn) 77 | - [opencv-mtcnn](https://github.com/ksachdeva/opencv-mtcnn) 78 | -------------------------------------------------------------------------------- /jfda/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from easydict import EasyDict 3 | 4 | 5 | cfg = EasyDict() 6 | 7 | # data directories 8 | cfg.Data_DIR = 'data/' 9 | cfg.CelebA_DIR = 'data/CelebA/' 10 | cfg.WIDER_DIR = 'data/WIDER/' 11 | cfg.FDDB_DIR = 'data/fddb/' 12 | 13 | # cnn input size 14 | cfg.NET_TYPE = 'p' 15 | cfg.NET_INPUT_SIZE = {'p': 12, 'r': 24, 'o': 48} 16 | 17 | # data prepare 18 | cfg.USE_DETECT = False 19 | cfg.GPU_ID = -1 20 | cfg.WORKER_N = 4 21 | 22 | # random seed 23 | cfg.RNG_SEED = 1 24 | 25 | # shuffle data buff before save to lmdb 26 | cfg.SHUFFLE_SIZE = 10000 27 | 28 | # ratios 29 | cfg.FACE_OVERLAP = 0.65 # (0.65, 1] is positives 30 | cfg.NONFACE_OVERLAP = 0.3 # [0, 0.3] is negatives 31 | cfg.PARTFACE_OVERLAP = 0.4 # (0.4, 0.65] is part faces 32 | 33 | # face proposal 34 | # cfg.PROPOSAL_SCALES = [0.8, 1.0, 1.2] # proposal sliding window scales 35 | # cfg.PROPOSAL_STRIDES = [0.1] # proposal sliding window strides 36 | cfg.POS_PROPOSAL_SCALES = [0.6, 0.8, 1.0, 1.2] 37 | cfg.POS_PROPOSAL_STRIDE = 0.1 38 | cfg.PART_PROPOSAL_SCALES = [0.6, 0.8, 1.0, 1.2] 39 | cfg.PART_PROPOSAL_STRIDE = 0.1 40 | cfg.NEG_PROPOSAL_SCALES = [0.6, 0.8, 1.0, 1.2] 41 | cfg.NEG_PROPOSAL_STRIDE = 0.1 42 | cfg.POS_PER_FACE = 10 # positive face per face region 43 | cfg.PART_PER_FACE = 10 # part face per face region 44 | cfg.NEG_PER_FACE = 10 # non-face per face region 45 | cfg.LANDMARK_PER_FACE = 10 # landmark face per face region 46 | cfg.NEG_FROM_FR_RATIO = 0.5 # ratio of non-face from face region, non-face are from face region or global image random patch 47 | cfg.NEG_FORCE_BALANCE = True # whether to balance the ratio of negatives from face region and global crop, this will reduce the total number of negative samples 48 | cfg.NEG_PER_IMAGE = 256 # non-face per image, set to 128 if you set cfg.NEG_FORCE_BALANCE=False 49 | cfg.NEG_DETECT_PER_IMAGE = 128 # use for rnet and onet 50 | cfg.NEG_PROPOSAL_RATIO = 10 # total proposal size equals to NEG_PER_IMAGE * NEG_PROPOSAL_RATIO 51 | cfg.NEG_MIN_SIZE = 12 # minimum random crop patch size for non-face 52 | 53 | # using previous network to generate data 54 | cfg.PROPOSAL_NETS = { 55 | 'p': None, 56 | 'r': ['proto/p.prototxt', 'tmp/pnet_iter_446000.caffemodel'], 57 | 'o': ['proto/p.prototxt', 'tmp/pnet_iter_446000.caffemodel', 'proto/r.prototxt', 'tmp/rnet_iter_116000.caffemodel'], 58 | } 59 | cfg.DETECT_PARAMS = { 60 | 'min_size': 24, 61 | 'ths': [0.5, 0.5, 0.5], 62 | 'factor': 0.709 63 | } 64 | 65 | # training data ratio in a minibatch, [negative, positive, part, landmark] 66 | cfg.DATA_RATIO = { 67 | 'p': [3, 1, 1], 68 | 'r': [3, 1, 1], 69 | 'o': [3, 2, 1], 70 | } 71 | 72 | # data augment 73 | cfg.GRAY_PROB = 0.1 74 | cfg.FLIP_PROB = 0.5 75 | 76 | # lnet 77 | cfg.SAMPLE_RADIUS = 0.1 78 | cfg.DATASIZE_PER_H5 = 100000 79 | cfg.LNET_SAMPLE_PER_FACE = 3 80 | cfg.LNET_FACE_SCALES = [0.3] 81 | -------------------------------------------------------------------------------- /proto/o.prototxt: -------------------------------------------------------------------------------- 1 | name: "oNet" 2 | layer { 3 | name: "data" 4 | type: "Input" 5 | top: "data" 6 | input_param { 7 | shape: { 8 | dim: 1 9 | dim: 3 10 | dim: 48 11 | dim: 48 12 | } 13 | } 14 | } 15 | 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | convolution_param { 22 | num_output: 16 23 | kernel_size: 3 24 | stride: 1 25 | } 26 | } 27 | layer { 28 | name: "prelu1" 29 | type: "PReLU" 30 | bottom: "conv1" 31 | top: "conv1" 32 | } 33 | layer { 34 | name: "pool1" 35 | type: "Pooling" 36 | bottom: "conv1" 37 | top: "pool1" 38 | pooling_param { 39 | pool: MAX 40 | kernel_size: 3 41 | stride: 2 42 | } 43 | } 44 | 45 | layer { 46 | name: "conv2" 47 | type: "Convolution" 48 | bottom: "pool1" 49 | top: "conv2" 50 | convolution_param { 51 | num_output: 32 52 | kernel_size: 3 53 | stride: 1 54 | } 55 | } 56 | layer { 57 | name: "prelu2" 58 | type: "PReLU" 59 | bottom: "conv2" 60 | top: "conv2" 61 | } 62 | layer { 63 | name: "pool2" 64 | type: "Pooling" 65 | bottom: "conv2" 66 | top: "pool2" 67 | pooling_param { 68 | pool: MAX 69 | kernel_size: 3 70 | stride: 2 71 | } 72 | } 73 | 74 | layer { 75 | name: "conv3" 76 | type: "Convolution" 77 | bottom: "pool2" 78 | top: "conv3" 79 | convolution_param { 80 | num_output: 48 81 | kernel_size: 3 82 | stride: 1 83 | } 84 | } 85 | layer { 86 | name: "prelu3" 87 | type: "PReLU" 88 | bottom: "conv3" 89 | top: "conv3" 90 | } 91 | layer { 92 | name: "pool3" 93 | type: "Pooling" 94 | bottom: "conv3" 95 | top: "pool3" 96 | pooling_param { 97 | pool: MAX 98 | kernel_size: 2 99 | stride: 2 100 | } 101 | } 102 | 103 | layer { 104 | name: "conv4" 105 | type: "Convolution" 106 | bottom: "pool3" 107 | top: "conv4" 108 | convolution_param { 109 | num_output: 64 110 | kernel_size: 2 111 | stride: 1 112 | } 113 | } 114 | layer { 115 | name: "prelu4" 116 | type: "PReLU" 117 | bottom: "conv4" 118 | top: "conv4" 119 | } 120 | 121 | layer { 122 | name: "fc" 123 | type: "InnerProduct" 124 | bottom: "conv4" 125 | top: "fc" 126 | inner_product_param { 127 | num_output: 96 128 | } 129 | } 130 | layer { 131 | name: "prelu5" 132 | type: "PReLU" 133 | bottom: "fc" 134 | top: "fc" 135 | } 136 | 137 | # score 138 | layer { 139 | name: "score" 140 | type: "InnerProduct" 141 | bottom: "fc" 142 | top: "score" 143 | inner_product_param { 144 | num_output: 2 145 | } 146 | } 147 | layer { 148 | name: "prob" 149 | type: "Softmax" 150 | bottom: "score" 151 | top: "prob" 152 | softmax_param { 153 | axis: 1 154 | } 155 | } 156 | # bbox 157 | layer { 158 | name: "bbox_pred" 159 | type: "InnerProduct" 160 | bottom: "fc" 161 | top: "bbox_pred" 162 | inner_product_param { 163 | num_output: 4 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /proto/p_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "pNet" 2 | 3 | layer { 4 | name: "data" 5 | type: "Python" 6 | top: "data" 7 | top: "bbox_target" 8 | top: "label" 9 | python_param { 10 | module: "layers.data_layer" 11 | layer: "FaceDataLayer" 12 | } 13 | include { 14 | phase: TRAIN 15 | } 16 | } 17 | layer { 18 | name: "data" 19 | type: "Python" 20 | top: "data" 21 | top: "bbox_target" 22 | top: "label" 23 | python_param { 24 | module: "layers.data_layer" 25 | layer: "FaceDataLayer" 26 | } 27 | include { 28 | phase: TEST 29 | } 30 | } 31 | 32 | layer { 33 | name: "conv1" 34 | type: "Convolution" 35 | bottom: "data" 36 | top: "conv1" 37 | param { 38 | lr_mult: 1 39 | decay_mult: 1 40 | } 41 | param { 42 | lr_mult: 2 43 | decay_mult: 0 44 | } 45 | convolution_param { 46 | num_output: 10 47 | kernel_size: 3 48 | stride: 1 49 | weight_filler { 50 | type: "xavier" 51 | } 52 | bias_filler { 53 | type: "constant" 54 | } 55 | } 56 | } 57 | layer { 58 | name: "prelu1" 59 | type: "PReLU" 60 | bottom: "conv1" 61 | top: "conv1" 62 | } 63 | layer { 64 | name: "pool1" 65 | type: "Pooling" 66 | bottom: "conv1" 67 | top: "pool1" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 2 71 | stride: 2 72 | } 73 | } 74 | 75 | layer { 76 | name: "conv2" 77 | type: "Convolution" 78 | bottom: "pool1" 79 | top: "conv2" 80 | param { 81 | lr_mult: 1 82 | decay_mult: 1 83 | } 84 | param { 85 | lr_mult: 2 86 | decay_mult: 0 87 | } 88 | convolution_param { 89 | num_output: 16 90 | kernel_size: 3 91 | stride: 1 92 | weight_filler { 93 | type: "xavier" 94 | } 95 | bias_filler { 96 | type: "constant" 97 | } 98 | } 99 | } 100 | layer { 101 | name: "prelu2" 102 | type: "PReLU" 103 | bottom: "conv2" 104 | top: "conv2" 105 | } 106 | 107 | layer { 108 | name: "conv3" 109 | type: "Convolution" 110 | bottom: "conv2" 111 | top: "conv3" 112 | param { 113 | lr_mult: 1 114 | decay_mult: 1 115 | } 116 | param { 117 | lr_mult: 2 118 | decay_mult: 0 119 | } 120 | convolution_param { 121 | num_output: 32 122 | kernel_size: 3 123 | stride: 1 124 | weight_filler { 125 | type: "xavier" 126 | } 127 | bias_filler { 128 | type: "constant" 129 | } 130 | } 131 | } 132 | layer { 133 | name: "prelu3" 134 | type: "PReLU" 135 | bottom: "conv3" 136 | top: "conv3" 137 | } 138 | 139 | # score 140 | layer { 141 | name: "score" 142 | type: "Convolution" 143 | bottom: "conv3" 144 | top: "score" 145 | param { 146 | lr_mult: 1 147 | decay_mult: 1 148 | } 149 | param { 150 | lr_mult: 2 151 | decay_mult: 0 152 | } 153 | convolution_param { 154 | num_output: 2 155 | kernel_size: 1 156 | stride: 1 157 | weight_filler { 158 | type: "xavier" 159 | } 160 | bias_filler { 161 | type: "constant" 162 | } 163 | } 164 | } 165 | 166 | # bbox 167 | layer { 168 | name: "bbox_pred" 169 | type: "Convolution" 170 | bottom: "conv3" 171 | top: "bbox_pred" 172 | param { 173 | lr_mult: 1 174 | decay_mult: 1 175 | } 176 | param { 177 | lr_mult: 2 178 | decay_mult: 0 179 | } 180 | convolution_param { 181 | num_output: 4 182 | kernel_size: 1 183 | stride: 1 184 | weight_filler { 185 | type: "xavier" 186 | } 187 | bias_filler { 188 | type: "constant" 189 | } 190 | } 191 | } 192 | 193 | # loss 194 | layer { 195 | name: "loss" 196 | type: "JfdaLoss" 197 | bottom: "score" 198 | bottom: "bbox_pred" 199 | bottom: "bbox_target" 200 | bottom: "label" 201 | top: "face_cls_loss" 202 | top: "bbox_reg_loss" 203 | top: "face_cls_neg_acc" 204 | top: "face_cls_pos_acc" 205 | loss_weight: 1.0 206 | loss_weight: 0.5 207 | loss_weight: 0 208 | loss_weight: 0 209 | jfda_loss_param { 210 | drop_loss_rate: 0.3 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /jfda/minibatch.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 2 | import multiprocessing 3 | import cv2 4 | import lmdb 5 | import numpy as np 6 | from jfda.config import cfg 7 | 8 | 9 | class MiniBatcher(multiprocessing.Process): 10 | '''generate minibatch 11 | given a queue, put (negatives, positives, part faces, landmark faces) = (n1, n2, n3, n4) 12 | ''' 13 | 14 | def __init__(self, db_names, ns, net_type): 15 | '''order: negatives, positives, part faces, landmark faces 16 | ''' 17 | super(MiniBatcher, self).__init__() 18 | self.ns = ns 19 | self.n = reduce(lambda x, acc: acc + x, ns, 0) 20 | self._start = [0 for _ in range(3)] 21 | self.net_type = net_type 22 | self.db_names = db_names 23 | self.db = [lmdb.open(db_name) for db_name in db_names] 24 | self.tnx = [db.begin() for db in self.db] 25 | self.db_size = [int(tnx.get('size')) for tnx in self.tnx] 26 | 27 | def __del__(self): 28 | for tnx in self.tnx: 29 | tnx.abort() 30 | for db in self.db: 31 | db.close() 32 | 33 | def set_queue(self, queue): 34 | self.queue = queue 35 | 36 | def get_size(self): 37 | return self.db_size 38 | 39 | def _make_transform(self, data, bbox=None): 40 | # gray scale 41 | if np.random.rand() < cfg.GRAY_PROB: 42 | gray = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY) 43 | data[:, :, 0] = gray 44 | data[:, :, 1] = gray 45 | data[:, :, 2] = gray 46 | # flip 47 | if np.random.rand() < cfg.FLIP_PROB: 48 | data = data[:, ::-1, :] 49 | if bbox is not None: 50 | # [dx1 dy1 dx2 dy2] --> [-dx2 dy1 -dx1 dy2] 51 | bbox[0], bbox[2] = -bbox[2], -bbox[0] 52 | data = data.transpose((2, 0, 1)) 53 | return data, bbox 54 | 55 | def run(self): 56 | intpu_size = cfg.NET_INPUT_SIZE[self.net_type] 57 | data_shape = (intpu_size, intpu_size, 3) 58 | bbox_shape = (4,) 59 | n = self.n 60 | while True: 61 | data = np.zeros((n, 3, intpu_size, intpu_size), dtype=np.float32) 62 | bbox_target = np.zeros((n, 4), dtype=np.float32) 63 | label = np.zeros(n, dtype=np.float32) 64 | 65 | start = self._start 66 | end = [start[i] + self.ns[i] for i in range(3)] 67 | for i in range(3): 68 | if end[i] > self.db_size[i]: 69 | end[i] -= self.db_size[i] 70 | start[i] = end[i] 71 | end[i] = start[i] + self.ns[i] 72 | 73 | idx = 0 74 | # negatives 75 | for i in xrange(start[0], end[0]): 76 | data_key = '%08d_data'%i 77 | _data = np.fromstring(self.tnx[0].get(data_key), dtype=np.uint8).reshape(data_shape) 78 | data[idx], _1 = self._make_transform(_data) 79 | idx += 1 80 | # positives 81 | for i in xrange(start[1], end[1]): 82 | data_key = '%08d_data'%i 83 | bbox_key = '%08d_bbox'%i 84 | _data = np.fromstring(self.tnx[1].get(data_key), dtype=np.uint8).reshape(data_shape) 85 | _bbox_target = np.fromstring(self.tnx[1].get(bbox_key), dtype=np.float32).reshape(bbox_shape) 86 | data[idx], bbox_target[idx] = self._make_transform(_data, _bbox_target) 87 | idx += 1 88 | # part faces 89 | for i in xrange(start[2], end[2]): 90 | data_key = '%08d_data'%i 91 | bbox_key = '%08d_bbox'%i 92 | _data = np.fromstring(self.tnx[2].get(data_key), dtype=np.uint8).reshape(data_shape) 93 | _bbox_target = np.fromstring(self.tnx[2].get(bbox_key), dtype=np.float32).reshape(bbox_shape) 94 | data[idx], bbox_target[idx] = self._make_transform(_data, _bbox_target) 95 | idx += 1 96 | # label 97 | label[:self.ns[0]] = 0 98 | label[self.ns[0]: self.ns[0]+self.ns[1]] = 1 99 | label[self.ns[0]+self.ns[1]: self.ns[0]+self.ns[1]+self.ns[2]] = 2 100 | label[self.ns[0]+self.ns[1]+self.ns[2]:] = 3 101 | 102 | self._start = end 103 | data = (data - 128) / 128 # simple normalization 104 | minibatch = {'data': data, 105 | 'bbox_target': bbox_target, 106 | 'label': label} 107 | self.queue.put(minibatch) 108 | -------------------------------------------------------------------------------- /proto/r_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "rNet" 2 | 3 | layer { 4 | name: "data" 5 | type: "Python" 6 | top: "data" 7 | top: "bbox_target" 8 | top: "label" 9 | python_param { 10 | module: "layers.data_layer" 11 | layer: "FaceDataLayer" 12 | } 13 | include { 14 | phase: TRAIN 15 | } 16 | } 17 | layer { 18 | name: "data" 19 | type: "Python" 20 | top: "data" 21 | top: "bbox_target" 22 | top: "label" 23 | python_param { 24 | module: "layers.data_layer" 25 | layer: "FaceDataLayer" 26 | } 27 | include { 28 | phase: TEST 29 | } 30 | } 31 | 32 | layer { 33 | name: "conv1" 34 | type: "Convolution" 35 | bottom: "data" 36 | top: "conv1" 37 | param { 38 | lr_mult: 1 39 | decay_mult: 1 40 | } 41 | param { 42 | lr_mult: 2 43 | decay_mult: 0 44 | } 45 | convolution_param { 46 | num_output: 28 47 | kernel_size: 3 48 | stride: 1 49 | weight_filler { 50 | type: "xavier" 51 | } 52 | bias_filler { 53 | type: "constant" 54 | } 55 | } 56 | } 57 | layer { 58 | name: "prelu1" 59 | type: "PReLU" 60 | bottom: "conv1" 61 | top: "conv1" 62 | } 63 | layer { 64 | name: "pool1" 65 | type: "Pooling" 66 | bottom: "conv1" 67 | top: "pool1" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 3 71 | stride: 2 72 | } 73 | } 74 | 75 | layer { 76 | name: "conv2" 77 | type: "Convolution" 78 | bottom: "pool1" 79 | top: "conv2" 80 | param { 81 | lr_mult: 1 82 | decay_mult: 1 83 | } 84 | param { 85 | lr_mult: 2 86 | decay_mult: 0 87 | } 88 | convolution_param { 89 | num_output: 48 90 | kernel_size: 3 91 | stride: 1 92 | weight_filler { 93 | type: "xavier" 94 | } 95 | bias_filler { 96 | type: "constant" 97 | } 98 | } 99 | } 100 | layer { 101 | name: "prelu2" 102 | type: "PReLU" 103 | bottom: "conv2" 104 | top: "conv2" 105 | } 106 | layer { 107 | name: "pool2" 108 | type: "Pooling" 109 | bottom: "conv2" 110 | top: "pool2" 111 | pooling_param { 112 | pool: MAX 113 | kernel_size: 3 114 | stride: 2 115 | } 116 | } 117 | 118 | layer { 119 | name: "conv3" 120 | type: "Convolution" 121 | bottom: "pool2" 122 | top: "conv3" 123 | param { 124 | lr_mult: 1 125 | decay_mult: 1 126 | } 127 | param { 128 | lr_mult: 2 129 | decay_mult: 0 130 | } 131 | convolution_param { 132 | num_output: 64 133 | kernel_size: 2 134 | stride: 1 135 | weight_filler { 136 | type: "xavier" 137 | } 138 | bias_filler { 139 | type: "constant" 140 | } 141 | } 142 | } 143 | layer { 144 | name: "prelu3" 145 | type: "PReLU" 146 | bottom: "conv3" 147 | top: "conv3" 148 | } 149 | 150 | layer { 151 | name: "fc" 152 | type: "InnerProduct" 153 | bottom: "conv3" 154 | top: "fc" 155 | param { 156 | lr_mult: 1 157 | decay_mult: 1 158 | } 159 | param { 160 | lr_mult: 2 161 | decay_mult: 0 162 | } 163 | inner_product_param { 164 | num_output: 128 165 | weight_filler { 166 | type: "xavier" 167 | } 168 | bias_filler { 169 | type: "constant" 170 | } 171 | } 172 | } 173 | layer { 174 | name: "prelu4" 175 | type: "PReLU" 176 | bottom: "fc" 177 | top: "fc" 178 | } 179 | 180 | # score 181 | layer { 182 | name: "score" 183 | type: "InnerProduct" 184 | bottom: "fc" 185 | top: "score" 186 | param { 187 | lr_mult: 1 188 | decay_mult: 1 189 | } 190 | param { 191 | lr_mult: 2 192 | decay_mult: 0 193 | } 194 | inner_product_param { 195 | num_output: 2 196 | weight_filler { 197 | type: "xavier" 198 | } 199 | bias_filler { 200 | type: "constant" 201 | } 202 | } 203 | } 204 | # bbox 205 | layer { 206 | name: "bbox_pred" 207 | type: "InnerProduct" 208 | bottom: "fc" 209 | top: "bbox_pred" 210 | param { 211 | lr_mult: 1 212 | decay_mult: 1 213 | } 214 | param { 215 | lr_mult: 2 216 | decay_mult: 0 217 | } 218 | inner_product_param { 219 | num_output: 4 220 | weight_filler { 221 | type: "xavier" 222 | } 223 | bias_filler { 224 | type: "constant" 225 | } 226 | } 227 | } 228 | 229 | # loss 230 | layer { 231 | name: "loss" 232 | type: "JfdaLoss" 233 | bottom: "score" 234 | bottom: "bbox_pred" 235 | bottom: "bbox_target" 236 | bottom: "label" 237 | top: "face_cls_loss" 238 | top: "bbox_reg_loss" 239 | top: "face_cls_neg_acc" 240 | top: "face_cls_pos_acc" 241 | loss_weight: 1.0 242 | loss_weight: 0.5 243 | loss_weight: 0 244 | loss_weight: 0 245 | jfda_loss_param { 246 | drop_loss_rate: 0.3 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /jfda/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 3 | 4 | import shutil 5 | import argparse 6 | import multiprocessing 7 | import numpy as np 8 | import caffe 9 | from caffe.proto import caffe_pb2 10 | from google.protobuf import text_format 11 | from jfda.config import cfg 12 | from jfda.minibatch import MiniBatcher 13 | 14 | 15 | class Solver: 16 | 17 | def __init__(self, solver_prototxt, args): 18 | net_type = args.net 19 | self.net_type = net_type 20 | input_size = cfg.NET_INPUT_SIZE[net_type] 21 | db_names_train = ['data/%snet_negative_train'%net_type, 22 | 'data/%snet_positive_train'%net_type, 23 | 'data/%snet_part_train'%net_type] 24 | db_names_test = ['data/%snet_negative_val'%net_type, 25 | 'data/%snet_positive_val'%net_type, 26 | 'data/%snet_part_val'%net_type] 27 | base_size = args.size 28 | ns = [r*base_size for r in cfg.DATA_RATIO[net_type]] 29 | # batcher setup 30 | batcher_train = MiniBatcher(db_names_train, ns, net_type) 31 | batcher_test = MiniBatcher(db_names_test, ns, net_type) 32 | # data queue setup 33 | queue_train = multiprocessing.Queue(32) 34 | queue_test = multiprocessing.Queue(32) 35 | batcher_train.set_queue(queue_train) 36 | batcher_test.set_queue(queue_test) 37 | # solver parameter setup 38 | size_train = batcher_train.get_size() 39 | size_test = batcher_test.get_size() 40 | iter_train = sum([x/y for x, y in zip(size_train, ns)]) / len(ns) # train epoch size 41 | iter_test = sum([x/y for x, y in zip(size_test, ns)]) / len(ns) # test epoch size 42 | max_iter = args.epoch * iter_train 43 | self.final_model = 'tmp/%snet_iter_%d.caffemodel'%(net_type, max_iter) 44 | solver_param = caffe_pb2.SolverParameter() 45 | with open(solver_prototxt, 'r') as fin: 46 | text_format.Merge(fin.read(), solver_param) 47 | solver_param.max_iter = max_iter # max training iterations 48 | #solver_param.snapshot = iter_train # save after an epoch 49 | solver_param.snapshot = 1000 50 | solver_param.test_interval = iter_train 51 | solver_param.test_iter[0] = iter_test 52 | solver_param.base_lr = args.lr 53 | solver_param.gamma = args.lrw 54 | solver_param.stepsize = args.lrp * iter_train 55 | solver_param.weight_decay = args.wd 56 | tmp_solver_prototxt = 'tmp/%s_solver.prototxt'%net_type 57 | with open(tmp_solver_prototxt, 'w') as fout: 58 | fout.write(text_format.MessageToString(solver_param)) 59 | # solver setup 60 | self.solver = caffe.SGDSolver(tmp_solver_prototxt) 61 | # data layer setup 62 | layer_train = self.solver.net.layers[0] 63 | layer_test = self.solver.test_nets[0].layers[0] 64 | layer_train.set_batch_num(ns[0], ns[1], ns[2]) 65 | layer_test.set_batch_num(ns[0], ns[1], ns[2]) 66 | layer_train.set_data_queue(queue_train) 67 | layer_test.set_data_queue(queue_test) 68 | # copy init weight if any 69 | if args.weight: 70 | self.solver.net.copy_from(args.weight) 71 | self.solver.test_nets[0].copy_from(args.weight) 72 | # start batcher 73 | batcher_train.start() 74 | batcher_test.start() 75 | def cleanup(): 76 | batcher_train.terminate() 77 | batcher_test.terminate() 78 | batcher_train.join() 79 | batcher_test.join() 80 | import atexit 81 | atexit.register(cleanup) 82 | 83 | def train_model(self, snapshot_model=None): 84 | self.solver.solve(snapshot_model) 85 | # copy model 86 | shutil.copyfile(self.final_model, 'model/%s.caffemodel'%self.net_type) 87 | 88 | 89 | def init_caffe(cfg): 90 | np.random.seed(cfg.RNG_SEED) 91 | caffe.set_random_seed(cfg.RNG_SEED) 92 | if cfg.GPU_ID < 0: 93 | caffe.set_mode_cpu() 94 | else: 95 | caffe.set_mode_gpu() 96 | caffe.set_device(cfg.GPU_ID) 97 | 98 | 99 | if __name__ == '__main__': 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument('--gpu', type=int, default=0, help='gpu id to use, -1 for cpu') 102 | parser.add_argument('--net', type=str, default='p', help='net type, p, r, o') 103 | parser.add_argument('--size', type=int, default=128, help='base batch size') 104 | parser.add_argument('--epoch', type=int, default=20, help='train epoches') 105 | parser.add_argument('--snapshot', type=str, default=None, help='snapshot model') 106 | parser.add_argument('--lr', type=float, default=0.01, help='initial learning rate') 107 | parser.add_argument('--lrw', type=float, default=0.1, help='lr decay rate') 108 | parser.add_argument('--lrp', type=int, default=2, help='number of epoches to decay the lr') 109 | parser.add_argument('--wd', type=float, default=5e-4, help='weight decay') 110 | parser.add_argument('--weight', type=str, default=None, help='init weight for the model') 111 | args = parser.parse_args() 112 | 113 | print args 114 | 115 | net_type = args.net 116 | # check args 117 | assert net_type in ['p', 'r', 'o'], "net should be 'p', 'r', 'o'" 118 | cfg.NET_TYPE = net_type 119 | cfg.GPU_ID = args.gpu 120 | init_caffe(cfg) 121 | 122 | solver_prototxt = 'proto/%s_solver.prototxt'%net_type 123 | solver = Solver(solver_prototxt, args) 124 | solver.train_model(args.snapshot) 125 | -------------------------------------------------------------------------------- /proto/o_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "oNet" 2 | 3 | layer { 4 | name: "data" 5 | type: "Python" 6 | top: "data" 7 | top: "bbox_target" 8 | top: "label" 9 | python_param { 10 | module: "layers.data_layer" 11 | layer: "FaceDataLayer" 12 | } 13 | include { 14 | phase: TRAIN 15 | } 16 | } 17 | layer { 18 | name: "data" 19 | type: "Python" 20 | top: "data" 21 | top: "bbox_target" 22 | top: "label" 23 | python_param { 24 | module: "layers.data_layer" 25 | layer: "FaceDataLayer" 26 | } 27 | include { 28 | phase: TEST 29 | } 30 | } 31 | 32 | layer { 33 | name: "conv1" 34 | type: "Convolution" 35 | bottom: "data" 36 | top: "conv1" 37 | param { 38 | lr_mult: 1 39 | decay_mult: 1 40 | } 41 | param { 42 | lr_mult: 2 43 | decay_mult: 0 44 | } 45 | convolution_param { 46 | num_output: 16 47 | kernel_size: 3 48 | stride: 1 49 | weight_filler { 50 | type: "xavier" 51 | } 52 | bias_filler { 53 | type: "constant" 54 | } 55 | } 56 | } 57 | layer { 58 | name: "prelu1" 59 | type: "PReLU" 60 | bottom: "conv1" 61 | top: "conv1" 62 | } 63 | layer { 64 | name: "pool1" 65 | type: "Pooling" 66 | bottom: "conv1" 67 | top: "pool1" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 3 71 | stride: 2 72 | } 73 | } 74 | 75 | layer { 76 | name: "conv2" 77 | type: "Convolution" 78 | bottom: "pool1" 79 | top: "conv2" 80 | param { 81 | lr_mult: 1 82 | decay_mult: 1 83 | } 84 | param { 85 | lr_mult: 2 86 | decay_mult: 0 87 | } 88 | convolution_param { 89 | num_output: 32 90 | kernel_size: 3 91 | stride: 1 92 | weight_filler { 93 | type: "xavier" 94 | } 95 | bias_filler { 96 | type: "constant" 97 | } 98 | } 99 | } 100 | layer { 101 | name: "prelu2" 102 | type: "PReLU" 103 | bottom: "conv2" 104 | top: "conv2" 105 | } 106 | layer { 107 | name: "pool2" 108 | type: "Pooling" 109 | bottom: "conv2" 110 | top: "pool2" 111 | pooling_param { 112 | pool: MAX 113 | kernel_size: 3 114 | stride: 2 115 | } 116 | } 117 | 118 | layer { 119 | name: "conv3" 120 | type: "Convolution" 121 | bottom: "pool2" 122 | top: "conv3" 123 | param { 124 | lr_mult: 1 125 | decay_mult: 1 126 | } 127 | param { 128 | lr_mult: 2 129 | decay_mult: 0 130 | } 131 | convolution_param { 132 | num_output: 48 133 | kernel_size: 3 134 | stride: 1 135 | weight_filler { 136 | type: "xavier" 137 | } 138 | bias_filler { 139 | type: "constant" 140 | } 141 | } 142 | } 143 | layer { 144 | name: "prelu3" 145 | type: "PReLU" 146 | bottom: "conv3" 147 | top: "conv3" 148 | } 149 | layer { 150 | name: "pool3" 151 | type: "Pooling" 152 | bottom: "conv3" 153 | top: "pool3" 154 | pooling_param { 155 | pool: MAX 156 | kernel_size: 2 157 | stride: 2 158 | } 159 | } 160 | 161 | layer { 162 | name: "conv4" 163 | type: "Convolution" 164 | bottom: "pool3" 165 | top: "conv4" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 64 176 | kernel_size: 2 177 | stride: 1 178 | weight_filler { 179 | type: "xavier" 180 | } 181 | bias_filler { 182 | type: "constant" 183 | } 184 | } 185 | } 186 | layer { 187 | name: "prelu4" 188 | type: "PReLU" 189 | bottom: "conv4" 190 | top: "conv4" 191 | } 192 | 193 | layer { 194 | name: "fc" 195 | type: "InnerProduct" 196 | bottom: "conv4" 197 | top: "fc" 198 | param { 199 | lr_mult: 1 200 | decay_mult: 1 201 | } 202 | param { 203 | lr_mult: 2 204 | decay_mult: 0 205 | } 206 | inner_product_param { 207 | num_output: 96 208 | weight_filler { 209 | type: "xavier" 210 | } 211 | bias_filler { 212 | type: "constant" 213 | } 214 | } 215 | } 216 | layer { 217 | name: "prelu5" 218 | type: "PReLU" 219 | bottom: "fc" 220 | top: "fc" 221 | } 222 | 223 | # score 224 | layer { 225 | name: "score" 226 | type: "InnerProduct" 227 | bottom: "fc" 228 | top: "score" 229 | param { 230 | lr_mult: 1 231 | decay_mult: 1 232 | } 233 | param { 234 | lr_mult: 2 235 | decay_mult: 0 236 | } 237 | inner_product_param { 238 | num_output: 2 239 | weight_filler { 240 | type: "xavier" 241 | } 242 | bias_filler { 243 | type: "constant" 244 | } 245 | } 246 | } 247 | # bbox 248 | layer { 249 | name: "bbox_pred" 250 | type: "InnerProduct" 251 | bottom: "fc" 252 | top: "bbox_pred" 253 | param { 254 | lr_mult: 1 255 | decay_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | decay_mult: 0 260 | } 261 | inner_product_param { 262 | num_output: 4 263 | weight_filler { 264 | type: "xavier" 265 | } 266 | bias_filler { 267 | type: "constant" 268 | } 269 | } 270 | } 271 | 272 | # loss 273 | layer { 274 | name: "loss" 275 | type: "JfdaLoss" 276 | bottom: "score" 277 | bottom: "bbox_pred" 278 | bottom: "bbox_target" 279 | bottom: "label" 280 | top: "face_cls_loss" 281 | top: "bbox_reg_loss" 282 | top: "face_cls_neg_acc" 283 | top: "face_cls_pos_acc" 284 | loss_weight: 1.0 285 | loss_weight: 0.5 286 | loss_weight: 0 287 | loss_weight: 0 288 | jfda_loss_param { 289 | drop_loss_rate: 0.3 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /summary.md: -------------------------------------------------------------------------------- 1 | JFDA 总结 2 | ========= 3 | 4 | 这里总结了训练 JFDA 的过程。JFDA 是对 [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878) 这篇论文的复现。 5 | 6 | JFDA 检测人脸的网络架构与之前一篇 [Cascade CNN](http://users.eecs.northwestern.edu/~xsh835/assets/cvpr2015_cascnn.pdf) 论文相似,都是针对传统级联方法在 DL 上的重现,区别在于传统方法使用到的分类器分类能力较弱,需要用 boost 提升方法,级联非常多的弱分类器来组成一个强分类器分类人脸。而 DL 的方法采用较小的 CNN 作为分类器,仅需若干个(一般采用 3 个)就能达到传统方法上千个分类器级联的分类效果。在实际检测人脸时,JFDA 与传统方法一致,采用了滑动窗口的方法来检测人脸,由于 CNN 的特性,JFDA 中的第一个网络(PNet)实际会设计成一个全卷积网络,从而实现快速的滑窗操作。对 JFDA 中的 3 个网络,我们简称为 PNet,RNet 和 ONet。 7 | 8 | ### 训练数据 9 | 10 | 训练 JFDA 的数据主要来做 WIDER 和 CelebA 这两个数据集,其中 CelebA 主要用来提供标有 5 个点的人脸用来训练 landmark 任务。WIDER Face 主要用来提供正样本(pos),负样本(neg)和局部人脸(part)。每一级的训练数据都是从这两个数据集中通过不同方式采集的。JFDA 将多个任务放在同一个网络中训练,包括了人脸分类(face cls),人脸框回归(bbox reg)和人脸关键点检测(landmark reg)。同时不同的样本可能参与的任务不一样,不像一般的多任务,每个样本都会有每个任务的标签,在这里,比如说负样本,它只有 face cls 的 label,并没有 bbox reg 和 landmark reg 的 label,实际处理数据送给网络计算 loss 也会有差异。 11 | 12 | ### 关于 loss 13 | 14 | 论文中提到了一种有别于传统 hard negative mining 的思路,叫做 online hard sample mining。传统级联方法的思路是用已有的分类器去检测训练图片,可以得到一些负样本,已有分类器没办法将这些负样本拒绝掉,因此称这些负样本为 hard negative,然后拿这些负样本去训练之后的分类器。JFDA 也有类似的操作,这个操作会在训练完网络之后,需要训练下一个网络的时候做,思路也这样,比如 RNet 的训练数据会来自 PNet 在训练数据集上的检测结果。但是论文中提到的 online hard sample mining 是在训练网络的同时做的操作,总体来说就是在训练过程中,丢掉一些简单样本的 gradient,让它们不参与之后的 bp,简单样本的鉴定就是看这个样本的 loss 值,loss 值高的样本是 hard 的,而 loss 值低的则认为是 simple 的,训练时按比例丢弃 simple 的 gradient,论文说这样做效果会提升,我没有做对比。 15 | 16 | 上面描述到的 loss 行为显然没法用原生的 Caffe Layer 来实现,这里采用了 C++ 单独写了一个 Loss Layer,将多个任务的 loss 计算全部合并到这个 Layer 中,同时完成 hard sample mining 的操作。这个 Layer 的实现也可以用 Python 写,但是当时采用了 C++ 编写,后面也没有改,Layer 使用时需要将相应代码 copy 到 Caffe 源码树中参与编译,这样就能够在 prototxt 中使用了。 17 | 18 | ### 训练数据的准备 19 | 20 | 这一步是整个 JFDA 中最为关键的一步,需要从 WIDER Face 中采集到 pos,neg 和 part 用于训练。我定义了一个 proposal 函数,函数的行为是给一张图片和这张图片中所有人脸的 ground truth,然后返回 pos,neg 和 part 数据。在实际 proposal 时,将每个 ground truth bbox 按一定比例放大后,通过不同的 scale 和 stride 进行滑窗操作,得到的框再根据与 ground truth 的 overlap 判断每个框是属于 pos,neg 还是 part。这里有一点需要非常注意,**一定要将滑窗区域做一下随机扰动**,不然滑窗出来的框在计算 bbox offset 时会有一个固定的模式(几个固定的值),严重影响后面 bbox reg 的训练。这里还有一点,判断 neg 时需要和图中所有 ground truth bbox 计算 overlap 才行,只和当前的 ground truth 计算是不对的。实际实现中,还可以针对 pos,neg 和 part 单独设置不同的 scale 和 stride 进行滑窗采样。 21 | 22 | 关于 neg 数据,上面在人脸区域中采集了一部分 neg 数据,同时也需要去整个图中随机 crop neg。我最开始的策略是所有 neg 来做整个图的随机 crop,这导致 PNet 在 fddb 上误检太多,在实际图片中使用模型,PNet 输出也非常多,但是 recall 基本还行,后来改成 50% 来自人脸区域附近,50% 来做整图,PNet 在 fddb 上的误检就少了很多,recall 基本没有影响,但是如果 100% 来自人脸区域,误检确实会很少,但是会导致 recall 降低。关于 50%,@tornadomeet 认为这个数值可能比较好。 23 | 24 | RNet 和 ONet 需要用到前面网络做检测的输出,proposal 函数可以接受一个 detector 来表示,检测的结果直接去划分 pos,neg 和 part。关于 CelebA 数据的采样,也是同样调用的 proposal 函数,但是只取 pos 作为采样数据。同时由于 CelebA 图片较多较大,我最开始并没有用前面的网络去做检测,直接像处理 PNet 的数据一样处理后面两个网络的 landmark 人脸,这样做会导致 landmark 人脸数据比较多。后来我重新用 PNet,RNet 跑 CelebA 来准备数据,耗时比较大,但是 landmark 数据量少多了。 25 | 26 | |Type |Pos |Neg |Part |Landmark | 27 | |---------------|--------|----------|-------|-----------| 28 | |PNet-Train |888351 |1179356 |970000 |1620790 | 29 | |PNet-Val |218478 |295192 |240000 |405200 | 30 | |RNet-Train |97642 |1585937 |250000 |353873 | 31 | |RNet-Val |24461 |393851 |60000 |89107 | 32 | |ONet-Train |142354 |38143 |90000 |360634 | 33 | |ONet-Val |35199 |9078 |20000 |89586 | 34 | 35 | ### Data Layer 36 | 37 | 每个 mini-batch 里 pos,neg,part 和 landmark 需要满足一定的比例(1:3:1:2),我针对不同的网络设置了不同的比例。我用 Python 写了 Data Layer,每个 mini-batch 都是在线组装的,同时以一定概率做 data augment,包括随机灰度化,随机 flip 人脸。这里需要注意的是 flip 操作会导致样本 label 的变化,一定要计算清楚,包括 bbox offset 的变换和 landmark 点的变换。如果不保持 mini-batch 里的样本比例也可以,但要做一下 gradient 的归一化,然后再把相应任务的 gradient 按比例 rescale,但是如果这样做,最好能够保证整个训练数据是按一定比例生成的,tornadomeet 采用了这种方法,训练结果没什么问题。 38 | 39 | 关于数据的加载,我采用的是 lmdb 格式,从性能上来衡量,只能说还行,网络较小时,io 开销明显有点大,关键在于 online data augment 会比较耗时,导致 GPU 负载不够,这个问题在 PNet 比较明显,ONet 稍微好些。Data Layer 我采用了生产消费模式,一个进程不停地从 lmdb 里加载各类数据,然后做 data augment 后进行 mini-batch 的组装,然后放到队列中。同时 Python 写的 Data Layer 作为消费者直接从队列中获取 batch 后交给网络去训练。 40 | 41 | ### 关于训练 42 | 43 | 训练主要是用的 Caffe 框架,结合自己的 Loss Layer,我针对性地输出了正负样本在 forward 时候的准确率,方便刻画误差,当然,原本的 softmax loss 也是可以看到的。任务之间的权值并没有做过多的调整,这里需要注意的是,mini-batch 中不同任务之间样本的比例已经是一种权值的刻画,但是在最后合并 loss 时还是加了额外任务的权值,这一步我没有细究,也没有做过多的调试,这里的两种比例是这样的,样本的比例刻画了数据集不同数据之间的分布,而任务的权值刻画了任务的重要性。训练参数的调节主要就是平时调 Caffe 模型的几个参数。这里我单独写了 train 的代码,因为 Caffe 训练参数中并没有 epoch 的概念,但是有每隔多少次迭代做测试的逻辑,我会每次根据数据集的大小和 batch size 的大小动态算出每个 epoch 需要迭代多少次,然后动态改变 solver 的参数来进行训练,其他参数的调整也是这个原理。因此不能直接用 caffe 命令来进行训练,必须用 train.py 来进行训练。 44 | 45 | PNet 训练时的模型最好能够保证正样本的准确率 > 80%,负样本的准确率 > 97%,face cls loss 在 0.1 附近。以此保证在误检不多的情况下保证较高的召回率。 46 | 47 | RNet 训练时的模型最好能够保证正样本的准确率 > 93%,负样本的准确率 > 99%,face cls loss 在 0.04 附近。 48 | 49 | ONet 训练时的模型最好能够保证正样本的准确率 > 95.5% 以保证最后的 recall,负样本的准确率 > 95.5%,face cls loss 在 0.099 附近,landmark reg loss < 0.0008。ONet 本身由于训练数据约束,比较难训练。 50 | 51 | 具体训练参数查看 train.sh 里的配置。 52 | 53 | ### 关于 recall 和误检 54 | 55 | 目前在为后面网络准备数据时,设置的网络输出阈值均为 0.5,这个值可能比较大,导致后面网络的 recall 无法支撑,但是同时误检会少很多,可以考虑降低检测输出阈值,这个可以在 jfda/config.py 中修改。 56 | 57 | ### 关于 LNet 58 | 59 | LNet 并没有出现在论文中,但是出现在作者公布的模型中,主要是做了 5 个关键点的修正,思路和之前 CUHK 做的 [Cascade CNN](http://mmlab.ie.cuhk.edu.hk/archive/CNN_FacePoint.htm) 思路一致,拿每个点附近的 patch 送到网络去预测点的修正,期望标点误差更低,考虑到各个 patch 来自人脸的不同区域,原作者采用了多个 branch 的子网络来做卷积,然后 concat feature 之后再针对各个点做 regression 训练,这里我们可以采用卷积的 group 参数达到相同的效果,如果不采用 group 而是直接用卷积在各个 patch 上操作,训练效果不好。采用 group 策略原则上跟多个子网络效果类似,但是这样做的好处是网络速度会有明显提升,同时我也压缩了网络的参数,进一步提高 LNet 的速度,用 cpp 代码做了相应的 benchmark,提升速度比较明显。训练时最好保证最后模型总的 loss 在 0.016 以下。 60 | 61 | ### 各个网络在 fddb 上的性能 62 | 63 | caffe-mtcnn 是作者公布的模型在 mxnet 下性能,mx-mtcnn 是 @tornadomeet 在 mxnet 下的复现,这两者的数据由 @tornadomeet 提供,jfda 是我在 caffe 下的复现。 64 | 65 | ![fddb-pnet](fddb/fddb-pnet.png) 66 | 67 | ![fddb-rnet](fddb/fddb-rnet.png) 68 | 69 | ![fddb-onet](fddb/fddb-onet.png) 70 | 71 | ### 总体性能 72 | 73 | 我设置了不同的阈值来检测模型在 fddb 上的性能,fddb-a 中三个网络的阈值分别为 0.6,0.7,0.8。而 fddb-b 中三个网络的阈值分别为 0.1,0.1,0.1。 74 | 75 | ![fddb-a](fddb/fddb-a.png) 76 | 77 | ![fddb-b](fddb/fddb-b.png) 78 | 79 | ### 各个网络的图示 80 | 81 | ![pnet](img/pnet.png) 82 | 83 | ![rnet](img/rnet.png) 84 | 85 | ![onet](img/onet.png) 86 | 87 | ![lnet](img/lnet.png) 88 | -------------------------------------------------------------------------------- /simplewebcam.py: -------------------------------------------------------------------------------- 1 | import math 2 | import cv2 3 | import caffe 4 | import numpy as np 5 | 6 | def gen_bbox(hotmap, offset, scale, th): 7 | h, w = hotmap.shape 8 | stride = 2 9 | win_size = 12 10 | hotmap = hotmap.reshape((h, w)) 11 | keep = hotmap > th 12 | pos = np.where(keep) 13 | score = hotmap[keep] 14 | offset = offset[:, keep] 15 | x, y = pos[1], pos[0] 16 | x1 = stride * x 17 | y1 = stride * y 18 | x2 = x1 + win_size 19 | y2 = y1 + win_size 20 | x1 = x1 / scale 21 | y1 = y1 / scale 22 | x2 = x2 / scale 23 | y2 = y2 / scale 24 | bbox = np.vstack([x1, y1, x2, y2, score, offset]).transpose() 25 | return bbox.astype(np.float32) 26 | 27 | def nms(dets, thresh, meth='Union'): 28 | x1 = dets[:, 0] 29 | y1 = dets[:, 1] 30 | x2 = dets[:, 2] 31 | y2 = dets[:, 3] 32 | scores = dets[:, 4] 33 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 34 | order = scores.argsort()[::-1] 35 | keep = [] 36 | while order.size > 0: 37 | i = order[0] 38 | keep.append(i) 39 | xx1 = np.maximum(x1[i], x1[order[1:]]) 40 | yy1 = np.maximum(y1[i], y1[order[1:]]) 41 | xx2 = np.minimum(x2[i], x2[order[1:]]) 42 | yy2 = np.minimum(y2[i], y2[order[1:]]) 43 | w = np.maximum(0.0, xx2 - xx1 + 1) 44 | h = np.maximum(0.0, yy2 - yy1 + 1) 45 | inter = w * h 46 | if meth == 'Union': 47 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 48 | else: 49 | ovr = inter / np.minimum(areas[i], areas[order[1:]]) 50 | inds = np.where(ovr <= thresh)[0] 51 | order = order[inds + 1] 52 | return keep 53 | 54 | def bbox_reg(bboxes): 55 | w = bboxes[:, 2] - bboxes[:, 0] 56 | h = bboxes[:, 3] - bboxes[:, 1] 57 | bboxes[:, 0] += bboxes[:, 5] * w 58 | bboxes[:, 1] += bboxes[:, 6] * h 59 | bboxes[:, 2] += bboxes[:, 7] * w 60 | bboxes[:, 3] += bboxes[:, 8] * h 61 | return bboxes 62 | 63 | def make_square(bboxes): 64 | x_center = (bboxes[:, 0] + bboxes[:, 2]) / 2 65 | y_center = (bboxes[:, 1] + bboxes[:, 3]) / 2 66 | w = bboxes[:, 2] - bboxes[:, 0] 67 | h = bboxes[:, 3] - bboxes[:, 1] 68 | size = np.vstack([w, h]).max(axis=0).transpose() 69 | bboxes[:, 0] = x_center - size / 2 70 | bboxes[:, 2] = x_center + size / 2 71 | bboxes[:, 1] = y_center - size / 2 72 | bboxes[:, 3] = y_center + size / 2 73 | return bboxes 74 | 75 | def crop_face(img, bbox, wrap=True): 76 | height, width = img.shape[:-1] 77 | x1, y1, x2, y2 = bbox 78 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 79 | if x1 >= width or y1 >= height or x2 <= 0 or y2 <= 0: 80 | print '[WARN] ridiculous x1, y1, x2, y2' 81 | return None 82 | if x1 < 0 or y1 < 0 or x2 > width or y2 > height: 83 | # out of boundary, still crop the face 84 | if not wrap: 85 | return None 86 | h, w = y2 - y1, x2 - x1 87 | patch = np.zeros((h, w, 3), dtype=np.uint8) 88 | vx1 = 0 if x1 < 0 else x1 89 | vy1 = 0 if y1 < 0 else y1 90 | vx2 = width if x2 > width else x2 91 | vy2 = height if y2 > height else y2 92 | sx = -x1 if x1 < 0 else 0 93 | sy = -y1 if y1 < 0 else 0 94 | vw = vx2 - vx1 95 | vh = vy2 - vy1 96 | patch[sy:sy+vh, sx:sx+vw] = img[vy1:vy2, vx1:vx2] 97 | return patch 98 | return img[y1:y2, x1:x2] 99 | 100 | def mtcnn_detection(img, scales, width, height): 101 | ### pnet ### 102 | bboxes_in_all_scales = np.zeros((0, 4 + 1 + 4), dtype=np.float32) 103 | for scale in scales: 104 | w, h = int(math.ceil(scale * width)), int(math.ceil(scale * height)) 105 | data = cv2.resize(img, (w, h)) 106 | data = data.transpose((2, 0, 1)).astype(np.float32) # order now: ch, height, width 107 | data = (data - 128) / 128 108 | data = data.reshape((1, 3, h, w)) # order now: batch, ch, height, width 109 | pnet.blobs['data'].reshape(*data.shape) 110 | pnet.blobs['data'].data[...] = data 111 | pnet.forward() 112 | prob = pnet.blobs['prob'].data 113 | bbox_pred = pnet.blobs['bbox_pred'].data 114 | bboxes = gen_bbox(prob[0][1], bbox_pred[0], scale, 0.6) 115 | keep = nms(bboxes, 0.5) # nms in each scale 116 | bboxes = bboxes[keep] 117 | bboxes_in_all_scales = np.vstack([bboxes_in_all_scales, bboxes]) 118 | # nms in total 119 | keep = nms(bboxes_in_all_scales, 0.7) 120 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 121 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 122 | bboxes_in_all_scales = make_square(bboxes_in_all_scales) 123 | if len(bboxes_in_all_scales) == 0: 124 | return bboxes_in_all_scales 125 | 126 | 127 | ### rnet ### 128 | n = len(bboxes_in_all_scales) 129 | data = np.zeros((n, 3, 24, 24), dtype=np.float32) 130 | for i, bbox in enumerate(bboxes_in_all_scales): 131 | face = crop_face(img, bbox[:4]) 132 | data[i] = cv2.resize(face, (24, 24)).transpose((2, 0, 1)) 133 | data = (data - 128) / 128 134 | rnet.blobs['data'].reshape(*data.shape) 135 | rnet.blobs['data'].data[...] = data 136 | rnet.forward() 137 | prob = rnet.blobs['prob'].data 138 | bbox_pred = rnet.blobs['bbox_pred'].data 139 | prob = prob.reshape(n, 2) 140 | bbox_pred = bbox_pred.reshape(n, 4) 141 | keep = prob[:, 1] > 0.7 142 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 143 | bboxes_in_all_scales[:, 4] = prob[keep, 1] 144 | bboxes_in_all_scales[:, 5:9] = bbox_pred[keep] 145 | keep = nms(bboxes_in_all_scales, 0.7) 146 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 147 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 148 | bboxes_in_all_scales = make_square(bboxes_in_all_scales) 149 | if len(bboxes_in_all_scales) == 0: 150 | return bboxes_in_all_scales 151 | 152 | ### onet ### 153 | n = len(bboxes_in_all_scales) 154 | data = np.zeros((n, 3, 48, 48), dtype=np.float32) 155 | for i, bbox in enumerate(bboxes_in_all_scales): 156 | face = crop_face(img, bbox[:4]) 157 | data[i] = cv2.resize(face, (48, 48)).transpose((2, 0, 1)) 158 | data = (data - 128) / 128 159 | onet.blobs['data'].reshape(*data.shape) 160 | onet.blobs['data'].data[...] = data 161 | onet.forward() 162 | prob = onet.blobs['prob'].data 163 | bbox_pred = onet.blobs['bbox_pred'].data 164 | prob = prob.reshape(n, 2) 165 | bbox_pred = bbox_pred.reshape(n, 4) 166 | keep = prob[:, 1] > 0.4 167 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 168 | bboxes_in_all_scales[:, 4] = prob[keep, 1] 169 | bboxes_in_all_scales[:, 5:9] = bbox_pred[keep] 170 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 171 | keep = nms(bboxes_in_all_scales, 0.5, 'Min') 172 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 173 | return bboxes_in_all_scales 174 | 175 | 176 | 177 | 178 | pnet = caffe.Net('proto/p.prototxt', 'tmp/pnet_iter_327000.caffemodel', caffe.TEST) 179 | rnet = caffe.Net('proto/r.prototxt', 'tmp/rnet_iter_91000.caffemodel', caffe.TEST) 180 | onet = caffe.Net('proto/o.prototxt', 'tmp/onet_iter_32000.caffemodel', caffe.TEST) 181 | 182 | cap = cv2.VideoCapture(0) 183 | ret, img = cap.read() 184 | 185 | min_size = 24 186 | factor = 0.709 187 | base = 12. / min_size 188 | height, width = img.shape[:-1] 189 | l = min(width, height) 190 | l *= base 191 | scales = [] 192 | while l > 12: 193 | scales.append(base) 194 | base *= factor 195 | l *= factor 196 | 197 | 198 | while(True): 199 | ret, img = cap.read() 200 | onet_boxes = mtcnn_detection(img, scales, width, height) 201 | imgdraw_onet = img.copy() 202 | for i in range(len(onet_boxes)): 203 | x1, y1, x2, y2, score = onet_boxes[i, :5] 204 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 205 | cv2.rectangle(imgdraw_onet, (x1, y1), (x2, y2), (0, 0, 255), 2) 206 | cv2.putText(imgdraw_onet, '%.03f'%score, (x1, y1), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0)) 207 | 208 | cv2.imshow("mtcnn", imgdraw_onet) 209 | k = cv2.waitKey(1) & 0xff 210 | if k == 27 : 211 | break 212 | cap.release() 213 | cv2.destroyAllWindows() 214 | -------------------------------------------------------------------------------- /layers/test_jfda_loss_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "gtest/gtest.h" 6 | #include "caffe/blob.hpp" 7 | #include "caffe/common.hpp" 8 | #include "caffe/filler.hpp" 9 | #include "caffe/layers/jfda_loss_layer.hpp" 10 | #include "caffe/test/test_caffe_main.hpp" 11 | #include "caffe/test/test_gradient_check_util.hpp" 12 | #include 13 | 14 | using boost::scoped_ptr; 15 | using std::cout; 16 | using std::endl; 17 | 18 | namespace caffe { 19 | 20 | static const int kBaseSize = 16; 21 | static const int kBatchSize = kBaseSize * 7; 22 | 23 | template 24 | class JfdaLossLayerTest : public MultiDeviceTest { 25 | typedef typename TypeParam::Dtype Dtype; 26 | 27 | protected: 28 | JfdaLossLayerTest() 29 | : score(new Blob(kBatchSize, 2, 1, 1)), 30 | label(new Blob(kBatchSize, 1, 1, 1)), 31 | bbox_pred(new Blob(kBatchSize, 4, 1, 1)), 32 | bbox_target(new Blob(kBatchSize, 4, 1, 1)), 33 | landmark_pred(new Blob(kBatchSize, 10, 1, 1)), 34 | landmark_target(new Blob(kBatchSize, 10, 1, 1)), 35 | blob_top_data1_(new Blob()), 36 | blob_top_data2_(new Blob()), 37 | blob_top_data3_(new Blob()), 38 | blob_top_data4_(new Blob()), 39 | blob_top_data5_(new Blob()) { 40 | Caffe::set_random_seed(0); 41 | FillerParameter filler_param; 42 | filler_param.set_std(10); 43 | GaussianFiller filler(filler_param); 44 | filler.Fill(this->score); 45 | filler.Fill(this->bbox_pred); 46 | filler.Fill(this->bbox_target); 47 | filler.Fill(this->landmark_pred); 48 | filler.Fill(this->landmark_target); 49 | int n1 = 3 * kBaseSize; 50 | int n2 = kBaseSize; 51 | int n3 = kBaseSize; 52 | int n4 = 2 * kBaseSize; 53 | Dtype* label_data = label->mutable_cpu_data(); 54 | for (int i = 0; i < n1; i++) { 55 | label_data[i] = 0; 56 | } 57 | for (int i = 0; i < n2; i++) { 58 | label_data[i + n1] = 1; 59 | } 60 | for (int i = 0; i < n3; i++) { 61 | label_data[i + n1 + n2] = 2; 62 | } 63 | for (int i = 0; i < n4; i++) { 64 | label_data[i + n1 + n2 + n3] = 3; 65 | } 66 | blob_bottom_vec_.push_back(score); 67 | blob_bottom_vec_.push_back(bbox_pred); 68 | blob_bottom_vec_.push_back(landmark_pred); 69 | blob_bottom_vec_.push_back(bbox_target); 70 | blob_bottom_vec_.push_back(landmark_target); 71 | blob_bottom_vec_.push_back(label); 72 | blob_top_vec_.push_back(blob_top_data1_); 73 | blob_top_vec_.push_back(blob_top_data2_); 74 | blob_top_vec_.push_back(blob_top_data3_); 75 | blob_top_vec_.push_back(blob_top_data4_); 76 | blob_top_vec_.push_back(blob_top_data5_); 77 | } 78 | 79 | virtual ~JfdaLossLayerTest() { 80 | delete score; 81 | delete label; 82 | delete bbox_pred; 83 | delete bbox_target; 84 | delete landmark_pred; 85 | delete landmark_target; 86 | delete blob_top_data1_; 87 | delete blob_top_data2_; 88 | delete blob_top_data3_; 89 | delete blob_top_data4_; 90 | delete blob_top_data5_; 91 | } 92 | 93 | Blob* const score; 94 | Blob* const label; 95 | Blob* const bbox_pred; 96 | Blob* const bbox_target; 97 | Blob* const landmark_pred; 98 | Blob* const landmark_target; 99 | Blob* const blob_top_data1_; 100 | Blob* const blob_top_data2_; 101 | Blob* const blob_top_data3_; 102 | Blob* const blob_top_data4_; 103 | Blob* const blob_top_data5_; 104 | vector*> blob_bottom_vec_; 105 | vector*> blob_top_vec_; 106 | }; // class JfdaLossLayerTest 107 | 108 | TYPED_TEST_CASE(JfdaLossLayerTest, TestDtypesAndDevices); 109 | 110 | TYPED_TEST(JfdaLossLayerTest, TestForward) { 111 | typedef typename TypeParam::Dtype Dtype; 112 | LayerParameter layer_param; 113 | scoped_ptr > layer( 114 | new JfdaLossLayer(layer_param)); 115 | layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); 116 | layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); 117 | 118 | const Dtype* top_loss1 = this->blob_top_data1_->cpu_data(); 119 | const Dtype* top_loss2 = this->blob_top_data2_->cpu_data(); 120 | const Dtype* top_loss3 = this->blob_top_data3_->cpu_data(); 121 | 122 | cout << "Loss-1 " << top_loss1[0] << endl; 123 | cout << "Loss-2 " << top_loss2[0] << endl; 124 | cout << "Loss-3 " << top_loss3[0] << endl; 125 | 126 | // bbox loss 127 | const Dtype* bpd = this->bbox_pred->cpu_data(); 128 | const Dtype* btd = this->bbox_target->cpu_data(); 129 | Dtype loss = 0; 130 | for (int i = 3*kBaseSize; i < 5*kBaseSize; i++) { 131 | int offset = this->bbox_pred->offset(i, 0, 0, 0); 132 | for (int j = 0; j < 4; j++) { 133 | Dtype diff = bpd[offset + j] - btd[offset + j]; 134 | loss += diff * diff; 135 | } 136 | } 137 | loss /= 2 * kBatchSize; 138 | cout << "BBox loss " << loss << endl; 139 | } 140 | 141 | TYPED_TEST(JfdaLossLayerTest, TestBackward) { 142 | typedef typename TypeParam::Dtype Dtype; 143 | LayerParameter layer_param; 144 | scoped_ptr > layer( 145 | new JfdaLossLayer(layer_param)); 146 | layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); 147 | layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); 148 | // set loss weight 149 | this->blob_top_data1_->mutable_cpu_diff()[0] = 1; 150 | this->blob_top_data2_->mutable_cpu_diff()[0] = 0.5; 151 | this->blob_top_data3_->mutable_cpu_diff()[0] = 0.5; 152 | vector propagate_down; 153 | layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_); 154 | 155 | cout << "========================" << endl; 156 | cout << "score gradient" << endl; 157 | const Dtype* score_grad = this->score->cpu_diff(); 158 | for (int i = 0; i < 4*kBaseSize; i++) { 159 | cout << score_grad[2 * i] << " " << score_grad[2 * i + 1] << endl; 160 | } 161 | cout << "========================" << endl; 162 | cout << "bbox gradient" << endl; 163 | const Dtype* bbox_grad = this->bbox_pred->cpu_diff(); 164 | for (int i = 3*kBaseSize; i < 5*kBaseSize; i++) { 165 | for (int j = 0; j < 4; j++) { 166 | cout << bbox_grad[this->bbox_pred->offset(i, j, 0, 0)] << " "; 167 | } 168 | cout << endl; 169 | } 170 | cout << "========================" << endl; 171 | cout << "bbox different" << endl; 172 | const Dtype* bbox_diff = layer->bbox_diff_.cpu_data(); 173 | for (int i = 3*kBaseSize; i < 5*kBaseSize; i++) { 174 | for (int j = 0; j < 4; j++) { 175 | cout << bbox_diff[layer->bbox_diff_.offset(i, j, 0, 0)] << " "; 176 | } 177 | cout << endl; 178 | } 179 | 180 | // gradient check of bbox regression 181 | cout << "========================" << endl; 182 | cout << "bbox gradient check" << endl; 183 | Dtype loss1, loss2, gradient1, gradient2, esp; 184 | esp = 1e-2; 185 | int offset = this->bbox_pred->offset(4*kBaseSize, 2); 186 | gradient1 = this->bbox_pred->cpu_diff()[offset]; 187 | Dtype x = this->bbox_pred->cpu_data()[offset]; 188 | this->bbox_pred->mutable_cpu_data()[offset] = x - esp; 189 | layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); 190 | loss1 = this->blob_top_data1_->cpu_data()[0] * this->blob_top_data1_->cpu_diff()[0] + 191 | this->blob_top_data2_->cpu_data()[0] * this->blob_top_data2_->cpu_diff()[0] + 192 | this->blob_top_data3_->cpu_data()[0] * this->blob_top_data3_->cpu_diff()[0]; 193 | this->bbox_pred->mutable_cpu_data()[offset] = x + esp; 194 | layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); 195 | loss2 = this->blob_top_data1_->cpu_data()[0] * this->blob_top_data1_->cpu_diff()[0] + 196 | this->blob_top_data2_->cpu_data()[0] * this->blob_top_data2_->cpu_diff()[0] + 197 | this->blob_top_data3_->cpu_data()[0] * this->blob_top_data3_->cpu_diff()[0]; 198 | gradient2 = (loss2 - loss1) / (2 * esp); 199 | cout << "calculate gradient " << gradient2 << endl; 200 | cout << "backward gradient " << gradient1 << endl; 201 | } 202 | 203 | } // namespace caffe 204 | -------------------------------------------------------------------------------- /jfda/lnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 3 | 4 | import os 5 | import shutil 6 | import random 7 | import argparse 8 | import multiprocessing as mp 9 | import cv2 10 | import h5py 11 | import caffe 12 | import numpy as np 13 | from caffe.proto import caffe_pb2 14 | from google.protobuf import text_format 15 | from jfda.config import cfg 16 | from jfda.utils import load_celeba, get_logger, crop_face 17 | 18 | 19 | logger = get_logger() 20 | 21 | 22 | def fill_queues(data, qs): 23 | data_n = len(data) 24 | queue_n = len(qs) 25 | for i in range(len(data)): 26 | qs[i%queue_n].put(data[i]) 27 | 28 | def remove(f): 29 | if os.path.exists(f): 30 | os.remove(f) 31 | 32 | 33 | # =========== prepare data ================ 34 | 35 | def lnet_reader_func(q_in, q_out): 36 | counter = 0 37 | while not q_in.empty(): 38 | item = q_in.get() 39 | counter += 1 40 | if counter%10000 == 0: 41 | logger.info('%s reads %d', mp.current_process().name, counter) 42 | img_path, bbox, landmark = item 43 | img = cv2.imread(img_path, cv2.IMREAD_COLOR) 44 | if img is None: 45 | logger.warn('read %s failed', img_path) 46 | continue 47 | x1, y1, x2, y2 = bbox 48 | w, h = x2 - x1, y2 - y1 49 | # assert w == h, 'bbox is not a square' 50 | landmark = landmark.reshape((5, 2)) 51 | for _ in range(cfg.LNET_SAMPLE_PER_FACE): 52 | offset = np.random.rand(5, 2).astype(np.float32) 53 | offset = (2*offset - 1) * cfg.SAMPLE_RADIUS 54 | for scale in cfg.LNET_FACE_SCALES: 55 | l = w * scale 56 | target = offset.copy() 57 | # target = target * w / l 58 | target /= scale 59 | target = target.reshape(10) 60 | data = np.zeros((24, 24, 15), dtype=np.uint8) 61 | for i in range(5): 62 | x, y = landmark[i] 63 | x_offset, y_offset = offset[i] * w 64 | x_center, y_center = x+x_offset, y+y_offset 65 | patch_bbox = [x_center - l/2, y_center - l/2, 66 | x_center + l/2, y_center + l/2] 67 | patch = crop_face(img, patch_bbox) 68 | # # debug 69 | # print patch.shape, scale, x_offset, y_offset, target[i, 0], target[i, 1] 70 | # patch = patch.copy() 71 | # patch_x, patch_y = patch_bbox[:2] 72 | # cv2.circle(patch, (int(x_center - patch_x), int(y_center - patch_y)), 1, (0, 255, 0), -1) 73 | # cv2.circle(patch, (int(x - patch_x), int(y - patch_y)), 1, (0, 0, 255), -1) 74 | # cv2.imshow('patch', patch) 75 | # cv2.waitKey(0) 76 | patch = cv2.resize(patch, (24, 24)) 77 | data[:, :, (3*i):(3*i+3)] = patch 78 | data = data.transpose((2, 0, 1)) # 15x24x24, uint8 79 | target *= -1 80 | q_out.put(('data', [data, target])) 81 | 82 | 83 | def lnet_writer_func(q_out, txt): 84 | file_counter = 0 85 | item_counter = 0 86 | fout_pattern = 'data/lnet_train_%03d.h5' if 'train' in txt else 'data/lnet_val_%03d.h5' 87 | fouts = [] 88 | q = [] 89 | 90 | def output_data(q, file_counter): 91 | file_counter += 1 92 | random.shuffle(q) 93 | n = len(q) 94 | data = np.zeros((n, 15, 24, 24), dtype=np.float32) 95 | target = np.zeros((n, 10), dtype=np.float32) 96 | for idx, (one_data, one_target) in enumerate(q): 97 | data[idx] = one_data 98 | target[idx] = one_target 99 | data = (data - 128.) / 128. # process data 100 | fout = fout_pattern%file_counter 101 | fouts.append(fout) 102 | remove(fout) 103 | logger.info('write to %s', fout) 104 | with h5py.File(fout, 'w') as h5: 105 | h5['data'] = data 106 | h5['target'] = target 107 | return file_counter 108 | 109 | while True: 110 | stat, item = q_out.get() 111 | if stat == 'finish': 112 | file_counter = output_data(q, file_counter) 113 | q = [] 114 | break 115 | item_counter += 1 116 | if item_counter%10000 == 0: 117 | logger.info('writes %d landmark data', item_counter) 118 | q.append(item) 119 | if len(q) >= cfg.DATASIZE_PER_H5: 120 | file_counter = output_data(q, file_counter) 121 | q = [] 122 | 123 | remove(txt) 124 | with open(txt, 'w') as txt_out: 125 | for fout in fouts: 126 | txt_out.write('%s\n'%fout) 127 | logger.info("Finish") 128 | 129 | 130 | def prepare(args): 131 | '''prepare data for lnet 132 | ''' 133 | 134 | logger.info('loading CelebA') 135 | train_data, val_data = load_celeba() 136 | 137 | def gen(data, is_train): 138 | txt = 'data/lnet_train.txt' if is_train else 'data/lnet_val.txt' 139 | remove(txt) 140 | q_in = [mp.Queue() for i in range(cfg.WORKER_N)] 141 | q_out = mp.Queue(1024) 142 | fill_queues(data, q_in) 143 | readers = [mp.Process(target=lnet_reader_func, args=(q_in[i], q_out)) \ 144 | for i in range(cfg.WORKER_N)] 145 | for p in readers: 146 | p.start() 147 | writer = mp.Process(target=lnet_writer_func, args=(q_out, txt)) 148 | writer.start() 149 | for p in readers: 150 | p.join() 151 | q_out.put(('finish', [])) 152 | writer.join() 153 | 154 | logger.info('writing train data') 155 | gen(train_data, True) 156 | logger.info('writing val data') 157 | gen(val_data, False) 158 | 159 | 160 | # =========== train lnet ================ 161 | 162 | def train(args): 163 | '''train lnet using data prepare by `prepare()` 164 | ''' 165 | 166 | def get_data_size(txt): 167 | size = 0 168 | with open(txt, 'r') as fin: 169 | for line in fin.readlines(): 170 | line = line.strip() 171 | data = h5py.File(line, 'r') 172 | size += data['target'].shape[0] 173 | data.close() 174 | return size 175 | 176 | # init caffe 177 | np.random.seed(cfg.RNG_SEED) 178 | caffe.set_random_seed(cfg.RNG_SEED) 179 | if cfg.GPU_ID < 0: 180 | caffe.set_mode_cpu() 181 | else: 182 | caffe.set_mode_gpu() 183 | caffe.set_device(cfg.GPU_ID) 184 | # solver parameter setup 185 | batch_size = 128 186 | train_size = get_data_size('data/lnet_train.txt') 187 | val_size = get_data_size('data/lnet_val.txt') 188 | iter_train = train_size / batch_size 189 | iter_test = val_size / batch_size 190 | max_iter = args.epoch * iter_train 191 | final_model = 'tmp/lnet_iter_%d.caffemodel'%max_iter 192 | solver_param = caffe_pb2.SolverParameter() 193 | with open('proto/l_solver.prototxt', 'r') as fin: 194 | text_format.Merge(fin.read(), solver_param) 195 | solver_param.max_iter = max_iter 196 | solver_param.snapshot = iter_train 197 | solver_param.test_interval = iter_train 198 | solver_param.test_iter[0] = iter_test 199 | solver_param.base_lr = args.lr 200 | solver_param.gamma = args.lrw 201 | solver_param.stepsize = args.lrp * iter_train 202 | tmp_solver_prototxt = 'tmp/l_solver.prototxt' 203 | with open(tmp_solver_prototxt, 'w') as fout: 204 | fout.write(text_format.MessageToString(solver_param)) 205 | # solver setup 206 | solver = caffe.SGDSolver(tmp_solver_prototxt) 207 | # train 208 | solver.solve(args.snapshot) 209 | shutil.copyfile(final_model, 'model/l.caffemodel'); 210 | 211 | 212 | if __name__ == '__main__': 213 | parser = argparse.ArgumentParser() 214 | parser.add_argument('--prepare', action='store_true', help='prepare training data for lnet') 215 | parser.add_argument('--train', action='store_true', help='train lnet') 216 | parser.add_argument('--worker', type=int, default=8, help='workers to process the data') 217 | parser.add_argument('--gpu', type=int, default=0, help='gpu id to use, -1 for cpu') 218 | parser.add_argument('--epoch', type=int, default=20, help='train epoches') 219 | parser.add_argument('--snapshot', type=str, default=None, help='snapshot model') 220 | parser.add_argument('--lr', type=float, default=0.01, help='initial learning rate') 221 | parser.add_argument('--lrw', type=float, default=0.1, help='lr decay rate') 222 | parser.add_argument('--lrp', type=int, default=2, help='number of epoches to decay the lr') 223 | args = parser.parse_args() 224 | 225 | cfg.GPU_ID = args.gpu 226 | cfg.WORKER_N = args.worker 227 | 228 | print args 229 | 230 | if args.prepare: 231 | prepare(args) 232 | if args.train: 233 | train(args) 234 | -------------------------------------------------------------------------------- /simpledemo.py: -------------------------------------------------------------------------------- 1 | import math 2 | import cv2 3 | import caffe 4 | import numpy as np 5 | 6 | def gen_bbox(hotmap, offset, scale, th): 7 | h, w = hotmap.shape 8 | stride = 2 9 | win_size = 12 10 | hotmap = hotmap.reshape((h, w)) 11 | keep = hotmap > th 12 | pos = np.where(keep) 13 | score = hotmap[keep] 14 | offset = offset[:, keep] 15 | x, y = pos[1], pos[0] 16 | x1 = stride * x 17 | y1 = stride * y 18 | x2 = x1 + win_size 19 | y2 = y1 + win_size 20 | x1 = x1 / scale 21 | y1 = y1 / scale 22 | x2 = x2 / scale 23 | y2 = y2 / scale 24 | bbox = np.vstack([x1, y1, x2, y2, score, offset]).transpose() 25 | return bbox.astype(np.float32) 26 | 27 | def nms(dets, thresh, meth='Union'): 28 | x1 = dets[:, 0] 29 | y1 = dets[:, 1] 30 | x2 = dets[:, 2] 31 | y2 = dets[:, 3] 32 | scores = dets[:, 4] 33 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 34 | order = scores.argsort()[::-1] 35 | keep = [] 36 | while order.size > 0: 37 | i = order[0] 38 | keep.append(i) 39 | xx1 = np.maximum(x1[i], x1[order[1:]]) 40 | yy1 = np.maximum(y1[i], y1[order[1:]]) 41 | xx2 = np.minimum(x2[i], x2[order[1:]]) 42 | yy2 = np.minimum(y2[i], y2[order[1:]]) 43 | w = np.maximum(0.0, xx2 - xx1 + 1) 44 | h = np.maximum(0.0, yy2 - yy1 + 1) 45 | inter = w * h 46 | if meth == 'Union': 47 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 48 | else: 49 | ovr = inter / np.minimum(areas[i], areas[order[1:]]) 50 | inds = np.where(ovr <= thresh)[0] 51 | order = order[inds + 1] 52 | return keep 53 | 54 | def bbox_reg(bboxes): 55 | w = bboxes[:, 2] - bboxes[:, 0] 56 | h = bboxes[:, 3] - bboxes[:, 1] 57 | bboxes[:, 0] += bboxes[:, 5] * w 58 | bboxes[:, 1] += bboxes[:, 6] * h 59 | bboxes[:, 2] += bboxes[:, 7] * w 60 | bboxes[:, 3] += bboxes[:, 8] * h 61 | return bboxes 62 | 63 | def make_square(bboxes): 64 | x_center = (bboxes[:, 0] + bboxes[:, 2]) / 2 65 | y_center = (bboxes[:, 1] + bboxes[:, 3]) / 2 66 | w = bboxes[:, 2] - bboxes[:, 0] 67 | h = bboxes[:, 3] - bboxes[:, 1] 68 | size = np.vstack([w, h]).max(axis=0).transpose() 69 | bboxes[:, 0] = x_center - size / 2 70 | bboxes[:, 2] = x_center + size / 2 71 | bboxes[:, 1] = y_center - size / 2 72 | bboxes[:, 3] = y_center + size / 2 73 | return bboxes 74 | 75 | def crop_face(img, bbox, wrap=True): 76 | height, width = img.shape[:-1] 77 | x1, y1, x2, y2 = bbox 78 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 79 | if x1 >= width or y1 >= height or x2 <= 0 or y2 <= 0: 80 | print '[WARN] ridiculous x1, y1, x2, y2' 81 | return None 82 | if x1 < 0 or y1 < 0 or x2 > width or y2 > height: 83 | # out of boundary, still crop the face 84 | if not wrap: 85 | return None 86 | h, w = y2 - y1, x2 - x1 87 | patch = np.zeros((h, w, 3), dtype=np.uint8) 88 | vx1 = 0 if x1 < 0 else x1 89 | vy1 = 0 if y1 < 0 else y1 90 | vx2 = width if x2 > width else x2 91 | vy2 = height if y2 > height else y2 92 | sx = -x1 if x1 < 0 else 0 93 | sy = -y1 if y1 < 0 else 0 94 | vw = vx2 - vx1 95 | vh = vy2 - vy1 96 | patch[sy:sy+vh, sx:sx+vw] = img[vy1:vy2, vx1:vx2] 97 | return patch 98 | return img[y1:y2, x1:x2] 99 | 100 | pnet = caffe.Net('proto/p.prototxt', 'tmp/pnet_iter_446000.caffemodel', caffe.TEST) 101 | rnet = caffe.Net('proto/r.prototxt', 'tmp/rnet_iter_116000.caffemodel', caffe.TEST) 102 | onet = caffe.Net('proto/o.prototxt', 'tmp/onet_iter_90000.caffemodel', caffe.TEST) 103 | 104 | img = cv2.imread('9387493245278.jpg', cv2.IMREAD_COLOR) 105 | min_size = 24 106 | factor = 0.709 107 | base = 12. / min_size 108 | height, width = img.shape[:-1] 109 | l = min(width, height) 110 | l *= base 111 | scales = [] 112 | while l > 12: 113 | scales.append(base) 114 | base *= factor 115 | l *= factor 116 | 117 | ### pnet ### 118 | bboxes_in_all_scales = np.zeros((0, 4 + 1 + 4), dtype=np.float32) 119 | for scale in scales: 120 | w, h = int(math.ceil(scale * width)), int(math.ceil(scale * height)) 121 | data = cv2.resize(img, (w, h)) 122 | data = data.transpose((2, 0, 1)).astype(np.float32) # order now: ch, height, width 123 | data = (data - 128) / 128 124 | data = data.reshape((1, 3, h, w)) # order now: batch, ch, height, width 125 | pnet.blobs['data'].reshape(*data.shape) 126 | pnet.blobs['data'].data[...] = data 127 | pnet.forward() 128 | prob = pnet.blobs['prob'].data 129 | bbox_pred = pnet.blobs['bbox_pred'].data 130 | bboxes = gen_bbox(prob[0][1], bbox_pred[0], scale, 0.6) 131 | keep = nms(bboxes, 0.5) # nms in each scale 132 | bboxes = bboxes[keep] 133 | bboxes_in_all_scales = np.vstack([bboxes_in_all_scales, bboxes]) 134 | # nms in total 135 | keep = nms(bboxes_in_all_scales, 0.7) 136 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 137 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 138 | bboxes_in_all_scales = make_square(bboxes_in_all_scales) 139 | pnet_boxes = bboxes_in_all_scales.copy() 140 | imgdraw_pnet = img.copy() 141 | for i in range(len(pnet_boxes)): 142 | x1, y1, x2, y2, score = pnet_boxes[i, :5] 143 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 144 | cv2.rectangle(imgdraw_pnet, (x1, y1), (x2, y2), (0, 0, 255), 2) 145 | cv2.putText(imgdraw_pnet, '%.03f'%score, (x1, y1), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0)) 146 | cv2.imshow("pnet", imgdraw_pnet) 147 | cv2.imwrite("pnet.jpg", imgdraw_pnet) 148 | cv2.waitKey(0) 149 | 150 | # maybe redundent 151 | fake = np.zeros((1, 3, 12, 12), dtype=np.float32) 152 | pnet.blobs['data'].reshape(*fake.shape) 153 | pnet.blobs['data'].data[...] = fake 154 | pnet.forward() 155 | 156 | ### rnet ### 157 | n = len(bboxes_in_all_scales) 158 | data = np.zeros((n, 3, 24, 24), dtype=np.float32) 159 | for i, bbox in enumerate(bboxes_in_all_scales): 160 | face = crop_face(img, bbox[:4]) 161 | data[i] = cv2.resize(face, (24, 24)).transpose((2, 0, 1)) 162 | data = (data - 128) / 128 163 | rnet.blobs['data'].reshape(*data.shape) 164 | rnet.blobs['data'].data[...] = data 165 | rnet.forward() 166 | prob = rnet.blobs['prob'].data 167 | bbox_pred = rnet.blobs['bbox_pred'].data 168 | prob = prob.reshape(n, 2) 169 | bbox_pred = bbox_pred.reshape(n, 4) 170 | keep = prob[:, 1] > 0.7 171 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 172 | bboxes_in_all_scales[:, 4] = prob[keep, 1] 173 | bboxes_in_all_scales[:, 5:9] = bbox_pred[keep] 174 | keep = nms(bboxes_in_all_scales, 0.7) 175 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 176 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 177 | bboxes_in_all_scales = make_square(bboxes_in_all_scales) 178 | rnet_boxes = bboxes_in_all_scales.copy() 179 | imgdraw_rnet = img.copy() 180 | for i in range(len(rnet_boxes)): 181 | x1, y1, x2, y2, score = rnet_boxes[i, :5] 182 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 183 | cv2.rectangle(imgdraw_rnet, (x1, y1), (x2, y2), (0, 0, 255), 2) 184 | cv2.putText(imgdraw_rnet, '%.03f'%score, (x1, y1), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0)) 185 | cv2.imshow("rnet", imgdraw_rnet) 186 | cv2.imwrite("rnet.jpg", imgdraw_rnet) 187 | cv2.waitKey(0) 188 | 189 | # maybe redundent 190 | fake = np.zeros((1, 3, 24, 24), dtype=np.float32) 191 | rnet.blobs['data'].reshape(*fake.shape) 192 | rnet.blobs['data'].data[...] = fake 193 | rnet.forward() 194 | 195 | ### onet ### 196 | n = len(bboxes_in_all_scales) 197 | data = np.zeros((n, 3, 48, 48), dtype=np.float32) 198 | for i, bbox in enumerate(bboxes_in_all_scales): 199 | face = crop_face(img, bbox[:4]) 200 | data[i] = cv2.resize(face, (48, 48)).transpose((2, 0, 1)) 201 | data = (data - 128) / 128 202 | onet.blobs['data'].reshape(*data.shape) 203 | onet.blobs['data'].data[...] = data 204 | onet.forward() 205 | prob = onet.blobs['prob'].data 206 | bbox_pred = onet.blobs['bbox_pred'].data 207 | prob = prob.reshape(n, 2) 208 | bbox_pred = bbox_pred.reshape(n, 4) 209 | keep = prob[:, 1] > 0.4 210 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 211 | bboxes_in_all_scales[:, 4] = prob[keep, 1] 212 | bboxes_in_all_scales[:, 5:9] = bbox_pred[keep] 213 | bboxes_in_all_scales = bbox_reg(bboxes_in_all_scales) 214 | keep = nms(bboxes_in_all_scales, 0.5, 'Min') 215 | bboxes_in_all_scales = bboxes_in_all_scales[keep] 216 | onet_boxes = bboxes_in_all_scales.copy() 217 | imgdraw_onet = img.copy() 218 | for i in range(len(onet_boxes)): 219 | x1, y1, x2, y2, score = onet_boxes[i, :5] 220 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 221 | cv2.rectangle(imgdraw_onet, (x1, y1), (x2, y2), (0, 0, 255), 2) 222 | cv2.putText(imgdraw_onet, '%.03f'%score, (x1, y1), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0)) 223 | cv2.imshow("onet", imgdraw_onet) 224 | cv2.imwrite("onet.jpg", imgdraw_onet) 225 | cv2.waitKey(0) 226 | -------------------------------------------------------------------------------- /jfda/utils.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 2 | import os 3 | import time 4 | import logging 5 | import cv2 6 | import numpy as np 7 | from jfda.config import cfg 8 | import xml.etree.ElementTree as ET 9 | 10 | def load_scutbrainwashcheat(): 11 | 12 | train_image_dirs = ['/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/SCUT_HEAD_Part_B/JPEGImages/', '/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/SCUT_HEAD_Part_A/JPEGImages/', '/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/cheat/JPEGImages/', '/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/brainwash/JPEGImages/'] 13 | train_data = [] 14 | for trainimdir in train_image_dirs: 15 | print 'parsing ' + trainimdir + ' ...' 16 | for name in os.listdir(trainimdir): 17 | if name[-4:] == '.jpg': 18 | impath = trainimdir + name 19 | labelpath = impath.replace('.jpg', '.xml') 20 | labelpath = labelpath.replace('JPEGImages', 'Annotations') 21 | tree = ET.parse(labelpath) 22 | root = tree.getroot() 23 | bboxes = [] 24 | for obj in root.iter('object'): 25 | xmlbox = obj.find('bndbox') 26 | xmin = int(xmlbox.find('xmin').text) 27 | ymin = int(xmlbox.find('ymin').text) 28 | xmax = int(xmlbox.find('xmax').text) 29 | ymax = int(xmlbox.find('ymax').text) 30 | w = xmax - xmin 31 | h = ymax - ymin 32 | size = min(w, h) 33 | # only large enough 34 | if size > 12: 35 | bbox = [xmin, ymin, xmax, ymax] 36 | bboxes.append(bbox) 37 | if len(bboxes) > 0: 38 | bboxes = np.asarray(bboxes, dtype=np.float32) 39 | train_data.append([impath, bboxes]) 40 | return (train_data, train_data) 41 | 42 | def load_cheat(): 43 | 44 | train_image_dirs = ['/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/cheat/JPEGImages/'] 45 | train_data = [] 46 | for trainimdir in train_image_dirs: 47 | print 'parsing ' + trainimdir + ' ...' 48 | for name in os.listdir(trainimdir): 49 | if name[-4:] == '.jpg': 50 | impath = trainimdir + name 51 | labelpath = impath.replace('.jpg', '.xml') 52 | labelpath = labelpath.replace('JPEGImages', 'Annotations') 53 | tree = ET.parse(labelpath) 54 | root = tree.getroot() 55 | bboxes = [] 56 | for obj in root.iter('object'): 57 | xmlbox = obj.find('bndbox') 58 | xmin = int(xmlbox.find('xmin').text) 59 | ymin = int(xmlbox.find('ymin').text) 60 | xmax = int(xmlbox.find('xmax').text) 61 | ymax = int(xmlbox.find('ymax').text) 62 | w = xmax - xmin 63 | h = ymax - ymin 64 | size = min(w, h) 65 | # only large enough 66 | if size > 12: 67 | bbox = [xmin, ymin, xmax, ymax] 68 | bboxes.append(bbox) 69 | if len(bboxes) > 0: 70 | bboxes = np.asarray(bboxes, dtype=np.float32) 71 | train_data.append([impath, bboxes]) 72 | 73 | val_image_dirs = ['/media/lincolnhard/11e5b063-4031-49ed-bf24-312fc250e90c/lincoln/cheat/JPEGImages/'] 74 | val_data = [] 75 | for valimdir in val_image_dirs: 76 | print 'parsing ' + valimdir + ' ...' 77 | for name in os.listdir(valimdir): 78 | if name[-4:] == '.jpg': 79 | impath = valimdir + name 80 | labelpath = impath.replace('.jpg', '.xml') 81 | labelpath = labelpath.replace('JPEGImages', 'Annotations') 82 | tree = ET.parse(labelpath) 83 | root = tree.getroot() 84 | bboxes = [] 85 | for obj in root.iter('object'): 86 | xmlbox = obj.find('bndbox') 87 | xmin = int(xmlbox.find('xmin').text) 88 | ymin = int(xmlbox.find('ymin').text) 89 | xmax = int(xmlbox.find('xmax').text) 90 | ymax = int(xmlbox.find('ymax').text) 91 | w = xmax - xmin 92 | h = ymax - ymin 93 | size = min(w, h) 94 | # only large enough 95 | if size > 12: 96 | bbox = [xmin, ymin, xmax, ymax] 97 | bboxes.append(bbox) 98 | if len(bboxes) > 0: 99 | bboxes = np.asarray(bboxes, dtype=np.float32) 100 | val_data.append([impath, bboxes]) 101 | 102 | return (train_data, val_data) 103 | 104 | def load_wider(): 105 | """load wider face dataset 106 | data: [img_path, bboxes]+ 107 | bboxes: [x1, y1, x2, y2] 108 | """ 109 | 110 | def get_dirmapper(dirpath): 111 | """return dir mapper for wider face 112 | """ 113 | mapper = {} 114 | for d in os.listdir(dirpath): 115 | dir_id = d.split('--')[0] 116 | mapper[dir_id] = os.path.join(dirpath, d) 117 | return mapper 118 | 119 | train_mapper = get_dirmapper(os.path.join(cfg.WIDER_DIR, 'WIDER_train', 'images')) 120 | val_mapper = get_dirmapper(os.path.join(cfg.WIDER_DIR, 'WIDER_val', 'images')) 121 | 122 | def gen(text, mapper): 123 | fin = open(text, 'r') 124 | 125 | result = [] 126 | while True: 127 | line = fin.readline() 128 | if not line: break # eof 129 | name = line.strip() 130 | dir_id = name.split('_')[0] 131 | img_path = os.path.join(mapper[dir_id], name + '.jpg') 132 | face_n = int(fin.readline().strip()) 133 | 134 | bboxes = [] 135 | for i in range(face_n): 136 | line = fin.readline().strip() 137 | components = line.split(' ') 138 | x, y, w, h = [float(_) for _ in components] 139 | 140 | size = min(w, h) 141 | # only large enough 142 | if size > 12: 143 | bbox = [x, y, w, h] 144 | bboxes.append(bbox) 145 | 146 | # # for debug 147 | # img = cv2.imread(img_path) 148 | # for bbox in bboxes: 149 | # x, y, w, h = bbox 150 | # cv2.rectangle(img, (int(x), int(y)), (int(x+w), int(y+h)), (0,0,255), 1) 151 | # cv2.imshow('img', img) 152 | # cv2.waitKey(0) 153 | 154 | if len(bboxes) > 0: 155 | bboxes = np.asarray(bboxes, dtype=np.float32) 156 | bboxes[:, 2] += bboxes[:, 0] 157 | bboxes[:, 3] += bboxes[:, 1] 158 | result.append([img_path, bboxes]) 159 | fin.close() 160 | return result 161 | 162 | txt_dir = os.path.join(cfg.WIDER_DIR, 'wider_face_split') 163 | train_data = gen(os.path.join(txt_dir, 'wider_face_train.txt'), train_mapper) 164 | val_data = gen(os.path.join(txt_dir, 'wider_face_val.txt'), val_mapper) 165 | return (train_data, val_data) 166 | 167 | 168 | def load_celeba(): 169 | """load celeba dataset and crop the face bbox 170 | notice: the face bbox may out of the image range 171 | data: [img_path, bbox, landmark] 172 | bbox: [x1, y1, x2, y2] 173 | landmark: [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5], align to top left of the image 174 | """ 175 | text = os.path.join(cfg.CelebA_DIR, 'list_landmarks_celeba.txt') 176 | fin = open(text, 'r') 177 | n = int(fin.readline().strip()) 178 | fin.readline() # drop 179 | 180 | result = [] 181 | for i in range(n): 182 | line = fin.readline().strip() 183 | components = line.split() 184 | img_path = os.path.join(cfg.CelebA_DIR, 'img_celeba', components[0]) 185 | landmark = np.asarray([int(_) for _ in components[1:]], dtype=np.float32) 186 | landmark = landmark.reshape((-1, 2)) # 5x2 187 | 188 | # crop face bbox 189 | x_max, y_max = landmark.max(0) 190 | x_min, y_min = landmark.min(0) 191 | w, h = x_max - x_min, y_max - y_min 192 | w = h = max(w, h) 193 | ratio = 0.5 194 | x_new = x_min - w*ratio 195 | y_new = y_min - h*ratio 196 | w_new = w*(1 + 2*ratio) 197 | h_new = h*(1 + 2*ratio) 198 | bbox = [x_new, y_new, x_new + w_new, y_new + h_new] 199 | bbox = [int(_) for _ in bbox] 200 | 201 | # # for debug 202 | # img = cv2.imread(img_path) 203 | # x, y, w, h = bbox 204 | # cv2.rectangle(img, (x, y), (x+w, y+h), (0,0,255), 1) 205 | # for j in range(5): 206 | # cv2.circle(img, (int(landmark[j, 0]), int(landmark[j, 1])), 2, (0,255,0), -1) 207 | # cv2.imshow('img', img) 208 | # cv2.waitKey(0) 209 | 210 | # # normalize landmark 211 | # landmark[:, 0] = (landmark[:, 0] - bbox[0]) / w_new 212 | # landmark[:, 1] = (landmark[:, 1] - bbox[1]) / h_new 213 | 214 | landmark = landmark.reshape(-1) 215 | result.append([img_path, bbox, landmark]) 216 | 217 | fin.close() 218 | ratio = 0.8 219 | train_n = int(len(result) * ratio) 220 | train = result[:train_n] 221 | val = result[train_n:] 222 | return train, val 223 | 224 | 225 | def get_logger(name=None): 226 | """return a logger 227 | """ 228 | logger = logging.getLogger(name) 229 | logger.setLevel(logging.INFO) 230 | sh = logging.StreamHandler() 231 | sh.setLevel(logging.INFO) 232 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s') 233 | sh.setFormatter(formatter) 234 | logger.addHandler(sh) 235 | return logger 236 | 237 | 238 | def crop_face(img, bbox, wrap=True): 239 | height, width = img.shape[:-1] 240 | x1, y1, x2, y2 = bbox 241 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 242 | if x1 >= width or y1 >= height or x2 <= 0 or y2 <= 0: 243 | print '[WARN] ridiculous x1, y1, x2, y2' 244 | return None 245 | if x1 < 0 or y1 < 0 or x2 > width or y2 > height: 246 | # out of boundary, still crop the face 247 | if not wrap: 248 | return None 249 | h, w = y2 - y1, x2 - x1 250 | patch = np.zeros((h, w, 3), dtype=np.uint8) 251 | vx1 = 0 if x1 < 0 else x1 252 | vy1 = 0 if y1 < 0 else y1 253 | vx2 = width if x2 > width else x2 254 | vy2 = height if y2 > height else y2 255 | sx = -x1 if x1 < 0 else 0 256 | sy = -y1 if y1 < 0 else 0 257 | vw = vx2 - vx1 258 | vh = vy2 - vy1 259 | patch[sy:sy+vh, sx:sx+vw] = img[vy1:vy2, vx1:vx2] 260 | return patch 261 | return img[y1:y2, x1:x2] 262 | 263 | 264 | class Timer: 265 | 266 | def __init__(self): 267 | self.start_time = 0 268 | self.total_time = 0 269 | 270 | def tic(self): 271 | self.start_time = time.time() 272 | 273 | def toc(self): 274 | self.total_time = time.time() - self.start_time 275 | 276 | def elapsed(self): 277 | return self.total_time 278 | 279 | 280 | if __name__ == '__main__': 281 | img = cv2.imread('test.jpg') 282 | bbox = [-100, -200, 300, 400] 283 | patch = crop_face(img, bbox) 284 | cv2.imshow('patch', patch) 285 | cv2.waitKey(0) 286 | -------------------------------------------------------------------------------- /layers/jfda_loss_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "caffe/layers/jfda_loss_layer.hpp" 4 | #include 5 | 6 | using std::cout; 7 | using std::endl; 8 | 9 | namespace caffe { 10 | 11 | // JfdaLossLayer 12 | // LayerInput 13 | // 1. face classification score 14 | // 2. face bbox regression pred 15 | // 3. face landmark pred 16 | // 4. face bbox regression target 17 | // 5. face landmark target 18 | // 6. face data type / label, 0 for negatives, 1 for positives, 19 | // 2 for part faces, 3 for landmark faces 20 | // LayerOutput 21 | // 1. face classification loss 22 | // 2. face bbox regression loss 23 | // 3. face landmark loss 24 | // 4. face classification accuracy x 2 25 | // Training data layout 26 | // negatives, positives, part faces, landmark faces 27 | // Bottom 28 | // bottom[0]: face classification score 29 | // bottom[1]: face bbox pred 30 | // bottom[2]: face landmark pred 31 | // bottom[3]: face bbox target 32 | // bottom[4]: face landmark target 33 | // bottom[5]: face data type / label 34 | // Top 35 | // top[0]: face classificatoin loss 36 | // top[1]: face bbox regression loss 37 | // top[2]: face landmark regression loos 38 | // top[3]: face classification negative accuracy 39 | // top[4]: face classification positive accuracy 40 | 41 | template 42 | void JfdaLossLayer::LayerSetUp( 43 | const vector*>& bottom, const vector*>& top) { 44 | drop_loss_rate_ = this->layer_param_.jfda_loss_param().drop_loss_rate(); 45 | if (drop_loss_rate_ < 0.f) drop_loss_rate_ = 0.f; 46 | if (drop_loss_rate_ > 1.f) drop_loss_rate_ = 1.f; 47 | } 48 | 49 | template 50 | void JfdaLossLayer::Reshape( 51 | const vector*>& bottom, const vector*>& top) { 52 | mask_.ReshapeLike(*bottom[5]); 53 | prob_.ReshapeLike(*bottom[0]); 54 | bbox_diff_.ReshapeLike(*bottom[1]); 55 | landmark_diff_.ReshapeLike(*bottom[2]); 56 | vector loss_shape(0); 57 | top[0]->Reshape(loss_shape); 58 | top[1]->Reshape(loss_shape); 59 | top[2]->Reshape(loss_shape); 60 | top[3]->Reshape(loss_shape); 61 | top[4]->Reshape(loss_shape); 62 | } 63 | 64 | template 65 | void _QSort_(vector& loss, vector& idx, int left, int right) { 66 | int i = left; 67 | int j = right; 68 | Dtype t = loss[(i + j) / 2]; 69 | do { 70 | while (loss[i] > t) i++; 71 | while (loss[j] < t) j--; 72 | if (i <= j) { 73 | std::swap(loss[i], loss[j]); 74 | std::swap(idx[i], idx[j]); 75 | i++; 76 | j--; 77 | } 78 | } while (i <= j); 79 | if (left < j) _QSort_(loss, idx, left, j); 80 | if (i < right) _QSort_(loss, idx, i, right); 81 | } 82 | 83 | template 84 | void JfdaLossLayer::Forward_cpu( 85 | const vector*>& bottom, const vector*>& top) { 86 | int n1, n2, n3, n4; 87 | n1 = n2 = n3 = n4 = 0; 88 | const Dtype* label_data = bottom[5]->cpu_data(); 89 | const int batch_size = bottom[5]->num(); 90 | for (int i = 0; i < batch_size; i++) { 91 | const int label = static_cast(label_data[i]); 92 | if (label == 0) n1++; 93 | else if (label == 1) n2++; 94 | else if (label == 2) n3++; 95 | else n4++; 96 | } 97 | 98 | // face classification 99 | int n = n1 + n2; 100 | const Dtype* score_data = bottom[0]->cpu_data(); 101 | Dtype* prob_data = prob_.mutable_cpu_data(); 102 | vector loss_data(n); 103 | Dtype face_cls_loss = 0; 104 | Dtype pos_acc = 0; 105 | Dtype neg_acc = 0; 106 | for (int i = 0; i < n; i++) { 107 | const Dtype max_input = std::max(score_data[2 * i], score_data[2 * i + 1]); 108 | prob_data[2 * i] = std::exp(score_data[2 * i] - max_input); 109 | prob_data[2 * i + 1] = std::exp(score_data[2 * i + 1] - max_input); 110 | const Dtype sum = prob_data[2 * i] + prob_data[2 * i + 1]; 111 | prob_data[2 * i] /= sum; 112 | prob_data[2 * i + 1] /= sum; 113 | const int label = static_cast(label_data[i]); 114 | if (label == 0) { 115 | loss_data[i] = -std::log(std::max(prob_data[2 * i], Dtype(FLT_MIN))); 116 | if (prob_data[2 * i] > prob_data[2 * i + 1]) neg_acc += 1; 117 | } 118 | else { 119 | loss_data[i] = -std::log(std::max(prob_data[2 * i + 1], Dtype(FLT_MIN))); 120 | if (prob_data[2 * i + 1] > prob_data[2 * i]) pos_acc += 1; 121 | } 122 | face_cls_loss += loss_data[i]; 123 | } 124 | face_cls_loss /= batch_size; 125 | top[0]->mutable_cpu_data()[0] = face_cls_loss; 126 | neg_acc /= n1; 127 | pos_acc /= n2; 128 | top[3]->mutable_cpu_data()[0] = neg_acc; 129 | top[4]->mutable_cpu_data()[0] = pos_acc; 130 | 131 | // bbox regression 132 | // n = n1 + n2 + n3; 133 | // Dtype bbox_reg_loss = 0; 134 | // for (int i = n1; i < n; i++) { 135 | // const Dtype* bbox_pred_data = bottom[1]->cpu_data() + bottom[1]->offset(i); 136 | // const Dtype* bbox_target_data = bottom[3]->cpu_data() + bottom[3]->offset(i); 137 | // const int m = bottom[1]->channels(); 138 | // for (int j = 0; j < m; j++) { 139 | // bbox_reg_loss += (bbox_pred_data[j] - bbox_target_data[j]) * 140 | // (bbox_pred_data[j] - bbox_target_data[j]); 141 | // } 142 | // } 143 | // bbox_reg_loss /= 2 * batch_size; 144 | // top[1]->mutable_cpu_data()[0] = bbox_reg_loss; 145 | 146 | const Dtype* bbox_pred_data = bottom[1]->cpu_data() + bottom[1]->offset(n1); 147 | const Dtype* bbox_target_data = bottom[3]->cpu_data() + bottom[3]->offset(n1); 148 | Dtype* bbox_diff_data = bbox_diff_.mutable_cpu_data() + bbox_diff_.offset(n1); 149 | const int bbox_count = (n2 + n3) * bottom[1]->channels(); 150 | caffe_sub( 151 | bbox_count, 152 | bbox_pred_data, 153 | bbox_target_data, 154 | bbox_diff_data); 155 | Dtype bbox_dot = caffe_cpu_dot(bbox_count, bbox_diff_data, bbox_diff_data); 156 | Dtype bbox_reg_loss = bbox_dot / batch_size / Dtype(2); 157 | top[1]->mutable_cpu_data()[0] = bbox_reg_loss; 158 | // cout << "bbox pred [" << bbox_pred_data[0] << ", " 159 | // << bbox_pred_data[1] << ", " 160 | // << bbox_pred_data[2] << ", " 161 | // << bbox_pred_data[3] << "]" << endl; 162 | 163 | // landmark regression 164 | // n = n1 + n2 + n3 + n4; 165 | // Dtype landmark_reg_loss = 0; 166 | // for (int i = n1 + n2 + n3; i < n; i++) { 167 | // const Dtype* landmark_pred_data = bottom[2]->cpu_data() + bottom[2]->offset(i); 168 | // const Dtype* landmark_target_data = bottom[4]->cpu_data() + bottom[4]->offset(i); 169 | // const int m = bottom[2]->channels(); 170 | // for (int j = 0; j < m; j++) { 171 | // landmark_reg_loss += (landmark_pred_data[j] - landmark_target_data[j]) * 172 | // (landmark_pred_data[j] - landmark_target_data[j]); 173 | // } 174 | // } 175 | // landmark_reg_loss /= 2 * batch_size; 176 | // top[2]->mutable_cpu_data()[1] = landmark_reg_loss; 177 | 178 | const Dtype* landmark_pred_data = bottom[2]->cpu_data() + bottom[2]->offset(n1 + n2 + n3); 179 | const Dtype* landmark_target_data = bottom[4]->cpu_data() + bottom[4]->offset(n1 + n2 + n3); 180 | Dtype* landmark_diff_data = landmark_diff_.mutable_cpu_data() + landmark_diff_.offset(n1 + n2 + n3); 181 | const int landmark_count = n4 * bottom[2]->channels(); 182 | caffe_sub( 183 | landmark_count, 184 | landmark_pred_data, 185 | landmark_target_data, 186 | landmark_diff_data); 187 | Dtype landmark_dot = caffe_cpu_dot(landmark_count, landmark_diff_data, landmark_diff_data); 188 | Dtype landmark_reg_loss = landmark_dot / batch_size / Dtype(2); 189 | top[2]->mutable_cpu_data()[0] = landmark_reg_loss; 190 | 191 | // set backward mask for face classification 192 | vector idx(loss_data.size()); 193 | for (int i = 0; i < idx.size(); i++) { 194 | idx[i] = i; 195 | } 196 | _QSort_(loss_data, idx, 0, loss_data.size() - 1); 197 | const Dtype th = static_cast(1.f - drop_loss_rate_); 198 | const int remained = static_cast(loss_data.size() * th); 199 | Dtype* mask_data = mask_.mutable_cpu_data(); 200 | for (int i = 0; i < remained; i++) { 201 | mask_data[idx[i]] = Dtype(1); 202 | } 203 | for (int i = remained; i < loss_data.size(); i++) { 204 | mask_data[idx[i]] = Dtype(0); 205 | } 206 | } 207 | 208 | template 209 | void JfdaLossLayer::Backward_cpu(const vector*>& top, 210 | const vector& propagate_down, const vector*>& bottom) { 211 | int n1, n2, n3, n4; 212 | n1 = n2 = n3 = n4 = 0; 213 | const Dtype* label_data = bottom[5]->cpu_data(); 214 | const int batch_size = bottom[5]->num(); 215 | for (int i = 0; i < batch_size; i++) { 216 | const int label = static_cast(label_data[i]); 217 | if (label == 0) n1++; 218 | else if (label == 1) n2++; 219 | else if (label == 2) n3++; 220 | else n4++; 221 | } 222 | 223 | // face classification 224 | int n = n1 + n2; 225 | const Dtype* prob_data = prob_.cpu_data(); 226 | const Dtype* mask_data = mask_.cpu_data(); 227 | Dtype* cls_diff = bottom[0]->mutable_cpu_diff(); 228 | caffe_copy(2 * n, prob_data, cls_diff); 229 | for (int i = 0; i < n; i++) { 230 | const int label = static_cast(label_data[i]); 231 | const Dtype weight = mask_data[i] * top[0]->cpu_diff()[0] / batch_size; 232 | if (label == 0) { 233 | cls_diff[2 * i] -= 1; 234 | } 235 | else { 236 | cls_diff[2 * i + 1] -= 1; 237 | } 238 | cls_diff[2 * i] *= weight; 239 | cls_diff[2 * i + 1] *= weight; 240 | } 241 | 242 | // // bbox regression 243 | // n = n1 + n2 + n3; 244 | // for (int i = n1; i < n; i++) { 245 | // const Dtype* bbox_pred_data = bottom[1]->cpu_data() + bottom[1]->offset(i); 246 | // const Dtype* bbox_target_data = bottom[3]->cpu_data() + bottom[3]->offset(i); 247 | // Dtype* reg_diff = bottom[1]->mutable_cpu_diff() + bottom[1]->offset(i); 248 | // const int m = bottom[1]->channels(); 249 | // const Dtype weight = top[1]->cpu_diff()[0] / batch_size; 250 | // for (int j = 0; j < m; j++) { 251 | // reg_diff[j] = weight * (bbox_pred_data[j] - bbox_target_data[j]); 252 | // } 253 | // } 254 | 255 | const Dtype* bbox_diff_data = bbox_diff_.cpu_data() + bbox_diff_.offset(n1); 256 | Dtype* bbox_reg_diff = bottom[1]->mutable_cpu_diff() + bottom[1]->offset(n1); 257 | const int bbox_count = (n2 + n3) * bottom[1]->channels(); 258 | const Dtype bbox_alpha = top[1]->cpu_diff()[0] / batch_size; 259 | caffe_cpu_axpby( 260 | bbox_count, 261 | bbox_alpha, 262 | bbox_diff_data, 263 | Dtype(0), 264 | bbox_reg_diff); 265 | 266 | // // landmark regression 267 | // n = n1 + n2 + n3 + n4; 268 | // for (int i = n1 + n2 + n3; i < n; i++) { 269 | // const Dtype* landmark_pred_data = bottom[2]->cpu_data() + bottom[2]->offset(i); 270 | // const Dtype* landmark_target_data = bottom[4]->cpu_data() + bottom[4]->offset(i); 271 | // Dtype* reg_diff = bottom[2]->mutable_cpu_diff() + bottom[2]->offset(i); 272 | // const int m = bottom[1]->channels(); 273 | // const Dtype weight = top[2]->cpu_diff()[0] / batch_size; 274 | // for (int j = 0; j < m; j++) { 275 | // reg_diff[j] = weight * (landmark_pred_data - landmark_target_data); 276 | // } 277 | // } 278 | 279 | const Dtype* landmark_diff_data = landmark_diff_.cpu_data() + landmark_diff_.offset(n1 + n2 + n3); 280 | Dtype* landmark_reg_loss = bottom[2]->mutable_cpu_diff() + bottom[2]->offset(n1 + n2 + n3); 281 | const int landmark_count = n4 * bottom[2]->channels(); 282 | const Dtype landmark_alpha = top[2]->cpu_diff()[0] / batch_size; 283 | caffe_cpu_axpby( 284 | landmark_count, 285 | landmark_alpha, 286 | landmark_diff_data, 287 | Dtype(0), 288 | landmark_reg_loss); 289 | } 290 | 291 | #ifdef CPU_ONLY 292 | STUB_GPU(JfdaLossLayer) 293 | #endif // CPU_ONLY 294 | 295 | INSTANTIATE_CLASS(JfdaLossLayer); 296 | REGISTER_LAYER_CLASS(JfdaLoss); 297 | 298 | } // namespace caffe 299 | -------------------------------------------------------------------------------- /jfda/detector.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 2 | import math 3 | import cv2 4 | import caffe 5 | import numpy as np 6 | from jfda.utils import crop_face, Timer 7 | 8 | 9 | class JfdaDetector: 10 | '''JfdaDetector 11 | ''' 12 | 13 | def __init__(self, nets): 14 | assert len(nets) in [2, 4, 6, 8], 'wrong number of nets' 15 | self.pnet, self.rnet, self.onet, self.lnet = None, None, None, None 16 | if len(nets) >= 2: 17 | self.pnet = caffe.Net(nets[0], caffe.TEST, weights=nets[1]) 18 | if len(nets) >= 4: 19 | self.rnet = caffe.Net(nets[2], caffe.TEST, weights=nets[3]) 20 | if len(nets) >= 6: 21 | self.onet = caffe.Net(nets[4], caffe.TEST, weights=nets[5]) 22 | if len(nets) >= 8: 23 | self.lnet = caffe.Net(nets[6], caffe.TEST, weights=nets[7]) 24 | self.pnet_single_forward = False 25 | 26 | def set_pnet_single_forward(self, single_forward=True): 27 | '''convert image pyramid to a single image and forward once 28 | ''' 29 | self.pnet_single_forward = single_forward 30 | 31 | def detect(self, img, ths, min_size, factor, debug=False): 32 | '''detect face, return bboxes, [bbox score offset landmark] 33 | if debug is on, return bboxes of every stage and time consumption 34 | ''' 35 | timer = Timer() 36 | ts = [0, 0, 0, 0] 37 | bb = [[], [], [], []] 38 | # stage-1 39 | timer.tic() 40 | base = 12. / min_size 41 | height, width = img.shape[:-1] 42 | l = min(width, height) 43 | l *= base 44 | scales = [] 45 | while l > 12: 46 | scales.append(base) 47 | base *= factor 48 | l *= factor 49 | if not self.pnet_single_forward or len(scales) <= 1: 50 | bboxes = np.zeros((0, 4 + 1 + 4), dtype=np.float32) 51 | for scale in scales: 52 | w, h = int(math.ceil(scale * width)), int(math.ceil(scale * height)) 53 | data = cv2.resize(img, (w, h)) 54 | data = data.transpose((2, 0, 1)).astype(np.float32) 55 | data = (data - 128) / 128 56 | data = data.reshape((1, 3, h, w)) 57 | prob, bbox_pred = self._forward(self.pnet, data, ['prob', 'bbox_pred']) 58 | _bboxes = self._gen_bbox(prob[0][1], bbox_pred[0], scale, ths[0]) 59 | keep = nms(_bboxes, 0.5) 60 | _bboxes = _bboxes[keep] 61 | bboxes = np.vstack([bboxes, _bboxes]) 62 | else: 63 | # convert to a single image 64 | data, pyramid_info = convert_image_pyramid(img, scales, interval=2) 65 | # forward pnet 66 | data = data.astype(np.float32) 67 | data = (data.transpose((2, 0, 1)) - 128) / 128 68 | data = data[np.newaxis, :, :, :] 69 | prob, bbox_pred = self._forward(self.pnet, data, ['prob', 'bbox_pred']) 70 | bboxes = self._gen_bbox(prob[0][1], bbox_pred[0], 1, ths[0]) 71 | # nms over every pyramid 72 | keep = nms(bboxes, 0.5) 73 | bboxes = bboxes[keep] 74 | # map to original image 75 | bboxes = get_original_bboxes(bboxes, pyramid_info) 76 | keep = nms(bboxes, 0.7) 77 | bboxes = bboxes[keep] 78 | bboxes = self._bbox_reg(bboxes) 79 | bboxes = self._make_square(bboxes) 80 | timer.toc() 81 | ts[0] = timer.elapsed() 82 | bb[0] = bboxes.copy() 83 | self._clear_network_buffer(self.pnet) 84 | # stage-2 85 | if self.rnet is None or len(bboxes) == 0: 86 | if debug is True: 87 | return bb, ts 88 | else: 89 | return bboxes 90 | timer.tic() 91 | n = len(bboxes) 92 | data = np.zeros((n, 3, 24, 24), dtype=np.float32) 93 | for i, bbox in enumerate(bboxes): 94 | face = crop_face(img, bbox[:4]) 95 | data[i] = cv2.resize(face, (24, 24)).transpose((2, 0, 1)) 96 | data = (data - 128) / 128 97 | prob, bbox_pred = self._forward(self.rnet, data, ['prob', 'bbox_pred']) 98 | prob = prob.reshape(n, 2) 99 | bbox_pred = bbox_pred.reshape(n, 4) 100 | keep = prob[:, 1] > ths[1] 101 | bboxes = bboxes[keep] 102 | bboxes[:, 4] = prob[keep, 1] 103 | bboxes[:, 5:9] = bbox_pred[keep] 104 | keep = nms(bboxes, 0.7) 105 | bboxes = bboxes[keep] 106 | bboxes = self._bbox_reg(bboxes) 107 | bboxes = self._make_square(bboxes) 108 | timer.toc() 109 | ts[1] = timer.elapsed() 110 | bb[1] = bboxes.copy() 111 | self._clear_network_buffer(self.rnet) 112 | # stage-3 113 | if self.onet is None or len(bboxes) == 0: 114 | if debug is True: 115 | return bb, ts 116 | else: 117 | return bboxes 118 | timer.tic() 119 | n = len(bboxes) 120 | data = np.zeros((n, 3, 48, 48), dtype=np.float32) 121 | for i, bbox in enumerate(bboxes): 122 | face = crop_face(img, bbox[:4]) 123 | data[i] = cv2.resize(face, (48, 48)).transpose((2, 0, 1)) 124 | data = (data - 128) / 128 125 | prob, bbox_pred = self._forward(self.onet, data, ['prob', 'bbox_pred']) 126 | prob = prob.reshape(n, 2) 127 | bbox_pred = bbox_pred.reshape(n, 4) 128 | keep = prob[:, 1] > ths[2] 129 | bboxes = bboxes[keep] 130 | bboxes[:, 4] = prob[keep, 1] 131 | bboxes[:, 5:9] = bbox_pred[keep] 132 | bboxes = self._bbox_reg(bboxes) 133 | keep = nms(bboxes, 0.7, 'Min') 134 | bboxes = bboxes[keep] 135 | timer.toc() 136 | ts[2] = timer.elapsed() 137 | bb[2] = bboxes.copy() 138 | self._clear_network_buffer(self.onet) 139 | # stage-4 140 | if self.lnet is None or len(bboxes) == 0: 141 | if debug is True: 142 | return bb, ts 143 | else: 144 | return bboxes 145 | timer.tic() 146 | n = len(bboxes) 147 | data = np.zeros((n, 15, 24, 24), dtype=np.float32) 148 | w, h = bboxes[:, 2]-bboxes[:, 0], bboxes[:, 3]-bboxes[:, 1] 149 | l = np.maximum(w, h) * 0.25 150 | for i in range(len(bboxes)): 151 | x1, y1, x2, y2 = bboxes[i, :4] 152 | landmark = bboxes[i, 9:].reshape((5, 2)) 153 | for j in range(5): 154 | x, y = landmark[j] 155 | patch_bbox = [x-l[i]/2, y-l[i]/2, x+l[i]/2, y+l[i]/2] 156 | patch = crop_face(img, patch_bbox) 157 | patch = cv2.resize(patch, (24, 24)) 158 | patch = patch.transpose((2, 0, 1)) 159 | data[i, (3*j):(3*j+3)] = patch 160 | data = (data - 128) / 128 161 | offset = self._forward(self.lnet, data, ['landmark_offset'])[0] 162 | offset = offset.reshape(n, 10) 163 | offset *= l.reshape((-1, 1)) 164 | bboxes[:, 9:] += offset 165 | timer.toc() 166 | ts[3] = timer.elapsed() 167 | bb[3] = bboxes.copy() 168 | self._clear_network_buffer(self.lnet) 169 | if debug is True: 170 | return bb, ts 171 | else: 172 | return bboxes 173 | 174 | def _forward(self, net, data, outs): 175 | '''forward a net with given data, return blobs[out] 176 | ''' 177 | net.blobs['data'].reshape(*data.shape) 178 | net.blobs['data'].data[...] = data 179 | net.forward() 180 | return [net.blobs[out].data for out in outs] 181 | 182 | def _clear_network_buffer(self, net): 183 | if net is self.pnet: 184 | fake = np.zeros((1, 3, 12, 12), dtype=np.float32) 185 | elif net is self.rnet: 186 | fake = np.zeros((1, 3, 24, 24), dtype=np.float32) 187 | elif net is self.onet: 188 | fake = np.zeros((1, 3, 48, 48), dtype=np.float32) 189 | else: 190 | fake = np.zeros((1, 15, 24, 24), dtype=np.float32) 191 | net.blobs['data'].reshape(*fake.shape) 192 | net.blobs['data'].data[...] = fake 193 | net.forward() 194 | 195 | def _gen_bbox(self, hotmap, offset, scale, th): 196 | '''[x1, y1, x2, y2, score, offset_x1, offset_y1, offset_x2, offset_y2] 197 | ''' 198 | h, w = hotmap.shape 199 | stride = 2 200 | win_size = 12 201 | hotmap = hotmap.reshape((h, w)) 202 | keep = hotmap > th 203 | pos = np.where(keep) 204 | score = hotmap[keep] 205 | offset = offset[:, keep] 206 | x, y = pos[1], pos[0] 207 | x1 = stride * x 208 | y1 = stride * y 209 | x2 = x1 + win_size 210 | y2 = y1 + win_size 211 | x1 = x1 / scale 212 | y1 = y1 / scale 213 | x2 = x2 / scale 214 | y2 = y2 / scale 215 | bbox = np.vstack([x1, y1, x2, y2, score, offset]).transpose() 216 | return bbox.astype(np.float32) 217 | 218 | def _bbox_reg(self, bboxes): 219 | w = bboxes[:, 2] - bboxes[:, 0] 220 | h = bboxes[:, 3] - bboxes[:, 1] 221 | bboxes[:, 0] += bboxes[:, 5] * w 222 | bboxes[:, 1] += bboxes[:, 6] * h 223 | bboxes[:, 2] += bboxes[:, 7] * w 224 | bboxes[:, 3] += bboxes[:, 8] * h 225 | return bboxes 226 | 227 | def _make_square(self, bboxes): 228 | '''make bboxes sqaure 229 | ''' 230 | x_center = (bboxes[:, 0] + bboxes[:, 2]) / 2 231 | y_center = (bboxes[:, 1] + bboxes[:, 3]) / 2 232 | w = bboxes[:, 2] - bboxes[:, 0] 233 | h = bboxes[:, 3] - bboxes[:, 1] 234 | size = np.vstack([w, h]).max(axis=0).transpose() 235 | bboxes[:, 0] = x_center - size / 2 236 | bboxes[:, 2] = x_center + size / 2 237 | bboxes[:, 1] = y_center - size / 2 238 | bboxes[:, 3] = y_center + size / 2 239 | return bboxes 240 | 241 | 242 | def nms(dets, thresh, meth='Union'): 243 | '''nms from py-faster-rcnn 244 | ''' 245 | x1 = dets[:, 0] 246 | y1 = dets[:, 1] 247 | x2 = dets[:, 2] 248 | y2 = dets[:, 3] 249 | scores = dets[:, 4] 250 | 251 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 252 | order = scores.argsort()[::-1] 253 | 254 | keep = [] 255 | while order.size > 0: 256 | i = order[0] 257 | keep.append(i) 258 | xx1 = np.maximum(x1[i], x1[order[1:]]) 259 | yy1 = np.maximum(y1[i], y1[order[1:]]) 260 | xx2 = np.minimum(x2[i], x2[order[1:]]) 261 | yy2 = np.minimum(y2[i], y2[order[1:]]) 262 | 263 | w = np.maximum(0.0, xx2 - xx1 + 1) 264 | h = np.maximum(0.0, yy2 - yy1 + 1) 265 | inter = w * h 266 | if meth == 'Union': 267 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 268 | else: 269 | ovr = inter / np.minimum(areas[i], areas[order[1:]]) 270 | 271 | inds = np.where(ovr <= thresh)[0] 272 | order = order[inds + 1] 273 | 274 | return keep 275 | 276 | 277 | def convert_image_pyramid(img, scales, interval=2): 278 | """convert image pyramid to a single image 279 | 280 | Parameters 281 | ========== 282 | img: image 283 | scales: pyramid scales 284 | interval: interval pixels between pyramid images 285 | 286 | Returns 287 | ======= 288 | result: image pyramid in a single image 289 | bboxes: every pyramid image in the result image with position and scale information, 290 | (x, y, w, h, scale) 291 | """ 292 | assert len(scales) >= 2 293 | height, width = img.shape[:2] 294 | pyramids = [] 295 | for scale in scales: 296 | w, h = int(math.ceil(scale*width)), int(math.ceil(scale*height)) 297 | img_pyramid = cv2.resize(img, (w, h)) 298 | pyramids.append(img_pyramid) 299 | 300 | input_h, input_w = pyramids[0].shape[:2] 301 | # x, y, w, h 302 | bboxes = [[0, 0, img.shape[1], img.shape[0], scale] for img, scale in zip(pyramids, scales)] 303 | if input_h < input_w: 304 | output_h = input_h + interval + pyramids[1].shape[0] 305 | output_w = 0 306 | available = [[0, 0]] 307 | for bbox in bboxes: 308 | min_used_width = 3 * width 309 | choosed = -1 310 | for i, (x, y) in enumerate(available): 311 | if y + bbox[3] <= output_h and x + bbox[2] < min_used_width: 312 | min_used_width = x + bbox[2] 313 | bbox[0], bbox[1] = x, y 314 | choosed = i 315 | assert choosed != -1, "No suitable position for this pyramid scale" 316 | # extend available positions 317 | x, y = available[choosed] 318 | w, h = bbox[2:4] 319 | available[choosed][0] = x + interval + w 320 | available[choosed][1] = y 321 | available.append([x, y + interval + h]) 322 | output_w = max(output_w, min_used_width) 323 | else: 324 | output_w = input_w + interval + pyramids[1].shape[1] 325 | output_h = 0 326 | available = [[0, 0]] 327 | for bbox in bboxes: 328 | min_used_height = 3 * height 329 | choosed = -1 330 | for i, (x, y) in enumerate(available): 331 | if x + bbox[2] <= output_w and y + bbox[3] < min_used_height: 332 | min_used_height = y + bbox[3] 333 | bbox[0], bbox[1] = x, y 334 | choosed = i 335 | assert choosed != -1, "No suitable position for this pyramid scale" 336 | # extend available positions 337 | x, y = available[choosed] 338 | w, h = bbox[2:4] 339 | available[choosed][0] = x + interval + w 340 | available[choosed][1] = y 341 | available.append([x, y + interval + h]) 342 | output_h = max(output_h, min_used_height) 343 | # convert to a single image 344 | result = np.zeros((output_h, output_w, 3), dtype=np.uint8) 345 | for bbox, pyramid in zip(bboxes, pyramids): 346 | x, y, w, h, scale = bbox 347 | assert pyramid.shape[0] == h and pyramid.shape[1] == w 348 | result[y:y+h, x:x+w, :] = pyramid 349 | 350 | return result, bboxes 351 | 352 | 353 | def get_original_bboxes(bboxes, pyramid_info): 354 | """get original bboxes 355 | 356 | Parameters 357 | ========== 358 | bboxes: detected bboxes 359 | pyramid_info: information of pyramid from `convert_image_pyramid` 360 | 361 | Returns 362 | ======= 363 | bboxes_ori: bboxes in original image 364 | """ 365 | count = 0 366 | bboxes_ori = np.zeros((0, bboxes.shape[1]), dtype=np.float32) 367 | for x, y, w, h, scale in pyramid_info: 368 | x1, y1, x2, y2 = x, y, x+w, y+h 369 | idx = np.logical_and( 370 | np.logical_and(bboxes[:, 0] >= x1, bboxes[:, 1] >= y1), 371 | np.logical_and(bboxes[:, 2] <= x2, bboxes[:, 3] <= y2)) 372 | bboxes[idx, 0] = (bboxes[idx, 0] - x1) / scale 373 | bboxes[idx, 1] = (bboxes[idx, 1] - y1) / scale 374 | bboxes[idx, 2] = (bboxes[idx, 2] - x1) / scale 375 | bboxes[idx, 3] = (bboxes[idx, 3] - y1) / scale 376 | bboxes_ori = np.vstack([bboxes_ori, bboxes[idx]]) 377 | count += idx.sum() 378 | #assert count == len(bboxes), "generate bboxes gives wrong number" 379 | return bboxes_ori 380 | -------------------------------------------------------------------------------- /cpp/mtcnn_detect.cpp: -------------------------------------------------------------------------------- 1 | #include "mtcnn_detect.h" 2 | #include 3 | 4 | static cv::dnn::Net pnet; 5 | static cv::dnn::Net rnet; 6 | static cv::dnn::Net onet; 7 | static const std::vector nets_outblob_names{"prob", "bbox_pred"}; 8 | static std::vector working_scales; 9 | static const float pnet_winsize = 12.0f; 10 | static unsigned int num_working_scale; 11 | static const int rnet_winsize = 24; 12 | 13 | 14 | static void nms_bounding_box 15 | ( 16 | std::vector& inboxes, 17 | float thresh, 18 | char method_type, 19 | std::vector& outboxes 20 | ) 21 | { 22 | if (inboxes.size() == 0) 23 | { 24 | return; 25 | } 26 | 27 | std::sort(inboxes.begin(), inboxes.end(), [](const obj_info &a, const obj_info &b){ return a.bbox.score > b.bbox.score;}); 28 | 29 | int select_idx = 0; 30 | int num_bbox = inboxes.size(); 31 | std::vector mask_merged(num_bbox, 0); 32 | unsigned char all_merged = 0; 33 | 34 | while (!all_merged) 35 | { 36 | while (select_idx < num_bbox && mask_merged[select_idx] == 1) 37 | { 38 | ++select_idx; 39 | } 40 | if (select_idx == num_bbox) 41 | { 42 | all_merged = 1; 43 | continue; 44 | } 45 | 46 | outboxes.push_back(inboxes[select_idx]); 47 | mask_merged[select_idx] = 1; 48 | 49 | obj_box select_bbox = inboxes[select_idx].bbox; 50 | float area1 = (select_bbox.xmax - select_bbox.xmin + 1) * (select_bbox.ymax - select_bbox.ymin + 1); 51 | float x1 = select_bbox.xmin; 52 | float y1 = select_bbox.ymin; 53 | float x2 = select_bbox.xmax; 54 | float y2 = select_bbox.ymax; 55 | 56 | ++select_idx; 57 | //#pragma omp parallel for num_threads(threads_num) 58 | for (int i = select_idx; i < num_bbox; ++i) 59 | { 60 | if (mask_merged[i] == 1) 61 | { 62 | continue; 63 | } 64 | obj_box & bbox_i = inboxes[i].bbox; 65 | float x = std::max(x1, bbox_i.xmin); 66 | float y = std::max(y1, bbox_i.ymin); 67 | float w = std::min(x2, bbox_i.xmax) - x + 1; 68 | float h = std::min(y2, bbox_i.ymax) - y + 1; 69 | if (w <= 0 || h <= 0) 70 | { 71 | continue; 72 | } 73 | float area2 = (bbox_i.xmax - bbox_i.xmin + 1) * (bbox_i.ymax - bbox_i.ymin + 1); 74 | float area_intersect = w * h; 75 | 76 | switch (method_type) 77 | { 78 | case 'u': 79 | if (area_intersect / (area1 + area2 - area_intersect) > thresh) 80 | { 81 | mask_merged[i] = 1; 82 | } 83 | break; 84 | case 'm': 85 | if (area_intersect / std::min(area1, area2) > thresh) 86 | { 87 | mask_merged[i] = 1; 88 | } 89 | break; 90 | default: 91 | break; 92 | } 93 | } 94 | } 95 | } 96 | 97 | static void regress_boxes 98 | ( 99 | std::vector& bboxes 100 | ) 101 | { 102 | unsigned int num_bboxes = bboxes.size(); 103 | //#pragma omp parallel for num_threads(threads_num) 104 | for (unsigned int i = 0; i < num_bboxes; ++i) 105 | { 106 | obj_box &bbox = bboxes[i].bbox; 107 | float *bbox_reg = bboxes[i].bbox_reg; 108 | float w = bbox.xmax - bbox.xmin; 109 | float h = bbox.ymax - bbox.ymin; 110 | bbox.xmin += bbox_reg[0] * w; 111 | bbox.ymin += bbox_reg[1] * h; 112 | bbox.xmax += bbox_reg[2] * w; 113 | bbox.ymax += bbox_reg[3] * h; 114 | } 115 | } 116 | 117 | static void make_square 118 | ( 119 | std::vector& bboxes 120 | ) 121 | { 122 | unsigned int num_bboxes = bboxes.size(); 123 | //#pragma omp parallel for num_threads(threads_num) 124 | for (unsigned int i = 0; i < num_bboxes; ++i) 125 | { 126 | obj_box &bbox = bboxes[i].bbox; 127 | float w = bbox.xmax - bbox.xmin; 128 | float h = bbox.ymax - bbox.ymin; 129 | float xcenter = (bbox.xmax + bbox.xmin) * 0.5f; 130 | float ycenter = (bbox.ymax + bbox.ymin) * 0.5f; 131 | float side = h > w ? h : w; 132 | side *= 0.5f; 133 | bbox.xmin = xcenter - side; 134 | bbox.ymin = ycenter - side; 135 | bbox.xmax = xcenter + side; 136 | bbox.ymax = ycenter + side; 137 | } 138 | } 139 | 140 | void init_mtcnn 141 | ( 142 | const int srcw, 143 | const int srch 144 | ) 145 | { 146 | // head with onet parameter reduction 147 | pnet = cv::dnn::readNetFromCaffe("proto/p.prototxt", "tmp/pnet_iter_446000.caffemodel"); 148 | rnet = cv::dnn::readNetFromCaffe("proto/r.prototxt", "tmp/rnet_iter_116000.caffemodel"); 149 | onet = cv::dnn::readNetFromCaffe("proto/o.prototxt", "tmp/onet_iter_90000.caffemodel"); 150 | 151 | float scale_base = pnet_winsize / min_objsize; 152 | float current_netim_side_size = std::min(srcw, srch); 153 | const float min_netim_side_size = pnet_winsize * current_netim_side_size / max_objsize; 154 | current_netim_side_size *= scale_base; 155 | 156 | while (current_netim_side_size > min_netim_side_size) 157 | { 158 | working_scales.push_back(scale_base); 159 | scale_base *= pyramid_factor; 160 | current_netim_side_size *= pyramid_factor; 161 | } 162 | num_working_scale = working_scales.size(); 163 | 164 | std::cout << "scales: " << std::endl; 165 | for(unsigned int i = 0; i < num_working_scale; ++i) 166 | { 167 | std::cout << working_scales[i] << std::endl; 168 | } 169 | std::cout << "------------" << std::endl; 170 | } 171 | 172 | void run_mtcnn 173 | ( 174 | cv::Mat& im, 175 | std::vector& onet_boxes 176 | ) 177 | { 178 | const int IMW = im.cols; 179 | const int IMH = im.rows; 180 | std::vector pnet_boxes_in_all_scales; 181 | for (unsigned int i = 0; i < num_working_scale; ++i) 182 | { 183 | const float scale = working_scales[i]; 184 | const float scale_inv = 1.0f / scale; 185 | const float candidate_winsize = pnet_winsize * scale_inv; 186 | int netw = std::ceil(IMW * working_scales[i]); 187 | int neth = std::ceil(IMH * working_scales[i]); 188 | cv::Mat netim; 189 | cv::resize(im, netim, cv::Size(netw, neth)); 190 | cv::Mat inblob = cv::dnn::blobFromImage(netim, 0.0078125f, cv::Size(), cv::Scalar(128, 128, 128), false); 191 | pnet.setInput(inblob, "data"); 192 | std::vector outblobs; 193 | pnet.forward(outblobs, nets_outblob_names); 194 | 195 | cv::Mat clsprob = outblobs[0]; 196 | cv::Mat boxroi = outblobs[1]; 197 | const int netoutw = clsprob.size[3]; 198 | const int netouth = clsprob.size[2]; 199 | const int netoutsize = netoutw * netouth; 200 | const float *scores_data = (float *)(clsprob.data); 201 | const float *reg_data = (float *)(boxroi.data); 202 | // get positive obj prob 203 | scores_data += netoutsize; 204 | // generate bounding box 205 | std::vector candidate_boxes; 206 | int idx = 0; 207 | for (int y = 0; y < netouth; ++y) 208 | { 209 | for (int x = 0; x < netoutw; ++x) 210 | { 211 | if (scores_data[idx] > pnet_th) 212 | { 213 | obj_info instance_info; 214 | obj_box &instance_box = instance_info.bbox; 215 | instance_box.xmin = (x << 1) * scale_inv; 216 | instance_box.ymin = (y << 1) * scale_inv; 217 | instance_box.xmax = instance_box.xmin + candidate_winsize; 218 | instance_box.ymax = instance_box.ymin + candidate_winsize; 219 | instance_box.score = scores_data[idx]; 220 | instance_info.bbox_reg[0] = reg_data[idx]; 221 | instance_info.bbox_reg[1] = reg_data[idx + netoutsize]; 222 | instance_info.bbox_reg[2] = reg_data[idx + netoutsize + netoutsize]; 223 | instance_info.bbox_reg[3] = reg_data[idx + netoutsize + netoutsize + netoutsize]; 224 | candidate_boxes.push_back(instance_info); 225 | } 226 | ++idx; 227 | } 228 | } 229 | std::vector nms_boxes; 230 | nms_bounding_box(candidate_boxes, 0.5f, 'u', nms_boxes); 231 | if (nms_boxes.size() > 0) 232 | { 233 | pnet_boxes_in_all_scales.insert(pnet_boxes_in_all_scales.end(), nms_boxes.begin(), nms_boxes.end()); 234 | } 235 | } 236 | #ifdef SHOW_PNET_RESULT 237 | if (pnet_boxes_in_all_scales.size() != 0) 238 | { 239 | nms_bounding_box(pnet_boxes_in_all_scales, pnet_merge_th, 'u', onet_boxes); 240 | regress_boxes(onet_boxes); 241 | make_square(onet_boxes); 242 | } 243 | #else 244 | std::vector pnet_boxes; 245 | if (pnet_boxes_in_all_scales.size() != 0) 246 | { 247 | nms_bounding_box(pnet_boxes_in_all_scales, pnet_merge_th, 'u', pnet_boxes); 248 | regress_boxes(pnet_boxes); 249 | make_square(pnet_boxes); 250 | } 251 | unsigned int num_pnet_boxes = pnet_boxes.size(); 252 | if (num_pnet_boxes == 0) 253 | { 254 | return; 255 | } 256 | 257 | 258 | // rnet 259 | if (num_pnet_boxes > max_pnet_bbox_num) 260 | { 261 | num_pnet_boxes = max_pnet_bbox_num; 262 | } 263 | //std::cout << "p: " << num_pnet_boxes << std::endl; 264 | std::vector rnet_inputs; 265 | for (unsigned int n = 0; n < num_pnet_boxes; ++n) 266 | { 267 | obj_box &box = pnet_boxes[n].bbox; 268 | const int x1 = (int)(box.xmin); 269 | const int y1 = (int)(box.ymin); 270 | const int x2 = (int)(box.xmax); 271 | const int y2 = (int)(box.ymax); 272 | const int h = y2 - y1; 273 | const int w = x2 - x1; 274 | cv::Mat roi = cv::Mat::zeros(h, w, CV_8UC3); 275 | if (x1 < 0 || y1 < 0 || x2 > IMW || y2 > IMH) 276 | { 277 | int vx1 = x1; 278 | int sx = 0; 279 | if (x1 < 0) 280 | { 281 | vx1 = 0; 282 | sx = -x1; 283 | } 284 | int vy1 = y1; 285 | int sy = 0; 286 | if (y1 < 0) 287 | { 288 | vy1 = 0; 289 | sy = -y1; 290 | } 291 | int vx2 = x2; 292 | if (x2 > IMW) 293 | { 294 | vx2 = IMW; 295 | } 296 | int vy2 = y2; 297 | if (y2 > IMH) 298 | { 299 | vy2 = IMH; 300 | } 301 | const int vw = vx2 - vx1; 302 | const int vh = vy2 - vy1; 303 | im(cv::Range(vy1, vy2), cv::Range(vx1, vx2)).copyTo(roi(cv::Range(sy, sy + vh), cv::Range(sx, sx + vw))); 304 | } 305 | else 306 | { 307 | im(cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2))).copyTo(roi); 308 | } 309 | 310 | cv::resize(roi, roi, cv::Size(rnet_winsize, rnet_winsize)); 311 | rnet_inputs.push_back(roi); 312 | } 313 | 314 | cv::Mat inblob = cv::dnn::blobFromImages(rnet_inputs, 0.0078125f, cv::Size(), cv::Scalar(128, 128, 128), false); 315 | rnet.setInput(inblob, "data"); 316 | std::vector rnet_outblobs; 317 | rnet.forward(rnet_outblobs, nets_outblob_names); 318 | 319 | cv::Mat clsprob = rnet_outblobs[0]; 320 | cv::Mat boxroi = rnet_outblobs[1]; 321 | const float *scores_data = (float *)(clsprob.data); 322 | const float *reg_data = (float *)(boxroi.data); 323 | 324 | std::vector rnet_candidate_boxes; 325 | for (unsigned int k = 0; k < num_pnet_boxes; ++k) 326 | { 327 | const float score = scores_data[2 * k + 1]; 328 | if (score > rnet_th) 329 | { 330 | obj_info instance_info; 331 | instance_info.bbox = pnet_boxes[k].bbox; 332 | instance_info.bbox.score = score; 333 | instance_info.bbox_reg[0] = reg_data[4 * k]; 334 | instance_info.bbox_reg[1] = reg_data[4 * k + 1]; 335 | instance_info.bbox_reg[2] = reg_data[4 * k + 2]; 336 | instance_info.bbox_reg[3] = reg_data[4 * k + 3]; 337 | rnet_candidate_boxes.push_back(instance_info); 338 | } 339 | } 340 | #ifdef SHOW_RNET_RESULT 341 | nms_bounding_box(rnet_candidate_boxes, rnet_merge_th, 'u', onet_boxes); 342 | regress_boxes(onet_boxes); 343 | make_square(onet_boxes); 344 | #else 345 | std::vector rnet_boxes; 346 | nms_bounding_box(rnet_candidate_boxes, rnet_merge_th, 'u', rnet_boxes); 347 | regress_boxes(rnet_boxes); 348 | make_square(rnet_boxes); 349 | unsigned int num_rnet_boxes = rnet_boxes.size(); 350 | if (num_rnet_boxes == 0) 351 | { 352 | return; 353 | } 354 | 355 | 356 | // onet 357 | if (num_rnet_boxes > max_rnet_bbox_num) 358 | { 359 | num_rnet_boxes = max_rnet_bbox_num; 360 | } 361 | //std::cout << "r: " << num_rnet_boxes << std::endl; 362 | const int onet_winsize = 48; 363 | std::vector onet_inputs; 364 | for (unsigned int n = 0; n < num_rnet_boxes; ++n) 365 | { 366 | obj_box &box = rnet_boxes[n].bbox; 367 | //std::cout << box.xmin << ", " << box.ymin << ", " << box.xmax << ", " << box.ymax << std::endl; 368 | const int x1 = (int)(box.xmin); 369 | const int y1 = (int)(box.ymin); 370 | const int x2 = (int)(box.xmax); 371 | const int y2 = (int)(box.ymax); 372 | const int h = y2 - y1; 373 | const int w = x2 - x1; 374 | cv::Mat roi = cv::Mat::zeros(h, w, CV_8UC3); 375 | if (x1 < 0 || y1 < 0 || x2 > IMW || y2 > IMH) 376 | { 377 | int vx1 = x1; 378 | int sx = 0; 379 | if (x1 < 0) 380 | { 381 | vx1 = 0; 382 | sx = -x1; 383 | } 384 | int vy1 = y1; 385 | int sy = 0; 386 | if (y1 < 0) 387 | { 388 | vy1 = 0; 389 | sy = -y1; 390 | } 391 | int vx2 = x2; 392 | if (x2 > IMW) 393 | { 394 | vx2 = IMW; 395 | } 396 | int vy2 = y2; 397 | if (y2 > IMH) 398 | { 399 | vy2 = IMH; 400 | } 401 | const int vw = vx2 - vx1; 402 | const int vh = vy2 - vy1; 403 | im(cv::Range(vy1, vy2), cv::Range(vx1, vx2)).copyTo(roi(cv::Range(sy, sy + vh), cv::Range(sx, sx + vw))); 404 | } 405 | else 406 | { 407 | im(cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2))).copyTo(roi); 408 | } 409 | cv::resize(roi, roi, cv::Size(onet_winsize, onet_winsize)); 410 | onet_inputs.push_back(roi); 411 | } 412 | 413 | inblob = cv::dnn::blobFromImages(onet_inputs, 0.0078125f, cv::Size(), cv::Scalar(128, 128, 128), false); 414 | onet.setInput(inblob, "data"); 415 | std::vector onet_outblobs; 416 | onet.forward(onet_outblobs, nets_outblob_names); 417 | clsprob = onet_outblobs[0]; 418 | boxroi = onet_outblobs[1]; 419 | scores_data = (float *)(clsprob.data); 420 | reg_data = (float *)(boxroi.data); 421 | std::vector onet_candidate_boxes; 422 | for (unsigned int k = 0; k < num_rnet_boxes; ++k) 423 | { 424 | const float score = scores_data[2 * k + 1]; 425 | if (score > onet_th) 426 | { 427 | obj_info instance_info; 428 | instance_info.bbox = rnet_boxes[k].bbox; 429 | instance_info.bbox.score = score; 430 | instance_info.bbox_reg[0] = reg_data[4 * k]; 431 | instance_info.bbox_reg[1] = reg_data[4 * k + 1]; 432 | instance_info.bbox_reg[2] = reg_data[4 * k + 2]; 433 | instance_info.bbox_reg[3] = reg_data[4 * k + 3]; 434 | onet_candidate_boxes.push_back(instance_info); 435 | } 436 | } 437 | 438 | regress_boxes(onet_candidate_boxes); 439 | nms_bounding_box(onet_candidate_boxes, 0.5f, 'm', onet_boxes); 440 | #endif 441 | #endif 442 | } 443 | -------------------------------------------------------------------------------- /jfda/prepare.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # pylint: disable=bad-indentation, no-member, invalid-name, line-too-long 3 | 4 | import os 5 | import shutil 6 | import random 7 | import argparse 8 | import multiprocessing 9 | import cv2 10 | import lmdb 11 | import caffe 12 | import numpy as np 13 | from jfda.config import cfg 14 | from jfda.utils import load_wider, load_celeba, load_scutbrainwashcheat, load_cheat 15 | from jfda.utils import get_logger, crop_face 16 | from jfda.detector import JfdaDetector 17 | 18 | import pyximport 19 | pyximport.install(setup_args={'include_dirs': np.get_include()}) 20 | from bbox import bbox_overlaps 21 | 22 | 23 | logger = get_logger() 24 | 25 | G8 = 8*1024*1024*1024 26 | G16 = 2*G8 27 | G24 = 3*G8 28 | G32 = 4*G8 29 | 30 | 31 | def fill_queues(data, qs): 32 | data_n = len(data) 33 | queue_n = len(qs) 34 | for i in range(len(data)): 35 | qs[i%queue_n].put(data[i]) 36 | 37 | def remove_if_exists(db): 38 | if os.path.exists(db): 39 | logger.info('remove %s'%db) 40 | shutil.rmtree(db) 41 | 42 | def get_detector(): 43 | nets = cfg.PROPOSAL_NETS[cfg.NET_TYPE] 44 | if nets is None or not cfg.USE_DETECT: 45 | detector = None 46 | else: 47 | if cfg.GPU_ID >= 0: 48 | caffe.set_mode_gpu() 49 | caffe.set_device(cfg.GPU_ID) 50 | else: 51 | caffe.set_mode_cpu() 52 | detector = JfdaDetector(nets) 53 | return detector 54 | 55 | 56 | # =========== region proposal ============================= 57 | 58 | def sliding_windows(x, y, width, height, kw, kh, sw, sh): 59 | '''given a region (x, y, width, height), return sliding window locations (x1, y1, x2, y2) 60 | x, y: region top left position 61 | width, height: region width and height 62 | kw, kh: window width and height 63 | sw, sh: stride width and height 64 | ''' 65 | xs = np.arange(0, width-kw, sw) 66 | ys = np.arange(0, height-kh, sh) 67 | xs, ys = np.meshgrid(xs, ys) 68 | xy = np.vstack([xs.ravel(), ys.ravel()]).transpose() 69 | wh = np.array([kw, kh]) 70 | bbox = np.hstack([xy, np.tile(wh, (len(xy), 1))]) 71 | bbox[:, 0] += x 72 | bbox[:, 1] += y 73 | bbox[:, 2] += bbox[:, 0] 74 | bbox[:, 3] += bbox[:, 1] 75 | return bbox.astype(np.float32) 76 | 77 | 78 | def proposal(img, gt_bboxes, detector=None): 79 | '''given an image with face bboxes, proposal negatives, positives and part faces 80 | for rNet and oNet, we use previous networks to proposal bboxes 81 | Return 82 | (negatives, positives, part) 83 | negatives: [data, bbox] 84 | positives: [(data, bbox, bbox_target)] 85 | part: [(data, bbox, bbox_target)] 86 | ''' 87 | # ======================= proposal for rnet and onet ============== 88 | if detector is not None: 89 | assert isinstance(detector, JfdaDetector) 90 | bboxes = detector.detect(img, **cfg.DETECT_PARAMS) 91 | # # maybe sort it by score in descending order 92 | # bboxes = bboxes[bboxes[:, 4].argsort()[::-1]] 93 | # keep bbox info, drop score, offset and landmark 94 | bboxes = bboxes[:, :4] 95 | ovs = bbox_overlaps(bboxes, gt_bboxes) 96 | ovs_max = ovs.max(axis=1) 97 | ovs_idx = ovs.argmax(axis=1) 98 | pos_idx = np.where(ovs_max > cfg.FACE_OVERLAP)[0] 99 | neg_idx = np.where(ovs_max < cfg.NONFACE_OVERLAP)[0] 100 | part_idx = np.where(np.logical_and(ovs_max > cfg.PARTFACE_OVERLAP, ovs_max <= cfg.FACE_OVERLAP))[0] 101 | # pos 102 | positives = [] 103 | for idx in pos_idx: 104 | bbox = bboxes[idx].reshape(4) 105 | gt_bbox = gt_bboxes[ovs_idx[idx]] 106 | data = crop_face(img, bbox) 107 | if data is None: 108 | continue 109 | # cv2.imshow('pos', data) 110 | # cv2.waitKey() 111 | k = bbox[2] - bbox[0] 112 | bbox_target = (gt_bbox - bbox) / k 113 | positives.append((data, bbox, bbox_target)) 114 | # part 115 | part = [] 116 | for idx in part_idx: 117 | bbox = bboxes[idx].reshape(4) 118 | gt_bbox = gt_bboxes[ovs_idx[idx]] 119 | data = crop_face(img, bbox) 120 | if data is None: 121 | continue 122 | # cv2.imshow('part', data) 123 | # cv2.waitKey() 124 | k = bbox[2] - bbox[0] 125 | bbox_target = (gt_bbox - bbox) / k 126 | part.append((data, bbox, bbox_target)) 127 | # neg 128 | negatives = [] 129 | np.random.shuffle(neg_idx) 130 | for idx in neg_idx[:cfg.NEG_DETECT_PER_IMAGE]: 131 | bbox = bboxes[idx].reshape(4) 132 | data = crop_face(img, bbox) 133 | if data is None: 134 | continue 135 | # cv2.imshow('neg', data) 136 | # cv2.waitKey() 137 | negatives.append((data, bbox)) 138 | return negatives, positives, part 139 | 140 | # ======================= proposal for pnet ======================= 141 | height, width = img.shape[:-1] 142 | negatives, positives, part = [], [], [] 143 | 144 | # ===== proposal positives ===== 145 | for gt_bbox in gt_bboxes: 146 | x, y = gt_bbox[:2] 147 | w, h = gt_bbox[2]-gt_bbox[0], gt_bbox[3]-gt_bbox[1] 148 | this_positives = [] 149 | for scale in cfg.POS_PROPOSAL_SCALES: 150 | k = max(w, h) * scale 151 | stride = cfg.POS_PROPOSAL_STRIDE 152 | s = k * stride 153 | offset_x = (0.5 + np.random.rand()) * k / 2. 154 | offset_y = (0.5 + np.random.rand()) * k / 2. 155 | candidates = sliding_windows(x-offset_x, y-offset_y, w+2*offset_x, h+2*offset_y, k, k, s, s) 156 | ovs = bbox_overlaps(candidates, gt_bbox.reshape((1, 4))) 157 | ovs = ovs.reshape((1, len(candidates)))[0] 158 | pos_bboxes = candidates[ovs > cfg.FACE_OVERLAP, :] 159 | if len(pos_bboxes) > 0: 160 | np.random.shuffle(pos_bboxes) 161 | for bbox in pos_bboxes[:cfg.POS_PER_FACE]: 162 | data = crop_face(img, bbox) 163 | if data is None: 164 | continue 165 | # cv2.imshow('positive', data) 166 | # cv2.waitKey() 167 | bbox_target = (gt_bbox - bbox) / k 168 | this_positives.append((data, bbox, bbox_target)) 169 | random.shuffle(this_positives) 170 | positives.extend(this_positives[:cfg.POS_PER_FACE]) 171 | 172 | # ===== proposal part faces ===== 173 | for gt_bbox in gt_bboxes: 174 | x, y = gt_bbox[:2] 175 | w, h = gt_bbox[2]-gt_bbox[0], gt_bbox[3]-gt_bbox[1] 176 | this_part = [] 177 | for scale in cfg.PART_PROPOSAL_SCALES: 178 | k = max(w, h) * scale 179 | stride = cfg.PART_PROPOSAL_STRIDE 180 | s = k * stride 181 | offset_x = (0.5 + np.random.rand()) * k / 2. 182 | offset_y = (0.5 + np.random.rand()) * k / 2. 183 | candidates = sliding_windows(x-offset_x, y-offset_y, w+2*offset_x, h+2*offset_y, k, k, s, s) 184 | ovs = bbox_overlaps(candidates, gt_bbox.reshape((1, 4))) 185 | ovs = ovs.reshape((1, len(candidates)))[0] 186 | part_bboxes = candidates[np.logical_and(ovs > cfg.PARTFACE_OVERLAP, ovs <= cfg.FACE_OVERLAP), :] 187 | if len(part_bboxes) > 0: 188 | np.random.shuffle(part_bboxes) 189 | for bbox in part_bboxes[:cfg.PART_PER_FACE]: 190 | data = crop_face(img, bbox) 191 | if data is None: 192 | continue 193 | # cv2.imshow('part', data) 194 | # cv2.waitKey() 195 | bbox_target = (gt_bbox - bbox) / k 196 | this_part.append((data, bbox, bbox_target)) 197 | random.shuffle(this_part) 198 | part.extend(this_part[:cfg.POS_PER_FACE]) 199 | 200 | # ===== proposal negatives ===== 201 | for gt_bbox in gt_bboxes: 202 | x, y = gt_bbox[:2] 203 | w, h = gt_bbox[2]-gt_bbox[0], gt_bbox[3]-gt_bbox[1] 204 | this_negatives = [] 205 | for scale in cfg.NEG_PROPOSAL_SCALES: 206 | k = max(w, h) * scale 207 | stride = cfg.NEG_PROPOSAL_STRIDE 208 | s = k * stride 209 | offset_x = (0.5 + np.random.rand()) * k / 2. 210 | offset_y = (0.5 + np.random.rand()) * k / 2. 211 | candidates = sliding_windows(x-offset_x, y-offset_y, w+2*offset_x, h+2*offset_y, k, k, s, s) 212 | ovs = bbox_overlaps(candidates, gt_bboxes) 213 | neg_bboxes = candidates[ovs.max(axis=1) < cfg.NONFACE_OVERLAP, :] 214 | if len(neg_bboxes) > 0: 215 | np.random.shuffle(neg_bboxes) 216 | for bbox in neg_bboxes[:cfg.NEG_PER_FACE]: 217 | data = crop_face(img, bbox) 218 | if data is None: 219 | continue 220 | # cv2.imshow('negative', data) 221 | # cv2.waitKey() 222 | this_negatives.append((data, bbox)) 223 | random.shuffle(this_negatives) 224 | negatives.extend(this_negatives[:cfg.NEG_PER_FACE]) 225 | 226 | # negatives from global image random crop 227 | max_num_from_fr = int(cfg.NEG_PER_IMAGE * cfg.NEG_FROM_FR_RATIO) 228 | if len(negatives) > max_num_from_fr: 229 | random.shuffle(negatives) 230 | negatives = negatives[:max_num_from_fr] 231 | bbox_neg = [] 232 | range_x, range_y = width - cfg.NEG_MIN_SIZE, height - cfg.NEG_MIN_SIZE 233 | for i in xrange(cfg.NEG_PROPOSAL_RATIO * cfg.NEG_PER_IMAGE): 234 | x1, y1 = np.random.randint(range_x), np.random.randint(range_y) 235 | w = h = np.random.randint(low=cfg.NEG_MIN_SIZE, high=min(width-x1, height-y1)) 236 | x2, y2 = x1 + w, y1 + h 237 | bbox_neg.append([x1, y1, x2, y2]) 238 | if x2 > width or y2 > height: 239 | print 'hhhh' 240 | bbox_neg = np.asarray(bbox_neg, dtype=gt_bboxes.dtype) 241 | ovs = bbox_overlaps(bbox_neg, gt_bboxes) 242 | bbox_neg = bbox_neg[ovs.max(axis=1) < cfg.NONFACE_OVERLAP] 243 | np.random.shuffle(bbox_neg) 244 | if not cfg.NEG_FORCE_BALANCE: 245 | remain = cfg.NEG_PER_IMAGE - len(negatives) 246 | else: 247 | # balance ratio from face region and global crop 248 | remain = len(negatives) * (1. - cfg.NEG_FROM_FR_RATIO) / cfg.NEG_FROM_FR_RATIO 249 | remain = int(remain) 250 | bbox_neg = bbox_neg[:remain] 251 | 252 | # for bbox in bbox_neg: 253 | # x1, y1, x2, y2 = bbox 254 | # x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 255 | # cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 1) 256 | # cv2.imshow('neg', img) 257 | # cv2.waitKey() 258 | 259 | for bbox in bbox_neg: 260 | data = crop_face(img, bbox) 261 | negatives.append((data, bbox)) 262 | return negatives, positives, part 263 | 264 | 265 | # =========== WIDER ================ 266 | 267 | def gen_wider(): 268 | logger.info('loading WIDER') 269 | #train_data, val_data = load_wider() 270 | train_data, val_data = load_scutbrainwashcheat() 271 | #train_data, val_data = load_cheat() 272 | logger.info('total images, train: %d, val: %d', len(train_data), len(val_data)) 273 | train_faces = reduce(lambda acc, x: acc + len(x[1]), train_data, 0) 274 | val_faces = reduce(lambda acc, x: acc + len(x[1]), val_data, 0) 275 | logger.info('total faces, train: %d, val: %d', train_faces, val_faces) 276 | 277 | def gen(data, db_names): 278 | for db_name in db_names: remove_if_exists(db_name) 279 | logger.info('fill queues') 280 | q_in = [multiprocessing.Queue() for i in range(cfg.WORKER_N)] 281 | q_out = multiprocessing.Queue(1024) 282 | fill_queues(data, q_in) 283 | readers = [multiprocessing.Process(target=wider_reader_func, args=(q_in[i], q_out)) \ 284 | for i in range(cfg.WORKER_N)] 285 | for p in readers: 286 | p.start() 287 | writer = multiprocessing.Process(target=wider_writer_func, args=(q_out, db_names)) 288 | writer.start() 289 | for p in readers: 290 | p.join() 291 | q_out.put(('finish', [])) 292 | writer.join() 293 | 294 | logger.info('writing train data, %d images', len(train_data)) 295 | db_names = ['data/%snet_positive_train'%cfg.NET_TYPE, 296 | 'data/%snet_negative_train'%cfg.NET_TYPE, 297 | 'data/%snet_part_train'%cfg.NET_TYPE] 298 | gen(train_data, db_names) 299 | logger.info('writing val data, %d images', len(val_data)) 300 | db_names = ['data/%snet_positive_val'%cfg.NET_TYPE, 301 | 'data/%snet_negative_val'%cfg.NET_TYPE, 302 | 'data/%snet_part_val'%cfg.NET_TYPE] 303 | gen(val_data, db_names) 304 | 305 | 306 | def wider_reader_func(q_in, q_out): 307 | input_size = cfg.NET_INPUT_SIZE[cfg.NET_TYPE] 308 | detector = get_detector() 309 | counter = 0 310 | while not q_in.empty(): 311 | item = q_in.get() 312 | counter += 1 313 | if counter % 1000 == 0: 314 | logger.info('%s reads %d', multiprocessing.current_process().name, counter) 315 | img_path, bboxes = item 316 | img = cv2.imread(img_path, cv2.IMREAD_COLOR) 317 | if img is None: 318 | logger.warning('read %s failed', img_path) 319 | continue 320 | negatives, positives, part = proposal(img, bboxes, detector) 321 | 322 | for data, _ in negatives: 323 | data = cv2.resize(data, (input_size, input_size)) 324 | data = data.tostring() # string for lmdb, uint8 325 | q_out.put(('negative', [data])) 326 | for data, _, bbox_target in positives: 327 | data = cv2.resize(data, (input_size, input_size)) 328 | data = data.tostring() # string for lmdb, uint8 329 | bbox_target = bbox_target.astype(np.float32).tostring() # float32 330 | q_out.put(('positive', [data, bbox_target])) 331 | for data, _, bbox_target in part: 332 | data = cv2.resize(data, (input_size, input_size)) 333 | data = data.tostring() # string for lmdb, uint8 334 | bbox_target = bbox_target.astype(np.float32).tostring() # float32 335 | q_out.put(('part', [data, bbox_target])) 336 | 337 | 338 | def wider_writer_func(q_out, db_names): 339 | db_pos = lmdb.open(db_names[0], map_size=G16) 340 | db_neg = lmdb.open(db_names[1], map_size=G16) 341 | db_part = lmdb.open(db_names[2], map_size=G16) 342 | txn_pos = db_pos.begin(write=True) 343 | txn_neg = db_neg.begin(write=True) 344 | txn_part = db_part.begin(write=True) 345 | 346 | idx_pos, idx_neg, idx_part = 0, 0, 0 347 | q_pos, q_neg, q_part = [], [], [] 348 | 349 | def fill(txn, items, idx, has_bbox=True): 350 | random.shuffle(items) 351 | for item in items: 352 | data_key = '%08d_data'%idx 353 | txn.put(data_key, item[0]) 354 | if has_bbox: 355 | bbox_key = '%08d_bbox'%idx 356 | txn.put(bbox_key, item[1]) 357 | idx += 1 358 | return idx 359 | 360 | counter = 0 361 | pos_counter, neg_counter, part_counter = 0, 0, 0 362 | while True: 363 | stat, item = q_out.get() 364 | counter += 1 365 | if counter % 10000 == 0: 366 | logger.info('writes %d positives, %d negatives, %d part', pos_counter, neg_counter, part_counter) 367 | if stat == 'positive': 368 | pos_counter += 1 369 | q_pos.append(item) 370 | if len(q_pos) >= cfg.SHUFFLE_SIZE: 371 | idx_pos = fill(txn_pos, q_pos, idx_pos, True) 372 | q_pos = [] 373 | elif stat == 'negative': 374 | neg_counter += 1 375 | q_neg.append(item) 376 | if len(q_neg) >= cfg.SHUFFLE_SIZE: 377 | idx_neg = fill(txn_neg, q_neg, idx_neg, False) 378 | q_neg = [] 379 | elif stat == 'part': 380 | part_counter += 1 381 | q_part.append(item) 382 | if len(q_part) >= cfg.SHUFFLE_SIZE: 383 | idx_part = fill(txn_part, q_part, idx_part, True) 384 | q_part = [] 385 | else: 386 | # stat == 'finish' 387 | idx_pos = fill(txn_pos, q_pos, idx_pos, True) 388 | txn_pos.put('size', str(idx_pos)) 389 | idx_neg = fill(txn_neg, q_neg, idx_neg, False) 390 | txn_neg.put('size', str(idx_neg)) 391 | idx_pos = fill(txn_part, q_part, idx_part, True) 392 | txn_part.put('size', str(idx_part)) 393 | break 394 | 395 | txn_pos.commit() 396 | txn_neg.commit() 397 | txn_part.commit() 398 | db_pos.close() 399 | db_neg.close() 400 | db_part.close() 401 | logger.info('Finish') 402 | 403 | 404 | # =========== CelebA =============== 405 | 406 | def gen_celeba(): 407 | logger.info('loading CelebA') 408 | train_data, val_data = load_celeba() 409 | logger.info('total images, train: %d, val: %d', len(train_data), len(val_data)) 410 | 411 | def gen(data, db_name): 412 | remove_if_exists(db_name) 413 | logger.info('fill queues') 414 | q_in = [multiprocessing.Queue() for i in range(cfg.WORKER_N)] 415 | q_out = multiprocessing.Queue(1024) 416 | fill_queues(data, q_in) 417 | readers = [multiprocessing.Process(target=celeba_reader_func, args=(q_in[i], q_out)) \ 418 | for i in range(cfg.WORKER_N)] 419 | for p in readers: 420 | p.start() 421 | writer = multiprocessing.Process(target=celeba_writer_func, args=(q_out, db_name)) 422 | writer.start() 423 | for p in readers: 424 | p.join() 425 | q_out.put(('finish', [])) 426 | writer.join() 427 | 428 | logger.info('writing train data, %d images', len(train_data)) 429 | gen(train_data, 'data/%snet_landmark_train'%cfg.NET_TYPE) 430 | logger.info('writing val data, %d images', len(val_data)) 431 | gen(val_data, 'data/%snet_landmark_val'%cfg.NET_TYPE) 432 | 433 | 434 | def celeba_reader_func(q_in, q_out): 435 | 436 | def vertify_bbox(bbox, landmark): 437 | return True 438 | 439 | input_size = cfg.NET_INPUT_SIZE[cfg.NET_TYPE] 440 | detector = get_detector() 441 | counter = 0 442 | while not q_in.empty(): 443 | item = q_in.get() 444 | counter += 1 445 | if counter%1000 == 0: 446 | logger.info('%s reads %d', multiprocessing.current_process().name, counter) 447 | img_path, bbox, landmark = item 448 | img = cv2.imread(img_path, cv2.IMREAD_COLOR) 449 | if img is None: 450 | logger.warning('read %s failed', img_path) 451 | continue 452 | bbox = np.asarray(bbox, dtype=np.float32).reshape((1, -1)) 453 | _1, bboxes, _2 = proposal(img, bbox, detector) 454 | np.random.shuffle(bboxes) 455 | for data, bbox, _ in bboxes[:cfg.LANDMARK_PER_FACE]: 456 | # make sure landmark points are in bbox 457 | landmark1 = landmark.reshape((-1, 2)).copy() 458 | if not vertify_bbox(bbox, landmark1): 459 | continue 460 | # # debug 461 | # img1 = img.copy() 462 | # x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) 463 | # cv2.rectangle(img1, (x1, y1), (x2, y2), (0, 0, 255), 2) 464 | # for x, y in landmark1: 465 | # x, y = int(x), int(y) 466 | # cv2.circle(img1, (x, y), 2, (0, 255, 0), -1) 467 | # cv2.imshow('landmark', img1) 468 | # cv2.waitKey(0) 469 | # normalize landmark 470 | w, h = bbox[2]-bbox[0], bbox[3]-bbox[1] 471 | landmark1[:, 0] = (landmark1[:, 0] - bbox[0]) / w 472 | landmark1[:, 1] = (landmark1[:, 1] - bbox[1]) / h 473 | landmark1 = landmark1.reshape(-1) 474 | # format data 475 | data = cv2.resize(data, (input_size, input_size)) 476 | data = data.tostring() # string for lmdb, uint8 477 | landmark1 = landmark1.astype(np.float32).tostring() # float32 478 | q_out.put(('data', [data, landmark1])) 479 | 480 | 481 | def celeba_writer_func(q_out, db_name): 482 | map_size = G16 483 | db = lmdb.open(db_name, map_size=map_size) 484 | counter = 0 485 | with db.begin(write=True) as txn: 486 | while True: 487 | stat, item = q_out.get() 488 | if stat == 'finish': 489 | txn.put('size', str(counter)) 490 | break 491 | data, landmark = item 492 | data_key = '%08d_data'%counter 493 | landmark_key = '%08d_landmark'%counter 494 | txn.put(data_key, data) 495 | txn.put(landmark_key, landmark) 496 | counter += 1 497 | if counter%1000 == 0: 498 | logger.info('writes %d landmark faces', counter) 499 | db.close() 500 | logger.info('Finish') 501 | 502 | 503 | def test(): 504 | os.system('rm -rf tmp/pos/*') 505 | os.system('rm -rf tmp/neg/*') 506 | os.system('rm -rf tmp/part/*') 507 | logger.info('Load WIDER') 508 | train_data, val_data = load_wider() 509 | img_path, bboxes = train_data[np.random.choice(len(train_data))] 510 | bboxes = np.asarray(bboxes) 511 | img = cv2.imread(img_path, cv2.IMREAD_COLOR) 512 | detector = JfdaDetector(cfg.PROPOSAL_NETS['r']) 513 | negatives, positives, part = proposal(img, bboxes, detector) 514 | logger.info('%d gt_bboxes', len(bboxes)) 515 | logger.info('%d negatives, %d positives, %d part', len(negatives), len(positives), len(part)) 516 | for i, (data, bbox_target) in enumerate(positives): 517 | cv2.imwrite('tmp/pos/%03d.jpg'%i, data) 518 | for i, (data) in enumerate(negatives): 519 | cv2.imwrite('tmp/neg/%03d.jpg'%i, data) 520 | for i, (data, bbox_target) in enumerate(part): 521 | cv2.imwrite('tmp/part/%03d.jpg'%i, data) 522 | cv2.imwrite('tmp/test.jpg', img) 523 | 524 | 525 | if __name__ == '__main__': 526 | parser = argparse.ArgumentParser() 527 | parser.add_argument('--net', type=str, default='p', help='net type') 528 | parser.add_argument('--celeba', action='store_true', help='generate face data') 529 | parser.add_argument('--wider', action='store_true', help='generate landmark data') 530 | parser.add_argument('--gpu', type=int, default=0, help='gpu device') 531 | parser.add_argument('--detect', action='store_true', help='use previous network detection') 532 | parser.add_argument('--worker', type=int, default=8, help='workers to process the data') 533 | parser.add_argument('--test', action='store_true', help='just simple test') 534 | args = parser.parse_args() 535 | 536 | cfg.GPU_ID = args.gpu 537 | cfg.NET_TYPE = args.net 538 | cfg.USE_DETECT = args.detect 539 | cfg.WORKER_N = args.worker 540 | 541 | if args.test: 542 | test() 543 | if args.wider: 544 | gen_wider() 545 | if args.celeba: 546 | gen_celeba() 547 | --------------------------------------------------------------------------------