├── src ├── __init__.py ├── __pycache__ │ ├── nets.cpython-37.pyc │ ├── utils.cpython-37.pyc │ ├── detect.cpython-37.pyc │ ├── models.cpython-37.pyc │ ├── __init__.cpython-37.pyc │ ├── base_camera.cpython-37.pyc │ └── camera_opencv.cpython-37.pyc ├── camera_opencv.py ├── test.py ├── utils.py ├── base_camera.py ├── models.py └── detect.py ├── .gitignore ├── assets ├── yuanm.png ├── example.png ├── office1.jpg ├── office2.jpg ├── office3.jpg ├── office4.jpg └── office5.jpg ├── scripts ├── onet.npy ├── pnet.npy ├── rnet.npy ├── test.py └── caffemodel_to_pytorchmodel.py ├── weights ├── onet.npy ├── pnet.npy └── rnet.npy ├── caffe_models ├── det1.caffemodel ├── det2.caffemodel ├── det3.caffemodel ├── det4.caffemodel ├── det1.prototxt ├── det2.prototxt ├── det3.prototxt └── det4.prototxt └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /assets/yuanm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/yuanm.png -------------------------------------------------------------------------------- /scripts/onet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/scripts/onet.npy -------------------------------------------------------------------------------- /scripts/pnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/scripts/pnet.npy -------------------------------------------------------------------------------- /scripts/rnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/scripts/rnet.npy -------------------------------------------------------------------------------- /weights/onet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/weights/onet.npy -------------------------------------------------------------------------------- /weights/pnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/weights/pnet.npy -------------------------------------------------------------------------------- /weights/rnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/weights/rnet.npy -------------------------------------------------------------------------------- /assets/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/example.png -------------------------------------------------------------------------------- /assets/office1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/office1.jpg -------------------------------------------------------------------------------- /assets/office2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/office2.jpg -------------------------------------------------------------------------------- /assets/office3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/office3.jpg -------------------------------------------------------------------------------- /assets/office4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/office4.jpg -------------------------------------------------------------------------------- /assets/office5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/assets/office5.jpg -------------------------------------------------------------------------------- /caffe_models/det1.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/caffe_models/det1.caffemodel -------------------------------------------------------------------------------- /caffe_models/det2.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/caffe_models/det2.caffemodel -------------------------------------------------------------------------------- /caffe_models/det3.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/caffe_models/det3.caffemodel -------------------------------------------------------------------------------- /caffe_models/det4.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/caffe_models/det4.caffemodel -------------------------------------------------------------------------------- /src/__pycache__/nets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/nets.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/detect.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/detect.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/base_camera.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/base_camera.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/camera_opencv.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mayuanjason/MTCNN_face_detection_alignment_pytorch/HEAD/src/__pycache__/camera_opencv.cpython-37.pyc -------------------------------------------------------------------------------- /scripts/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | weights = np.load('./pnet.npy', encoding='bytes', allow_pickle=True)[()] 4 | doc = open('pnet.txt', 'a') 5 | print(weights, file=doc) -------------------------------------------------------------------------------- /src/camera_opencv.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from base_camera import BaseCamera 3 | 4 | 5 | class Camera(BaseCamera): 6 | video_source = 0 7 | cap = None 8 | 9 | @staticmethod 10 | def set_video_source(source): 11 | Camera.video_source = source 12 | 13 | @staticmethod 14 | def frames(): 15 | Camera.cap = cv2.VideoCapture(Camera.video_source) 16 | if not Camera.cap.isOpened(): 17 | raise RuntimeError('Could not start camera.') 18 | 19 | while True: 20 | # read current frame 21 | _, img = Camera.cap.read() 22 | 23 | # encode as a jpeg image and return it 24 | # yield cv2.imencode('.jpg', img)[1].tobytes() 25 | yield img 26 | 27 | @staticmethod 28 | def close(): 29 | print('1release camera resource') 30 | if Camera.cap: 31 | print('2release camera resource') 32 | Camera.cap.release() -------------------------------------------------------------------------------- /src/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | sys.path.append(os.pardir) 5 | from importlib import import_module 6 | import cv2 7 | from src.detect import FaceDetector 8 | 9 | # import camera driver 10 | if os.environ.get('CAMERA'): 11 | Camera = import_module('camera_' + os.environ['CAMERA']).Camera 12 | else: 13 | from camera import Camera 14 | 15 | 16 | if __name__ == "__main__": 17 | detector = FaceDetector() 18 | 19 | while True: 20 | frame = Camera().get_frame() 21 | image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 22 | bounding_boxes = detector.detect(image) 23 | 24 | for i in range(len(bounding_boxes)): 25 | cv2.rectangle(frame, (int(bounding_boxes[i][0]), int(bounding_boxes[i][1])), 26 | (int(bounding_boxes[i][2]), int(bounding_boxes[i][3])), (255, 0, 0), 2) 27 | 28 | cv2.imshow('capture', frame) 29 | key = cv2.waitKey(1) 30 | if key & 0xFF == ord('q'): 31 | break -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from IPython import display 4 | import matplotlib.pyplot as plt 5 | import torch 6 | from PIL import ImageDraw 7 | 8 | 9 | def use_svg_display(): 10 | """用矢量图显示 11 | """ 12 | 13 | display.set_matplotlib_formats('svg') 14 | 15 | 16 | def set_figsize(figsize=(3.5, 2.5)): 17 | """Set matplotlib figure size. 18 | 19 | Keyword Arguments: 20 | figsize {tuple} -- [description] (default: {(3.5, 2.5)}) 21 | """ 22 | 23 | use_svg_display() 24 | plt.rcParams['figure.figsize'] = figsize 25 | 26 | 27 | def try_gpu(): 28 | use_cuda = torch.cuda.is_available() 29 | return torch.device("cuda" if use_cuda else "cpu") 30 | 31 | 32 | def show_bboxes(img, bboxes, facial_landmarks=[]): 33 | """Draw bounding boxes and facial landmarks. 34 | 35 | Arguments: 36 | img {[type]} -- an instance of PIL.Image. 37 | bboxes {[type]} -- a float numpy array of shape [n, 5]. 38 | 39 | Keyword Arguments: 40 | facial_landmarks {list} -- a float numpy array of shape [n, 10]. (default: {[]}) 41 | 42 | Returns: 43 | [type] -- an instance of PIL.Image. 44 | """ 45 | 46 | img_copy = img.copy() 47 | draw = ImageDraw.Draw(img_copy) 48 | 49 | for b in bboxes: 50 | draw.rectangle([ 51 | (b[0], b[1]), (b[2], b[3]) 52 | ], outline='white') 53 | 54 | for p in facial_landmarks: 55 | for i in range(5): 56 | draw.ellipse([ 57 | (p[i] - 1.0, p[i + 5] - 1.0), 58 | (p[i] + 1.0, p[i + 5] + 1.0) 59 | ], outline='blue') 60 | 61 | return img_copy 62 | -------------------------------------------------------------------------------- /caffe_models/det1.prototxt: -------------------------------------------------------------------------------- 1 | name: "PNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 12 6 | input_dim: 12 7 | 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 10 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "PReLU1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 2 48 | stride: 2 49 | } 50 | } 51 | 52 | layer { 53 | name: "conv2" 54 | type: "Convolution" 55 | bottom: "pool1" 56 | top: "conv2" 57 | param { 58 | lr_mult: 1 59 | decay_mult: 1 60 | } 61 | param { 62 | lr_mult: 2 63 | decay_mult: 0 64 | } 65 | convolution_param { 66 | num_output: 16 67 | kernel_size: 3 68 | stride: 1 69 | weight_filler { 70 | type: "xavier" 71 | } 72 | bias_filler { 73 | type: "constant" 74 | value: 0 75 | } 76 | } 77 | } 78 | layer { 79 | name: "PReLU2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | 85 | layer { 86 | name: "conv3" 87 | type: "Convolution" 88 | bottom: "conv2" 89 | top: "conv3" 90 | param { 91 | lr_mult: 1 92 | decay_mult: 1 93 | } 94 | param { 95 | lr_mult: 2 96 | decay_mult: 0 97 | } 98 | convolution_param { 99 | num_output: 32 100 | kernel_size: 3 101 | stride: 1 102 | weight_filler { 103 | type: "xavier" 104 | } 105 | bias_filler { 106 | type: "constant" 107 | value: 0 108 | } 109 | } 110 | } 111 | layer { 112 | name: "PReLU3" 113 | type: "PReLU" 114 | bottom: "conv3" 115 | top: "conv3" 116 | } 117 | 118 | 119 | layer { 120 | name: "conv4-1" 121 | type: "Convolution" 122 | bottom: "conv3" 123 | top: "conv4-1" 124 | param { 125 | lr_mult: 1 126 | decay_mult: 1 127 | } 128 | param { 129 | lr_mult: 2 130 | decay_mult: 0 131 | } 132 | convolution_param { 133 | num_output: 2 134 | kernel_size: 1 135 | stride: 1 136 | weight_filler { 137 | type: "xavier" 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | 146 | layer { 147 | name: "conv4-2" 148 | type: "Convolution" 149 | bottom: "conv3" 150 | top: "conv4-2" 151 | param { 152 | lr_mult: 1 153 | decay_mult: 1 154 | } 155 | param { 156 | lr_mult: 2 157 | decay_mult: 0 158 | } 159 | convolution_param { 160 | num_output: 4 161 | kernel_size: 1 162 | stride: 1 163 | weight_filler { 164 | type: "xavier" 165 | } 166 | bias_filler { 167 | type: "constant" 168 | value: 0 169 | } 170 | } 171 | } 172 | layer { 173 | name: "prob1" 174 | type: "Softmax" 175 | bottom: "conv4-1" 176 | top: "prob1" 177 | } 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MTCNN 2 | 3 | `PyTorch` implementation of **inference stage** of face detection algorithm described in 4 | [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878). 5 | 6 | ![example of a face detection](assets/example.png) 7 | 8 | ## Why this projects 9 | [mtcnn-pytorch](https://github.com/TropComplique/mtcnn-pytorch) This is the most popular pytorch implementation of mtcnn. There are some disadvantages I found when using it for real-time detection task. 10 | 11 | * Mix torch operation and numpy operation together, which resulting in slow inference speed (Cannot run on GPU). 12 | * Based on the old version of pytorch (0.2). 13 | 14 | So I create this project and make some improvments: 15 | * Transfer all numpy operation to torch operation, so that it can benefit from GPU acceleration. 16 | * Automatic run on 'CPU' or 'GPU'. 17 | * Based on the latest version of pytorch (1.3) 18 | * Real-time face tracking 19 | 20 | ## Installation 21 | 1. Create virtual environment 22 | ``` 23 | # conda create -n face_detection 24 | ``` 25 | 2. Activate virtual environment 26 | ``` 27 | # source activate face_detection 28 | ``` 29 | 3. [Install PyTorch](https://pytorch.org/) 30 | ``` 31 | (face_detection)# conda install pytorch torchvision cudatoolkit=10.1 -c pytorch 32 | ``` 33 | 34 | 4. Install Jupyter Notebook 35 | ``` 36 | (face_detection)# conda install jupyter notebook 37 | ``` 38 | 39 | 5. Install `opencv` 40 | ``` 41 | (face_detection)# pip install opencv-python 42 | ``` 43 | 44 | ## How to use it 45 | Just download the repository and then do this 46 | ``` 47 | import sys 48 | import os 49 | sys.path.append(os.pardir) 50 | from importlib import import_module 51 | import cv2 52 | from src.detect import FaceDetector 53 | 54 | # import camera driver 55 | if os.environ.get('CAMERA'): 56 | Camera = import_module('camera_' + os.environ['CAMERA']).Camera 57 | else: 58 | from camera import Camera 59 | 60 | if __name__ == "__main__": 61 | detector = FaceDetector() 62 | 63 | while True: 64 | frame = Camera().get_frame() 65 | image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 66 | bounding_boxes = detector.detect(image) 67 | 68 | for i in range(len(bounding_boxes)): 69 | cv2.rectangle(frame, (int(bounding_boxes[i][0]), int(bounding_boxes[i][1])), 70 | (int(bounding_boxes[i][2]), int(bounding_boxes[i][3])), (255, 0, 0), 2) 71 | 72 | cv2.imshow('capture', frame) 73 | key = cv2.waitKey(1) 74 | if key & 0xFF == ord('q'): 75 | break 76 | ``` 77 | or just run: 78 | ``` 79 | (face_detection)# cd MTCNN_face_detection_alignment_pytorch/src/ 80 | (face_detection)# CAMERA=opencv python test.py 81 | ``` 82 | 83 | ## Tutorial 84 | [Detect step by step](./notebooks/try_mtcnn_step_by_step.ipynb) 85 | 86 | ## Credit 87 | This implementation is heavily inspired by: 88 | * [TropComplique/mtcnn-pytorch](https://github.com/TropComplique/mtcnn-pytorch) 89 | * [faciallab/FaceDetector](https://github.com/faciallab/FaceDetector) 90 | 91 | ## Citation 92 | ``` 93 | @article{7553523, 94 | author={K. Zhang and Z. Zhang and Z. Li and Y. Qiao}, 95 | journal={IEEE Signal Processing Letters}, 96 | title={Joint Face Detection and Alignment Using Multitask Cascaded Convolutional Networks}, 97 | year={2016}, 98 | volume={23}, 99 | number={10}, 100 | pages={1499-1503}, 101 | keywords={Benchmark testing;Computer architecture;Convolution;Detectors;Face;Face detection;Training;Cascaded convolutional neural network (CNN);face alignment;face detection}, 102 | doi={10.1109/LSP.2016.2603342}, 103 | ISSN={1070-9908}, 104 | month={Oct} 105 | } 106 | ``` -------------------------------------------------------------------------------- /scripts/caffemodel_to_pytorchmodel.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import torch 4 | 5 | """ 6 | # PNet 7 | # conv1.weight (10, 3, 3, 3) 8 | # conv1.bias (10,) 9 | # prelu1.weight (10,) 10 | # conv2.weight (16, 10, 3, 3) 11 | # conv2.bias (16,) 12 | # prelu2.weight (16,) 13 | # conv3.weight (32, 16, 3, 3) 14 | # conv3.bias (32,) 15 | # prelu3.weight (32,) 16 | # conv4-1.weight (2, 32, 1, 1) 17 | # conv4-1.bias (2,) 18 | # conv4-2.weight (4, 32, 1, 1) 19 | # conv4-2.bias (4,) 20 | 21 | # RNet 22 | # conv1.weight (28, 3, 3, 3) 23 | # conv1.bias (28,) 24 | # prelu1.weight (28,) 25 | # conv2.weight (48, 28, 3, 3) 26 | # conv2.bias (48,) 27 | # prelu2.weight (48,) 28 | # conv3.weight (64, 48, 2, 2) 29 | # conv3.bias (64,) 30 | # prelu3.weight (64,) 31 | # conv4.weight (128, 576) 32 | # conv4.bias (128,) 33 | # prelu4.weight (128,) 34 | # conv5-1.weight (2, 128) 35 | # conv5-1.bias (2,) 36 | # conv5-2.weight (4, 128) 37 | # conv5-2.bias (4,) 38 | 39 | # ONet 40 | # conv1.weight (32, 3, 3, 3) 41 | # conv1.bias (32,) 42 | # prelu1.weight (32,) 43 | # conv2.weight (64, 32, 3, 3) 44 | # conv2.bias (64,) 45 | # prelu2.weight (64,) 46 | # conv3.weight (64, 64, 3, 3) 47 | # conv3.bias (64,) 48 | # prelu3.weight (64,) 49 | # conv4.weight (128, 64, 2, 2) 50 | # conv4.bias (128,) 51 | # prelu4.weight (128,) 52 | # conv5.weight (256, 1152) 53 | # conv5.bias (256,) 54 | # prelu5.weight (256,) 55 | # conv6-1.weight (2, 256) 56 | # conv6-1.bias (2,) 57 | # conv6-2.weight (4, 256) 58 | # conv6-2.bias (4,) 59 | # conv6-3.weight (10, 256) 60 | # conv6-3.bias (10,) 61 | """ 62 | 63 | def dump_layer(net): 64 | for param in net.params.keys(): 65 | print(param.lower() + '.weight', net.params[param][0].data.shape) 66 | if len(net.params[param]) == 2: 67 | print(param.lower() + '.bias', net.params[param][1].data.shape) 68 | 69 | def convert_to_pytorch_model(net, **net_info): 70 | model_state = {} 71 | 72 | for param in net.params.keys(): 73 | if net_info['cls_prob'] in param: 74 | prefix = 'cls_prob.' + param.lower().replace('-', '_') 75 | elif net_info['bbox_offset'] in param: 76 | prefix = 'bbox_offset.' + param.lower().replace('-', '_') 77 | elif net_info['landmarks'] is not None and net_info['landmarks'] in param: 78 | prefix = 'landmarks.' + param.lower().replace('-', '_') 79 | else: 80 | prefix = 'backend.' + param.lower() 81 | 82 | if 'prelu' in prefix: 83 | model_state[prefix + '.weight'] = torch.tensor(net.params[param][0].data) 84 | else: 85 | if len(net.params[param][0].data.shape) == 4: 86 | model_state[prefix + '.weight'] = torch.tensor(net.params[param][0].data.transpose((0, 1, 3, 2))) 87 | else: 88 | model_state[prefix + '.weight'] = torch.tensor(net.params[param][0].data) 89 | 90 | model_state[prefix + '.bias'] = torch.tensor(net.params[param][1].data) 91 | 92 | return model_state 93 | 94 | 95 | def covnver_pnet(): 96 | net = caffe.Net('../caffe_models/det1.prototxt', '../caffe_models/det1.caffemodel', caffe.TEST) 97 | # dump_layer(net) 98 | p = convert_to_pytorch_model(net, cls_prob='conv4-1', bbox_offset='conv4-2', landmarks=None) 99 | np.save('pnet.npy', p, allow_pickle=True) 100 | 101 | def covnver_rnet(): 102 | net = caffe.Net('../caffe_models/det2.prototxt', '../caffe_models/det2.caffemodel', caffe.TEST) 103 | # dump_layer(net) 104 | p = convert_to_pytorch_model(net, cls_prob='conv5-1', bbox_offset='conv5-2', landmarks=None) 105 | np.save('rnet.npy', p, allow_pickle=True) 106 | 107 | def covnver_onet(): 108 | net = caffe.Net('../caffe_models/det3.prototxt', '../caffe_models/det3.caffemodel', caffe.TEST) 109 | # dump_layer(net) 110 | p = convert_to_pytorch_model(net, cls_prob='conv6-1', bbox_offset='conv6-2', landmarks='conv6-3') 111 | np.save('onet.npy', p, allow_pickle=True) 112 | 113 | if __name__ == "__main__": 114 | covnver_pnet() 115 | covnver_rnet() 116 | covnver_onet() 117 | -------------------------------------------------------------------------------- /src/base_camera.py: -------------------------------------------------------------------------------- 1 | import time 2 | import threading 3 | try: 4 | from greenlet import getcurrent as get_ident 5 | except ImportError: 6 | try: 7 | from thread import get_ident 8 | except ImportError: 9 | from _thread import get_ident 10 | 11 | 12 | class CameraEvent(object): 13 | """An Event-like class that signals all active clients when a new frame is 14 | available. 15 | """ 16 | def __init__(self): 17 | self.events = {} 18 | 19 | def wait(self): 20 | """Invoked from each client's thread to wait for the next frame.""" 21 | ident = get_ident() 22 | if ident not in self.events: 23 | # this is a new client 24 | # add an entry for it in the self.events dict 25 | # each entry has two elements, a threading.Event() and a timestamp 26 | self.events[ident] = [threading.Event(), time.time()] 27 | return self.events[ident][0].wait() 28 | 29 | def set(self): 30 | """Invoked by the camera thread when a new frame is available.""" 31 | now = time.time() 32 | remove = None 33 | for ident, event in self.events.items(): 34 | if not event[0].isSet(): 35 | # if this client's event is not set, then set it 36 | # also update the last set timestamp to now 37 | event[0].set() 38 | event[1] = now 39 | else: 40 | # if the client's event is already set, it means the client 41 | # did not process a previous frame 42 | # if the event stays set for more than 5 seconds, then assume 43 | # the client is gone and remove it 44 | if now - event[1] > 5: 45 | remove = ident 46 | if remove: 47 | del self.events[remove] 48 | 49 | def clear(self): 50 | """Invoked from each client's thread after a frame was processed.""" 51 | self.events[get_ident()][0].clear() 52 | 53 | 54 | class BaseCamera(object): 55 | thread = None # background thread that reads frames from camera 56 | frame = None # current frame is stored here by background thread 57 | last_access = 0 # time of last client access to the camera 58 | event = CameraEvent() 59 | 60 | def __init__(self): 61 | """Start the background camera thread if it isn't running yet.""" 62 | if BaseCamera.thread is None: 63 | BaseCamera.last_access = time.time() 64 | 65 | # start background frame thread 66 | BaseCamera.thread = threading.Thread(target=self._thread) 67 | BaseCamera.thread.start() 68 | 69 | # wait until frames are available 70 | while self.get_frame() is None: 71 | time.sleep(0) 72 | 73 | def get_frame(self): 74 | """Return the current camera frame.""" 75 | BaseCamera.last_access = time.time() 76 | 77 | # wait for a signal from the camera thread 78 | BaseCamera.event.wait() 79 | BaseCamera.event.clear() 80 | 81 | return BaseCamera.frame 82 | 83 | @staticmethod 84 | def frames(): 85 | """"Generator that returns frames from the camera.""" 86 | raise RuntimeError('Must be implemented by subclasses.') 87 | 88 | @staticmethod 89 | def close(): 90 | raise RuntimeError('Must be implemented by subclasses.') 91 | 92 | @classmethod 93 | def _thread(cls): 94 | """Camera background thread.""" 95 | print('Starting camera thread.') 96 | frames_iterator = cls.frames() 97 | for frame in frames_iterator: 98 | BaseCamera.frame = frame 99 | BaseCamera.event.set() # send signal to clients 100 | time.sleep(0) 101 | 102 | # if there hasn't been any clients asking for frames in 103 | # the last 10 seconds then stop the thread 104 | if time.time() - BaseCamera.last_access > 10: 105 | frames_iterator.close() 106 | cls.close() 107 | print('Stopping camera thread due to inactivity.') 108 | break 109 | BaseCamera.thread = None 110 | -------------------------------------------------------------------------------- /caffe_models/det2.prototxt: -------------------------------------------------------------------------------- 1 | name: "RNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 24 6 | input_dim: 24 7 | 8 | 9 | ########################## 10 | ###################### 11 | layer { 12 | name: "conv1" 13 | type: "Convolution" 14 | bottom: "data" 15 | top: "conv1" 16 | param { 17 | lr_mult: 0 18 | decay_mult: 0 19 | } 20 | param { 21 | lr_mult: 0 22 | decay_mult: 0 23 | } 24 | convolution_param { 25 | num_output: 28 26 | kernel_size: 3 27 | stride: 1 28 | weight_filler { 29 | type: "xavier" 30 | } 31 | bias_filler { 32 | type: "constant" 33 | value: 0 34 | } 35 | } 36 | } 37 | layer { 38 | name: "prelu1" 39 | type: "PReLU" 40 | bottom: "conv1" 41 | top: "conv1" 42 | propagate_down: true 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "conv1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 52 | stride: 2 53 | } 54 | } 55 | 56 | layer { 57 | name: "conv2" 58 | type: "Convolution" 59 | bottom: "pool1" 60 | top: "conv2" 61 | param { 62 | lr_mult: 0 63 | decay_mult: 0 64 | } 65 | param { 66 | lr_mult: 0 67 | decay_mult: 0 68 | } 69 | convolution_param { 70 | num_output: 48 71 | kernel_size: 3 72 | stride: 1 73 | weight_filler { 74 | type: "xavier" 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 0 79 | } 80 | } 81 | } 82 | layer { 83 | name: "prelu2" 84 | type: "PReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | propagate_down: true 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | stride: 2 98 | } 99 | } 100 | #################################### 101 | 102 | ################################## 103 | layer { 104 | name: "conv3" 105 | type: "Convolution" 106 | bottom: "pool2" 107 | top: "conv3" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 64 118 | kernel_size: 2 119 | stride: 1 120 | weight_filler { 121 | type: "xavier" 122 | } 123 | bias_filler { 124 | type: "constant" 125 | value: 0 126 | } 127 | } 128 | } 129 | layer { 130 | name: "prelu3" 131 | type: "PReLU" 132 | bottom: "conv3" 133 | top: "conv3" 134 | propagate_down: true 135 | } 136 | ############################### 137 | 138 | ############################### 139 | 140 | layer { 141 | name: "conv4" 142 | type: "InnerProduct" 143 | bottom: "conv3" 144 | top: "conv4" 145 | param { 146 | lr_mult: 0 147 | decay_mult: 0 148 | } 149 | param { 150 | lr_mult: 0 151 | decay_mult: 0 152 | } 153 | inner_product_param { 154 | num_output: 128 155 | weight_filler { 156 | type: "xavier" 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0 161 | } 162 | } 163 | } 164 | layer { 165 | name: "prelu4" 166 | type: "PReLU" 167 | bottom: "conv4" 168 | top: "conv4" 169 | } 170 | 171 | layer { 172 | name: "conv5-1" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5-1" 176 | param { 177 | lr_mult: 0 178 | decay_mult: 0 179 | } 180 | param { 181 | lr_mult: 0 182 | decay_mult: 0 183 | } 184 | inner_product_param { 185 | num_output: 2 186 | #kernel_size: 1 187 | #stride: 1 188 | weight_filler { 189 | type: "xavier" 190 | } 191 | bias_filler { 192 | type: "constant" 193 | value: 0 194 | } 195 | } 196 | } 197 | layer { 198 | name: "conv5-2" 199 | type: "InnerProduct" 200 | bottom: "conv4" 201 | top: "conv5-2" 202 | param { 203 | lr_mult: 1 204 | decay_mult: 1 205 | } 206 | param { 207 | lr_mult: 2 208 | decay_mult: 1 209 | } 210 | inner_product_param { 211 | num_output: 4 212 | #kernel_size: 1 213 | #stride: 1 214 | weight_filler { 215 | type: "xavier" 216 | } 217 | bias_filler { 218 | type: "constant" 219 | value: 0 220 | } 221 | } 222 | } 223 | layer { 224 | name: "prob1" 225 | type: "Softmax" 226 | bottom: "conv5-1" 227 | top: "prob1" 228 | } -------------------------------------------------------------------------------- /caffe_models/det3.prototxt: -------------------------------------------------------------------------------- 1 | name: "ONet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 48 6 | input_dim: 48 7 | ################################## 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 1 20 | } 21 | convolution_param { 22 | num_output: 32 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "prelu1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | stride: 2 49 | } 50 | } 51 | layer { 52 | name: "conv2" 53 | type: "Convolution" 54 | bottom: "pool1" 55 | top: "conv2" 56 | param { 57 | lr_mult: 1 58 | decay_mult: 1 59 | } 60 | param { 61 | lr_mult: 2 62 | decay_mult: 1 63 | } 64 | convolution_param { 65 | num_output: 64 66 | kernel_size: 3 67 | stride: 1 68 | weight_filler { 69 | type: "xavier" 70 | } 71 | bias_filler { 72 | type: "constant" 73 | value: 0 74 | } 75 | } 76 | } 77 | 78 | layer { 79 | name: "prelu2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "conv2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 92 | stride: 2 93 | } 94 | } 95 | 96 | layer { 97 | name: "conv3" 98 | type: "Convolution" 99 | bottom: "pool2" 100 | top: "conv3" 101 | param { 102 | lr_mult: 1 103 | decay_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | decay_mult: 1 108 | } 109 | convolution_param { 110 | num_output: 64 111 | kernel_size: 3 112 | weight_filler { 113 | type: "xavier" 114 | } 115 | bias_filler { 116 | type: "constant" 117 | value: 0 118 | } 119 | } 120 | } 121 | layer { 122 | name: "prelu3" 123 | type: "PReLU" 124 | bottom: "conv3" 125 | top: "conv3" 126 | } 127 | layer { 128 | name: "pool3" 129 | type: "Pooling" 130 | bottom: "conv3" 131 | top: "pool3" 132 | pooling_param { 133 | pool: MAX 134 | kernel_size: 2 135 | stride: 2 136 | } 137 | } 138 | layer { 139 | name: "conv4" 140 | type: "Convolution" 141 | bottom: "pool3" 142 | top: "conv4" 143 | param { 144 | lr_mult: 1 145 | decay_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | decay_mult: 1 150 | } 151 | convolution_param { 152 | num_output: 128 153 | kernel_size: 2 154 | weight_filler { 155 | type: "xavier" 156 | } 157 | bias_filler { 158 | type: "constant" 159 | value: 0 160 | } 161 | } 162 | } 163 | layer { 164 | name: "prelu4" 165 | type: "PReLU" 166 | bottom: "conv4" 167 | top: "conv4" 168 | } 169 | 170 | 171 | layer { 172 | name: "conv5" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5" 176 | param { 177 | lr_mult: 1 178 | decay_mult: 1 179 | } 180 | param { 181 | lr_mult: 2 182 | decay_mult: 1 183 | } 184 | inner_product_param { 185 | #kernel_size: 3 186 | num_output: 256 187 | weight_filler { 188 | type: "xavier" 189 | } 190 | bias_filler { 191 | type: "constant" 192 | value: 0 193 | } 194 | } 195 | } 196 | 197 | layer { 198 | name: "drop5" 199 | type: "Dropout" 200 | bottom: "conv5" 201 | top: "conv5" 202 | dropout_param { 203 | dropout_ratio: 0.25 204 | } 205 | } 206 | layer { 207 | name: "prelu5" 208 | type: "PReLU" 209 | bottom: "conv5" 210 | top: "conv5" 211 | } 212 | 213 | 214 | layer { 215 | name: "conv6-1" 216 | type: "InnerProduct" 217 | bottom: "conv5" 218 | top: "conv6-1" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 1 226 | } 227 | inner_product_param { 228 | #kernel_size: 1 229 | num_output: 2 230 | weight_filler { 231 | type: "xavier" 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0 236 | } 237 | } 238 | } 239 | layer { 240 | name: "conv6-2" 241 | type: "InnerProduct" 242 | bottom: "conv5" 243 | top: "conv6-2" 244 | param { 245 | lr_mult: 1 246 | decay_mult: 1 247 | } 248 | param { 249 | lr_mult: 2 250 | decay_mult: 1 251 | } 252 | inner_product_param { 253 | #kernel_size: 1 254 | num_output: 4 255 | weight_filler { 256 | type: "xavier" 257 | } 258 | bias_filler { 259 | type: "constant" 260 | value: 0 261 | } 262 | } 263 | } 264 | layer { 265 | name: "conv6-3" 266 | type: "InnerProduct" 267 | bottom: "conv5" 268 | top: "conv6-3" 269 | param { 270 | lr_mult: 1 271 | decay_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | decay_mult: 1 276 | } 277 | inner_product_param { 278 | #kernel_size: 1 279 | num_output: 10 280 | weight_filler { 281 | type: "xavier" 282 | } 283 | bias_filler { 284 | type: "constant" 285 | value: 0 286 | } 287 | } 288 | } 289 | layer { 290 | name: "prob1" 291 | type: "Softmax" 292 | bottom: "conv6-1" 293 | top: "prob1" 294 | } 295 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from collections import OrderedDict 6 | from src.utils import try_gpu 7 | 8 | 9 | class Flatten(nn.Module): 10 | def __init__(self): 11 | super(Flatten, self).__init__() 12 | 13 | def forward(self, x): 14 | """[summary] 15 | 16 | Arguments: 17 | x {[type]} -- a float tensor with shape [batch_size, c, h, w]. 18 | 19 | Returns: 20 | [type] -- a float tensor with shape [batch_size, c*h*w]. 21 | """ 22 | 23 | # without this pretrained model isn't working 24 | x = x.transpose(3, 2).contiguous() 25 | 26 | # "flatten" the C * H * W values into a single vector per image 27 | return x.view(x.size(0), -1) 28 | 29 | 30 | class _Net(nn.Module): 31 | def __init__(self, is_training=False, device=try_gpu()): 32 | super(_Net, self).__init__() 33 | 34 | self._init_net() 35 | 36 | # Move tensor to target device 37 | self.to(device) 38 | 39 | self.train(is_training) 40 | 41 | def _init_net(self): 42 | raise NotImplementedError 43 | 44 | def load(self, model_path): 45 | states_to_load = np.load(model_path, allow_pickle=True)[()] 46 | model_state = self.state_dict() 47 | model_state.update(states_to_load) 48 | self.load_state_dict(model_state) 49 | 50 | 51 | class PNet(_Net): 52 | """ 53 | Model's state_dict: 54 | backend.conv1.weight torch.Size([10, 3, 3, 3]) 55 | backend.conv1.bias torch.Size([10]) 56 | backend.prelu1.weight torch.Size([10]) 57 | backend.conv2.weight torch.Size([16, 10, 3, 3]) 58 | backend.conv2.bias torch.Size([16]) 59 | backend.prelu2.weight torch.Size([16]) 60 | backend.conv3.weight torch.Size([32, 16, 3, 3]) 61 | backend.conv3.bias torch.Size([32]) 62 | backend.prelu3.weight torch.Size([32]) 63 | cls_prob.conv4_1.weight torch.Size([2, 32, 1, 1]) 64 | cls_prob.conv4_1.bias torch.Size([2]) 65 | bbox_offset.conv4_2.weight torch.Size([4, 32, 1, 1]) 66 | bbox_offset.conv4_2.bias torch.Size([4]) 67 | """ 68 | 69 | def __init__(self, **kwargs): 70 | super(PNet, self).__init__(**kwargs) 71 | 72 | def _init_net(self): 73 | self.backend = nn.Sequential(OrderedDict([ 74 | ('conv1', nn.Conv2d(3, 10, kernel_size=3, stride=1)), 75 | ('prelu1', nn.PReLU(10)), 76 | ('pool1', nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)), 77 | 78 | ('conv2', nn.Conv2d(10, 16, kernel_size=3, stride=1)), 79 | ('prelu2', nn.PReLU(16)), 80 | 81 | ('conv3', nn.Conv2d(16, 32, kernel_size=3, stride=1)), 82 | ('prelu3', nn.PReLU(32)) 83 | ])) 84 | 85 | self.cls_prob = nn.Sequential(OrderedDict([ 86 | ('conv4_1', nn.Conv2d(32, 2, 1, 1)), 87 | ('softmax', nn.Softmax(dim=1)) 88 | ])) 89 | 90 | self.bbox_offset = nn.Sequential(OrderedDict([ 91 | ('conv4_2', nn.Conv2d(32, 4, 1, 1)) 92 | ])) 93 | 94 | def forward(self, x): 95 | """[summary] 96 | 97 | Arguments: 98 | x {torch.float32} -- a float tensor with shape [batch_size, 3, h, w]. 99 | 100 | Returns: 101 | cls_probs {torch.float32} -- a float tensor with shape [batch_size, 2, h, w]. 102 | offsets {torch.float32} -- a float tensor with shape [batch_size, 4, h, w]. 103 | """ 104 | 105 | feature_map = self.backend(x) 106 | 107 | # face classification 108 | cls_probs = self.cls_prob(feature_map) 109 | 110 | # bounding box regression 111 | offsets = self.bbox_offset(feature_map) 112 | 113 | return cls_probs, offsets 114 | 115 | 116 | class RNet(_Net): 117 | """ 118 | Model's state_dict: 119 | backend.conv1.weight torch.Size([28, 3, 3, 3]) 120 | backend.conv1.bias torch.Size([28]) 121 | backend.prelu1.weight torch.Size([28]) 122 | backend.conv2.weight torch.Size([48, 28, 3, 3]) 123 | backend.conv2.bias torch.Size([48]) 124 | backend.prelu2.weight torch.Size([48]) 125 | backend.conv3.weight torch.Size([64, 48, 2, 2]) 126 | backend.conv3.bias torch.Size([64]) 127 | backend.prelu3.weight torch.Size([64]) 128 | backend.conv4.weight torch.Size([128, 576]) 129 | backend.conv4.bias torch.Size([128]) 130 | backend.prelu4.weight torch.Size([128]) 131 | cls_prob.conv5_1.weight torch.Size([2, 128]) 132 | cls_prob.conv5_1.bias torch.Size([2]) 133 | bbox_offset.conv5_2.weight torch.Size([4, 128]) 134 | bbox_offset.conv5_2.bias torch.Size([4]) 135 | """ 136 | 137 | def __init__(self, **kwargs): 138 | super(RNet, self).__init__(**kwargs) 139 | 140 | def _init_net(self): 141 | self.backend = nn.Sequential(OrderedDict([ 142 | ('conv1', nn.Conv2d(3, 28, 3, 1)), 143 | ('prelu1', nn.PReLU(28)), 144 | ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)), 145 | 146 | ('conv2', nn.Conv2d(28, 48, 3, 1)), 147 | ('prelu2', nn.PReLU(48)), 148 | ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)), 149 | 150 | ('conv3', nn.Conv2d(48, 64, 2, 1)), 151 | ('prelu3', nn.PReLU(64)), 152 | 153 | ('flatten', Flatten()), 154 | # Linear(in_features, out_features, bias=True) 155 | ('conv4', nn.Linear(576, 128)), 156 | ('prelu4', nn.PReLU(128)) 157 | ])) 158 | 159 | self.cls_prob = nn.Sequential(OrderedDict([ 160 | ('conv5_1', nn.Linear(128, 2)), 161 | ('softmax', nn.Softmax(dim=1)) 162 | ])) 163 | 164 | self.bbox_offset = nn.Sequential(OrderedDict([ 165 | ('conv5_2', nn.Linear(128, 4)) 166 | ])) 167 | 168 | def forward(self, x): 169 | """[summary] 170 | 171 | Arguments: 172 | x {torch.float32} -- a float tensor with shape [batch_size, 3, h, w]. 173 | 174 | Returns: 175 | cls_probs {torch.float32} -- a float tensor with shape [batch_size, 2]. 176 | offsets {torch.float32} -- a float tensor with shape [batch_size, 4]. 177 | """ 178 | 179 | feature_map = self.backend(x) 180 | 181 | # face classification 182 | cls_probs = self.cls_prob(feature_map) 183 | 184 | # bounding box regression 185 | offsets = self.bbox_offset(feature_map) 186 | 187 | return cls_probs, offsets 188 | 189 | 190 | class ONet(_Net): 191 | """ 192 | Model's state_dict: 193 | backend.conv1.weight torch.Size([32, 3, 3, 3]) 194 | backend.conv1.bias torch.Size([32]) 195 | backend.prelu1.weight torch.Size([32]) 196 | backend.conv2.weight torch.Size([64, 32, 3, 3]) 197 | backend.conv2.bias torch.Size([64]) 198 | backend.prelu2.weight torch.Size([64]) 199 | backend.conv3.weight torch.Size([64, 64, 3, 3]) 200 | backend.conv3.bias torch.Size([64]) 201 | backend.prelu3.weight torch.Size([64]) 202 | backend.conv4.weight torch.Size([128, 64, 2, 2]) 203 | backend.conv4.bias torch.Size([128]) 204 | backend.prelu4.weight torch.Size([128]) 205 | backend.conv5.weight torch.Size([256, 1152]) 206 | backend.conv5.bias torch.Size([256]) 207 | backend.prelu5.weight torch.Size([256]) 208 | cls_prob.conv6_1.weight torch.Size([2, 256]) 209 | cls_prob.conv6_1.bias torch.Size([2]) 210 | bbox_offset.conv6_2.weight torch.Size([4, 256]) 211 | bbox_offset.conv6_2.bias torch.Size([4]) 212 | landmarks.conv6_3.weight torch.Size([10, 256]) 213 | landmarks.conv6_3.bias torch.Size([10]) 214 | """ 215 | 216 | def __init__(self, **kwargs): 217 | super(ONet, self).__init__(**kwargs) 218 | 219 | def _init_net(self): 220 | self.backend = nn.Sequential(OrderedDict([ 221 | ('conv1', nn.Conv2d(3, 32, 3, 1)), 222 | ('prelu1', nn.PReLU(32)), 223 | ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)), 224 | 225 | ('conv2', nn.Conv2d(32, 64, 3, 1)), 226 | ('prelu2', nn.PReLU(64)), 227 | ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)), 228 | 229 | ('conv3', nn.Conv2d(64, 64, 3, 1)), 230 | ('prelu3', nn.PReLU(64)), 231 | ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)), 232 | 233 | ('conv4', nn.Conv2d(64, 128, 2, 1)), 234 | ('prelu4', nn.PReLU(128)), 235 | 236 | ('flatten', Flatten()), 237 | ('conv5', nn.Linear(1152, 256)), 238 | ('drop5', nn.Dropout(0.25)), 239 | ('prelu5', nn.PReLU(256)) 240 | ])) 241 | 242 | self.cls_prob = nn.Sequential(OrderedDict([ 243 | ('conv6_1', nn.Linear(256, 2)), 244 | ('softmax', nn.Softmax(dim=1)) 245 | ])) 246 | 247 | self.bbox_offset = nn.Sequential(OrderedDict([ 248 | ('conv6_2', nn.Linear(256, 4)) 249 | ])) 250 | 251 | self.landmarks = nn.Sequential(OrderedDict([ 252 | ('conv6_3', nn.Linear(256, 10)) 253 | ])) 254 | 255 | def forward(self, x): 256 | """[summary] 257 | 258 | Arguments: 259 | x {torch.float32} -- a float tensor with shape [batch_size, 3, h, w]. 260 | 261 | Returns: 262 | cls_probs {torch.float32} -- a float tensor with shape [batch_size, 2]. 263 | offsets {torch.float32} -- a float tensor with shape [batch_size, 4]. 264 | landmarks {torch.float32} -- a float tensor with shape [batch_size, 10]. 265 | """ 266 | 267 | feature_map = self.backend(x) 268 | 269 | # face classification 270 | cls_probs = self.cls_prob(feature_map) 271 | 272 | # bounding box regression 273 | offsets = self.bbox_offset(feature_map) 274 | 275 | # Ficial landmark localization 276 | landmarks = self.landmarks(feature_map) 277 | 278 | return cls_probs, offsets, landmarks 279 | -------------------------------------------------------------------------------- /src/detect.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.pardir) 4 | 5 | from src.utils import try_gpu, set_figsize, show_bboxes 6 | from src.models import PNet, RNet, ONet 7 | import math 8 | from PIL import Image 9 | import torchvision.transforms as transforms 10 | import torchvision 11 | import torch.nn.functional as F 12 | import torch 13 | 14 | 15 | def _no_grad(func): 16 | def wrapper(*args, **kwargs): 17 | with torch.no_grad(): 18 | return func(*args, **kwargs) 19 | 20 | return wrapper 21 | 22 | class FaceDetector(): 23 | 24 | def __init__(self): 25 | self.device = try_gpu() 26 | 27 | # LOAD MODELS 28 | self.pnet = PNet() 29 | self.rnet = RNet() 30 | self.onet = ONet() 31 | 32 | self.pnet.load('../weights/pnet.npy') 33 | self.rnet.load('../weights/rnet.npy') 34 | # TBD need to check if weight is on GPU 35 | self.onet.load('../weights/onet.npy') 36 | 37 | def _preprocess(self, img): 38 | """Preprocessing step before feeding the network. 39 | 40 | Arguments: 41 | img {PIL.Image} -- an instance of PIL.Image. 42 | or an image path 43 | 44 | Returns: 45 | {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 46 | """ 47 | 48 | if isinstance(img, str): 49 | img = Image.open(img) 50 | 51 | # The output of torchvision datasets are PILImage images of range [0, 1]. We transform them to Tensors of normalized range [-1, 1]. 52 | transform = transforms.Compose([ 53 | # Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 54 | transforms.ToTensor(), 55 | # Normalize a tensor image with mean and standard deviation 56 | # input[channel] = (input[channel] - mean[channel]) / std[channel] 57 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 58 | ]) 59 | 60 | img = transform(img).to(self.device) 61 | img = img.unsqueeze(0) 62 | 63 | return img 64 | 65 | def detect(self, img, min_face_size=20.0, threshold=[0.6, 0.7, 0.8], factor=0.707, nms_threshold=[0.7, 0.7, 0.7]): 66 | """[summary] 67 | 68 | Arguments: 69 | img {[type]} -- an instance of PIL.Image. 70 | 71 | Keyword Arguments: 72 | min_face_size {float} -- a float number. (default: {20.0}) 73 | threshold {list} -- a list of length 3 (default: {[0.6, 0.7, 0.8]}) 74 | factor {float} -- [description] (default: {0.707}) 75 | nms_threshold {list} -- a list of length 3. (default: {[0.7, 0.7, 0.7]}) 76 | 77 | Returns: 78 | [type] -- [description] 79 | """ 80 | 81 | img = self._preprocess(img) 82 | 83 | scales = self.create_image_pyramid(img, min_face_size, factor) 84 | 85 | bounding_boxes = self.stage_one( 86 | img, scales, threshold[0], nms_threshold[0]) 87 | bounding_boxes = self.stage_two( 88 | img, bounding_boxes, threshold[1], nms_threshold[1]) 89 | bounding_boxes, _ = self.stage_three( 90 | img, bounding_boxes, threshold[2], nms_threshold[2]) 91 | 92 | return bounding_boxes 93 | 94 | def create_image_pyramid(self, img, min_face_size, factor): 95 | """BUILD AN IMAGE PYRAMID 96 | 97 | Arguments: 98 | img {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 99 | min_face_size {float} -- [description] 100 | factor {float} -- [description] 101 | 102 | Returns: 103 | {list} -- [description] 104 | """ 105 | _, _, height, width = img.shape 106 | min_length = min(height, width) 107 | 108 | min_detection_size = 12 109 | 110 | # scales for scaling the image 111 | scales = [] 112 | 113 | # scales the image so that 114 | # minimum size that we can detect equals to 115 | # minimum face size that we want to detect 116 | m = min_detection_size/min_face_size 117 | min_length *= m 118 | 119 | factor_count = 0 120 | while min_length > min_detection_size: 121 | scales.append(m*factor**factor_count) # TBD need to optimize here 122 | min_length *= factor 123 | factor_count += 1 124 | 125 | return scales 126 | 127 | def _generate_bboxes(self, cls_probs, offsets, scale, threshold): 128 | """Generate bounding boxes at places 129 | 130 | Arguments: 131 | cls_probs {[type]} -- a float tensor of shape [1, 2, n, m]. 132 | offsets {[type]} -- a float tensor of shape [1, 4, n, m]. 133 | scale {[type]} -- a float number, 134 | width and height of the image were scaled by this number. 135 | threshold {[type]} -- a float number. 136 | 137 | Returns: 138 | bounding_boxes {} -- a float tensor of shape [n_boxes, 4] 139 | scores {} -- a float tensor of shape [n_boxes] 140 | offsets {} -- a float tensor of shape [n_boxes, 4] 141 | """ 142 | 143 | # applying P-Net is equivalent, in some sense, to 144 | # moving 12x12 window with stride 2 145 | stride = 2 146 | cell_size = 12 147 | 148 | # extract positive probability and resize it as [n, m] dim tensor. 149 | cls_probs = cls_probs[0, 1, :, :] 150 | 151 | # indices of boxes where there is probably a face 152 | inds = (cls_probs > threshold).nonzero() 153 | 154 | if inds.shape[0] == 0: 155 | return torch.empty((0, 4), device=self.device), torch.empty((0), device=self.device), torch.empty((0, 4), device=self.device) 156 | 157 | # transformations of bounding boxes 158 | tx1, ty1, tx2, ty2 = [ 159 | offsets[0, i, inds[:, 0], inds[:, 1]] for i in range(4)] 160 | # they are defined as: 161 | # x1 = x * stride / scale 162 | # y1 = y * stride / scale 163 | # x2 = (x * stride + 12) / scale 164 | # y2 = (y * stride + 12) / scale 165 | # w = x2 - x1 + 1 166 | # h = y2 - y1 + 1 167 | # x1_true = x1 + tx1 * w 168 | # x2_true = x2 + tx2 * w 169 | # y1_true = y1 + ty1 * h 170 | # y2_true = y2 + ty2 * h 171 | 172 | offsets = torch.stack([tx1, ty1, tx2, ty2], dim=1) 173 | scores = cls_probs[inds[:, 0], inds[:, 1]] 174 | 175 | # P-Net is applied to scaled images 176 | # so we need to rescale bounding boxes back 177 | bounding_boxes = torch.stack([ 178 | (stride * inds[:, 1] + 1.0), 179 | (stride * inds[:, 0] + 1.0), 180 | (stride * inds[:, 1] + 1.0 + cell_size), 181 | (stride * inds[:, 0] + 1.0 + cell_size), 182 | ]).transpose(0, 1).float() 183 | # why one is added? 184 | bounding_boxes = bounding_boxes / scale 185 | 186 | return bounding_boxes, scores, offsets 187 | 188 | def _refine_boxes(self, bboxes, height, width): 189 | bboxes = torch.max(torch.zeros_like( 190 | bboxes, device=self.device), bboxes) 191 | sizes = torch.tensor([[width, height, width, height]] * 192 | bboxes.shape[0], dtype=torch.float32, device=self.device) 193 | bboxes = torch.min(bboxes, sizes) 194 | 195 | return bboxes 196 | 197 | def _get_image_boxes(self, bboxes, img, size=24): 198 | """[summary] 199 | 200 | Arguments: 201 | bboxes {torch.float32} -- a float tensor of shape [n, 4]. 202 | img {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 203 | 204 | Keyword Arguments: 205 | size {int} -- an integer, size of cutouts. (default: {24}) 206 | 207 | Returns: 208 | {torch.float32} -- a float tensor of shape [n, 3, size, size]. 209 | """ 210 | 211 | _, _, height, width = img.shape 212 | bboxes = self._refine_boxes(bboxes, height, width) 213 | 214 | img_boxes = [] 215 | 216 | for box in bboxes: 217 | im = img[:, :, box[1].int(): box[3].int(), 218 | box[0].int(): box[2].int()] 219 | im = F.interpolate(im, size=(size, size), 220 | mode='bilinear', align_corners=False) 221 | img_boxes.append(im) 222 | 223 | img_boxes = torch.cat(img_boxes, 0) 224 | 225 | return img_boxes 226 | 227 | def _convert_to_square(self, bboxes): 228 | """Convert bounding boxes to a square form. 229 | 230 | Arguments: 231 | bboxes {torch.float32} -- a float tensor of shape [n, 4] 232 | 233 | Returns: 234 | square_bboxes {torch.float32} -- a float tensor of shape [n, 4], 235 | squared bounding boxes. 236 | """ 237 | square_bboxes = torch.zeros_like(bboxes, device=self.device) 238 | x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)] 239 | h = y2 - y1 + 1.0 240 | w = x2 - x1 + 1.0 241 | max_side = torch.max(h, w) 242 | square_bboxes[:, 0] = x1 + w*0.5 - max_side*0.5 243 | square_bboxes[:, 1] = y1 + h*0.5 - max_side*0.5 244 | square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0 245 | square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0 246 | square_bboxes = torch.round(square_bboxes) 247 | 248 | return square_bboxes 249 | 250 | def _calibrate_box(self, bboxes, offsets): 251 | """Transform bounding boxes to be more like true bounding boxes. 252 | 'offsets' is one of the outputs of the nets. 253 | 254 | Arguments: 255 | bboxes {torch.float32} -- a float tensor of shape [n, 4]. 256 | offsets {torch.float32} -- a float tensor of shape [n, 4]. 257 | 258 | Returns: 259 | {torch.float32} -- a float tensor of shape [n, 4]. 260 | """ 261 | x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)] 262 | w = x2 - x1 + 1.0 263 | h = y2 - y1 + 1.0 264 | w = torch.unsqueeze(w, 1) 265 | h = torch.unsqueeze(h, 1) 266 | 267 | # this is what happening here: 268 | # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)] 269 | # x1_true = x1 + tx1 * w 270 | # y1_true = y1 + ty1 * h 271 | # x2_true = x2 + tx2 * w 272 | # y2_true = y2 + ty2 * h 273 | # below is just more compact form of this 274 | 275 | # are offsets always such that 276 | # x1 < x2 and y1 < y2 ? 277 | translation = torch.cat([w, h, w, h], dim=1) * offsets 278 | bboxes = bboxes + translation 279 | 280 | return bboxes 281 | 282 | @_no_grad 283 | def stage_one(self, img, scales, threshold, nms_threshold): 284 | """Run P-Net, generate bounding boxes, and do NMS. 285 | 286 | Arguments: 287 | img {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 288 | scales {list} -- a float list, 289 | scale width and height of the image by this number. 290 | threshold {float} -- a float number, 291 | threshold on the probability of a face when generating 292 | bounding boxes from predictions of the net. 293 | nms_threshold {float} -- [description] 294 | 295 | Returns: 296 | candidate_boxes {torch.float32} -- a float tensor of shape [n_boxes, 4] 297 | """ 298 | 299 | candidate_boxes = torch.empty((0, 4), device=self.device) 300 | candidate_scores = torch.empty((0), device=self.device) 301 | candidate_offsets = torch.empty((0, 4), device=self.device) 302 | 303 | # scale the image 304 | for scale in scales: 305 | _, _, height, width = img.shape 306 | sh, sw = math.ceil(height * scale), math.ceil(width * scale) 307 | resize_img = F.interpolate(img, size=( 308 | sh, sw), mode='bilinear', align_corners=False) 309 | 310 | # cls_probs: probability of a face at each sliding window 311 | # offsets: transformations to true bounding boxes 312 | cls_probs, offsets = self.pnet(resize_img) 313 | 314 | bboxes, scores, offsets = self._generate_bboxes( 315 | cls_probs, offsets, scale, threshold) 316 | 317 | candidate_boxes = torch.cat((candidate_boxes, bboxes)) 318 | candidate_scores = torch.cat((candidate_scores, scores)) 319 | candidate_offsets = torch.cat((candidate_offsets, offsets)) 320 | 321 | keep = torchvision.ops.nms( 322 | candidate_boxes, candidate_scores, iou_threshold=nms_threshold) 323 | candidate_boxes = candidate_boxes[keep] 324 | candidate_scores = candidate_scores[keep] 325 | candidate_offsets = candidate_offsets[keep] 326 | 327 | # use offsets predicted by pnet to transform bounding boxes 328 | candidate_boxes = self._calibrate_box( 329 | candidate_boxes, candidate_offsets) 330 | 331 | candidate_boxes = self._convert_to_square(candidate_boxes) 332 | 333 | return candidate_boxes 334 | 335 | @_no_grad 336 | def stage_two(self, img, bboxes, threshold, nms_threshold): 337 | """Run R-Net, generate bounding boxes, and do NMS. 338 | 339 | Arguments: 340 | img {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 341 | bboxes {torch.float32} -- [description] 342 | threshold {float} -- [description] 343 | nms_threshold {float} -- [description] 344 | 345 | Returns: 346 | {torch.float32} -- [description] 347 | """ 348 | 349 | # no candidate face found. 350 | if bboxes.shape[0] == 0: 351 | return bboxes 352 | 353 | img_boxes = self._get_image_boxes(bboxes, img, size=24) 354 | 355 | cls_probs, offsets = self.rnet(img_boxes) 356 | 357 | scores = cls_probs[:, 1] 358 | keep = (scores > threshold) 359 | bboxes = bboxes[keep] 360 | offsets = offsets[keep] 361 | scores = scores[keep] 362 | 363 | if bboxes.shape[0] == 0: # TBD return value need to be check 364 | return bboxes 365 | 366 | keep = torchvision.ops.nms(bboxes, scores, iou_threshold=nms_threshold) 367 | bboxes = bboxes[keep] 368 | offsets = offsets[keep] 369 | 370 | bboxes = self._calibrate_box(bboxes, offsets) 371 | bboxes = self._convert_to_square(bboxes) 372 | 373 | return bboxes 374 | 375 | @_no_grad 376 | def stage_three(self, img, bboxes, threshold, nms_threshold): 377 | """Run O-Net, generate bounding boxes, and do NMS. 378 | 379 | Arguments: 380 | img {torch.float32} -- a float tensor of shape [1, C, H, W] in the range [-1.0, 1.0] 381 | bboxes {torch.float32} -- [description] 382 | threshold {float} -- [description] 383 | nms_threshold {float} -- [description] 384 | 385 | Returns: 386 | {torch.float32} -- [description] 387 | """ 388 | if bboxes.shape[0] == 0: 389 | return bboxes, torch.empty(0, device=self.device) 390 | 391 | img_boxes = self._get_image_boxes(bboxes, img, size=48) 392 | cls_probs, offsets, landmarks = self.onet(img_boxes) 393 | 394 | scores = cls_probs[:, 1] 395 | keep = (scores > threshold) 396 | bboxes = bboxes[keep] 397 | offsets = offsets[keep] 398 | scores = scores[keep] 399 | landmarks = landmarks[keep] 400 | 401 | if bboxes.shape[0] == 0: 402 | return bboxes, torch.empty(0, device=self.device) # TBD 403 | 404 | # compute landmark points 405 | # TBD 406 | 407 | bboxes = self._calibrate_box(bboxes, offsets) 408 | keep = torchvision.ops.nms(bboxes, scores, iou_threshold=nms_threshold) 409 | bboxes = bboxes[keep] 410 | offsets = offsets[keep] 411 | 412 | return bboxes, torch.empty(0, device=self.device) 413 | 414 | if __name__ == '__main__': 415 | img = Image.open('../assets/office1.jpg') 416 | detector = FaceDetector() 417 | bounding_boxes = detector.detect(img) -------------------------------------------------------------------------------- /caffe_models/det4.prototxt: -------------------------------------------------------------------------------- 1 | name: "LNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 15 5 | input_dim: 24 6 | input_dim: 24 7 | 8 | layer { 9 | name: "slicer_data" 10 | type: "Slice" 11 | bottom: "data" 12 | top: "data241" 13 | top: "data242" 14 | top: "data243" 15 | top: "data244" 16 | top: "data245" 17 | slice_param { 18 | axis: 1 19 | slice_point: 3 20 | slice_point: 6 21 | slice_point: 9 22 | slice_point: 12 23 | } 24 | } 25 | layer { 26 | name: "conv1_1" 27 | type: "Convolution" 28 | bottom: "data241" 29 | top: "conv1_1" 30 | param { 31 | lr_mult: 1 32 | decay_mult: 1 33 | } 34 | param { 35 | lr_mult: 2 36 | decay_mult: 1 37 | } 38 | convolution_param { 39 | num_output: 28 40 | kernel_size: 3 41 | stride: 1 42 | weight_filler { 43 | type: "xavier" 44 | } 45 | bias_filler { 46 | type: "constant" 47 | value: 0 48 | } 49 | } 50 | 51 | } 52 | layer { 53 | name: "prelu1_1" 54 | type: "PReLU" 55 | bottom: "conv1_1" 56 | top: "conv1_1" 57 | 58 | } 59 | layer { 60 | name: "pool1_1" 61 | type: "Pooling" 62 | bottom: "conv1_1" 63 | top: "pool1_1" 64 | pooling_param { 65 | pool: MAX 66 | kernel_size: 3 67 | stride: 2 68 | } 69 | } 70 | 71 | layer { 72 | name: "conv2_1" 73 | type: "Convolution" 74 | bottom: "pool1_1" 75 | top: "conv2_1" 76 | param { 77 | lr_mult: 1 78 | decay_mult: 1 79 | } 80 | param { 81 | lr_mult: 2 82 | decay_mult: 1 83 | } 84 | convolution_param { 85 | num_output: 48 86 | kernel_size: 3 87 | stride: 1 88 | weight_filler { 89 | type: "xavier" 90 | } 91 | bias_filler { 92 | type: "constant" 93 | value: 0 94 | } 95 | } 96 | 97 | } 98 | layer { 99 | name: "prelu2_1" 100 | type: "PReLU" 101 | bottom: "conv2_1" 102 | top: "conv2_1" 103 | } 104 | layer { 105 | name: "pool2_1" 106 | type: "Pooling" 107 | bottom: "conv2_1" 108 | top: "pool2_1" 109 | pooling_param { 110 | pool: MAX 111 | kernel_size: 3 112 | stride: 2 113 | } 114 | 115 | } 116 | layer { 117 | name: "conv3_1" 118 | type: "Convolution" 119 | bottom: "pool2_1" 120 | top: "conv3_1" 121 | param { 122 | lr_mult: 1 123 | decay_mult: 1 124 | } 125 | param { 126 | lr_mult: 2 127 | decay_mult: 1 128 | } 129 | convolution_param { 130 | num_output: 64 131 | kernel_size: 2 132 | stride: 1 133 | weight_filler { 134 | type: "xavier" 135 | } 136 | bias_filler { 137 | type: "constant" 138 | value: 0 139 | } 140 | } 141 | 142 | } 143 | layer { 144 | name: "prelu3_1" 145 | type: "PReLU" 146 | bottom: "conv3_1" 147 | top: "conv3_1" 148 | } 149 | ########################## 150 | layer { 151 | name: "conv1_2" 152 | type: "Convolution" 153 | bottom: "data242" 154 | top: "conv1_2" 155 | param { 156 | lr_mult: 1 157 | decay_mult: 1 158 | } 159 | param { 160 | lr_mult: 2 161 | decay_mult: 1 162 | } 163 | convolution_param { 164 | num_output: 28 165 | kernel_size: 3 166 | stride: 1 167 | weight_filler { 168 | type: "xavier" 169 | } 170 | bias_filler { 171 | type: "constant" 172 | value: 0 173 | } 174 | } 175 | 176 | } 177 | layer { 178 | name: "prelu1_2" 179 | type: "PReLU" 180 | bottom: "conv1_2" 181 | top: "conv1_2" 182 | 183 | } 184 | layer { 185 | name: "pool1_2" 186 | type: "Pooling" 187 | bottom: "conv1_2" 188 | top: "pool1_2" 189 | pooling_param { 190 | pool: MAX 191 | kernel_size: 3 192 | stride: 2 193 | } 194 | } 195 | 196 | layer { 197 | name: "conv2_2" 198 | type: "Convolution" 199 | bottom: "pool1_2" 200 | top: "conv2_2" 201 | param { 202 | lr_mult: 1 203 | decay_mult: 1 204 | } 205 | param { 206 | lr_mult: 2 207 | decay_mult: 1 208 | } 209 | convolution_param { 210 | num_output: 48 211 | kernel_size: 3 212 | stride: 1 213 | weight_filler { 214 | type: "xavier" 215 | } 216 | bias_filler { 217 | type: "constant" 218 | value: 0 219 | } 220 | } 221 | 222 | } 223 | layer { 224 | name: "prelu2_2" 225 | type: "PReLU" 226 | bottom: "conv2_2" 227 | top: "conv2_2" 228 | } 229 | layer { 230 | name: "pool2_2" 231 | type: "Pooling" 232 | bottom: "conv2_2" 233 | top: "pool2_2" 234 | pooling_param { 235 | pool: MAX 236 | kernel_size: 3 237 | stride: 2 238 | } 239 | 240 | } 241 | layer { 242 | name: "conv3_2" 243 | type: "Convolution" 244 | bottom: "pool2_2" 245 | top: "conv3_2" 246 | param { 247 | lr_mult: 1 248 | decay_mult: 1 249 | } 250 | param { 251 | lr_mult: 2 252 | decay_mult: 1 253 | } 254 | convolution_param { 255 | num_output: 64 256 | kernel_size: 2 257 | stride: 1 258 | weight_filler { 259 | type: "xavier" 260 | } 261 | bias_filler { 262 | type: "constant" 263 | value: 0 264 | } 265 | } 266 | 267 | } 268 | layer { 269 | name: "prelu3_2" 270 | type: "PReLU" 271 | bottom: "conv3_2" 272 | top: "conv3_2" 273 | } 274 | ########################## 275 | ########################## 276 | layer { 277 | name: "conv1_3" 278 | type: "Convolution" 279 | bottom: "data243" 280 | top: "conv1_3" 281 | param { 282 | lr_mult: 1 283 | decay_mult: 1 284 | } 285 | param { 286 | lr_mult: 2 287 | decay_mult: 1 288 | } 289 | convolution_param { 290 | num_output: 28 291 | kernel_size: 3 292 | stride: 1 293 | weight_filler { 294 | type: "xavier" 295 | } 296 | bias_filler { 297 | type: "constant" 298 | value: 0 299 | } 300 | } 301 | 302 | } 303 | layer { 304 | name: "prelu1_3" 305 | type: "PReLU" 306 | bottom: "conv1_3" 307 | top: "conv1_3" 308 | 309 | } 310 | layer { 311 | name: "pool1_3" 312 | type: "Pooling" 313 | bottom: "conv1_3" 314 | top: "pool1_3" 315 | pooling_param { 316 | pool: MAX 317 | kernel_size: 3 318 | stride: 2 319 | } 320 | } 321 | 322 | layer { 323 | name: "conv2_3" 324 | type: "Convolution" 325 | bottom: "pool1_3" 326 | top: "conv2_3" 327 | param { 328 | lr_mult: 1 329 | decay_mult: 1 330 | } 331 | param { 332 | lr_mult: 2 333 | decay_mult: 1 334 | } 335 | convolution_param { 336 | num_output: 48 337 | kernel_size: 3 338 | stride: 1 339 | weight_filler { 340 | type: "xavier" 341 | } 342 | bias_filler { 343 | type: "constant" 344 | value: 0 345 | } 346 | } 347 | 348 | } 349 | layer { 350 | name: "prelu2_3" 351 | type: "PReLU" 352 | bottom: "conv2_3" 353 | top: "conv2_3" 354 | } 355 | layer { 356 | name: "pool2_3" 357 | type: "Pooling" 358 | bottom: "conv2_3" 359 | top: "pool2_3" 360 | pooling_param { 361 | pool: MAX 362 | kernel_size: 3 363 | stride: 2 364 | } 365 | 366 | } 367 | layer { 368 | name: "conv3_3" 369 | type: "Convolution" 370 | bottom: "pool2_3" 371 | top: "conv3_3" 372 | param { 373 | lr_mult: 1 374 | decay_mult: 1 375 | } 376 | param { 377 | lr_mult: 2 378 | decay_mult: 1 379 | } 380 | convolution_param { 381 | num_output: 64 382 | kernel_size: 2 383 | stride: 1 384 | weight_filler { 385 | type: "xavier" 386 | } 387 | bias_filler { 388 | type: "constant" 389 | value: 0 390 | } 391 | } 392 | 393 | } 394 | layer { 395 | name: "prelu3_3" 396 | type: "PReLU" 397 | bottom: "conv3_3" 398 | top: "conv3_3" 399 | } 400 | ########################## 401 | ########################## 402 | layer { 403 | name: "conv1_4" 404 | type: "Convolution" 405 | bottom: "data244" 406 | top: "conv1_4" 407 | param { 408 | lr_mult: 1 409 | decay_mult: 1 410 | } 411 | param { 412 | lr_mult: 2 413 | decay_mult: 1 414 | } 415 | convolution_param { 416 | num_output: 28 417 | kernel_size: 3 418 | stride: 1 419 | weight_filler { 420 | type: "xavier" 421 | } 422 | bias_filler { 423 | type: "constant" 424 | value: 0 425 | } 426 | } 427 | 428 | } 429 | layer { 430 | name: "prelu1_4" 431 | type: "PReLU" 432 | bottom: "conv1_4" 433 | top: "conv1_4" 434 | 435 | } 436 | layer { 437 | name: "pool1_4" 438 | type: "Pooling" 439 | bottom: "conv1_4" 440 | top: "pool1_4" 441 | pooling_param { 442 | pool: MAX 443 | kernel_size: 3 444 | stride: 2 445 | } 446 | } 447 | 448 | layer { 449 | name: "conv2_4" 450 | type: "Convolution" 451 | bottom: "pool1_4" 452 | top: "conv2_4" 453 | param { 454 | lr_mult: 1 455 | decay_mult: 1 456 | } 457 | param { 458 | lr_mult: 2 459 | decay_mult: 1 460 | } 461 | convolution_param { 462 | num_output: 48 463 | kernel_size: 3 464 | stride: 1 465 | weight_filler { 466 | type: "xavier" 467 | } 468 | bias_filler { 469 | type: "constant" 470 | value: 0 471 | } 472 | } 473 | 474 | } 475 | layer { 476 | name: "prelu2_4" 477 | type: "PReLU" 478 | bottom: "conv2_4" 479 | top: "conv2_4" 480 | } 481 | layer { 482 | name: "pool2_4" 483 | type: "Pooling" 484 | bottom: "conv2_4" 485 | top: "pool2_4" 486 | pooling_param { 487 | pool: MAX 488 | kernel_size: 3 489 | stride: 2 490 | } 491 | 492 | } 493 | layer { 494 | name: "conv3_4" 495 | type: "Convolution" 496 | bottom: "pool2_4" 497 | top: "conv3_4" 498 | param { 499 | lr_mult: 1 500 | decay_mult: 1 501 | } 502 | param { 503 | lr_mult: 2 504 | decay_mult: 1 505 | } 506 | convolution_param { 507 | num_output: 64 508 | kernel_size: 2 509 | stride: 1 510 | weight_filler { 511 | type: "xavier" 512 | } 513 | bias_filler { 514 | type: "constant" 515 | value: 0 516 | } 517 | } 518 | 519 | } 520 | layer { 521 | name: "prelu3_4" 522 | type: "PReLU" 523 | bottom: "conv3_4" 524 | top: "conv3_4" 525 | } 526 | ########################## 527 | ########################## 528 | layer { 529 | name: "conv1_5" 530 | type: "Convolution" 531 | bottom: "data245" 532 | top: "conv1_5" 533 | param { 534 | lr_mult: 1 535 | decay_mult: 1 536 | } 537 | param { 538 | lr_mult: 2 539 | decay_mult: 1 540 | } 541 | convolution_param { 542 | num_output: 28 543 | kernel_size: 3 544 | stride: 1 545 | weight_filler { 546 | type: "xavier" 547 | } 548 | bias_filler { 549 | type: "constant" 550 | value: 0 551 | } 552 | } 553 | 554 | } 555 | layer { 556 | name: "prelu1_5" 557 | type: "PReLU" 558 | bottom: "conv1_5" 559 | top: "conv1_5" 560 | 561 | } 562 | layer { 563 | name: "pool1_5" 564 | type: "Pooling" 565 | bottom: "conv1_5" 566 | top: "pool1_5" 567 | pooling_param { 568 | pool: MAX 569 | kernel_size: 3 570 | stride: 2 571 | } 572 | } 573 | 574 | layer { 575 | name: "conv2_5" 576 | type: "Convolution" 577 | bottom: "pool1_5" 578 | top: "conv2_5" 579 | param { 580 | lr_mult: 1 581 | decay_mult: 1 582 | } 583 | param { 584 | lr_mult: 2 585 | decay_mult: 1 586 | } 587 | convolution_param { 588 | num_output: 48 589 | kernel_size: 3 590 | stride: 1 591 | weight_filler { 592 | type: "xavier" 593 | } 594 | bias_filler { 595 | type: "constant" 596 | value: 0 597 | } 598 | } 599 | 600 | } 601 | layer { 602 | name: "prelu2_5" 603 | type: "PReLU" 604 | bottom: "conv2_5" 605 | top: "conv2_5" 606 | } 607 | layer { 608 | name: "pool2_5" 609 | type: "Pooling" 610 | bottom: "conv2_5" 611 | top: "pool2_5" 612 | pooling_param { 613 | pool: MAX 614 | kernel_size: 3 615 | stride: 2 616 | } 617 | 618 | } 619 | layer { 620 | name: "conv3_5" 621 | type: "Convolution" 622 | bottom: "pool2_5" 623 | top: "conv3_5" 624 | param { 625 | lr_mult: 1 626 | decay_mult: 1 627 | } 628 | param { 629 | lr_mult: 2 630 | decay_mult: 1 631 | } 632 | convolution_param { 633 | num_output: 64 634 | kernel_size: 2 635 | stride: 1 636 | weight_filler { 637 | type: "xavier" 638 | } 639 | bias_filler { 640 | type: "constant" 641 | value: 0 642 | } 643 | } 644 | 645 | } 646 | layer { 647 | name: "prelu3_5" 648 | type: "PReLU" 649 | bottom: "conv3_5" 650 | top: "conv3_5" 651 | } 652 | ########################## 653 | layer { 654 | name: "concat" 655 | bottom: "conv3_1" 656 | bottom: "conv3_2" 657 | bottom: "conv3_3" 658 | bottom: "conv3_4" 659 | bottom: "conv3_5" 660 | top: "conv3" 661 | type: "Concat" 662 | concat_param { 663 | axis: 1 664 | } 665 | } 666 | ########################## 667 | layer { 668 | name: "fc4" 669 | type: "InnerProduct" 670 | bottom: "conv3" 671 | top: "fc4" 672 | param { 673 | lr_mult: 1 674 | decay_mult: 1 675 | } 676 | param { 677 | lr_mult: 2 678 | decay_mult: 1 679 | } 680 | inner_product_param { 681 | num_output: 256 682 | weight_filler { 683 | type: "xavier" 684 | } 685 | bias_filler { 686 | type: "constant" 687 | value: 0 688 | } 689 | } 690 | 691 | } 692 | layer { 693 | name: "prelu4" 694 | type: "PReLU" 695 | bottom: "fc4" 696 | top: "fc4" 697 | } 698 | ############################ 699 | layer { 700 | name: "fc4_1" 701 | type: "InnerProduct" 702 | bottom: "fc4" 703 | top: "fc4_1" 704 | param { 705 | lr_mult: 1 706 | decay_mult: 1 707 | } 708 | param { 709 | lr_mult: 2 710 | decay_mult: 1 711 | } 712 | inner_product_param { 713 | num_output: 64 714 | weight_filler { 715 | type: "xavier" 716 | } 717 | bias_filler { 718 | type: "constant" 719 | value: 0 720 | } 721 | } 722 | 723 | } 724 | layer { 725 | name: "prelu4_1" 726 | type: "PReLU" 727 | bottom: "fc4_1" 728 | top: "fc4_1" 729 | } 730 | layer { 731 | name: "fc5_1" 732 | type: "InnerProduct" 733 | bottom: "fc4_1" 734 | top: "fc5_1" 735 | param { 736 | lr_mult: 1 737 | decay_mult: 1 738 | } 739 | param { 740 | lr_mult: 2 741 | decay_mult: 1 742 | } 743 | inner_product_param { 744 | num_output: 2 745 | weight_filler { 746 | type: "xavier" 747 | #type: "constant" 748 | #value: 0 749 | } 750 | bias_filler { 751 | type: "constant" 752 | value: 0 753 | } 754 | } 755 | } 756 | 757 | 758 | ######################### 759 | layer { 760 | name: "fc4_2" 761 | type: "InnerProduct" 762 | bottom: "fc4" 763 | top: "fc4_2" 764 | param { 765 | lr_mult: 1 766 | decay_mult: 1 767 | } 768 | param { 769 | lr_mult: 2 770 | decay_mult: 1 771 | } 772 | inner_product_param { 773 | num_output: 64 774 | weight_filler { 775 | type: "xavier" 776 | } 777 | bias_filler { 778 | type: "constant" 779 | value: 0 780 | } 781 | } 782 | 783 | } 784 | layer { 785 | name: "prelu4_2" 786 | type: "PReLU" 787 | bottom: "fc4_2" 788 | top: "fc4_2" 789 | } 790 | layer { 791 | name: "fc5_2" 792 | type: "InnerProduct" 793 | bottom: "fc4_2" 794 | top: "fc5_2" 795 | param { 796 | lr_mult: 1 797 | decay_mult: 1 798 | } 799 | param { 800 | lr_mult: 2 801 | decay_mult: 1 802 | } 803 | inner_product_param { 804 | num_output: 2 805 | weight_filler { 806 | type: "xavier" 807 | #type: "constant" 808 | #value: 0 809 | } 810 | bias_filler { 811 | type: "constant" 812 | value: 0 813 | } 814 | } 815 | } 816 | 817 | ######################### 818 | layer { 819 | name: "fc4_3" 820 | type: "InnerProduct" 821 | bottom: "fc4" 822 | top: "fc4_3" 823 | param { 824 | lr_mult: 1 825 | decay_mult: 1 826 | } 827 | param { 828 | lr_mult: 2 829 | decay_mult: 1 830 | } 831 | inner_product_param { 832 | num_output: 64 833 | weight_filler { 834 | type: "xavier" 835 | } 836 | bias_filler { 837 | type: "constant" 838 | value: 0 839 | } 840 | } 841 | 842 | } 843 | layer { 844 | name: "prelu4_3" 845 | type: "PReLU" 846 | bottom: "fc4_3" 847 | top: "fc4_3" 848 | } 849 | layer { 850 | name: "fc5_3" 851 | type: "InnerProduct" 852 | bottom: "fc4_3" 853 | top: "fc5_3" 854 | param { 855 | lr_mult: 1 856 | decay_mult: 1 857 | } 858 | param { 859 | lr_mult: 2 860 | decay_mult: 1 861 | } 862 | inner_product_param { 863 | num_output: 2 864 | weight_filler { 865 | type: "xavier" 866 | #type: "constant" 867 | #value: 0 868 | } 869 | bias_filler { 870 | type: "constant" 871 | value: 0 872 | } 873 | } 874 | } 875 | 876 | ######################### 877 | layer { 878 | name: "fc4_4" 879 | type: "InnerProduct" 880 | bottom: "fc4" 881 | top: "fc4_4" 882 | param { 883 | lr_mult: 1 884 | decay_mult: 1 885 | } 886 | param { 887 | lr_mult: 2 888 | decay_mult: 1 889 | } 890 | inner_product_param { 891 | num_output: 64 892 | weight_filler { 893 | type: "xavier" 894 | } 895 | bias_filler { 896 | type: "constant" 897 | value: 0 898 | } 899 | } 900 | 901 | } 902 | layer { 903 | name: "prelu4_4" 904 | type: "PReLU" 905 | bottom: "fc4_4" 906 | top: "fc4_4" 907 | } 908 | layer { 909 | name: "fc5_4" 910 | type: "InnerProduct" 911 | bottom: "fc4_4" 912 | top: "fc5_4" 913 | param { 914 | lr_mult: 1 915 | decay_mult: 1 916 | } 917 | param { 918 | lr_mult: 2 919 | decay_mult: 1 920 | } 921 | inner_product_param { 922 | num_output: 2 923 | weight_filler { 924 | type: "xavier" 925 | #type: "constant" 926 | #value: 0 927 | } 928 | bias_filler { 929 | type: "constant" 930 | value: 0 931 | } 932 | } 933 | } 934 | 935 | ######################### 936 | layer { 937 | name: "fc4_5" 938 | type: "InnerProduct" 939 | bottom: "fc4" 940 | top: "fc4_5" 941 | param { 942 | lr_mult: 1 943 | decay_mult: 1 944 | } 945 | param { 946 | lr_mult: 2 947 | decay_mult: 1 948 | } 949 | inner_product_param { 950 | num_output: 64 951 | weight_filler { 952 | type: "xavier" 953 | } 954 | bias_filler { 955 | type: "constant" 956 | value: 0 957 | } 958 | } 959 | 960 | } 961 | layer { 962 | name: "prelu4_5" 963 | type: "PReLU" 964 | bottom: "fc4_5" 965 | top: "fc4_5" 966 | } 967 | layer { 968 | name: "fc5_5" 969 | type: "InnerProduct" 970 | bottom: "fc4_5" 971 | top: "fc5_5" 972 | param { 973 | lr_mult: 1 974 | decay_mult: 1 975 | } 976 | param { 977 | lr_mult: 2 978 | decay_mult: 1 979 | } 980 | inner_product_param { 981 | num_output: 2 982 | weight_filler { 983 | type: "xavier" 984 | #type: "constant" 985 | #value: 0 986 | } 987 | bias_filler { 988 | type: "constant" 989 | value: 0 990 | } 991 | } 992 | } 993 | 994 | ######################### 995 | 996 | --------------------------------------------------------------------------------