├── .gitignore ├── LICENSE ├── README.md ├── caffe_models ├── det1.caffemodel ├── det1.prototxt ├── det2.caffemodel ├── det2.prototxt ├── det3.caffemodel ├── det3.prototxt ├── det4.caffemodel └── det4.prototxt ├── extract_weights_from_caffe_models.py ├── images ├── example.png ├── office1.jpg ├── office2.jpg ├── office3.jpg ├── office4.jpg └── office5.jpg ├── src ├── __init__.py ├── box_utils.py ├── detector.py ├── first_stage.py ├── get_nets.py ├── visualization_utils.py └── weights │ ├── onet.npy │ ├── pnet.npy │ └── rnet.npy ├── test_on_images.ipynb └── try_mtcnn_step_by_step.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | __pycache__ 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Dan Antoshchenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MTCNN 2 | 3 | `pytorch` implementation of **inference stage** of face detection algorithm described in 4 | [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878). 5 | 6 | ## Example 7 | ![example of a face detection](images/example.png) 8 | 9 | ## How to use it 10 | Just download the repository and then do this 11 | ```python 12 | from src import detect_faces 13 | from PIL import Image 14 | 15 | image = Image.open('image.jpg') 16 | bounding_boxes, landmarks = detect_faces(image) 17 | ``` 18 | For examples see `test_on_images.ipynb`. 19 | 20 | ## Requirements 21 | * pytorch 0.2 22 | * Pillow, numpy 23 | 24 | ## Credit 25 | This implementation is heavily inspired by: 26 | * [pangyupo/mxnet_mtcnn_face_detection](https://github.com/pangyupo/mxnet_mtcnn_face_detection) 27 | -------------------------------------------------------------------------------- /caffe_models/det1.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det1.caffemodel -------------------------------------------------------------------------------- /caffe_models/det1.prototxt: -------------------------------------------------------------------------------- 1 | name: "PNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 12 6 | input_dim: 12 7 | 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 10 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "PReLU1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 2 48 | stride: 2 49 | } 50 | } 51 | 52 | layer { 53 | name: "conv2" 54 | type: "Convolution" 55 | bottom: "pool1" 56 | top: "conv2" 57 | param { 58 | lr_mult: 1 59 | decay_mult: 1 60 | } 61 | param { 62 | lr_mult: 2 63 | decay_mult: 0 64 | } 65 | convolution_param { 66 | num_output: 16 67 | kernel_size: 3 68 | stride: 1 69 | weight_filler { 70 | type: "xavier" 71 | } 72 | bias_filler { 73 | type: "constant" 74 | value: 0 75 | } 76 | } 77 | } 78 | layer { 79 | name: "PReLU2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | 85 | layer { 86 | name: "conv3" 87 | type: "Convolution" 88 | bottom: "conv2" 89 | top: "conv3" 90 | param { 91 | lr_mult: 1 92 | decay_mult: 1 93 | } 94 | param { 95 | lr_mult: 2 96 | decay_mult: 0 97 | } 98 | convolution_param { 99 | num_output: 32 100 | kernel_size: 3 101 | stride: 1 102 | weight_filler { 103 | type: "xavier" 104 | } 105 | bias_filler { 106 | type: "constant" 107 | value: 0 108 | } 109 | } 110 | } 111 | layer { 112 | name: "PReLU3" 113 | type: "PReLU" 114 | bottom: "conv3" 115 | top: "conv3" 116 | } 117 | 118 | 119 | layer { 120 | name: "conv4-1" 121 | type: "Convolution" 122 | bottom: "conv3" 123 | top: "conv4-1" 124 | param { 125 | lr_mult: 1 126 | decay_mult: 1 127 | } 128 | param { 129 | lr_mult: 2 130 | decay_mult: 0 131 | } 132 | convolution_param { 133 | num_output: 2 134 | kernel_size: 1 135 | stride: 1 136 | weight_filler { 137 | type: "xavier" 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | 146 | layer { 147 | name: "conv4-2" 148 | type: "Convolution" 149 | bottom: "conv3" 150 | top: "conv4-2" 151 | param { 152 | lr_mult: 1 153 | decay_mult: 1 154 | } 155 | param { 156 | lr_mult: 2 157 | decay_mult: 0 158 | } 159 | convolution_param { 160 | num_output: 4 161 | kernel_size: 1 162 | stride: 1 163 | weight_filler { 164 | type: "xavier" 165 | } 166 | bias_filler { 167 | type: "constant" 168 | value: 0 169 | } 170 | } 171 | } 172 | layer { 173 | name: "prob1" 174 | type: "Softmax" 175 | bottom: "conv4-1" 176 | top: "prob1" 177 | } 178 | -------------------------------------------------------------------------------- /caffe_models/det2.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det2.caffemodel -------------------------------------------------------------------------------- /caffe_models/det2.prototxt: -------------------------------------------------------------------------------- 1 | name: "RNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 24 6 | input_dim: 24 7 | 8 | 9 | ########################## 10 | ###################### 11 | layer { 12 | name: "conv1" 13 | type: "Convolution" 14 | bottom: "data" 15 | top: "conv1" 16 | param { 17 | lr_mult: 0 18 | decay_mult: 0 19 | } 20 | param { 21 | lr_mult: 0 22 | decay_mult: 0 23 | } 24 | convolution_param { 25 | num_output: 28 26 | kernel_size: 3 27 | stride: 1 28 | weight_filler { 29 | type: "xavier" 30 | } 31 | bias_filler { 32 | type: "constant" 33 | value: 0 34 | } 35 | } 36 | } 37 | layer { 38 | name: "prelu1" 39 | type: "PReLU" 40 | bottom: "conv1" 41 | top: "conv1" 42 | propagate_down: true 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "conv1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 52 | stride: 2 53 | } 54 | } 55 | 56 | layer { 57 | name: "conv2" 58 | type: "Convolution" 59 | bottom: "pool1" 60 | top: "conv2" 61 | param { 62 | lr_mult: 0 63 | decay_mult: 0 64 | } 65 | param { 66 | lr_mult: 0 67 | decay_mult: 0 68 | } 69 | convolution_param { 70 | num_output: 48 71 | kernel_size: 3 72 | stride: 1 73 | weight_filler { 74 | type: "xavier" 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 0 79 | } 80 | } 81 | } 82 | layer { 83 | name: "prelu2" 84 | type: "PReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | propagate_down: true 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | stride: 2 98 | } 99 | } 100 | #################################### 101 | 102 | ################################## 103 | layer { 104 | name: "conv3" 105 | type: "Convolution" 106 | bottom: "pool2" 107 | top: "conv3" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 64 118 | kernel_size: 2 119 | stride: 1 120 | weight_filler { 121 | type: "xavier" 122 | } 123 | bias_filler { 124 | type: "constant" 125 | value: 0 126 | } 127 | } 128 | } 129 | layer { 130 | name: "prelu3" 131 | type: "PReLU" 132 | bottom: "conv3" 133 | top: "conv3" 134 | propagate_down: true 135 | } 136 | ############################### 137 | 138 | ############################### 139 | 140 | layer { 141 | name: "conv4" 142 | type: "InnerProduct" 143 | bottom: "conv3" 144 | top: "conv4" 145 | param { 146 | lr_mult: 0 147 | decay_mult: 0 148 | } 149 | param { 150 | lr_mult: 0 151 | decay_mult: 0 152 | } 153 | inner_product_param { 154 | num_output: 128 155 | weight_filler { 156 | type: "xavier" 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0 161 | } 162 | } 163 | } 164 | layer { 165 | name: "prelu4" 166 | type: "PReLU" 167 | bottom: "conv4" 168 | top: "conv4" 169 | } 170 | 171 | layer { 172 | name: "conv5-1" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5-1" 176 | param { 177 | lr_mult: 0 178 | decay_mult: 0 179 | } 180 | param { 181 | lr_mult: 0 182 | decay_mult: 0 183 | } 184 | inner_product_param { 185 | num_output: 2 186 | #kernel_size: 1 187 | #stride: 1 188 | weight_filler { 189 | type: "xavier" 190 | } 191 | bias_filler { 192 | type: "constant" 193 | value: 0 194 | } 195 | } 196 | } 197 | layer { 198 | name: "conv5-2" 199 | type: "InnerProduct" 200 | bottom: "conv4" 201 | top: "conv5-2" 202 | param { 203 | lr_mult: 1 204 | decay_mult: 1 205 | } 206 | param { 207 | lr_mult: 2 208 | decay_mult: 1 209 | } 210 | inner_product_param { 211 | num_output: 4 212 | #kernel_size: 1 213 | #stride: 1 214 | weight_filler { 215 | type: "xavier" 216 | } 217 | bias_filler { 218 | type: "constant" 219 | value: 0 220 | } 221 | } 222 | } 223 | layer { 224 | name: "prob1" 225 | type: "Softmax" 226 | bottom: "conv5-1" 227 | top: "prob1" 228 | } -------------------------------------------------------------------------------- /caffe_models/det3.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det3.caffemodel -------------------------------------------------------------------------------- /caffe_models/det3.prototxt: -------------------------------------------------------------------------------- 1 | name: "ONet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 48 6 | input_dim: 48 7 | ################################## 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 1 20 | } 21 | convolution_param { 22 | num_output: 32 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "prelu1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | stride: 2 49 | } 50 | } 51 | layer { 52 | name: "conv2" 53 | type: "Convolution" 54 | bottom: "pool1" 55 | top: "conv2" 56 | param { 57 | lr_mult: 1 58 | decay_mult: 1 59 | } 60 | param { 61 | lr_mult: 2 62 | decay_mult: 1 63 | } 64 | convolution_param { 65 | num_output: 64 66 | kernel_size: 3 67 | stride: 1 68 | weight_filler { 69 | type: "xavier" 70 | } 71 | bias_filler { 72 | type: "constant" 73 | value: 0 74 | } 75 | } 76 | } 77 | 78 | layer { 79 | name: "prelu2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "conv2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 92 | stride: 2 93 | } 94 | } 95 | 96 | layer { 97 | name: "conv3" 98 | type: "Convolution" 99 | bottom: "pool2" 100 | top: "conv3" 101 | param { 102 | lr_mult: 1 103 | decay_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | decay_mult: 1 108 | } 109 | convolution_param { 110 | num_output: 64 111 | kernel_size: 3 112 | weight_filler { 113 | type: "xavier" 114 | } 115 | bias_filler { 116 | type: "constant" 117 | value: 0 118 | } 119 | } 120 | } 121 | layer { 122 | name: "prelu3" 123 | type: "PReLU" 124 | bottom: "conv3" 125 | top: "conv3" 126 | } 127 | layer { 128 | name: "pool3" 129 | type: "Pooling" 130 | bottom: "conv3" 131 | top: "pool3" 132 | pooling_param { 133 | pool: MAX 134 | kernel_size: 2 135 | stride: 2 136 | } 137 | } 138 | layer { 139 | name: "conv4" 140 | type: "Convolution" 141 | bottom: "pool3" 142 | top: "conv4" 143 | param { 144 | lr_mult: 1 145 | decay_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | decay_mult: 1 150 | } 151 | convolution_param { 152 | num_output: 128 153 | kernel_size: 2 154 | weight_filler { 155 | type: "xavier" 156 | } 157 | bias_filler { 158 | type: "constant" 159 | value: 0 160 | } 161 | } 162 | } 163 | layer { 164 | name: "prelu4" 165 | type: "PReLU" 166 | bottom: "conv4" 167 | top: "conv4" 168 | } 169 | 170 | 171 | layer { 172 | name: "conv5" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5" 176 | param { 177 | lr_mult: 1 178 | decay_mult: 1 179 | } 180 | param { 181 | lr_mult: 2 182 | decay_mult: 1 183 | } 184 | inner_product_param { 185 | #kernel_size: 3 186 | num_output: 256 187 | weight_filler { 188 | type: "xavier" 189 | } 190 | bias_filler { 191 | type: "constant" 192 | value: 0 193 | } 194 | } 195 | } 196 | 197 | layer { 198 | name: "drop5" 199 | type: "Dropout" 200 | bottom: "conv5" 201 | top: "conv5" 202 | dropout_param { 203 | dropout_ratio: 0.25 204 | } 205 | } 206 | layer { 207 | name: "prelu5" 208 | type: "PReLU" 209 | bottom: "conv5" 210 | top: "conv5" 211 | } 212 | 213 | 214 | layer { 215 | name: "conv6-1" 216 | type: "InnerProduct" 217 | bottom: "conv5" 218 | top: "conv6-1" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 1 226 | } 227 | inner_product_param { 228 | #kernel_size: 1 229 | num_output: 2 230 | weight_filler { 231 | type: "xavier" 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0 236 | } 237 | } 238 | } 239 | layer { 240 | name: "conv6-2" 241 | type: "InnerProduct" 242 | bottom: "conv5" 243 | top: "conv6-2" 244 | param { 245 | lr_mult: 1 246 | decay_mult: 1 247 | } 248 | param { 249 | lr_mult: 2 250 | decay_mult: 1 251 | } 252 | inner_product_param { 253 | #kernel_size: 1 254 | num_output: 4 255 | weight_filler { 256 | type: "xavier" 257 | } 258 | bias_filler { 259 | type: "constant" 260 | value: 0 261 | } 262 | } 263 | } 264 | layer { 265 | name: "conv6-3" 266 | type: "InnerProduct" 267 | bottom: "conv5" 268 | top: "conv6-3" 269 | param { 270 | lr_mult: 1 271 | decay_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | decay_mult: 1 276 | } 277 | inner_product_param { 278 | #kernel_size: 1 279 | num_output: 10 280 | weight_filler { 281 | type: "xavier" 282 | } 283 | bias_filler { 284 | type: "constant" 285 | value: 0 286 | } 287 | } 288 | } 289 | layer { 290 | name: "prob1" 291 | type: "Softmax" 292 | bottom: "conv6-1" 293 | top: "prob1" 294 | } 295 | -------------------------------------------------------------------------------- /caffe_models/det4.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det4.caffemodel -------------------------------------------------------------------------------- /caffe_models/det4.prototxt: -------------------------------------------------------------------------------- 1 | name: "LNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 15 5 | input_dim: 24 6 | input_dim: 24 7 | 8 | layer { 9 | name: "slicer_data" 10 | type: "Slice" 11 | bottom: "data" 12 | top: "data241" 13 | top: "data242" 14 | top: "data243" 15 | top: "data244" 16 | top: "data245" 17 | slice_param { 18 | axis: 1 19 | slice_point: 3 20 | slice_point: 6 21 | slice_point: 9 22 | slice_point: 12 23 | } 24 | } 25 | layer { 26 | name: "conv1_1" 27 | type: "Convolution" 28 | bottom: "data241" 29 | top: "conv1_1" 30 | param { 31 | lr_mult: 1 32 | decay_mult: 1 33 | } 34 | param { 35 | lr_mult: 2 36 | decay_mult: 1 37 | } 38 | convolution_param { 39 | num_output: 28 40 | kernel_size: 3 41 | stride: 1 42 | weight_filler { 43 | type: "xavier" 44 | } 45 | bias_filler { 46 | type: "constant" 47 | value: 0 48 | } 49 | } 50 | 51 | } 52 | layer { 53 | name: "prelu1_1" 54 | type: "PReLU" 55 | bottom: "conv1_1" 56 | top: "conv1_1" 57 | 58 | } 59 | layer { 60 | name: "pool1_1" 61 | type: "Pooling" 62 | bottom: "conv1_1" 63 | top: "pool1_1" 64 | pooling_param { 65 | pool: MAX 66 | kernel_size: 3 67 | stride: 2 68 | } 69 | } 70 | 71 | layer { 72 | name: "conv2_1" 73 | type: "Convolution" 74 | bottom: "pool1_1" 75 | top: "conv2_1" 76 | param { 77 | lr_mult: 1 78 | decay_mult: 1 79 | } 80 | param { 81 | lr_mult: 2 82 | decay_mult: 1 83 | } 84 | convolution_param { 85 | num_output: 48 86 | kernel_size: 3 87 | stride: 1 88 | weight_filler { 89 | type: "xavier" 90 | } 91 | bias_filler { 92 | type: "constant" 93 | value: 0 94 | } 95 | } 96 | 97 | } 98 | layer { 99 | name: "prelu2_1" 100 | type: "PReLU" 101 | bottom: "conv2_1" 102 | top: "conv2_1" 103 | } 104 | layer { 105 | name: "pool2_1" 106 | type: "Pooling" 107 | bottom: "conv2_1" 108 | top: "pool2_1" 109 | pooling_param { 110 | pool: MAX 111 | kernel_size: 3 112 | stride: 2 113 | } 114 | 115 | } 116 | layer { 117 | name: "conv3_1" 118 | type: "Convolution" 119 | bottom: "pool2_1" 120 | top: "conv3_1" 121 | param { 122 | lr_mult: 1 123 | decay_mult: 1 124 | } 125 | param { 126 | lr_mult: 2 127 | decay_mult: 1 128 | } 129 | convolution_param { 130 | num_output: 64 131 | kernel_size: 2 132 | stride: 1 133 | weight_filler { 134 | type: "xavier" 135 | } 136 | bias_filler { 137 | type: "constant" 138 | value: 0 139 | } 140 | } 141 | 142 | } 143 | layer { 144 | name: "prelu3_1" 145 | type: "PReLU" 146 | bottom: "conv3_1" 147 | top: "conv3_1" 148 | } 149 | ########################## 150 | layer { 151 | name: "conv1_2" 152 | type: "Convolution" 153 | bottom: "data242" 154 | top: "conv1_2" 155 | param { 156 | lr_mult: 1 157 | decay_mult: 1 158 | } 159 | param { 160 | lr_mult: 2 161 | decay_mult: 1 162 | } 163 | convolution_param { 164 | num_output: 28 165 | kernel_size: 3 166 | stride: 1 167 | weight_filler { 168 | type: "xavier" 169 | } 170 | bias_filler { 171 | type: "constant" 172 | value: 0 173 | } 174 | } 175 | 176 | } 177 | layer { 178 | name: "prelu1_2" 179 | type: "PReLU" 180 | bottom: "conv1_2" 181 | top: "conv1_2" 182 | 183 | } 184 | layer { 185 | name: "pool1_2" 186 | type: "Pooling" 187 | bottom: "conv1_2" 188 | top: "pool1_2" 189 | pooling_param { 190 | pool: MAX 191 | kernel_size: 3 192 | stride: 2 193 | } 194 | } 195 | 196 | layer { 197 | name: "conv2_2" 198 | type: "Convolution" 199 | bottom: "pool1_2" 200 | top: "conv2_2" 201 | param { 202 | lr_mult: 1 203 | decay_mult: 1 204 | } 205 | param { 206 | lr_mult: 2 207 | decay_mult: 1 208 | } 209 | convolution_param { 210 | num_output: 48 211 | kernel_size: 3 212 | stride: 1 213 | weight_filler { 214 | type: "xavier" 215 | } 216 | bias_filler { 217 | type: "constant" 218 | value: 0 219 | } 220 | } 221 | 222 | } 223 | layer { 224 | name: "prelu2_2" 225 | type: "PReLU" 226 | bottom: "conv2_2" 227 | top: "conv2_2" 228 | } 229 | layer { 230 | name: "pool2_2" 231 | type: "Pooling" 232 | bottom: "conv2_2" 233 | top: "pool2_2" 234 | pooling_param { 235 | pool: MAX 236 | kernel_size: 3 237 | stride: 2 238 | } 239 | 240 | } 241 | layer { 242 | name: "conv3_2" 243 | type: "Convolution" 244 | bottom: "pool2_2" 245 | top: "conv3_2" 246 | param { 247 | lr_mult: 1 248 | decay_mult: 1 249 | } 250 | param { 251 | lr_mult: 2 252 | decay_mult: 1 253 | } 254 | convolution_param { 255 | num_output: 64 256 | kernel_size: 2 257 | stride: 1 258 | weight_filler { 259 | type: "xavier" 260 | } 261 | bias_filler { 262 | type: "constant" 263 | value: 0 264 | } 265 | } 266 | 267 | } 268 | layer { 269 | name: "prelu3_2" 270 | type: "PReLU" 271 | bottom: "conv3_2" 272 | top: "conv3_2" 273 | } 274 | ########################## 275 | ########################## 276 | layer { 277 | name: "conv1_3" 278 | type: "Convolution" 279 | bottom: "data243" 280 | top: "conv1_3" 281 | param { 282 | lr_mult: 1 283 | decay_mult: 1 284 | } 285 | param { 286 | lr_mult: 2 287 | decay_mult: 1 288 | } 289 | convolution_param { 290 | num_output: 28 291 | kernel_size: 3 292 | stride: 1 293 | weight_filler { 294 | type: "xavier" 295 | } 296 | bias_filler { 297 | type: "constant" 298 | value: 0 299 | } 300 | } 301 | 302 | } 303 | layer { 304 | name: "prelu1_3" 305 | type: "PReLU" 306 | bottom: "conv1_3" 307 | top: "conv1_3" 308 | 309 | } 310 | layer { 311 | name: "pool1_3" 312 | type: "Pooling" 313 | bottom: "conv1_3" 314 | top: "pool1_3" 315 | pooling_param { 316 | pool: MAX 317 | kernel_size: 3 318 | stride: 2 319 | } 320 | } 321 | 322 | layer { 323 | name: "conv2_3" 324 | type: "Convolution" 325 | bottom: "pool1_3" 326 | top: "conv2_3" 327 | param { 328 | lr_mult: 1 329 | decay_mult: 1 330 | } 331 | param { 332 | lr_mult: 2 333 | decay_mult: 1 334 | } 335 | convolution_param { 336 | num_output: 48 337 | kernel_size: 3 338 | stride: 1 339 | weight_filler { 340 | type: "xavier" 341 | } 342 | bias_filler { 343 | type: "constant" 344 | value: 0 345 | } 346 | } 347 | 348 | } 349 | layer { 350 | name: "prelu2_3" 351 | type: "PReLU" 352 | bottom: "conv2_3" 353 | top: "conv2_3" 354 | } 355 | layer { 356 | name: "pool2_3" 357 | type: "Pooling" 358 | bottom: "conv2_3" 359 | top: "pool2_3" 360 | pooling_param { 361 | pool: MAX 362 | kernel_size: 3 363 | stride: 2 364 | } 365 | 366 | } 367 | layer { 368 | name: "conv3_3" 369 | type: "Convolution" 370 | bottom: "pool2_3" 371 | top: "conv3_3" 372 | param { 373 | lr_mult: 1 374 | decay_mult: 1 375 | } 376 | param { 377 | lr_mult: 2 378 | decay_mult: 1 379 | } 380 | convolution_param { 381 | num_output: 64 382 | kernel_size: 2 383 | stride: 1 384 | weight_filler { 385 | type: "xavier" 386 | } 387 | bias_filler { 388 | type: "constant" 389 | value: 0 390 | } 391 | } 392 | 393 | } 394 | layer { 395 | name: "prelu3_3" 396 | type: "PReLU" 397 | bottom: "conv3_3" 398 | top: "conv3_3" 399 | } 400 | ########################## 401 | ########################## 402 | layer { 403 | name: "conv1_4" 404 | type: "Convolution" 405 | bottom: "data244" 406 | top: "conv1_4" 407 | param { 408 | lr_mult: 1 409 | decay_mult: 1 410 | } 411 | param { 412 | lr_mult: 2 413 | decay_mult: 1 414 | } 415 | convolution_param { 416 | num_output: 28 417 | kernel_size: 3 418 | stride: 1 419 | weight_filler { 420 | type: "xavier" 421 | } 422 | bias_filler { 423 | type: "constant" 424 | value: 0 425 | } 426 | } 427 | 428 | } 429 | layer { 430 | name: "prelu1_4" 431 | type: "PReLU" 432 | bottom: "conv1_4" 433 | top: "conv1_4" 434 | 435 | } 436 | layer { 437 | name: "pool1_4" 438 | type: "Pooling" 439 | bottom: "conv1_4" 440 | top: "pool1_4" 441 | pooling_param { 442 | pool: MAX 443 | kernel_size: 3 444 | stride: 2 445 | } 446 | } 447 | 448 | layer { 449 | name: "conv2_4" 450 | type: "Convolution" 451 | bottom: "pool1_4" 452 | top: "conv2_4" 453 | param { 454 | lr_mult: 1 455 | decay_mult: 1 456 | } 457 | param { 458 | lr_mult: 2 459 | decay_mult: 1 460 | } 461 | convolution_param { 462 | num_output: 48 463 | kernel_size: 3 464 | stride: 1 465 | weight_filler { 466 | type: "xavier" 467 | } 468 | bias_filler { 469 | type: "constant" 470 | value: 0 471 | } 472 | } 473 | 474 | } 475 | layer { 476 | name: "prelu2_4" 477 | type: "PReLU" 478 | bottom: "conv2_4" 479 | top: "conv2_4" 480 | } 481 | layer { 482 | name: "pool2_4" 483 | type: "Pooling" 484 | bottom: "conv2_4" 485 | top: "pool2_4" 486 | pooling_param { 487 | pool: MAX 488 | kernel_size: 3 489 | stride: 2 490 | } 491 | 492 | } 493 | layer { 494 | name: "conv3_4" 495 | type: "Convolution" 496 | bottom: "pool2_4" 497 | top: "conv3_4" 498 | param { 499 | lr_mult: 1 500 | decay_mult: 1 501 | } 502 | param { 503 | lr_mult: 2 504 | decay_mult: 1 505 | } 506 | convolution_param { 507 | num_output: 64 508 | kernel_size: 2 509 | stride: 1 510 | weight_filler { 511 | type: "xavier" 512 | } 513 | bias_filler { 514 | type: "constant" 515 | value: 0 516 | } 517 | } 518 | 519 | } 520 | layer { 521 | name: "prelu3_4" 522 | type: "PReLU" 523 | bottom: "conv3_4" 524 | top: "conv3_4" 525 | } 526 | ########################## 527 | ########################## 528 | layer { 529 | name: "conv1_5" 530 | type: "Convolution" 531 | bottom: "data245" 532 | top: "conv1_5" 533 | param { 534 | lr_mult: 1 535 | decay_mult: 1 536 | } 537 | param { 538 | lr_mult: 2 539 | decay_mult: 1 540 | } 541 | convolution_param { 542 | num_output: 28 543 | kernel_size: 3 544 | stride: 1 545 | weight_filler { 546 | type: "xavier" 547 | } 548 | bias_filler { 549 | type: "constant" 550 | value: 0 551 | } 552 | } 553 | 554 | } 555 | layer { 556 | name: "prelu1_5" 557 | type: "PReLU" 558 | bottom: "conv1_5" 559 | top: "conv1_5" 560 | 561 | } 562 | layer { 563 | name: "pool1_5" 564 | type: "Pooling" 565 | bottom: "conv1_5" 566 | top: "pool1_5" 567 | pooling_param { 568 | pool: MAX 569 | kernel_size: 3 570 | stride: 2 571 | } 572 | } 573 | 574 | layer { 575 | name: "conv2_5" 576 | type: "Convolution" 577 | bottom: "pool1_5" 578 | top: "conv2_5" 579 | param { 580 | lr_mult: 1 581 | decay_mult: 1 582 | } 583 | param { 584 | lr_mult: 2 585 | decay_mult: 1 586 | } 587 | convolution_param { 588 | num_output: 48 589 | kernel_size: 3 590 | stride: 1 591 | weight_filler { 592 | type: "xavier" 593 | } 594 | bias_filler { 595 | type: "constant" 596 | value: 0 597 | } 598 | } 599 | 600 | } 601 | layer { 602 | name: "prelu2_5" 603 | type: "PReLU" 604 | bottom: "conv2_5" 605 | top: "conv2_5" 606 | } 607 | layer { 608 | name: "pool2_5" 609 | type: "Pooling" 610 | bottom: "conv2_5" 611 | top: "pool2_5" 612 | pooling_param { 613 | pool: MAX 614 | kernel_size: 3 615 | stride: 2 616 | } 617 | 618 | } 619 | layer { 620 | name: "conv3_5" 621 | type: "Convolution" 622 | bottom: "pool2_5" 623 | top: "conv3_5" 624 | param { 625 | lr_mult: 1 626 | decay_mult: 1 627 | } 628 | param { 629 | lr_mult: 2 630 | decay_mult: 1 631 | } 632 | convolution_param { 633 | num_output: 64 634 | kernel_size: 2 635 | stride: 1 636 | weight_filler { 637 | type: "xavier" 638 | } 639 | bias_filler { 640 | type: "constant" 641 | value: 0 642 | } 643 | } 644 | 645 | } 646 | layer { 647 | name: "prelu3_5" 648 | type: "PReLU" 649 | bottom: "conv3_5" 650 | top: "conv3_5" 651 | } 652 | ########################## 653 | layer { 654 | name: "concat" 655 | bottom: "conv3_1" 656 | bottom: "conv3_2" 657 | bottom: "conv3_3" 658 | bottom: "conv3_4" 659 | bottom: "conv3_5" 660 | top: "conv3" 661 | type: "Concat" 662 | concat_param { 663 | axis: 1 664 | } 665 | } 666 | ########################## 667 | layer { 668 | name: "fc4" 669 | type: "InnerProduct" 670 | bottom: "conv3" 671 | top: "fc4" 672 | param { 673 | lr_mult: 1 674 | decay_mult: 1 675 | } 676 | param { 677 | lr_mult: 2 678 | decay_mult: 1 679 | } 680 | inner_product_param { 681 | num_output: 256 682 | weight_filler { 683 | type: "xavier" 684 | } 685 | bias_filler { 686 | type: "constant" 687 | value: 0 688 | } 689 | } 690 | 691 | } 692 | layer { 693 | name: "prelu4" 694 | type: "PReLU" 695 | bottom: "fc4" 696 | top: "fc4" 697 | } 698 | ############################ 699 | layer { 700 | name: "fc4_1" 701 | type: "InnerProduct" 702 | bottom: "fc4" 703 | top: "fc4_1" 704 | param { 705 | lr_mult: 1 706 | decay_mult: 1 707 | } 708 | param { 709 | lr_mult: 2 710 | decay_mult: 1 711 | } 712 | inner_product_param { 713 | num_output: 64 714 | weight_filler { 715 | type: "xavier" 716 | } 717 | bias_filler { 718 | type: "constant" 719 | value: 0 720 | } 721 | } 722 | 723 | } 724 | layer { 725 | name: "prelu4_1" 726 | type: "PReLU" 727 | bottom: "fc4_1" 728 | top: "fc4_1" 729 | } 730 | layer { 731 | name: "fc5_1" 732 | type: "InnerProduct" 733 | bottom: "fc4_1" 734 | top: "fc5_1" 735 | param { 736 | lr_mult: 1 737 | decay_mult: 1 738 | } 739 | param { 740 | lr_mult: 2 741 | decay_mult: 1 742 | } 743 | inner_product_param { 744 | num_output: 2 745 | weight_filler { 746 | type: "xavier" 747 | #type: "constant" 748 | #value: 0 749 | } 750 | bias_filler { 751 | type: "constant" 752 | value: 0 753 | } 754 | } 755 | } 756 | 757 | 758 | ######################### 759 | layer { 760 | name: "fc4_2" 761 | type: "InnerProduct" 762 | bottom: "fc4" 763 | top: "fc4_2" 764 | param { 765 | lr_mult: 1 766 | decay_mult: 1 767 | } 768 | param { 769 | lr_mult: 2 770 | decay_mult: 1 771 | } 772 | inner_product_param { 773 | num_output: 64 774 | weight_filler { 775 | type: "xavier" 776 | } 777 | bias_filler { 778 | type: "constant" 779 | value: 0 780 | } 781 | } 782 | 783 | } 784 | layer { 785 | name: "prelu4_2" 786 | type: "PReLU" 787 | bottom: "fc4_2" 788 | top: "fc4_2" 789 | } 790 | layer { 791 | name: "fc5_2" 792 | type: "InnerProduct" 793 | bottom: "fc4_2" 794 | top: "fc5_2" 795 | param { 796 | lr_mult: 1 797 | decay_mult: 1 798 | } 799 | param { 800 | lr_mult: 2 801 | decay_mult: 1 802 | } 803 | inner_product_param { 804 | num_output: 2 805 | weight_filler { 806 | type: "xavier" 807 | #type: "constant" 808 | #value: 0 809 | } 810 | bias_filler { 811 | type: "constant" 812 | value: 0 813 | } 814 | } 815 | } 816 | 817 | ######################### 818 | layer { 819 | name: "fc4_3" 820 | type: "InnerProduct" 821 | bottom: "fc4" 822 | top: "fc4_3" 823 | param { 824 | lr_mult: 1 825 | decay_mult: 1 826 | } 827 | param { 828 | lr_mult: 2 829 | decay_mult: 1 830 | } 831 | inner_product_param { 832 | num_output: 64 833 | weight_filler { 834 | type: "xavier" 835 | } 836 | bias_filler { 837 | type: "constant" 838 | value: 0 839 | } 840 | } 841 | 842 | } 843 | layer { 844 | name: "prelu4_3" 845 | type: "PReLU" 846 | bottom: "fc4_3" 847 | top: "fc4_3" 848 | } 849 | layer { 850 | name: "fc5_3" 851 | type: "InnerProduct" 852 | bottom: "fc4_3" 853 | top: "fc5_3" 854 | param { 855 | lr_mult: 1 856 | decay_mult: 1 857 | } 858 | param { 859 | lr_mult: 2 860 | decay_mult: 1 861 | } 862 | inner_product_param { 863 | num_output: 2 864 | weight_filler { 865 | type: "xavier" 866 | #type: "constant" 867 | #value: 0 868 | } 869 | bias_filler { 870 | type: "constant" 871 | value: 0 872 | } 873 | } 874 | } 875 | 876 | ######################### 877 | layer { 878 | name: "fc4_4" 879 | type: "InnerProduct" 880 | bottom: "fc4" 881 | top: "fc4_4" 882 | param { 883 | lr_mult: 1 884 | decay_mult: 1 885 | } 886 | param { 887 | lr_mult: 2 888 | decay_mult: 1 889 | } 890 | inner_product_param { 891 | num_output: 64 892 | weight_filler { 893 | type: "xavier" 894 | } 895 | bias_filler { 896 | type: "constant" 897 | value: 0 898 | } 899 | } 900 | 901 | } 902 | layer { 903 | name: "prelu4_4" 904 | type: "PReLU" 905 | bottom: "fc4_4" 906 | top: "fc4_4" 907 | } 908 | layer { 909 | name: "fc5_4" 910 | type: "InnerProduct" 911 | bottom: "fc4_4" 912 | top: "fc5_4" 913 | param { 914 | lr_mult: 1 915 | decay_mult: 1 916 | } 917 | param { 918 | lr_mult: 2 919 | decay_mult: 1 920 | } 921 | inner_product_param { 922 | num_output: 2 923 | weight_filler { 924 | type: "xavier" 925 | #type: "constant" 926 | #value: 0 927 | } 928 | bias_filler { 929 | type: "constant" 930 | value: 0 931 | } 932 | } 933 | } 934 | 935 | ######################### 936 | layer { 937 | name: "fc4_5" 938 | type: "InnerProduct" 939 | bottom: "fc4" 940 | top: "fc4_5" 941 | param { 942 | lr_mult: 1 943 | decay_mult: 1 944 | } 945 | param { 946 | lr_mult: 2 947 | decay_mult: 1 948 | } 949 | inner_product_param { 950 | num_output: 64 951 | weight_filler { 952 | type: "xavier" 953 | } 954 | bias_filler { 955 | type: "constant" 956 | value: 0 957 | } 958 | } 959 | 960 | } 961 | layer { 962 | name: "prelu4_5" 963 | type: "PReLU" 964 | bottom: "fc4_5" 965 | top: "fc4_5" 966 | } 967 | layer { 968 | name: "fc5_5" 969 | type: "InnerProduct" 970 | bottom: "fc4_5" 971 | top: "fc5_5" 972 | param { 973 | lr_mult: 1 974 | decay_mult: 1 975 | } 976 | param { 977 | lr_mult: 2 978 | decay_mult: 1 979 | } 980 | inner_product_param { 981 | num_output: 2 982 | weight_filler { 983 | type: "xavier" 984 | #type: "constant" 985 | #value: 0 986 | } 987 | bias_filler { 988 | type: "constant" 989 | value: 0 990 | } 991 | } 992 | } 993 | 994 | ######################### 995 | 996 | -------------------------------------------------------------------------------- /extract_weights_from_caffe_models.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | 4 | """ 5 | The purpose of this script is to convert pretrained weights taken from 6 | official implementation here: 7 | https://github.com/kpzhang93/MTCNN_face_detection_alignment/tree/master/code/codes/MTCNNv2 8 | to required format. 9 | 10 | In a nutshell, it just renames and transposes some of the weights. 11 | You don't have to use this script because weights are already in `src/weights`. 12 | """ 13 | 14 | 15 | def get_all_weights(net): 16 | all_weights = {} 17 | for p in net.params: 18 | if 'conv' in p: 19 | name = 'features.' + p 20 | if '-' in p: 21 | s = list(p) 22 | s[-2] = '_' 23 | s = ''.join(s) 24 | all_weights[s + '.weight'] = net.params[p][0].data 25 | all_weights[s + '.bias'] = net.params[p][1].data 26 | elif len(net.params[p][0].data.shape) == 4: 27 | all_weights[name + '.weight'] = net.params[p][0].data.transpose((0, 1, 3, 2)) 28 | all_weights[name + '.bias'] = net.params[p][1].data 29 | else: 30 | all_weights[name + '.weight'] = net.params[p][0].data 31 | all_weights[name + '.bias'] = net.params[p][1].data 32 | elif 'prelu' in p.lower(): 33 | all_weights['features.' + p.lower() + '.weight'] = net.params[p][0].data 34 | return all_weights 35 | 36 | 37 | # P-Net 38 | net = caffe.Net('caffe_models/det1.prototxt', 'caffe_models/det1.caffemodel', caffe.TEST) 39 | np.save('src/weights/pnet.npy', get_all_weights(net)) 40 | 41 | # R-Net 42 | net = caffe.Net('caffe_models/det2.prototxt', 'caffe_models/det2.caffemodel', caffe.TEST) 43 | np.save('src/weights/rnet.npy', get_all_weights(net)) 44 | 45 | # O-Net 46 | net = caffe.Net('caffe_models/det3.prototxt', 'caffe_models/det3.caffemodel', caffe.TEST) 47 | np.save('src/weights/onet.npy', get_all_weights(net)) 48 | -------------------------------------------------------------------------------- /images/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/example.png -------------------------------------------------------------------------------- /images/office1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office1.jpg -------------------------------------------------------------------------------- /images/office2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office2.jpg -------------------------------------------------------------------------------- /images/office3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office3.jpg -------------------------------------------------------------------------------- /images/office4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office4.jpg -------------------------------------------------------------------------------- /images/office5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office5.jpg -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .visualization_utils import show_bboxes 2 | from .detector import detect_faces 3 | -------------------------------------------------------------------------------- /src/box_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | 5 | def nms(boxes, overlap_threshold=0.5, mode='union'): 6 | """Non-maximum suppression. 7 | 8 | Arguments: 9 | boxes: a float numpy array of shape [n, 5], 10 | where each row is (xmin, ymin, xmax, ymax, score). 11 | overlap_threshold: a float number. 12 | mode: 'union' or 'min'. 13 | 14 | Returns: 15 | list with indices of the selected boxes 16 | """ 17 | 18 | # if there are no boxes, return the empty list 19 | if len(boxes) == 0: 20 | return [] 21 | 22 | # list of picked indices 23 | pick = [] 24 | 25 | # grab the coordinates of the bounding boxes 26 | x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)] 27 | 28 | area = (x2 - x1 + 1.0)*(y2 - y1 + 1.0) 29 | ids = np.argsort(score) # in increasing order 30 | 31 | while len(ids) > 0: 32 | 33 | # grab index of the largest value 34 | last = len(ids) - 1 35 | i = ids[last] 36 | pick.append(i) 37 | 38 | # compute intersections 39 | # of the box with the largest score 40 | # with the rest of boxes 41 | 42 | # left top corner of intersection boxes 43 | ix1 = np.maximum(x1[i], x1[ids[:last]]) 44 | iy1 = np.maximum(y1[i], y1[ids[:last]]) 45 | 46 | # right bottom corner of intersection boxes 47 | ix2 = np.minimum(x2[i], x2[ids[:last]]) 48 | iy2 = np.minimum(y2[i], y2[ids[:last]]) 49 | 50 | # width and height of intersection boxes 51 | w = np.maximum(0.0, ix2 - ix1 + 1.0) 52 | h = np.maximum(0.0, iy2 - iy1 + 1.0) 53 | 54 | # intersections' areas 55 | inter = w * h 56 | if mode == 'min': 57 | overlap = inter/np.minimum(area[i], area[ids[:last]]) 58 | elif mode == 'union': 59 | # intersection over union (IoU) 60 | overlap = inter/(area[i] + area[ids[:last]] - inter) 61 | 62 | # delete all boxes where overlap is too big 63 | ids = np.delete( 64 | ids, 65 | np.concatenate([[last], np.where(overlap > overlap_threshold)[0]]) 66 | ) 67 | 68 | return pick 69 | 70 | 71 | def convert_to_square(bboxes): 72 | """Convert bounding boxes to a square form. 73 | 74 | Arguments: 75 | bboxes: a float numpy array of shape [n, 5]. 76 | 77 | Returns: 78 | a float numpy array of shape [n, 5], 79 | squared bounding boxes. 80 | """ 81 | 82 | square_bboxes = np.zeros_like(bboxes) 83 | x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)] 84 | h = y2 - y1 + 1.0 85 | w = x2 - x1 + 1.0 86 | max_side = np.maximum(h, w) 87 | square_bboxes[:, 0] = x1 + w*0.5 - max_side*0.5 88 | square_bboxes[:, 1] = y1 + h*0.5 - max_side*0.5 89 | square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0 90 | square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0 91 | return square_bboxes 92 | 93 | 94 | def calibrate_box(bboxes, offsets): 95 | """Transform bounding boxes to be more like true bounding boxes. 96 | 'offsets' is one of the outputs of the nets. 97 | 98 | Arguments: 99 | bboxes: a float numpy array of shape [n, 5]. 100 | offsets: a float numpy array of shape [n, 4]. 101 | 102 | Returns: 103 | a float numpy array of shape [n, 5]. 104 | """ 105 | x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)] 106 | w = x2 - x1 + 1.0 107 | h = y2 - y1 + 1.0 108 | w = np.expand_dims(w, 1) 109 | h = np.expand_dims(h, 1) 110 | 111 | # this is what happening here: 112 | # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)] 113 | # x1_true = x1 + tx1*w 114 | # y1_true = y1 + ty1*h 115 | # x2_true = x2 + tx2*w 116 | # y2_true = y2 + ty2*h 117 | # below is just more compact form of this 118 | 119 | # are offsets always such that 120 | # x1 < x2 and y1 < y2 ? 121 | 122 | translation = np.hstack([w, h, w, h])*offsets 123 | bboxes[:, 0:4] = bboxes[:, 0:4] + translation 124 | return bboxes 125 | 126 | 127 | def get_image_boxes(bounding_boxes, img, size=24): 128 | """Cut out boxes from the image. 129 | 130 | Arguments: 131 | bounding_boxes: a float numpy array of shape [n, 5]. 132 | img: an instance of PIL.Image. 133 | size: an integer, size of cutouts. 134 | 135 | Returns: 136 | a float numpy array of shape [n, 3, size, size]. 137 | """ 138 | 139 | num_boxes = len(bounding_boxes) 140 | width, height = img.size 141 | 142 | [dy, edy, dx, edx, y, ey, x, ex, w, h] = correct_bboxes(bounding_boxes, width, height) 143 | img_boxes = np.zeros((num_boxes, 3, size, size), 'float32') 144 | 145 | for i in range(num_boxes): 146 | img_box = np.zeros((h[i], w[i], 3), 'uint8') 147 | 148 | img_array = np.asarray(img, 'uint8') 149 | img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\ 150 | img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :] 151 | 152 | # resize 153 | img_box = Image.fromarray(img_box) 154 | img_box = img_box.resize((size, size), Image.BILINEAR) 155 | img_box = np.asarray(img_box, 'float32') 156 | 157 | img_boxes[i, :, :, :] = _preprocess(img_box) 158 | 159 | return img_boxes 160 | 161 | 162 | def correct_bboxes(bboxes, width, height): 163 | """Crop boxes that are too big and get coordinates 164 | with respect to cutouts. 165 | 166 | Arguments: 167 | bboxes: a float numpy array of shape [n, 5], 168 | where each row is (xmin, ymin, xmax, ymax, score). 169 | width: a float number. 170 | height: a float number. 171 | 172 | Returns: 173 | dy, dx, edy, edx: a int numpy arrays of shape [n], 174 | coordinates of the boxes with respect to the cutouts. 175 | y, x, ey, ex: a int numpy arrays of shape [n], 176 | corrected ymin, xmin, ymax, xmax. 177 | h, w: a int numpy arrays of shape [n], 178 | just heights and widths of boxes. 179 | 180 | in the following order: 181 | [dy, edy, dx, edx, y, ey, x, ex, w, h]. 182 | """ 183 | 184 | x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)] 185 | w, h = x2 - x1 + 1.0, y2 - y1 + 1.0 186 | num_boxes = bboxes.shape[0] 187 | 188 | # 'e' stands for end 189 | # (x, y) -> (ex, ey) 190 | x, y, ex, ey = x1, y1, x2, y2 191 | 192 | # we need to cut out a box from the image. 193 | # (x, y, ex, ey) are corrected coordinates of the box 194 | # in the image. 195 | # (dx, dy, edx, edy) are coordinates of the box in the cutout 196 | # from the image. 197 | dx, dy = np.zeros((num_boxes,)), np.zeros((num_boxes,)) 198 | edx, edy = w.copy() - 1.0, h.copy() - 1.0 199 | 200 | # if box's bottom right corner is too far right 201 | ind = np.where(ex > width - 1.0)[0] 202 | edx[ind] = w[ind] + width - 2.0 - ex[ind] 203 | ex[ind] = width - 1.0 204 | 205 | # if box's bottom right corner is too low 206 | ind = np.where(ey > height - 1.0)[0] 207 | edy[ind] = h[ind] + height - 2.0 - ey[ind] 208 | ey[ind] = height - 1.0 209 | 210 | # if box's top left corner is too far left 211 | ind = np.where(x < 0.0)[0] 212 | dx[ind] = 0.0 - x[ind] 213 | x[ind] = 0.0 214 | 215 | # if box's top left corner is too high 216 | ind = np.where(y < 0.0)[0] 217 | dy[ind] = 0.0 - y[ind] 218 | y[ind] = 0.0 219 | 220 | return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h] 221 | return_list = [i.astype('int32') for i in return_list] 222 | 223 | return return_list 224 | 225 | 226 | def _preprocess(img): 227 | """Preprocessing step before feeding the network. 228 | 229 | Arguments: 230 | img: a float numpy array of shape [h, w, c]. 231 | 232 | Returns: 233 | a float numpy array of shape [1, c, h, w]. 234 | """ 235 | img = img.transpose((2, 0, 1)) 236 | img = np.expand_dims(img, 0) 237 | img = (img - 127.5)*0.0078125 238 | return img 239 | -------------------------------------------------------------------------------- /src/detector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | from .get_nets import PNet, RNet, ONet 5 | from .box_utils import nms, calibrate_box, get_image_boxes, convert_to_square 6 | from .first_stage import run_first_stage 7 | 8 | 9 | def detect_faces(image, min_face_size=20.0, 10 | thresholds=[0.6, 0.7, 0.8], 11 | nms_thresholds=[0.7, 0.7, 0.7]): 12 | """ 13 | Arguments: 14 | image: an instance of PIL.Image. 15 | min_face_size: a float number. 16 | thresholds: a list of length 3. 17 | nms_thresholds: a list of length 3. 18 | 19 | Returns: 20 | two float numpy arrays of shapes [n_boxes, 4] and [n_boxes, 10], 21 | bounding boxes and facial landmarks. 22 | """ 23 | 24 | # LOAD MODELS 25 | pnet = PNet() 26 | rnet = RNet() 27 | onet = ONet() 28 | onet.eval() 29 | 30 | # BUILD AN IMAGE PYRAMID 31 | width, height = image.size 32 | min_length = min(height, width) 33 | 34 | min_detection_size = 12 35 | factor = 0.707 # sqrt(0.5) 36 | 37 | # scales for scaling the image 38 | scales = [] 39 | 40 | # scales the image so that 41 | # minimum size that we can detect equals to 42 | # minimum face size that we want to detect 43 | m = min_detection_size/min_face_size 44 | min_length *= m 45 | 46 | factor_count = 0 47 | while min_length > min_detection_size: 48 | scales.append(m*factor**factor_count) 49 | min_length *= factor 50 | factor_count += 1 51 | 52 | # STAGE 1 53 | 54 | # it will be returned 55 | bounding_boxes = [] 56 | 57 | # run P-Net on different scales 58 | for s in scales: 59 | boxes = run_first_stage(image, pnet, scale=s, threshold=thresholds[0]) 60 | bounding_boxes.append(boxes) 61 | 62 | # collect boxes (and offsets, and scores) from different scales 63 | bounding_boxes = [i for i in bounding_boxes if i is not None] 64 | bounding_boxes = np.vstack(bounding_boxes) 65 | 66 | keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0]) 67 | bounding_boxes = bounding_boxes[keep] 68 | 69 | # use offsets predicted by pnet to transform bounding boxes 70 | bounding_boxes = calibrate_box(bounding_boxes[:, 0:5], bounding_boxes[:, 5:]) 71 | # shape [n_boxes, 5] 72 | 73 | bounding_boxes = convert_to_square(bounding_boxes) 74 | bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) 75 | 76 | # STAGE 2 77 | 78 | img_boxes = get_image_boxes(bounding_boxes, image, size=24) 79 | img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True) 80 | output = rnet(img_boxes) 81 | offsets = output[0].data.numpy() # shape [n_boxes, 4] 82 | probs = output[1].data.numpy() # shape [n_boxes, 2] 83 | 84 | keep = np.where(probs[:, 1] > thresholds[1])[0] 85 | bounding_boxes = bounding_boxes[keep] 86 | bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) 87 | offsets = offsets[keep] 88 | 89 | keep = nms(bounding_boxes, nms_thresholds[1]) 90 | bounding_boxes = bounding_boxes[keep] 91 | bounding_boxes = calibrate_box(bounding_boxes, offsets[keep]) 92 | bounding_boxes = convert_to_square(bounding_boxes) 93 | bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) 94 | 95 | # STAGE 3 96 | 97 | img_boxes = get_image_boxes(bounding_boxes, image, size=48) 98 | if len(img_boxes) == 0: 99 | return [], [] 100 | img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True) 101 | output = onet(img_boxes) 102 | landmarks = output[0].data.numpy() # shape [n_boxes, 10] 103 | offsets = output[1].data.numpy() # shape [n_boxes, 4] 104 | probs = output[2].data.numpy() # shape [n_boxes, 2] 105 | 106 | keep = np.where(probs[:, 1] > thresholds[2])[0] 107 | bounding_boxes = bounding_boxes[keep] 108 | bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) 109 | offsets = offsets[keep] 110 | landmarks = landmarks[keep] 111 | 112 | # compute landmark points 113 | width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0 114 | height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0 115 | xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1] 116 | landmarks[:, 0:5] = np.expand_dims(xmin, 1) + np.expand_dims(width, 1)*landmarks[:, 0:5] 117 | landmarks[:, 5:10] = np.expand_dims(ymin, 1) + np.expand_dims(height, 1)*landmarks[:, 5:10] 118 | 119 | bounding_boxes = calibrate_box(bounding_boxes, offsets) 120 | keep = nms(bounding_boxes, nms_thresholds[2], mode='min') 121 | bounding_boxes = bounding_boxes[keep] 122 | landmarks = landmarks[keep] 123 | 124 | return bounding_boxes, landmarks 125 | -------------------------------------------------------------------------------- /src/first_stage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import math 4 | from PIL import Image 5 | import numpy as np 6 | from .box_utils import nms, _preprocess 7 | 8 | 9 | def run_first_stage(image, net, scale, threshold): 10 | """Run P-Net, generate bounding boxes, and do NMS. 11 | 12 | Arguments: 13 | image: an instance of PIL.Image. 14 | net: an instance of pytorch's nn.Module, P-Net. 15 | scale: a float number, 16 | scale width and height of the image by this number. 17 | threshold: a float number, 18 | threshold on the probability of a face when generating 19 | bounding boxes from predictions of the net. 20 | 21 | Returns: 22 | a float numpy array of shape [n_boxes, 9], 23 | bounding boxes with scores and offsets (4 + 1 + 4). 24 | """ 25 | 26 | # scale the image and convert it to a float array 27 | width, height = image.size 28 | sw, sh = math.ceil(width*scale), math.ceil(height*scale) 29 | img = image.resize((sw, sh), Image.BILINEAR) 30 | img = np.asarray(img, 'float32') 31 | 32 | img = Variable(torch.FloatTensor(_preprocess(img)), volatile=True) 33 | output = net(img) 34 | probs = output[1].data.numpy()[0, 1, :, :] 35 | offsets = output[0].data.numpy() 36 | # probs: probability of a face at each sliding window 37 | # offsets: transformations to true bounding boxes 38 | 39 | boxes = _generate_bboxes(probs, offsets, scale, threshold) 40 | if len(boxes) == 0: 41 | return None 42 | 43 | keep = nms(boxes[:, 0:5], overlap_threshold=0.5) 44 | return boxes[keep] 45 | 46 | 47 | def _generate_bboxes(probs, offsets, scale, threshold): 48 | """Generate bounding boxes at places 49 | where there is probably a face. 50 | 51 | Arguments: 52 | probs: a float numpy array of shape [n, m]. 53 | offsets: a float numpy array of shape [1, 4, n, m]. 54 | scale: a float number, 55 | width and height of the image were scaled by this number. 56 | threshold: a float number. 57 | 58 | Returns: 59 | a float numpy array of shape [n_boxes, 9] 60 | """ 61 | 62 | # applying P-Net is equivalent, in some sense, to 63 | # moving 12x12 window with stride 2 64 | stride = 2 65 | cell_size = 12 66 | 67 | # indices of boxes where there is probably a face 68 | inds = np.where(probs > threshold) 69 | 70 | if inds[0].size == 0: 71 | return np.array([]) 72 | 73 | # transformations of bounding boxes 74 | tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)] 75 | # they are defined as: 76 | # w = x2 - x1 + 1 77 | # h = y2 - y1 + 1 78 | # x1_true = x1 + tx1*w 79 | # x2_true = x2 + tx2*w 80 | # y1_true = y1 + ty1*h 81 | # y2_true = y2 + ty2*h 82 | 83 | offsets = np.array([tx1, ty1, tx2, ty2]) 84 | score = probs[inds[0], inds[1]] 85 | 86 | # P-Net is applied to scaled images 87 | # so we need to rescale bounding boxes back 88 | bounding_boxes = np.vstack([ 89 | np.round((stride*inds[1] + 1.0)/scale), 90 | np.round((stride*inds[0] + 1.0)/scale), 91 | np.round((stride*inds[1] + 1.0 + cell_size)/scale), 92 | np.round((stride*inds[0] + 1.0 + cell_size)/scale), 93 | score, offsets 94 | ]) 95 | # why one is added? 96 | 97 | return bounding_boxes.T 98 | -------------------------------------------------------------------------------- /src/get_nets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | import numpy as np 6 | 7 | 8 | class Flatten(nn.Module): 9 | 10 | def __init__(self): 11 | super(Flatten, self).__init__() 12 | 13 | def forward(self, x): 14 | """ 15 | Arguments: 16 | x: a float tensor with shape [batch_size, c, h, w]. 17 | Returns: 18 | a float tensor with shape [batch_size, c*h*w]. 19 | """ 20 | 21 | # without this pretrained model isn't working 22 | x = x.transpose(3, 2).contiguous() 23 | 24 | return x.view(x.size(0), -1) 25 | 26 | 27 | class PNet(nn.Module): 28 | 29 | def __init__(self): 30 | 31 | super(PNet, self).__init__() 32 | 33 | # suppose we have input with size HxW, then 34 | # after first layer: H - 2, 35 | # after pool: ceil((H - 2)/2), 36 | # after second conv: ceil((H - 2)/2) - 2, 37 | # after last conv: ceil((H - 2)/2) - 4, 38 | # and the same for W 39 | 40 | self.features = nn.Sequential(OrderedDict([ 41 | ('conv1', nn.Conv2d(3, 10, 3, 1)), 42 | ('prelu1', nn.PReLU(10)), 43 | ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)), 44 | 45 | ('conv2', nn.Conv2d(10, 16, 3, 1)), 46 | ('prelu2', nn.PReLU(16)), 47 | 48 | ('conv3', nn.Conv2d(16, 32, 3, 1)), 49 | ('prelu3', nn.PReLU(32)) 50 | ])) 51 | 52 | self.conv4_1 = nn.Conv2d(32, 2, 1, 1) 53 | self.conv4_2 = nn.Conv2d(32, 4, 1, 1) 54 | 55 | weights = np.load('src/weights/pnet.npy')[()] 56 | for n, p in self.named_parameters(): 57 | p.data = torch.FloatTensor(weights[n]) 58 | 59 | def forward(self, x): 60 | """ 61 | Arguments: 62 | x: a float tensor with shape [batch_size, 3, h, w]. 63 | Returns: 64 | b: a float tensor with shape [batch_size, 4, h', w']. 65 | a: a float tensor with shape [batch_size, 2, h', w']. 66 | """ 67 | x = self.features(x) 68 | a = self.conv4_1(x) 69 | b = self.conv4_2(x) 70 | a = F.softmax(a) 71 | return b, a 72 | 73 | 74 | class RNet(nn.Module): 75 | 76 | def __init__(self): 77 | 78 | super(RNet, self).__init__() 79 | 80 | self.features = nn.Sequential(OrderedDict([ 81 | ('conv1', nn.Conv2d(3, 28, 3, 1)), 82 | ('prelu1', nn.PReLU(28)), 83 | ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)), 84 | 85 | ('conv2', nn.Conv2d(28, 48, 3, 1)), 86 | ('prelu2', nn.PReLU(48)), 87 | ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)), 88 | 89 | ('conv3', nn.Conv2d(48, 64, 2, 1)), 90 | ('prelu3', nn.PReLU(64)), 91 | 92 | ('flatten', Flatten()), 93 | ('conv4', nn.Linear(576, 128)), 94 | ('prelu4', nn.PReLU(128)) 95 | ])) 96 | 97 | self.conv5_1 = nn.Linear(128, 2) 98 | self.conv5_2 = nn.Linear(128, 4) 99 | 100 | weights = np.load('src/weights/rnet.npy')[()] 101 | for n, p in self.named_parameters(): 102 | p.data = torch.FloatTensor(weights[n]) 103 | 104 | def forward(self, x): 105 | """ 106 | Arguments: 107 | x: a float tensor with shape [batch_size, 3, h, w]. 108 | Returns: 109 | b: a float tensor with shape [batch_size, 4]. 110 | a: a float tensor with shape [batch_size, 2]. 111 | """ 112 | x = self.features(x) 113 | a = self.conv5_1(x) 114 | b = self.conv5_2(x) 115 | a = F.softmax(a) 116 | return b, a 117 | 118 | 119 | class ONet(nn.Module): 120 | 121 | def __init__(self): 122 | 123 | super(ONet, self).__init__() 124 | 125 | self.features = nn.Sequential(OrderedDict([ 126 | ('conv1', nn.Conv2d(3, 32, 3, 1)), 127 | ('prelu1', nn.PReLU(32)), 128 | ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)), 129 | 130 | ('conv2', nn.Conv2d(32, 64, 3, 1)), 131 | ('prelu2', nn.PReLU(64)), 132 | ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)), 133 | 134 | ('conv3', nn.Conv2d(64, 64, 3, 1)), 135 | ('prelu3', nn.PReLU(64)), 136 | ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)), 137 | 138 | ('conv4', nn.Conv2d(64, 128, 2, 1)), 139 | ('prelu4', nn.PReLU(128)), 140 | 141 | ('flatten', Flatten()), 142 | ('conv5', nn.Linear(1152, 256)), 143 | ('drop5', nn.Dropout(0.25)), 144 | ('prelu5', nn.PReLU(256)), 145 | ])) 146 | 147 | self.conv6_1 = nn.Linear(256, 2) 148 | self.conv6_2 = nn.Linear(256, 4) 149 | self.conv6_3 = nn.Linear(256, 10) 150 | 151 | weights = np.load('src/weights/onet.npy')[()] 152 | for n, p in self.named_parameters(): 153 | p.data = torch.FloatTensor(weights[n]) 154 | 155 | def forward(self, x): 156 | """ 157 | Arguments: 158 | x: a float tensor with shape [batch_size, 3, h, w]. 159 | Returns: 160 | c: a float tensor with shape [batch_size, 10]. 161 | b: a float tensor with shape [batch_size, 4]. 162 | a: a float tensor with shape [batch_size, 2]. 163 | """ 164 | x = self.features(x) 165 | a = self.conv6_1(x) 166 | b = self.conv6_2(x) 167 | c = self.conv6_3(x) 168 | a = F.softmax(a) 169 | return c, b, a 170 | -------------------------------------------------------------------------------- /src/visualization_utils.py: -------------------------------------------------------------------------------- 1 | from PIL import ImageDraw 2 | 3 | 4 | def show_bboxes(img, bounding_boxes, facial_landmarks=[]): 5 | """Draw bounding boxes and facial landmarks. 6 | 7 | Arguments: 8 | img: an instance of PIL.Image. 9 | bounding_boxes: a float numpy array of shape [n, 5]. 10 | facial_landmarks: a float numpy array of shape [n, 10]. 11 | 12 | Returns: 13 | an instance of PIL.Image. 14 | """ 15 | 16 | img_copy = img.copy() 17 | draw = ImageDraw.Draw(img_copy) 18 | 19 | for b in bounding_boxes: 20 | draw.rectangle([ 21 | (b[0], b[1]), (b[2], b[3]) 22 | ], outline='white') 23 | 24 | for p in facial_landmarks: 25 | for i in range(5): 26 | draw.ellipse([ 27 | (p[i] - 1.0, p[i + 5] - 1.0), 28 | (p[i] + 1.0, p[i + 5] + 1.0) 29 | ], outline='blue') 30 | 31 | return img_copy 32 | -------------------------------------------------------------------------------- /src/weights/onet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/onet.npy -------------------------------------------------------------------------------- /src/weights/pnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/pnet.npy -------------------------------------------------------------------------------- /src/weights/rnet.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/rnet.npy --------------------------------------------------------------------------------