├── .gitignore
├── LICENSE
├── README.md
├── caffe_models
    ├── det1.caffemodel
    ├── det1.prototxt
    ├── det2.caffemodel
    ├── det2.prototxt
    ├── det3.caffemodel
    ├── det3.prototxt
    ├── det4.caffemodel
    └── det4.prototxt
├── extract_weights_from_caffe_models.py
├── images
    ├── example.png
    ├── office1.jpg
    ├── office2.jpg
    ├── office3.jpg
    ├── office4.jpg
    └── office5.jpg
├── src
    ├── __init__.py
    ├── box_utils.py
    ├── detector.py
    ├── first_stage.py
    ├── get_nets.py
    ├── visualization_utils.py
    └── weights
    │   ├── onet.npy
    │   ├── pnet.npy
    │   └── rnet.npy
├── test_on_images.ipynb
└── try_mtcnn_step_by_step.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | __pycache__
3 | 
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Dan Antoshchenko
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MTCNN
 2 | 
 3 | `pytorch` implementation of **inference stage** of face detection algorithm described in  
 4 | [Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https://arxiv.org/abs/1604.02878).
 5 | 
 6 | ## Example
 7 | ![example of a face detection](images/example.png)
 8 | 
 9 | ## How to use it
10 | Just download the repository and then do this
11 | ```python
12 | from src import detect_faces
13 | from PIL import Image
14 | 
15 | image = Image.open('image.jpg')
16 | bounding_boxes, landmarks = detect_faces(image)
17 | ```
18 | For examples see `test_on_images.ipynb`.
19 | 
20 | ## Requirements
21 | * pytorch 0.2
22 | * Pillow, numpy
23 | 
24 | ## Credit
25 | This implementation is heavily inspired by:
26 | * [pangyupo/mxnet_mtcnn_face_detection](https://github.com/pangyupo/mxnet_mtcnn_face_detection)  
27 | 


--------------------------------------------------------------------------------
/caffe_models/det1.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det1.caffemodel


--------------------------------------------------------------------------------
/caffe_models/det1.prototxt:
--------------------------------------------------------------------------------
  1 | name: "PNet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 12
  6 | input_dim: 12
  7 | 
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |     decay_mult: 1
 16 |   }
 17 |   param {
 18 |     lr_mult: 2
 19 |     decay_mult: 0
 20 |   }
 21 |   convolution_param {
 22 |     num_output: 10
 23 |     kernel_size: 3
 24 |     stride: 1
 25 |     weight_filler {
 26 |       type: "xavier"
 27 |     }
 28 |     bias_filler {
 29 |       type: "constant"
 30 |       value: 0
 31 |     }
 32 |   }
 33 | }
 34 | layer {
 35 |   name: "PReLU1"
 36 |   type: "PReLU"
 37 |   bottom: "conv1"
 38 |   top: "conv1"
 39 | }
 40 | layer {
 41 |   name: "pool1"
 42 |   type: "Pooling"
 43 |   bottom: "conv1"
 44 |   top: "pool1"
 45 |   pooling_param {
 46 |     pool: MAX
 47 |     kernel_size: 2
 48 |     stride: 2
 49 |   }
 50 | }
 51 | 
 52 | layer {
 53 |   name: "conv2"
 54 |   type: "Convolution"
 55 |   bottom: "pool1"
 56 |   top: "conv2"
 57 |   param {
 58 |     lr_mult: 1
 59 |     decay_mult: 1
 60 |   }
 61 |   param {
 62 |     lr_mult: 2
 63 |     decay_mult: 0
 64 |   }
 65 |   convolution_param {
 66 |     num_output: 16
 67 |     kernel_size: 3
 68 |     stride: 1
 69 |      weight_filler {
 70 |       type: "xavier"
 71 |     }
 72 |     bias_filler {
 73 |       type: "constant"
 74 |       value: 0
 75 |     }
 76 |   }
 77 | }
 78 | layer {
 79 |   name: "PReLU2"
 80 |   type: "PReLU"
 81 |   bottom: "conv2"
 82 |   top: "conv2"
 83 | }
 84 | 
 85 | layer {
 86 |   name: "conv3"
 87 |   type: "Convolution"
 88 |   bottom: "conv2"
 89 |   top: "conv3"
 90 |   param {
 91 |     lr_mult: 1
 92 |     decay_mult: 1
 93 |   }
 94 |   param {
 95 |     lr_mult: 2
 96 |     decay_mult: 0
 97 |   }
 98 |   convolution_param {
 99 |     num_output: 32
100 |     kernel_size: 3
101 |     stride: 1
102 |      weight_filler {
103 |       type: "xavier"
104 |     }
105 |     bias_filler {
106 | 	  type: "constant"
107 |       value: 0
108 |     }
109 |   }
110 | }
111 | layer {
112 |   name: "PReLU3"
113 |   type: "PReLU"
114 |   bottom: "conv3"
115 |   top: "conv3"
116 | }
117 | 
118 | 
119 | layer {
120 |   name: "conv4-1"
121 |   type: "Convolution"
122 |   bottom: "conv3"
123 |   top: "conv4-1"
124 |   param {
125 |     lr_mult: 1
126 |     decay_mult: 1
127 |   }
128 |   param {
129 |     lr_mult: 2
130 |     decay_mult: 0
131 |   }
132 |   convolution_param {
133 |     num_output: 2
134 |     kernel_size: 1
135 |     stride: 1
136 |      weight_filler {
137 |       type: "xavier"
138 |     }
139 |     bias_filler {
140 |       type: "constant"
141 |       value: 0
142 |     }
143 |   }
144 | }
145 | 
146 | layer {
147 |   name: "conv4-2"
148 |   type: "Convolution"
149 |   bottom: "conv3"
150 |   top: "conv4-2"
151 |   param {
152 |     lr_mult: 1
153 |     decay_mult: 1
154 |   }
155 |   param {
156 |     lr_mult: 2
157 |     decay_mult: 0
158 |   }
159 |   convolution_param {
160 |     num_output: 4
161 |     kernel_size: 1
162 |     stride: 1
163 |      weight_filler {
164 |       type: "xavier"
165 | 	}
166 |     bias_filler {
167 |       type: "constant"
168 |       value: 0
169 |     }
170 |   }
171 | }
172 | layer {
173 |   name: "prob1"
174 |   type: "Softmax"
175 |   bottom: "conv4-1"
176 |   top: "prob1"
177 | }
178 | 


--------------------------------------------------------------------------------
/caffe_models/det2.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det2.caffemodel


--------------------------------------------------------------------------------
/caffe_models/det2.prototxt:
--------------------------------------------------------------------------------
  1 | name: "RNet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 24
  6 | input_dim: 24
  7 | 
  8 | 
  9 | ##########################
 10 | ######################
 11 | layer {
 12 |   name: "conv1"
 13 |   type: "Convolution"
 14 |   bottom: "data"
 15 |   top: "conv1"
 16 |   param {
 17 |     lr_mult: 0
 18 |     decay_mult: 0
 19 |   }
 20 |   param {
 21 |     lr_mult: 0
 22 |     decay_mult: 0
 23 |   }
 24 |   convolution_param {
 25 |     num_output: 28
 26 |     kernel_size: 3
 27 |     stride: 1
 28 |      weight_filler {
 29 |       type: "xavier"
 30 | 	}
 31 |     bias_filler {
 32 |       type: "constant"
 33 |       value: 0
 34 |     }
 35 |   }
 36 | }
 37 | layer {
 38 |   name: "prelu1"
 39 |   type: "PReLU"
 40 |   bottom: "conv1"
 41 |   top: "conv1"
 42 |   propagate_down: true
 43 | }
 44 | layer {
 45 |   name: "pool1"
 46 |   type: "Pooling"
 47 |   bottom: "conv1"
 48 |   top: "pool1"
 49 |   pooling_param {
 50 |     pool: MAX
 51 |     kernel_size: 3
 52 |     stride: 2
 53 |   }
 54 | }
 55 | 
 56 | layer {
 57 |   name: "conv2"
 58 |   type: "Convolution"
 59 |   bottom: "pool1"
 60 |   top: "conv2"
 61 |   param {
 62 |     lr_mult: 0
 63 |     decay_mult: 0
 64 |   }
 65 |   param {
 66 |     lr_mult: 0
 67 |     decay_mult: 0
 68 |   }
 69 |   convolution_param {
 70 |     num_output: 48
 71 |     kernel_size: 3
 72 |     stride: 1
 73 |     weight_filler {
 74 |       type: "xavier"
 75 | 	}
 76 |     bias_filler {
 77 |       type: "constant"
 78 |       value: 0
 79 |     }
 80 |   }
 81 | }
 82 | layer {
 83 |   name: "prelu2"
 84 |   type: "PReLU"
 85 |   bottom: "conv2"
 86 |   top: "conv2"
 87 |   propagate_down: true
 88 | }
 89 | layer {
 90 |   name: "pool2"
 91 |   type: "Pooling"
 92 |   bottom: "conv2"
 93 |   top: "pool2"
 94 |   pooling_param {
 95 |     pool: MAX
 96 |     kernel_size: 3
 97 |     stride: 2
 98 |   }
 99 | }
100 | ####################################
101 | 
102 | ##################################
103 | layer {
104 |   name: "conv3"
105 |   type: "Convolution"
106 |   bottom: "pool2"
107 |   top: "conv3"
108 |   param {
109 |     lr_mult: 0
110 |     decay_mult: 0
111 |   }
112 |   param {
113 |     lr_mult: 0
114 |     decay_mult: 0
115 |   }
116 |   convolution_param {
117 |     num_output: 64
118 |     kernel_size: 2
119 |     stride: 1
120 |     weight_filler {
121 |       type: "xavier"
122 | 	}
123 |     bias_filler {
124 |       type: "constant"
125 |       value: 0
126 |     }
127 |   }
128 | }
129 | layer {
130 |   name: "prelu3"
131 |   type: "PReLU"
132 |   bottom: "conv3"
133 |   top: "conv3"
134 |   propagate_down: true
135 | }
136 | ###############################
137 | 
138 | ###############################
139 | 
140 | layer {
141 |   name: "conv4"
142 |   type: "InnerProduct"
143 |   bottom: "conv3"
144 |   top: "conv4"
145 |   param {
146 |     lr_mult: 0
147 |     decay_mult: 0
148 |   }
149 |   param {
150 |     lr_mult: 0
151 |     decay_mult: 0
152 |   }
153 |   inner_product_param {
154 |     num_output: 128
155 |     weight_filler {
156 |       type: "xavier"
157 | 	}
158 |     bias_filler {
159 |       type: "constant"
160 |       value: 0
161 |     }
162 |   }
163 | }
164 | layer {
165 |   name: "prelu4"
166 |   type: "PReLU"
167 |   bottom: "conv4"
168 |   top: "conv4"
169 | }
170 | 
171 | layer {
172 |   name: "conv5-1"
173 |   type: "InnerProduct"
174 |   bottom: "conv4"
175 |   top: "conv5-1"
176 |   param {
177 |     lr_mult: 0
178 |     decay_mult: 0
179 |   }
180 |   param {
181 |     lr_mult: 0
182 |     decay_mult: 0
183 |   }
184 |   inner_product_param {
185 |     num_output: 2
186 |     #kernel_size: 1
187 |     #stride: 1
188 |     weight_filler {
189 |       type: "xavier"
190 | 	}
191 |     bias_filler {
192 |       type: "constant"
193 |       value: 0
194 |     }
195 |   }
196 | }
197 | layer {
198 |   name: "conv5-2"
199 |   type: "InnerProduct"
200 |   bottom: "conv4"
201 |   top: "conv5-2"
202 |   param {
203 |     lr_mult: 1
204 |     decay_mult: 1
205 |   }
206 |   param {
207 |     lr_mult: 2
208 |     decay_mult: 1
209 |   }
210 |   inner_product_param {
211 |     num_output: 4
212 |     #kernel_size: 1
213 |     #stride: 1
214 |      weight_filler {
215 |       type: "xavier"
216 | 	}
217 |     bias_filler {
218 |       type: "constant"
219 |       value: 0
220 |     }
221 |   }
222 | }
223 | layer {
224 |   name: "prob1"
225 |   type: "Softmax"
226 |   bottom: "conv5-1"
227 |   top: "prob1"
228 | }


--------------------------------------------------------------------------------
/caffe_models/det3.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det3.caffemodel


--------------------------------------------------------------------------------
/caffe_models/det3.prototxt:
--------------------------------------------------------------------------------
  1 | name: "ONet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 48
  6 | input_dim: 48
  7 | ##################################
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |     decay_mult: 1
 16 |   }
 17 |   param {
 18 |     lr_mult: 2
 19 |     decay_mult: 1
 20 |   }
 21 |   convolution_param {
 22 |     num_output: 32
 23 |     kernel_size: 3
 24 |     stride: 1
 25 |      weight_filler {
 26 |       type: "xavier"
 27 | 	}
 28 |     bias_filler {
 29 |       type: "constant"
 30 |       value: 0
 31 |     }
 32 |   }
 33 | }
 34 | layer {
 35 |   name: "prelu1"
 36 |   type: "PReLU"
 37 |   bottom: "conv1"
 38 |   top: "conv1"
 39 | }
 40 | layer {
 41 |   name: "pool1"
 42 |   type: "Pooling"
 43 |   bottom: "conv1"
 44 |   top: "pool1"
 45 |   pooling_param {
 46 |     pool: MAX
 47 |     kernel_size: 3
 48 |     stride: 2
 49 |   }
 50 | }
 51 | layer {
 52 |   name: "conv2"
 53 |   type: "Convolution"
 54 |   bottom: "pool1"
 55 |   top: "conv2"
 56 |   param {
 57 |     lr_mult: 1
 58 |     decay_mult: 1
 59 |   }
 60 |   param {
 61 |     lr_mult: 2
 62 |     decay_mult: 1
 63 |   }
 64 |   convolution_param {
 65 |     num_output: 64
 66 |     kernel_size: 3
 67 |     stride: 1
 68 |      weight_filler {
 69 |       type: "xavier"
 70 | 	}
 71 |     bias_filler {
 72 |       type: "constant"
 73 |       value: 0
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | layer {
 79 |   name: "prelu2"
 80 |   type: "PReLU"
 81 |   bottom: "conv2"
 82 |   top: "conv2"
 83 | }
 84 | layer {
 85 |   name: "pool2"
 86 |   type: "Pooling"
 87 |   bottom: "conv2"
 88 |   top: "pool2"
 89 |   pooling_param {
 90 |     pool: MAX
 91 |     kernel_size: 3
 92 |     stride: 2
 93 |   }
 94 | }
 95 | 
 96 | layer {
 97 |   name: "conv3"
 98 |   type: "Convolution"
 99 |   bottom: "pool2"
100 |   top: "conv3"
101 |   param {
102 |     lr_mult: 1
103 |     decay_mult: 1
104 |   }
105 |   param {
106 |     lr_mult: 2
107 |     decay_mult: 1
108 |   }
109 |   convolution_param {
110 | 	num_output: 64
111 | 	kernel_size: 3
112 |     weight_filler {
113 |       type: "xavier"
114 | 	}
115 |     bias_filler {
116 |       type: "constant"
117 |       value: 0
118 |     }
119 |   }
120 | }
121 | layer {
122 |   name: "prelu3"
123 |   type: "PReLU"
124 |   bottom: "conv3"
125 |   top: "conv3"
126 | }
127 | layer {
128 |   name: "pool3"
129 |   type: "Pooling"
130 |   bottom: "conv3"
131 |   top: "pool3"
132 |   pooling_param {
133 |     pool: MAX
134 |     kernel_size: 2
135 |     stride: 2
136 |   }
137 | }
138 | layer {
139 |   name: "conv4"
140 |   type: "Convolution"
141 |   bottom: "pool3"
142 |   top: "conv4"
143 |   param {
144 |     lr_mult: 1
145 |     decay_mult: 1
146 |   }
147 |   param {
148 |     lr_mult: 2
149 |     decay_mult: 1
150 |   }
151 |   convolution_param {
152 | 	num_output: 128
153 | 	kernel_size: 2
154 |     weight_filler {
155 |       type: "xavier"
156 | 	}
157 |     bias_filler {
158 |       type: "constant"
159 |       value: 0
160 |     }
161 |   }
162 | }
163 | layer {
164 |   name: "prelu4"
165 |   type: "PReLU"
166 |   bottom: "conv4"
167 |   top: "conv4"
168 | }
169 | 
170 | 
171 | layer {
172 |   name: "conv5"
173 |   type: "InnerProduct"
174 |   bottom: "conv4"
175 |   top: "conv5"
176 |   param {
177 |     lr_mult: 1
178 |     decay_mult: 1
179 |   }
180 |   param {
181 |     lr_mult: 2
182 |     decay_mult: 1
183 |   }
184 |   inner_product_param {
185 | 	#kernel_size: 3
186 | 	num_output: 256
187 |     weight_filler {
188 |       type: "xavier"
189 | 	}
190 |     bias_filler {
191 |       type: "constant"
192 |       value: 0
193 |     }
194 |   }
195 | }
196 | 
197 | layer {
198 |   name: "drop5"
199 |   type: "Dropout"
200 |   bottom: "conv5"
201 |   top: "conv5"
202 |   dropout_param {
203 |     dropout_ratio: 0.25
204 |   }
205 | }
206 | layer {
207 |   name: "prelu5"
208 |   type: "PReLU"
209 |   bottom: "conv5"
210 |   top: "conv5"
211 | }
212 | 
213 | 
214 | layer {
215 |   name: "conv6-1"
216 |   type: "InnerProduct"
217 |   bottom: "conv5"
218 |   top: "conv6-1"
219 |   param {
220 |     lr_mult: 1
221 |     decay_mult: 1
222 |   }
223 |   param {
224 |     lr_mult: 2
225 |     decay_mult: 1
226 |   }
227 |   inner_product_param {
228 |     #kernel_size: 1
229 | 	num_output: 2
230 |     weight_filler {
231 |       type: "xavier"
232 | 	}
233 |     bias_filler {
234 |       type: "constant"
235 |       value: 0
236 |     }
237 |   }
238 | }
239 | layer {
240 |   name: "conv6-2"
241 |   type: "InnerProduct"
242 |   bottom: "conv5"
243 |   top: "conv6-2"
244 |   param {
245 |     lr_mult: 1
246 |     decay_mult: 1
247 |   }
248 |   param {
249 |     lr_mult: 2
250 |     decay_mult: 1
251 |   }
252 |   inner_product_param {
253 |   	#kernel_size: 1
254 | 	num_output: 4
255 |     weight_filler {
256 |       type: "xavier"
257 | 	}
258 |     bias_filler {
259 |       type: "constant"
260 |       value: 0
261 |     }
262 |   }
263 | }
264 | layer {
265 |   name: "conv6-3"
266 |   type: "InnerProduct"
267 |   bottom: "conv5"
268 |   top: "conv6-3"
269 |   param {
270 |     lr_mult: 1
271 |     decay_mult: 1
272 |   }
273 |   param {
274 |     lr_mult: 2
275 |     decay_mult: 1
276 |   }
277 |   inner_product_param {
278 |   	#kernel_size: 1
279 | 	num_output: 10
280 |     weight_filler {
281 |       type: "xavier"
282 | 	}
283 |     bias_filler {
284 |       type: "constant"
285 |       value: 0
286 |     }
287 |   }
288 | }
289 | layer {
290 |   name: "prob1"
291 |   type: "Softmax"
292 |   bottom: "conv6-1"
293 |   top: "prob1"
294 | }
295 | 


--------------------------------------------------------------------------------
/caffe_models/det4.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/caffe_models/det4.caffemodel


--------------------------------------------------------------------------------
/caffe_models/det4.prototxt:
--------------------------------------------------------------------------------
  1 | name: "LNet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 15
  5 | input_dim: 24
  6 | input_dim: 24
  7 | 
  8 | layer {
  9 |   name: "slicer_data"
 10 |   type: "Slice"
 11 |   bottom: "data"
 12 |   top: "data241"
 13 |   top: "data242"
 14 |   top: "data243"
 15 |   top: "data244"
 16 |   top: "data245"
 17 |   slice_param {
 18 |     axis: 1
 19 |     slice_point: 3
 20 |     slice_point: 6
 21 |     slice_point: 9
 22 |     slice_point: 12
 23 |   }
 24 | }
 25 | layer {
 26 |   name: "conv1_1"
 27 |   type: "Convolution"
 28 |   bottom: "data241"
 29 |   top: "conv1_1"
 30 |   param {
 31 |     lr_mult: 1
 32 |     decay_mult: 1
 33 |   }
 34 |   param {
 35 |     lr_mult: 2
 36 |     decay_mult: 1
 37 |   }
 38 |   convolution_param {
 39 |     num_output: 28
 40 |     kernel_size: 3
 41 |     stride: 1
 42 |      weight_filler {
 43 |       type: "xavier"
 44 | 	}
 45 |     bias_filler {
 46 |       type: "constant"
 47 |       value: 0
 48 |     }
 49 |   }
 50 |   
 51 | }
 52 | layer {
 53 |   name: "prelu1_1"
 54 |   type: "PReLU"
 55 |   bottom: "conv1_1"
 56 |   top: "conv1_1"
 57 | 
 58 | }
 59 | layer {
 60 |   name: "pool1_1"
 61 |   type: "Pooling"
 62 |   bottom: "conv1_1"
 63 |   top: "pool1_1"
 64 |   pooling_param {
 65 |     pool: MAX
 66 |     kernel_size: 3
 67 |     stride: 2
 68 |   }
 69 | }
 70 | 
 71 | layer {
 72 |   name: "conv2_1"
 73 |   type: "Convolution"
 74 |   bottom: "pool1_1"
 75 |   top: "conv2_1"
 76 |   param {
 77 |     lr_mult: 1
 78 |     decay_mult: 1
 79 |   }
 80 |   param {
 81 |     lr_mult: 2
 82 |     decay_mult: 1
 83 |   }
 84 |   convolution_param {
 85 |     num_output: 48
 86 |     kernel_size: 3
 87 |     stride: 1
 88 |     weight_filler {
 89 |       type: "xavier"
 90 | 	}
 91 |     bias_filler {
 92 |       type: "constant"
 93 |       value: 0
 94 |     }
 95 |   }
 96 |   
 97 | }
 98 | layer {
 99 |   name: "prelu2_1"
100 |   type: "PReLU"
101 |   bottom: "conv2_1"
102 |   top: "conv2_1"
103 | }
104 | layer {
105 |   name: "pool2_1"
106 |   type: "Pooling"
107 |   bottom: "conv2_1"
108 |   top: "pool2_1"
109 |   pooling_param {
110 |     pool: MAX
111 |     kernel_size: 3
112 |     stride: 2
113 |   }
114 |   
115 | }
116 | layer {
117 |   name: "conv3_1"
118 |   type: "Convolution"
119 |   bottom: "pool2_1"
120 |   top: "conv3_1"
121 |   param {
122 |     lr_mult: 1
123 |     decay_mult: 1
124 |   }
125 |   param {
126 |     lr_mult: 2
127 |     decay_mult: 1
128 |   }
129 |   convolution_param {
130 |     num_output: 64
131 |     kernel_size: 2
132 |     stride: 1
133 |     weight_filler {
134 |       type: "xavier"
135 | 	}
136 |     bias_filler {
137 |       type: "constant"
138 |       value: 0
139 |     }
140 |   }
141 |   
142 | }
143 | layer {
144 |   name: "prelu3_1"
145 |   type: "PReLU"
146 |   bottom: "conv3_1"
147 |   top: "conv3_1"
148 | }
149 | ##########################
150 | layer {
151 |   name: "conv1_2"
152 |   type: "Convolution"
153 |   bottom: "data242"
154 |   top: "conv1_2"
155 |   param {
156 |     lr_mult: 1
157 |     decay_mult: 1
158 |   }
159 |   param {
160 |     lr_mult: 2
161 |     decay_mult: 1
162 |   }
163 |   convolution_param {
164 |     num_output: 28
165 |     kernel_size: 3
166 |     stride: 1
167 |      weight_filler {
168 |       type: "xavier"
169 | 	}
170 |     bias_filler {
171 |       type: "constant"
172 |       value: 0
173 |     }
174 |   }
175 |   
176 | }
177 | layer {
178 |   name: "prelu1_2"
179 |   type: "PReLU"
180 |   bottom: "conv1_2"
181 |   top: "conv1_2"
182 | 
183 | }
184 | layer {
185 |   name: "pool1_2"
186 |   type: "Pooling"
187 |   bottom: "conv1_2"
188 |   top: "pool1_2"
189 |   pooling_param {
190 |     pool: MAX
191 |     kernel_size: 3
192 |     stride: 2
193 |   }
194 | }
195 | 
196 | layer {
197 |   name: "conv2_2"
198 |   type: "Convolution"
199 |   bottom: "pool1_2"
200 |   top: "conv2_2"
201 |   param {
202 |     lr_mult: 1
203 |     decay_mult: 1
204 |   }
205 |   param {
206 |     lr_mult: 2
207 |     decay_mult: 1
208 |   }
209 |   convolution_param {
210 |     num_output: 48
211 |     kernel_size: 3
212 |     stride: 1
213 |     weight_filler {
214 |       type: "xavier"
215 | 	}
216 |     bias_filler {
217 |       type: "constant"
218 |       value: 0
219 |     }
220 |   }
221 |   
222 | }
223 | layer {
224 |   name: "prelu2_2"
225 |   type: "PReLU"
226 |   bottom: "conv2_2"
227 |   top: "conv2_2"
228 | }
229 | layer {
230 |   name: "pool2_2"
231 |   type: "Pooling"
232 |   bottom: "conv2_2"
233 |   top: "pool2_2"
234 |   pooling_param {
235 |     pool: MAX
236 |     kernel_size: 3
237 |     stride: 2
238 |   }
239 |   
240 | }
241 | layer {
242 |   name: "conv3_2"
243 |   type: "Convolution"
244 |   bottom: "pool2_2"
245 |   top: "conv3_2"
246 |   param {
247 |     lr_mult: 1
248 |     decay_mult: 1
249 |   }
250 |   param {
251 |     lr_mult: 2
252 |     decay_mult: 1
253 |   }
254 |   convolution_param {
255 |     num_output: 64
256 |     kernel_size: 2
257 |     stride: 1
258 |     weight_filler {
259 |       type: "xavier"
260 | 	}
261 |     bias_filler {
262 |       type: "constant"
263 |       value: 0
264 |     }
265 |   }
266 |   
267 | }
268 | layer {
269 |   name: "prelu3_2"
270 |   type: "PReLU"
271 |   bottom: "conv3_2"
272 |   top: "conv3_2"
273 | }
274 | ##########################
275 | ##########################
276 | layer {
277 |   name: "conv1_3"
278 |   type: "Convolution"
279 |   bottom: "data243"
280 |   top: "conv1_3"
281 |   param {
282 |     lr_mult: 1
283 |     decay_mult: 1
284 |   }
285 |   param {
286 |     lr_mult: 2
287 |     decay_mult: 1
288 |   }
289 |   convolution_param {
290 |     num_output: 28
291 |     kernel_size: 3
292 |     stride: 1
293 |      weight_filler {
294 |       type: "xavier"
295 | 	}
296 |     bias_filler {
297 |       type: "constant"
298 |       value: 0
299 |     }
300 |   }
301 |   
302 | }
303 | layer {
304 |   name: "prelu1_3"
305 |   type: "PReLU"
306 |   bottom: "conv1_3"
307 |   top: "conv1_3"
308 | 
309 | }
310 | layer {
311 |   name: "pool1_3"
312 |   type: "Pooling"
313 |   bottom: "conv1_3"
314 |   top: "pool1_3"
315 |   pooling_param {
316 |     pool: MAX
317 |     kernel_size: 3
318 |     stride: 2
319 |   }
320 | }
321 | 
322 | layer {
323 |   name: "conv2_3"
324 |   type: "Convolution"
325 |   bottom: "pool1_3"
326 |   top: "conv2_3"
327 |   param {
328 |     lr_mult: 1
329 |     decay_mult: 1
330 |   }
331 |   param {
332 |     lr_mult: 2
333 |     decay_mult: 1
334 |   }
335 |   convolution_param {
336 |     num_output: 48
337 |     kernel_size: 3
338 |     stride: 1
339 |     weight_filler {
340 |       type: "xavier"
341 | 	}
342 |     bias_filler {
343 |       type: "constant"
344 |       value: 0
345 |     }
346 |   }
347 |   
348 | }
349 | layer {
350 |   name: "prelu2_3"
351 |   type: "PReLU"
352 |   bottom: "conv2_3"
353 |   top: "conv2_3"
354 | }
355 | layer {
356 |   name: "pool2_3"
357 |   type: "Pooling"
358 |   bottom: "conv2_3"
359 |   top: "pool2_3"
360 |   pooling_param {
361 |     pool: MAX
362 |     kernel_size: 3
363 |     stride: 2
364 |   }
365 |   
366 | }
367 | layer {
368 |   name: "conv3_3"
369 |   type: "Convolution"
370 |   bottom: "pool2_3"
371 |   top: "conv3_3"
372 |   param {
373 |     lr_mult: 1
374 |     decay_mult: 1
375 |   }
376 |   param {
377 |     lr_mult: 2
378 |     decay_mult: 1
379 |   }
380 |   convolution_param {
381 |     num_output: 64
382 |     kernel_size: 2
383 |     stride: 1
384 |     weight_filler {
385 |       type: "xavier"
386 | 	}
387 |     bias_filler {
388 |       type: "constant"
389 |       value: 0
390 |     }
391 |   }
392 |   
393 | }
394 | layer {
395 |   name: "prelu3_3"
396 |   type: "PReLU"
397 |   bottom: "conv3_3"
398 |   top: "conv3_3"
399 | }
400 | ##########################
401 | ##########################
402 | layer {
403 |   name: "conv1_4"
404 |   type: "Convolution"
405 |   bottom: "data244"
406 |   top: "conv1_4"
407 |   param {
408 |     lr_mult: 1
409 |     decay_mult: 1
410 |   }
411 |   param {
412 |     lr_mult: 2
413 |     decay_mult: 1
414 |   }
415 |   convolution_param {
416 |     num_output: 28
417 |     kernel_size: 3
418 |     stride: 1
419 |      weight_filler {
420 |       type: "xavier"
421 | 	}
422 |     bias_filler {
423 |       type: "constant"
424 |       value: 0
425 |     }
426 |   }
427 |   
428 | }
429 | layer {
430 |   name: "prelu1_4"
431 |   type: "PReLU"
432 |   bottom: "conv1_4"
433 |   top: "conv1_4"
434 | 
435 | }
436 | layer {
437 |   name: "pool1_4"
438 |   type: "Pooling"
439 |   bottom: "conv1_4"
440 |   top: "pool1_4"
441 |   pooling_param {
442 |     pool: MAX
443 |     kernel_size: 3
444 |     stride: 2
445 |   }
446 | }
447 | 
448 | layer {
449 |   name: "conv2_4"
450 |   type: "Convolution"
451 |   bottom: "pool1_4"
452 |   top: "conv2_4"
453 |   param {
454 |     lr_mult: 1
455 |     decay_mult: 1
456 |   }
457 |   param {
458 |     lr_mult: 2
459 |     decay_mult: 1
460 |   }
461 |   convolution_param {
462 |     num_output: 48
463 |     kernel_size: 3
464 |     stride: 1
465 |     weight_filler {
466 |       type: "xavier"
467 | 	}
468 |     bias_filler {
469 |       type: "constant"
470 |       value: 0
471 |     }
472 |   }
473 |   
474 | }
475 | layer {
476 |   name: "prelu2_4"
477 |   type: "PReLU"
478 |   bottom: "conv2_4"
479 |   top: "conv2_4"
480 | }
481 | layer {
482 |   name: "pool2_4"
483 |   type: "Pooling"
484 |   bottom: "conv2_4"
485 |   top: "pool2_4"
486 |   pooling_param {
487 |     pool: MAX
488 |     kernel_size: 3
489 |     stride: 2
490 |   }
491 |   
492 | }
493 | layer {
494 |   name: "conv3_4"
495 |   type: "Convolution"
496 |   bottom: "pool2_4"
497 |   top: "conv3_4"
498 |   param {
499 |     lr_mult: 1
500 |     decay_mult: 1
501 |   }
502 |   param {
503 |     lr_mult: 2
504 |     decay_mult: 1
505 |   }
506 |   convolution_param {
507 |     num_output: 64
508 |     kernel_size: 2
509 |     stride: 1
510 |     weight_filler {
511 |       type: "xavier"
512 | 	}
513 |     bias_filler {
514 |       type: "constant"
515 |       value: 0
516 |     }
517 |   }
518 |   
519 | }
520 | layer {
521 |   name: "prelu3_4"
522 |   type: "PReLU"
523 |   bottom: "conv3_4"
524 |   top: "conv3_4"
525 | }
526 | ##########################
527 | ##########################
528 | layer {
529 |   name: "conv1_5"
530 |   type: "Convolution"
531 |   bottom: "data245"
532 |   top: "conv1_5"
533 |   param {
534 |     lr_mult: 1
535 |     decay_mult: 1
536 |   }
537 |   param {
538 |     lr_mult: 2
539 |     decay_mult: 1
540 |   }
541 |   convolution_param {
542 |     num_output: 28
543 |     kernel_size: 3
544 |     stride: 1
545 |      weight_filler {
546 |       type: "xavier"
547 | 	}
548 |     bias_filler {
549 |       type: "constant"
550 |       value: 0
551 |     }
552 |   }
553 |   
554 | }
555 | layer {
556 |   name: "prelu1_5"
557 |   type: "PReLU"
558 |   bottom: "conv1_5"
559 |   top: "conv1_5"
560 | 
561 | }
562 | layer {
563 |   name: "pool1_5"
564 |   type: "Pooling"
565 |   bottom: "conv1_5"
566 |   top: "pool1_5"
567 |   pooling_param {
568 |     pool: MAX
569 |     kernel_size: 3
570 |     stride: 2
571 |   }
572 | }
573 | 
574 | layer {
575 |   name: "conv2_5"
576 |   type: "Convolution"
577 |   bottom: "pool1_5"
578 |   top: "conv2_5"
579 |   param {
580 |     lr_mult: 1
581 |     decay_mult: 1
582 |   }
583 |   param {
584 |     lr_mult: 2
585 |     decay_mult: 1
586 |   }
587 |   convolution_param {
588 |     num_output: 48
589 |     kernel_size: 3
590 |     stride: 1
591 |     weight_filler {
592 |       type: "xavier"
593 | 	}
594 |     bias_filler {
595 |       type: "constant"
596 |       value: 0
597 |     }
598 |   }
599 |   
600 | }
601 | layer {
602 |   name: "prelu2_5"
603 |   type: "PReLU"
604 |   bottom: "conv2_5"
605 |   top: "conv2_5"
606 | }
607 | layer {
608 |   name: "pool2_5"
609 |   type: "Pooling"
610 |   bottom: "conv2_5"
611 |   top: "pool2_5"
612 |   pooling_param {
613 |     pool: MAX
614 |     kernel_size: 3
615 |     stride: 2
616 |   }
617 |   
618 | }
619 | layer {
620 |   name: "conv3_5"
621 |   type: "Convolution"
622 |   bottom: "pool2_5"
623 |   top: "conv3_5"
624 |   param {
625 |     lr_mult: 1
626 |     decay_mult: 1
627 |   }
628 |   param {
629 |     lr_mult: 2
630 |     decay_mult: 1
631 |   }
632 |   convolution_param {
633 |     num_output: 64
634 |     kernel_size: 2
635 |     stride: 1
636 |     weight_filler {
637 |       type: "xavier"
638 | 	}
639 |     bias_filler {
640 |       type: "constant"
641 |       value: 0
642 |     }
643 |   }
644 |   
645 | }
646 | layer {
647 |   name: "prelu3_5"
648 |   type: "PReLU"
649 |   bottom: "conv3_5"
650 |   top: "conv3_5"
651 | }
652 | ##########################
653 | layer { 
654 | 	name: "concat" 
655 | 	bottom: "conv3_1" 
656 | 	bottom: "conv3_2" 
657 | 	bottom: "conv3_3" 
658 | 	bottom: "conv3_4" 
659 | 	bottom: "conv3_5" 
660 | 	top: "conv3" 
661 | 	type: "Concat" 
662 | 	concat_param { 
663 | 		axis: 1 
664 | 	} 
665 | }
666 | ##########################
667 | layer {
668 |   name: "fc4"
669 |   type: "InnerProduct"
670 |   bottom: "conv3"
671 |   top: "fc4"
672 |   param {
673 |     lr_mult: 1
674 |     decay_mult: 1
675 |   }
676 |   param {
677 |     lr_mult: 2
678 |     decay_mult: 1
679 |   }
680 |   inner_product_param {
681 |     num_output: 256
682 |     weight_filler {
683 |       type: "xavier"
684 | 	}
685 |     bias_filler {
686 |       type: "constant"
687 |       value: 0
688 |     }
689 |   }
690 |   
691 | }
692 | layer {
693 |   name: "prelu4"
694 |   type: "PReLU"
695 |   bottom: "fc4"
696 |   top: "fc4"
697 | }
698 | ############################
699 | layer {
700 |   name: "fc4_1"
701 |   type: "InnerProduct"
702 |   bottom: "fc4"
703 |   top: "fc4_1"
704 |   param {
705 |     lr_mult: 1
706 |     decay_mult: 1
707 |   }
708 |   param {
709 |     lr_mult: 2
710 |     decay_mult: 1
711 |   }
712 |   inner_product_param {
713 |     num_output: 64
714 |     weight_filler {
715 |       type: "xavier"
716 | 	}
717 |     bias_filler {
718 |       type: "constant"
719 |       value: 0
720 |     }
721 |   }
722 |   
723 | }
724 | layer {
725 |   name: "prelu4_1"
726 |   type: "PReLU"
727 |   bottom: "fc4_1"
728 |   top: "fc4_1"
729 | }
730 | layer {
731 |   name: "fc5_1"
732 |   type: "InnerProduct"
733 |   bottom: "fc4_1"
734 |   top: "fc5_1"
735 |   param {
736 |     lr_mult: 1
737 |     decay_mult: 1
738 |   }
739 |   param {
740 |     lr_mult: 2
741 |     decay_mult: 1
742 |   }
743 |   inner_product_param {
744 |     num_output: 2
745 |     weight_filler {
746 |       type: "xavier"
747 | 	  #type: "constant"
748 |       #value: 0
749 | 	}
750 |     bias_filler {
751 |       type: "constant"
752 |       value: 0
753 |     }
754 |   }
755 | }
756 | 
757 | 
758 | #########################
759 | layer {
760 |   name: "fc4_2"
761 |   type: "InnerProduct"
762 |   bottom: "fc4"
763 |   top: "fc4_2"
764 |   param {
765 |     lr_mult: 1
766 |     decay_mult: 1
767 |   }
768 |   param {
769 |     lr_mult: 2
770 |     decay_mult: 1
771 |   }
772 |   inner_product_param {
773 |     num_output: 64
774 |     weight_filler {
775 |       type: "xavier"
776 | 	}
777 |     bias_filler {
778 |       type: "constant"
779 |       value: 0
780 |     }
781 |   }
782 |   
783 | }
784 | layer {
785 |   name: "prelu4_2"
786 |   type: "PReLU"
787 |   bottom: "fc4_2"
788 |   top: "fc4_2"
789 | }
790 | layer {
791 |   name: "fc5_2"
792 |   type: "InnerProduct"
793 |   bottom: "fc4_2"
794 |   top: "fc5_2"
795 |   param {
796 |     lr_mult: 1
797 |     decay_mult: 1
798 |   }
799 |   param {
800 |     lr_mult: 2
801 |     decay_mult: 1
802 |   }
803 |   inner_product_param {
804 |     num_output: 2
805 |     weight_filler {
806 |       type: "xavier"
807 | 	  #type: "constant"
808 |       #value: 0
809 | 	}
810 |     bias_filler {
811 |       type: "constant"
812 |       value: 0
813 |     }
814 |   }
815 | }
816 | 
817 | #########################
818 | layer {
819 |   name: "fc4_3"
820 |   type: "InnerProduct"
821 |   bottom: "fc4"
822 |   top: "fc4_3"
823 |   param {
824 |     lr_mult: 1
825 |     decay_mult: 1
826 |   }
827 |   param {
828 |     lr_mult: 2
829 |     decay_mult: 1
830 |   }
831 |   inner_product_param {
832 |     num_output: 64
833 |     weight_filler {
834 |       type: "xavier"
835 | 	}
836 |     bias_filler {
837 |       type: "constant"
838 |       value: 0
839 |     }
840 |   }
841 |   
842 | }
843 | layer {
844 |   name: "prelu4_3"
845 |   type: "PReLU"
846 |   bottom: "fc4_3"
847 |   top: "fc4_3"
848 | }
849 | layer {
850 |   name: "fc5_3"
851 |   type: "InnerProduct"
852 |   bottom: "fc4_3"
853 |   top: "fc5_3"
854 |   param {
855 |     lr_mult: 1
856 |     decay_mult: 1
857 |   }
858 |   param {
859 |     lr_mult: 2
860 |     decay_mult: 1
861 |   }
862 |   inner_product_param {
863 |     num_output: 2
864 |     weight_filler {
865 |       type: "xavier"
866 | 	  #type: "constant"
867 |       #value: 0
868 | 	}
869 |     bias_filler {
870 |       type: "constant"
871 |       value: 0
872 |     }
873 |   }
874 | }
875 | 
876 | #########################
877 | layer {
878 |   name: "fc4_4"
879 |   type: "InnerProduct"
880 |   bottom: "fc4"
881 |   top: "fc4_4"
882 |   param {
883 |     lr_mult: 1
884 |     decay_mult: 1
885 |   }
886 |   param {
887 |     lr_mult: 2
888 |     decay_mult: 1
889 |   }
890 |   inner_product_param {
891 |     num_output: 64
892 |     weight_filler {
893 |       type: "xavier"
894 | 	}
895 |     bias_filler {
896 |       type: "constant"
897 |       value: 0
898 |     }
899 |   }
900 |   
901 | }
902 | layer {
903 |   name: "prelu4_4"
904 |   type: "PReLU"
905 |   bottom: "fc4_4"
906 |   top: "fc4_4"
907 | }
908 | layer {
909 |   name: "fc5_4"
910 |   type: "InnerProduct"
911 |   bottom: "fc4_4"
912 |   top: "fc5_4"
913 |   param {
914 |     lr_mult: 1
915 |     decay_mult: 1
916 |   }
917 |   param {
918 |     lr_mult: 2
919 |     decay_mult: 1
920 |   }
921 |   inner_product_param {
922 |     num_output: 2
923 |     weight_filler {
924 |       type: "xavier"
925 | 	  #type: "constant"
926 |       #value: 0
927 | 	}
928 |     bias_filler {
929 |       type: "constant"
930 |       value: 0
931 |     }
932 |   }
933 | }
934 | 
935 | #########################
936 | layer {
937 |   name: "fc4_5"
938 |   type: "InnerProduct"
939 |   bottom: "fc4"
940 |   top: "fc4_5"
941 |   param {
942 |     lr_mult: 1
943 |     decay_mult: 1
944 |   }
945 |   param {
946 |     lr_mult: 2
947 |     decay_mult: 1
948 |   }
949 |   inner_product_param {
950 |     num_output: 64
951 |     weight_filler {
952 |       type: "xavier"
953 | 	}
954 |     bias_filler {
955 |       type: "constant"
956 |       value: 0
957 |     }
958 |   }
959 |   
960 | }
961 | layer {
962 |   name: "prelu4_5"
963 |   type: "PReLU"
964 |   bottom: "fc4_5"
965 |   top: "fc4_5"
966 | }
967 | layer {
968 |   name: "fc5_5"
969 |   type: "InnerProduct"
970 |   bottom: "fc4_5"
971 |   top: "fc5_5"
972 |   param {
973 |     lr_mult: 1
974 |     decay_mult: 1
975 |   }
976 |   param {
977 |     lr_mult: 2
978 |     decay_mult: 1
979 |   }
980 |   inner_product_param {
981 |     num_output: 2
982 |     weight_filler {
983 |       type: "xavier"
984 | 	  #type: "constant"
985 |       #value: 0
986 | 	}
987 |     bias_filler {
988 |       type: "constant"
989 |       value: 0
990 |     }
991 |   }
992 | }
993 | 
994 | #########################
995 | 
996 | 


--------------------------------------------------------------------------------
/extract_weights_from_caffe_models.py:
--------------------------------------------------------------------------------
 1 | import caffe
 2 | import numpy as np
 3 | 
 4 | """
 5 | The purpose of this script is to convert pretrained weights taken from
 6 | official implementation here:
 7 | https://github.com/kpzhang93/MTCNN_face_detection_alignment/tree/master/code/codes/MTCNNv2
 8 | to required format.
 9 | 
10 | In a nutshell, it just renames and transposes some of the weights.
11 | You don't have to use this script because weights are already in `src/weights`.
12 | """
13 | 
14 | 
15 | def get_all_weights(net):
16 |     all_weights = {}
17 |     for p in net.params:
18 |         if 'conv' in p:
19 |             name = 'features.' + p
20 |             if '-' in p:
21 |                 s = list(p)
22 |                 s[-2] = '_'
23 |                 s = ''.join(s)
24 |                 all_weights[s + '.weight'] = net.params[p][0].data
25 |                 all_weights[s + '.bias'] = net.params[p][1].data
26 |             elif len(net.params[p][0].data.shape) == 4:
27 |                 all_weights[name + '.weight'] = net.params[p][0].data.transpose((0, 1, 3, 2))
28 |                 all_weights[name + '.bias'] = net.params[p][1].data
29 |             else:
30 |                 all_weights[name + '.weight'] = net.params[p][0].data
31 |                 all_weights[name + '.bias'] = net.params[p][1].data
32 |         elif 'prelu' in p.lower():
33 |             all_weights['features.' + p.lower() + '.weight'] = net.params[p][0].data
34 |     return all_weights
35 | 
36 | 
37 | # P-Net
38 | net = caffe.Net('caffe_models/det1.prototxt', 'caffe_models/det1.caffemodel', caffe.TEST)
39 | np.save('src/weights/pnet.npy', get_all_weights(net))
40 | 
41 | # R-Net
42 | net = caffe.Net('caffe_models/det2.prototxt', 'caffe_models/det2.caffemodel', caffe.TEST)
43 | np.save('src/weights/rnet.npy', get_all_weights(net))
44 | 
45 | # O-Net
46 | net = caffe.Net('caffe_models/det3.prototxt', 'caffe_models/det3.caffemodel', caffe.TEST)
47 | np.save('src/weights/onet.npy', get_all_weights(net))
48 | 


--------------------------------------------------------------------------------
/images/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/example.png


--------------------------------------------------------------------------------
/images/office1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office1.jpg


--------------------------------------------------------------------------------
/images/office2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office2.jpg


--------------------------------------------------------------------------------
/images/office3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office3.jpg


--------------------------------------------------------------------------------
/images/office4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office4.jpg


--------------------------------------------------------------------------------
/images/office5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/images/office5.jpg


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from .visualization_utils import show_bboxes
2 | from .detector import detect_faces
3 | 


--------------------------------------------------------------------------------
/src/box_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | 
  4 | 
  5 | def nms(boxes, overlap_threshold=0.5, mode='union'):
  6 |     """Non-maximum suppression.
  7 | 
  8 |     Arguments:
  9 |         boxes: a float numpy array of shape [n, 5],
 10 |             where each row is (xmin, ymin, xmax, ymax, score).
 11 |         overlap_threshold: a float number.
 12 |         mode: 'union' or 'min'.
 13 | 
 14 |     Returns:
 15 |         list with indices of the selected boxes
 16 |     """
 17 | 
 18 |     # if there are no boxes, return the empty list
 19 |     if len(boxes) == 0:
 20 |         return []
 21 | 
 22 |     # list of picked indices
 23 |     pick = []
 24 | 
 25 |     # grab the coordinates of the bounding boxes
 26 |     x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)]
 27 | 
 28 |     area = (x2 - x1 + 1.0)*(y2 - y1 + 1.0)
 29 |     ids = np.argsort(score)  # in increasing order
 30 | 
 31 |     while len(ids) > 0:
 32 | 
 33 |         # grab index of the largest value
 34 |         last = len(ids) - 1
 35 |         i = ids[last]
 36 |         pick.append(i)
 37 | 
 38 |         # compute intersections
 39 |         # of the box with the largest score
 40 |         # with the rest of boxes
 41 | 
 42 |         # left top corner of intersection boxes
 43 |         ix1 = np.maximum(x1[i], x1[ids[:last]])
 44 |         iy1 = np.maximum(y1[i], y1[ids[:last]])
 45 | 
 46 |         # right bottom corner of intersection boxes
 47 |         ix2 = np.minimum(x2[i], x2[ids[:last]])
 48 |         iy2 = np.minimum(y2[i], y2[ids[:last]])
 49 | 
 50 |         # width and height of intersection boxes
 51 |         w = np.maximum(0.0, ix2 - ix1 + 1.0)
 52 |         h = np.maximum(0.0, iy2 - iy1 + 1.0)
 53 | 
 54 |         # intersections' areas
 55 |         inter = w * h
 56 |         if mode == 'min':
 57 |             overlap = inter/np.minimum(area[i], area[ids[:last]])
 58 |         elif mode == 'union':
 59 |             # intersection over union (IoU)
 60 |             overlap = inter/(area[i] + area[ids[:last]] - inter)
 61 | 
 62 |         # delete all boxes where overlap is too big
 63 |         ids = np.delete(
 64 |             ids,
 65 |             np.concatenate([[last], np.where(overlap > overlap_threshold)[0]])
 66 |         )
 67 | 
 68 |     return pick
 69 | 
 70 | 
 71 | def convert_to_square(bboxes):
 72 |     """Convert bounding boxes to a square form.
 73 | 
 74 |     Arguments:
 75 |         bboxes: a float numpy array of shape [n, 5].
 76 | 
 77 |     Returns:
 78 |         a float numpy array of shape [n, 5],
 79 |             squared bounding boxes.
 80 |     """
 81 | 
 82 |     square_bboxes = np.zeros_like(bboxes)
 83 |     x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
 84 |     h = y2 - y1 + 1.0
 85 |     w = x2 - x1 + 1.0
 86 |     max_side = np.maximum(h, w)
 87 |     square_bboxes[:, 0] = x1 + w*0.5 - max_side*0.5
 88 |     square_bboxes[:, 1] = y1 + h*0.5 - max_side*0.5
 89 |     square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0
 90 |     square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0
 91 |     return square_bboxes
 92 | 
 93 | 
 94 | def calibrate_box(bboxes, offsets):
 95 |     """Transform bounding boxes to be more like true bounding boxes.
 96 |     'offsets' is one of the outputs of the nets.
 97 | 
 98 |     Arguments:
 99 |         bboxes: a float numpy array of shape [n, 5].
100 |         offsets: a float numpy array of shape [n, 4].
101 | 
102 |     Returns:
103 |         a float numpy array of shape [n, 5].
104 |     """
105 |     x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
106 |     w = x2 - x1 + 1.0
107 |     h = y2 - y1 + 1.0
108 |     w = np.expand_dims(w, 1)
109 |     h = np.expand_dims(h, 1)
110 | 
111 |     # this is what happening here:
112 |     # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)]
113 |     # x1_true = x1 + tx1*w
114 |     # y1_true = y1 + ty1*h
115 |     # x2_true = x2 + tx2*w
116 |     # y2_true = y2 + ty2*h
117 |     # below is just more compact form of this
118 | 
119 |     # are offsets always such that
120 |     # x1 < x2 and y1 < y2 ?
121 | 
122 |     translation = np.hstack([w, h, w, h])*offsets
123 |     bboxes[:, 0:4] = bboxes[:, 0:4] + translation
124 |     return bboxes
125 | 
126 | 
127 | def get_image_boxes(bounding_boxes, img, size=24):
128 |     """Cut out boxes from the image.
129 | 
130 |     Arguments:
131 |         bounding_boxes: a float numpy array of shape [n, 5].
132 |         img: an instance of PIL.Image.
133 |         size: an integer, size of cutouts.
134 | 
135 |     Returns:
136 |         a float numpy array of shape [n, 3, size, size].
137 |     """
138 | 
139 |     num_boxes = len(bounding_boxes)
140 |     width, height = img.size
141 | 
142 |     [dy, edy, dx, edx, y, ey, x, ex, w, h] = correct_bboxes(bounding_boxes, width, height)
143 |     img_boxes = np.zeros((num_boxes, 3, size, size), 'float32')
144 | 
145 |     for i in range(num_boxes):
146 |         img_box = np.zeros((h[i], w[i], 3), 'uint8')
147 | 
148 |         img_array = np.asarray(img, 'uint8')
149 |         img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\
150 |             img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :]
151 | 
152 |         # resize
153 |         img_box = Image.fromarray(img_box)
154 |         img_box = img_box.resize((size, size), Image.BILINEAR)
155 |         img_box = np.asarray(img_box, 'float32')
156 | 
157 |         img_boxes[i, :, :, :] = _preprocess(img_box)
158 | 
159 |     return img_boxes
160 | 
161 | 
162 | def correct_bboxes(bboxes, width, height):
163 |     """Crop boxes that are too big and get coordinates
164 |     with respect to cutouts.
165 | 
166 |     Arguments:
167 |         bboxes: a float numpy array of shape [n, 5],
168 |             where each row is (xmin, ymin, xmax, ymax, score).
169 |         width: a float number.
170 |         height: a float number.
171 | 
172 |     Returns:
173 |         dy, dx, edy, edx: a int numpy arrays of shape [n],
174 |             coordinates of the boxes with respect to the cutouts.
175 |         y, x, ey, ex: a int numpy arrays of shape [n],
176 |             corrected ymin, xmin, ymax, xmax.
177 |         h, w: a int numpy arrays of shape [n],
178 |             just heights and widths of boxes.
179 | 
180 |         in the following order:
181 |             [dy, edy, dx, edx, y, ey, x, ex, w, h].
182 |     """
183 | 
184 |     x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
185 |     w, h = x2 - x1 + 1.0,  y2 - y1 + 1.0
186 |     num_boxes = bboxes.shape[0]
187 | 
188 |     # 'e' stands for end
189 |     # (x, y) -> (ex, ey)
190 |     x, y, ex, ey = x1, y1, x2, y2
191 | 
192 |     # we need to cut out a box from the image.
193 |     # (x, y, ex, ey) are corrected coordinates of the box
194 |     # in the image.
195 |     # (dx, dy, edx, edy) are coordinates of the box in the cutout
196 |     # from the image.
197 |     dx, dy = np.zeros((num_boxes,)), np.zeros((num_boxes,))
198 |     edx, edy = w.copy() - 1.0, h.copy() - 1.0
199 | 
200 |     # if box's bottom right corner is too far right
201 |     ind = np.where(ex > width - 1.0)[0]
202 |     edx[ind] = w[ind] + width - 2.0 - ex[ind]
203 |     ex[ind] = width - 1.0
204 | 
205 |     # if box's bottom right corner is too low
206 |     ind = np.where(ey > height - 1.0)[0]
207 |     edy[ind] = h[ind] + height - 2.0 - ey[ind]
208 |     ey[ind] = height - 1.0
209 | 
210 |     # if box's top left corner is too far left
211 |     ind = np.where(x < 0.0)[0]
212 |     dx[ind] = 0.0 - x[ind]
213 |     x[ind] = 0.0
214 | 
215 |     # if box's top left corner is too high
216 |     ind = np.where(y < 0.0)[0]
217 |     dy[ind] = 0.0 - y[ind]
218 |     y[ind] = 0.0
219 | 
220 |     return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h]
221 |     return_list = [i.astype('int32') for i in return_list]
222 | 
223 |     return return_list
224 | 
225 | 
226 | def _preprocess(img):
227 |     """Preprocessing step before feeding the network.
228 | 
229 |     Arguments:
230 |         img: a float numpy array of shape [h, w, c].
231 | 
232 |     Returns:
233 |         a float numpy array of shape [1, c, h, w].
234 |     """
235 |     img = img.transpose((2, 0, 1))
236 |     img = np.expand_dims(img, 0)
237 |     img = (img - 127.5)*0.0078125
238 |     return img
239 | 


--------------------------------------------------------------------------------
/src/detector.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.autograd import Variable
  4 | from .get_nets import PNet, RNet, ONet
  5 | from .box_utils import nms, calibrate_box, get_image_boxes, convert_to_square
  6 | from .first_stage import run_first_stage
  7 | 
  8 | 
  9 | def detect_faces(image, min_face_size=20.0,
 10 |                  thresholds=[0.6, 0.7, 0.8],
 11 |                  nms_thresholds=[0.7, 0.7, 0.7]):
 12 |     """
 13 |     Arguments:
 14 |         image: an instance of PIL.Image.
 15 |         min_face_size: a float number.
 16 |         thresholds: a list of length 3.
 17 |         nms_thresholds: a list of length 3.
 18 | 
 19 |     Returns:
 20 |         two float numpy arrays of shapes [n_boxes, 4] and [n_boxes, 10],
 21 |         bounding boxes and facial landmarks.
 22 |     """
 23 | 
 24 |     # LOAD MODELS
 25 |     pnet = PNet()
 26 |     rnet = RNet()
 27 |     onet = ONet()
 28 |     onet.eval()
 29 | 
 30 |     # BUILD AN IMAGE PYRAMID
 31 |     width, height = image.size
 32 |     min_length = min(height, width)
 33 | 
 34 |     min_detection_size = 12
 35 |     factor = 0.707  # sqrt(0.5)
 36 | 
 37 |     # scales for scaling the image
 38 |     scales = []
 39 | 
 40 |     # scales the image so that
 41 |     # minimum size that we can detect equals to
 42 |     # minimum face size that we want to detect
 43 |     m = min_detection_size/min_face_size
 44 |     min_length *= m
 45 | 
 46 |     factor_count = 0
 47 |     while min_length > min_detection_size:
 48 |         scales.append(m*factor**factor_count)
 49 |         min_length *= factor
 50 |         factor_count += 1
 51 | 
 52 |     # STAGE 1
 53 | 
 54 |     # it will be returned
 55 |     bounding_boxes = []
 56 | 
 57 |     # run P-Net on different scales
 58 |     for s in scales:
 59 |         boxes = run_first_stage(image, pnet, scale=s, threshold=thresholds[0])
 60 |         bounding_boxes.append(boxes)
 61 | 
 62 |     # collect boxes (and offsets, and scores) from different scales
 63 |     bounding_boxes = [i for i in bounding_boxes if i is not None]
 64 |     bounding_boxes = np.vstack(bounding_boxes)
 65 | 
 66 |     keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0])
 67 |     bounding_boxes = bounding_boxes[keep]
 68 | 
 69 |     # use offsets predicted by pnet to transform bounding boxes
 70 |     bounding_boxes = calibrate_box(bounding_boxes[:, 0:5], bounding_boxes[:, 5:])
 71 |     # shape [n_boxes, 5]
 72 | 
 73 |     bounding_boxes = convert_to_square(bounding_boxes)
 74 |     bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
 75 | 
 76 |     # STAGE 2
 77 | 
 78 |     img_boxes = get_image_boxes(bounding_boxes, image, size=24)
 79 |     img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
 80 |     output = rnet(img_boxes)
 81 |     offsets = output[0].data.numpy()  # shape [n_boxes, 4]
 82 |     probs = output[1].data.numpy()  # shape [n_boxes, 2]
 83 | 
 84 |     keep = np.where(probs[:, 1] > thresholds[1])[0]
 85 |     bounding_boxes = bounding_boxes[keep]
 86 |     bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,))
 87 |     offsets = offsets[keep]
 88 | 
 89 |     keep = nms(bounding_boxes, nms_thresholds[1])
 90 |     bounding_boxes = bounding_boxes[keep]
 91 |     bounding_boxes = calibrate_box(bounding_boxes, offsets[keep])
 92 |     bounding_boxes = convert_to_square(bounding_boxes)
 93 |     bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
 94 | 
 95 |     # STAGE 3
 96 | 
 97 |     img_boxes = get_image_boxes(bounding_boxes, image, size=48)
 98 |     if len(img_boxes) == 0: 
 99 |         return [], []
100 |     img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
101 |     output = onet(img_boxes)
102 |     landmarks = output[0].data.numpy()  # shape [n_boxes, 10]
103 |     offsets = output[1].data.numpy()  # shape [n_boxes, 4]
104 |     probs = output[2].data.numpy()  # shape [n_boxes, 2]
105 | 
106 |     keep = np.where(probs[:, 1] > thresholds[2])[0]
107 |     bounding_boxes = bounding_boxes[keep]
108 |     bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,))
109 |     offsets = offsets[keep]
110 |     landmarks = landmarks[keep]
111 | 
112 |     # compute landmark points
113 |     width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0
114 |     height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0
115 |     xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1]
116 |     landmarks[:, 0:5] = np.expand_dims(xmin, 1) + np.expand_dims(width, 1)*landmarks[:, 0:5]
117 |     landmarks[:, 5:10] = np.expand_dims(ymin, 1) + np.expand_dims(height, 1)*landmarks[:, 5:10]
118 | 
119 |     bounding_boxes = calibrate_box(bounding_boxes, offsets)
120 |     keep = nms(bounding_boxes, nms_thresholds[2], mode='min')
121 |     bounding_boxes = bounding_boxes[keep]
122 |     landmarks = landmarks[keep]
123 | 
124 |     return bounding_boxes, landmarks
125 | 


--------------------------------------------------------------------------------
/src/first_stage.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import math
 4 | from PIL import Image
 5 | import numpy as np
 6 | from .box_utils import nms, _preprocess
 7 | 
 8 | 
 9 | def run_first_stage(image, net, scale, threshold):
10 |     """Run P-Net, generate bounding boxes, and do NMS.
11 | 
12 |     Arguments:
13 |         image: an instance of PIL.Image.
14 |         net: an instance of pytorch's nn.Module, P-Net.
15 |         scale: a float number,
16 |             scale width and height of the image by this number.
17 |         threshold: a float number,
18 |             threshold on the probability of a face when generating
19 |             bounding boxes from predictions of the net.
20 | 
21 |     Returns:
22 |         a float numpy array of shape [n_boxes, 9],
23 |             bounding boxes with scores and offsets (4 + 1 + 4).
24 |     """
25 | 
26 |     # scale the image and convert it to a float array
27 |     width, height = image.size
28 |     sw, sh = math.ceil(width*scale), math.ceil(height*scale)
29 |     img = image.resize((sw, sh), Image.BILINEAR)
30 |     img = np.asarray(img, 'float32')
31 | 
32 |     img = Variable(torch.FloatTensor(_preprocess(img)), volatile=True)
33 |     output = net(img)
34 |     probs = output[1].data.numpy()[0, 1, :, :]
35 |     offsets = output[0].data.numpy()
36 |     # probs: probability of a face at each sliding window
37 |     # offsets: transformations to true bounding boxes
38 | 
39 |     boxes = _generate_bboxes(probs, offsets, scale, threshold)
40 |     if len(boxes) == 0:
41 |         return None
42 | 
43 |     keep = nms(boxes[:, 0:5], overlap_threshold=0.5)
44 |     return boxes[keep]
45 | 
46 | 
47 | def _generate_bboxes(probs, offsets, scale, threshold):
48 |     """Generate bounding boxes at places
49 |     where there is probably a face.
50 | 
51 |     Arguments:
52 |         probs: a float numpy array of shape [n, m].
53 |         offsets: a float numpy array of shape [1, 4, n, m].
54 |         scale: a float number,
55 |             width and height of the image were scaled by this number.
56 |         threshold: a float number.
57 | 
58 |     Returns:
59 |         a float numpy array of shape [n_boxes, 9]
60 |     """
61 | 
62 |     # applying P-Net is equivalent, in some sense, to
63 |     # moving 12x12 window with stride 2
64 |     stride = 2
65 |     cell_size = 12
66 | 
67 |     # indices of boxes where there is probably a face
68 |     inds = np.where(probs > threshold)
69 | 
70 |     if inds[0].size == 0:
71 |         return np.array([])
72 | 
73 |     # transformations of bounding boxes
74 |     tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)]
75 |     # they are defined as:
76 |     # w = x2 - x1 + 1
77 |     # h = y2 - y1 + 1
78 |     # x1_true = x1 + tx1*w
79 |     # x2_true = x2 + tx2*w
80 |     # y1_true = y1 + ty1*h
81 |     # y2_true = y2 + ty2*h
82 | 
83 |     offsets = np.array([tx1, ty1, tx2, ty2])
84 |     score = probs[inds[0], inds[1]]
85 | 
86 |     # P-Net is applied to scaled images
87 |     # so we need to rescale bounding boxes back
88 |     bounding_boxes = np.vstack([
89 |         np.round((stride*inds[1] + 1.0)/scale),
90 |         np.round((stride*inds[0] + 1.0)/scale),
91 |         np.round((stride*inds[1] + 1.0 + cell_size)/scale),
92 |         np.round((stride*inds[0] + 1.0 + cell_size)/scale),
93 |         score, offsets
94 |     ])
95 |     # why one is added?
96 | 
97 |     return bounding_boxes.T
98 | 


--------------------------------------------------------------------------------
/src/get_nets.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import OrderedDict
  5 | import numpy as np
  6 | 
  7 | 
  8 | class Flatten(nn.Module):
  9 | 
 10 |     def __init__(self):
 11 |         super(Flatten, self).__init__()
 12 | 
 13 |     def forward(self, x):
 14 |         """
 15 |         Arguments:
 16 |             x: a float tensor with shape [batch_size, c, h, w].
 17 |         Returns:
 18 |             a float tensor with shape [batch_size, c*h*w].
 19 |         """
 20 | 
 21 |         # without this pretrained model isn't working
 22 |         x = x.transpose(3, 2).contiguous()
 23 | 
 24 |         return x.view(x.size(0), -1)
 25 | 
 26 | 
 27 | class PNet(nn.Module):
 28 | 
 29 |     def __init__(self):
 30 | 
 31 |         super(PNet, self).__init__()
 32 | 
 33 |         # suppose we have input with size HxW, then
 34 |         # after first layer: H - 2,
 35 |         # after pool: ceil((H - 2)/2),
 36 |         # after second conv: ceil((H - 2)/2) - 2,
 37 |         # after last conv: ceil((H - 2)/2) - 4,
 38 |         # and the same for W
 39 | 
 40 |         self.features = nn.Sequential(OrderedDict([
 41 |             ('conv1', nn.Conv2d(3, 10, 3, 1)),
 42 |             ('prelu1', nn.PReLU(10)),
 43 |             ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)),
 44 | 
 45 |             ('conv2', nn.Conv2d(10, 16, 3, 1)),
 46 |             ('prelu2', nn.PReLU(16)),
 47 | 
 48 |             ('conv3', nn.Conv2d(16, 32, 3, 1)),
 49 |             ('prelu3', nn.PReLU(32))
 50 |         ]))
 51 | 
 52 |         self.conv4_1 = nn.Conv2d(32, 2, 1, 1)
 53 |         self.conv4_2 = nn.Conv2d(32, 4, 1, 1)
 54 | 
 55 |         weights = np.load('src/weights/pnet.npy')[()]
 56 |         for n, p in self.named_parameters():
 57 |             p.data = torch.FloatTensor(weights[n])
 58 | 
 59 |     def forward(self, x):
 60 |         """
 61 |         Arguments:
 62 |             x: a float tensor with shape [batch_size, 3, h, w].
 63 |         Returns:
 64 |             b: a float tensor with shape [batch_size, 4, h', w'].
 65 |             a: a float tensor with shape [batch_size, 2, h', w'].
 66 |         """
 67 |         x = self.features(x)
 68 |         a = self.conv4_1(x)
 69 |         b = self.conv4_2(x)
 70 |         a = F.softmax(a)
 71 |         return b, a
 72 | 
 73 | 
 74 | class RNet(nn.Module):
 75 | 
 76 |     def __init__(self):
 77 | 
 78 |         super(RNet, self).__init__()
 79 | 
 80 |         self.features = nn.Sequential(OrderedDict([
 81 |             ('conv1', nn.Conv2d(3, 28, 3, 1)),
 82 |             ('prelu1', nn.PReLU(28)),
 83 |             ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
 84 | 
 85 |             ('conv2', nn.Conv2d(28, 48, 3, 1)),
 86 |             ('prelu2', nn.PReLU(48)),
 87 |             ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
 88 | 
 89 |             ('conv3', nn.Conv2d(48, 64, 2, 1)),
 90 |             ('prelu3', nn.PReLU(64)),
 91 | 
 92 |             ('flatten', Flatten()),
 93 |             ('conv4', nn.Linear(576, 128)),
 94 |             ('prelu4', nn.PReLU(128))
 95 |         ]))
 96 | 
 97 |         self.conv5_1 = nn.Linear(128, 2)
 98 |         self.conv5_2 = nn.Linear(128, 4)
 99 | 
100 |         weights = np.load('src/weights/rnet.npy')[()]
101 |         for n, p in self.named_parameters():
102 |             p.data = torch.FloatTensor(weights[n])
103 | 
104 |     def forward(self, x):
105 |         """
106 |         Arguments:
107 |             x: a float tensor with shape [batch_size, 3, h, w].
108 |         Returns:
109 |             b: a float tensor with shape [batch_size, 4].
110 |             a: a float tensor with shape [batch_size, 2].
111 |         """
112 |         x = self.features(x)
113 |         a = self.conv5_1(x)
114 |         b = self.conv5_2(x)
115 |         a = F.softmax(a)
116 |         return b, a
117 | 
118 | 
119 | class ONet(nn.Module):
120 | 
121 |     def __init__(self):
122 | 
123 |         super(ONet, self).__init__()
124 | 
125 |         self.features = nn.Sequential(OrderedDict([
126 |             ('conv1', nn.Conv2d(3, 32, 3, 1)),
127 |             ('prelu1', nn.PReLU(32)),
128 |             ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
129 | 
130 |             ('conv2', nn.Conv2d(32, 64, 3, 1)),
131 |             ('prelu2', nn.PReLU(64)),
132 |             ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
133 | 
134 |             ('conv3', nn.Conv2d(64, 64, 3, 1)),
135 |             ('prelu3', nn.PReLU(64)),
136 |             ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)),
137 | 
138 |             ('conv4', nn.Conv2d(64, 128, 2, 1)),
139 |             ('prelu4', nn.PReLU(128)),
140 | 
141 |             ('flatten', Flatten()),
142 |             ('conv5', nn.Linear(1152, 256)),
143 |             ('drop5', nn.Dropout(0.25)),
144 |             ('prelu5', nn.PReLU(256)),
145 |         ]))
146 | 
147 |         self.conv6_1 = nn.Linear(256, 2)
148 |         self.conv6_2 = nn.Linear(256, 4)
149 |         self.conv6_3 = nn.Linear(256, 10)
150 | 
151 |         weights = np.load('src/weights/onet.npy')[()]
152 |         for n, p in self.named_parameters():
153 |             p.data = torch.FloatTensor(weights[n])
154 | 
155 |     def forward(self, x):
156 |         """
157 |         Arguments:
158 |             x: a float tensor with shape [batch_size, 3, h, w].
159 |         Returns:
160 |             c: a float tensor with shape [batch_size, 10].
161 |             b: a float tensor with shape [batch_size, 4].
162 |             a: a float tensor with shape [batch_size, 2].
163 |         """
164 |         x = self.features(x)
165 |         a = self.conv6_1(x)
166 |         b = self.conv6_2(x)
167 |         c = self.conv6_3(x)
168 |         a = F.softmax(a)
169 |         return c, b, a
170 | 


--------------------------------------------------------------------------------
/src/visualization_utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import ImageDraw
 2 | 
 3 | 
 4 | def show_bboxes(img, bounding_boxes, facial_landmarks=[]):
 5 |     """Draw bounding boxes and facial landmarks.
 6 | 
 7 |     Arguments:
 8 |         img: an instance of PIL.Image.
 9 |         bounding_boxes: a float numpy array of shape [n, 5].
10 |         facial_landmarks: a float numpy array of shape [n, 10].
11 | 
12 |     Returns:
13 |         an instance of PIL.Image.
14 |     """
15 | 
16 |     img_copy = img.copy()
17 |     draw = ImageDraw.Draw(img_copy)
18 | 
19 |     for b in bounding_boxes:
20 |         draw.rectangle([
21 |             (b[0], b[1]), (b[2], b[3])
22 |         ], outline='white')
23 | 
24 |     for p in facial_landmarks:
25 |         for i in range(5):
26 |             draw.ellipse([
27 |                 (p[i] - 1.0, p[i + 5] - 1.0),
28 |                 (p[i] + 1.0, p[i + 5] + 1.0)
29 |             ], outline='blue')
30 | 
31 |     return img_copy
32 | 


--------------------------------------------------------------------------------
/src/weights/onet.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/onet.npy


--------------------------------------------------------------------------------
/src/weights/pnet.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/pnet.npy


--------------------------------------------------------------------------------
/src/weights/rnet.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/mtcnn-pytorch/45b34462fc995e6b8dbd17545b799e8c8a30026b/src/weights/rnet.npy


--------------------------------------------------------------------------------