├── .idea
├── facemask.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
└── modules.xml
├── README.md
├── detect_mask_video.py
├── face_detector
├── deploy.prototxt
└── res10_300x300_ssd_iter_140000.caffemodel
├── mask_detector.model
├── plot.png
├── requirements.txt
└── train_mask_detector.py
/.idea/facemask.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Facemask Detection using MobileNet
2 | 
3 |
4 |
--------------------------------------------------------------------------------
/detect_mask_video.py:
--------------------------------------------------------------------------------
1 | # import the required packages
2 | from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
3 | from tensorflow.keras.preprocessing.image import img_to_array
4 | from tensorflow.keras.models import load_model
5 | from imutils.video import VideoStream
6 | import numpy as np
7 | import imutils
8 | import cv2
9 |
10 |
11 | def detect_and_predict_mask(frame, faceNet, maskNet):
12 | # get the face points for detection
13 | (h, w) = frame.shape[:2]
14 | blob = cv2.dnn.blobFromImage(frame, 1.0, (224, 224),
15 | (104.0, 177.0, 123.0))
16 | faceNet.setInput(blob)
17 | detections = faceNet.forward()
18 | print(detections.shape)
19 |
20 | # Initialization for faces, location & prediction
21 | faces = []
22 | local_points = []
23 | prediction = []
24 |
25 | # loop over the detections
26 | for i in range(0, detections.shape[2]):
27 | # extract the confidence for detection
28 | confidence = detections[0, 0, i, 2]
29 |
30 | # validating confidence level
31 | if confidence > 0.5:
32 | # measuring the coordinates of the bounding box
33 | bounding_box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
34 | (start_x, start_y, end_x, end_y) = bounding_box.astype("int")
35 |
36 | # validating the bounding boxes
37 | (start_x, start_y) = (max(0, start_x), max(0, start_y))
38 | (end_x, end_y) = (min(w - 1, end_x), min(h - 1, end_y))
39 |
40 | # extract the face ROI, color transformation & resizing
41 | face = frame[start_y:end_y, start_x:end_x]
42 | face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
43 | face = cv2.resize(face, (224, 224))
44 | face = img_to_array(face)
45 | face = preprocess_input(face)
46 |
47 | # face and bounding boxes to their respective list
48 | faces.append(face)
49 | local_points.append((start_x, start_y, end_x, end_y))
50 |
51 | # searching for one face in frame minimumly
52 | if len(faces) > 0:
53 | # for faster inference we'll make batch predictions on *all*
54 | # faces at the same time rather than one-by-one predictions
55 | # in the above `for` loop
56 | faces = np.array(faces, dtype="float32")
57 | prediction = maskNet.predict(faces, batch_size=32)
58 |
59 | # return a tuple of the face locations
60 | return local_points, prediction
61 |
62 |
63 | # load our serialized face detector model from disk
64 | prototxtPath = r"face_detector\deploy.prototxt"
65 | weightsPath = r"face_detector\res10_300x300_ssd_iter_140000.caffemodel"
66 | faceNet = cv2.dnn.readNet(prototxtPath, weightsPath)
67 |
68 | # loading the model
69 | maskNet = load_model("mask_detector.model")
70 |
71 | # starting the video stream
72 | print("starting video stream...")
73 | vs = VideoStream(src=0).start()
74 |
75 | # infinite loop
76 | while True:
77 | # grab threaded video stream, resize it.
78 | frame = vs.read()
79 | frame = imutils.resize(frame, width=400)
80 |
81 | # detect faces and determine face mask
82 | (locs, preds) = detect_and_predict_mask(frame, faceNet, maskNet)
83 |
84 | # detected face locations and their corresponding locations
85 | for (box, pred) in zip(locs, preds):
86 | (startX, startY, endX, endY) = box
87 | (mask, withoutMask) = pred
88 |
89 | # detecting the class label and color
90 | label = "Mask" if mask > withoutMask else "No Mask"
91 | color = (0, 255, 0) if label == "Mask" else (0, 0, 255)
92 |
93 | # adding the probability
94 | label = "{}: {:.2f}%".format(label, max(mask, withoutMask) * 100)
95 |
96 | # display the label and bounding box
97 | cv2.putText(frame, label, (startX, startY - 10),
98 | cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2)
99 | cv2.rectangle(frame, (startX, startY), (endX, endY), color, 2)
100 |
101 | # show the output frame
102 | cv2.imshow("Frame", frame)
103 | key = cv2.waitKey(1) & 0xFF
104 |
105 | # setting up breaking condition with 'q'
106 | if key == ord("q"):
107 | break
108 |
109 | # destroying window
110 | cv2.destroyAllWindows()
111 | vs.stop()
112 |
--------------------------------------------------------------------------------
/face_detector/deploy.prototxt:
--------------------------------------------------------------------------------
1 | input: "data"
2 | input_shape {
3 | dim: 1
4 | dim: 3
5 | dim: 300
6 | dim: 300
7 | }
8 |
9 | layer {
10 | name: "data_bn"
11 | type: "BatchNorm"
12 | bottom: "data"
13 | top: "data_bn"
14 | param {
15 | lr_mult: 0.0
16 | }
17 | param {
18 | lr_mult: 0.0
19 | }
20 | param {
21 | lr_mult: 0.0
22 | }
23 | }
24 | layer {
25 | name: "data_scale"
26 | type: "Scale"
27 | bottom: "data_bn"
28 | top: "data_bn"
29 | param {
30 | lr_mult: 1.0
31 | decay_mult: 1.0
32 | }
33 | param {
34 | lr_mult: 2.0
35 | decay_mult: 1.0
36 | }
37 | scale_param {
38 | bias_term: true
39 | }
40 | }
41 | layer {
42 | name: "conv1_h"
43 | type: "Convolution"
44 | bottom: "data_bn"
45 | top: "conv1_h"
46 | param {
47 | lr_mult: 1.0
48 | decay_mult: 1.0
49 | }
50 | param {
51 | lr_mult: 2.0
52 | decay_mult: 1.0
53 | }
54 | convolution_param {
55 | num_output: 32
56 | pad: 3
57 | kernel_size: 7
58 | stride: 2
59 | weight_filler {
60 | type: "msra"
61 | variance_norm: FAN_OUT
62 | }
63 | bias_filler {
64 | type: "constant"
65 | value: 0.0
66 | }
67 | }
68 | }
69 | layer {
70 | name: "conv1_bn_h"
71 | type: "BatchNorm"
72 | bottom: "conv1_h"
73 | top: "conv1_h"
74 | param {
75 | lr_mult: 0.0
76 | }
77 | param {
78 | lr_mult: 0.0
79 | }
80 | param {
81 | lr_mult: 0.0
82 | }
83 | }
84 | layer {
85 | name: "conv1_scale_h"
86 | type: "Scale"
87 | bottom: "conv1_h"
88 | top: "conv1_h"
89 | param {
90 | lr_mult: 1.0
91 | decay_mult: 1.0
92 | }
93 | param {
94 | lr_mult: 2.0
95 | decay_mult: 1.0
96 | }
97 | scale_param {
98 | bias_term: true
99 | }
100 | }
101 | layer {
102 | name: "conv1_relu"
103 | type: "ReLU"
104 | bottom: "conv1_h"
105 | top: "conv1_h"
106 | }
107 | layer {
108 | name: "conv1_pool"
109 | type: "Pooling"
110 | bottom: "conv1_h"
111 | top: "conv1_pool"
112 | pooling_param {
113 | kernel_size: 3
114 | stride: 2
115 | }
116 | }
117 | layer {
118 | name: "layer_64_1_conv1_h"
119 | type: "Convolution"
120 | bottom: "conv1_pool"
121 | top: "layer_64_1_conv1_h"
122 | param {
123 | lr_mult: 1.0
124 | decay_mult: 1.0
125 | }
126 | convolution_param {
127 | num_output: 32
128 | bias_term: false
129 | pad: 1
130 | kernel_size: 3
131 | stride: 1
132 | weight_filler {
133 | type: "msra"
134 | }
135 | bias_filler {
136 | type: "constant"
137 | value: 0.0
138 | }
139 | }
140 | }
141 | layer {
142 | name: "layer_64_1_bn2_h"
143 | type: "BatchNorm"
144 | bottom: "layer_64_1_conv1_h"
145 | top: "layer_64_1_conv1_h"
146 | param {
147 | lr_mult: 0.0
148 | }
149 | param {
150 | lr_mult: 0.0
151 | }
152 | param {
153 | lr_mult: 0.0
154 | }
155 | }
156 | layer {
157 | name: "layer_64_1_scale2_h"
158 | type: "Scale"
159 | bottom: "layer_64_1_conv1_h"
160 | top: "layer_64_1_conv1_h"
161 | param {
162 | lr_mult: 1.0
163 | decay_mult: 1.0
164 | }
165 | param {
166 | lr_mult: 2.0
167 | decay_mult: 1.0
168 | }
169 | scale_param {
170 | bias_term: true
171 | }
172 | }
173 | layer {
174 | name: "layer_64_1_relu2"
175 | type: "ReLU"
176 | bottom: "layer_64_1_conv1_h"
177 | top: "layer_64_1_conv1_h"
178 | }
179 | layer {
180 | name: "layer_64_1_conv2_h"
181 | type: "Convolution"
182 | bottom: "layer_64_1_conv1_h"
183 | top: "layer_64_1_conv2_h"
184 | param {
185 | lr_mult: 1.0
186 | decay_mult: 1.0
187 | }
188 | convolution_param {
189 | num_output: 32
190 | bias_term: false
191 | pad: 1
192 | kernel_size: 3
193 | stride: 1
194 | weight_filler {
195 | type: "msra"
196 | }
197 | bias_filler {
198 | type: "constant"
199 | value: 0.0
200 | }
201 | }
202 | }
203 | layer {
204 | name: "layer_64_1_sum"
205 | type: "Eltwise"
206 | bottom: "layer_64_1_conv2_h"
207 | bottom: "conv1_pool"
208 | top: "layer_64_1_sum"
209 | }
210 | layer {
211 | name: "layer_128_1_bn1_h"
212 | type: "BatchNorm"
213 | bottom: "layer_64_1_sum"
214 | top: "layer_128_1_bn1_h"
215 | param {
216 | lr_mult: 0.0
217 | }
218 | param {
219 | lr_mult: 0.0
220 | }
221 | param {
222 | lr_mult: 0.0
223 | }
224 | }
225 | layer {
226 | name: "layer_128_1_scale1_h"
227 | type: "Scale"
228 | bottom: "layer_128_1_bn1_h"
229 | top: "layer_128_1_bn1_h"
230 | param {
231 | lr_mult: 1.0
232 | decay_mult: 1.0
233 | }
234 | param {
235 | lr_mult: 2.0
236 | decay_mult: 1.0
237 | }
238 | scale_param {
239 | bias_term: true
240 | }
241 | }
242 | layer {
243 | name: "layer_128_1_relu1"
244 | type: "ReLU"
245 | bottom: "layer_128_1_bn1_h"
246 | top: "layer_128_1_bn1_h"
247 | }
248 | layer {
249 | name: "layer_128_1_conv1_h"
250 | type: "Convolution"
251 | bottom: "layer_128_1_bn1_h"
252 | top: "layer_128_1_conv1_h"
253 | param {
254 | lr_mult: 1.0
255 | decay_mult: 1.0
256 | }
257 | convolution_param {
258 | num_output: 128
259 | bias_term: false
260 | pad: 1
261 | kernel_size: 3
262 | stride: 2
263 | weight_filler {
264 | type: "msra"
265 | }
266 | bias_filler {
267 | type: "constant"
268 | value: 0.0
269 | }
270 | }
271 | }
272 | layer {
273 | name: "layer_128_1_bn2"
274 | type: "BatchNorm"
275 | bottom: "layer_128_1_conv1_h"
276 | top: "layer_128_1_conv1_h"
277 | param {
278 | lr_mult: 0.0
279 | }
280 | param {
281 | lr_mult: 0.0
282 | }
283 | param {
284 | lr_mult: 0.0
285 | }
286 | }
287 | layer {
288 | name: "layer_128_1_scale2"
289 | type: "Scale"
290 | bottom: "layer_128_1_conv1_h"
291 | top: "layer_128_1_conv1_h"
292 | param {
293 | lr_mult: 1.0
294 | decay_mult: 1.0
295 | }
296 | param {
297 | lr_mult: 2.0
298 | decay_mult: 1.0
299 | }
300 | scale_param {
301 | bias_term: true
302 | }
303 | }
304 | layer {
305 | name: "layer_128_1_relu2"
306 | type: "ReLU"
307 | bottom: "layer_128_1_conv1_h"
308 | top: "layer_128_1_conv1_h"
309 | }
310 | layer {
311 | name: "layer_128_1_conv2"
312 | type: "Convolution"
313 | bottom: "layer_128_1_conv1_h"
314 | top: "layer_128_1_conv2"
315 | param {
316 | lr_mult: 1.0
317 | decay_mult: 1.0
318 | }
319 | convolution_param {
320 | num_output: 128
321 | bias_term: false
322 | pad: 1
323 | kernel_size: 3
324 | stride: 1
325 | weight_filler {
326 | type: "msra"
327 | }
328 | bias_filler {
329 | type: "constant"
330 | value: 0.0
331 | }
332 | }
333 | }
334 | layer {
335 | name: "layer_128_1_conv_expand_h"
336 | type: "Convolution"
337 | bottom: "layer_128_1_bn1_h"
338 | top: "layer_128_1_conv_expand_h"
339 | param {
340 | lr_mult: 1.0
341 | decay_mult: 1.0
342 | }
343 | convolution_param {
344 | num_output: 128
345 | bias_term: false
346 | pad: 0
347 | kernel_size: 1
348 | stride: 2
349 | weight_filler {
350 | type: "msra"
351 | }
352 | bias_filler {
353 | type: "constant"
354 | value: 0.0
355 | }
356 | }
357 | }
358 | layer {
359 | name: "layer_128_1_sum"
360 | type: "Eltwise"
361 | bottom: "layer_128_1_conv2"
362 | bottom: "layer_128_1_conv_expand_h"
363 | top: "layer_128_1_sum"
364 | }
365 | layer {
366 | name: "layer_256_1_bn1"
367 | type: "BatchNorm"
368 | bottom: "layer_128_1_sum"
369 | top: "layer_256_1_bn1"
370 | param {
371 | lr_mult: 0.0
372 | }
373 | param {
374 | lr_mult: 0.0
375 | }
376 | param {
377 | lr_mult: 0.0
378 | }
379 | }
380 | layer {
381 | name: "layer_256_1_scale1"
382 | type: "Scale"
383 | bottom: "layer_256_1_bn1"
384 | top: "layer_256_1_bn1"
385 | param {
386 | lr_mult: 1.0
387 | decay_mult: 1.0
388 | }
389 | param {
390 | lr_mult: 2.0
391 | decay_mult: 1.0
392 | }
393 | scale_param {
394 | bias_term: true
395 | }
396 | }
397 | layer {
398 | name: "layer_256_1_relu1"
399 | type: "ReLU"
400 | bottom: "layer_256_1_bn1"
401 | top: "layer_256_1_bn1"
402 | }
403 | layer {
404 | name: "layer_256_1_conv1"
405 | type: "Convolution"
406 | bottom: "layer_256_1_bn1"
407 | top: "layer_256_1_conv1"
408 | param {
409 | lr_mult: 1.0
410 | decay_mult: 1.0
411 | }
412 | convolution_param {
413 | num_output: 256
414 | bias_term: false
415 | pad: 1
416 | kernel_size: 3
417 | stride: 2
418 | weight_filler {
419 | type: "msra"
420 | }
421 | bias_filler {
422 | type: "constant"
423 | value: 0.0
424 | }
425 | }
426 | }
427 | layer {
428 | name: "layer_256_1_bn2"
429 | type: "BatchNorm"
430 | bottom: "layer_256_1_conv1"
431 | top: "layer_256_1_conv1"
432 | param {
433 | lr_mult: 0.0
434 | }
435 | param {
436 | lr_mult: 0.0
437 | }
438 | param {
439 | lr_mult: 0.0
440 | }
441 | }
442 | layer {
443 | name: "layer_256_1_scale2"
444 | type: "Scale"
445 | bottom: "layer_256_1_conv1"
446 | top: "layer_256_1_conv1"
447 | param {
448 | lr_mult: 1.0
449 | decay_mult: 1.0
450 | }
451 | param {
452 | lr_mult: 2.0
453 | decay_mult: 1.0
454 | }
455 | scale_param {
456 | bias_term: true
457 | }
458 | }
459 | layer {
460 | name: "layer_256_1_relu2"
461 | type: "ReLU"
462 | bottom: "layer_256_1_conv1"
463 | top: "layer_256_1_conv1"
464 | }
465 | layer {
466 | name: "layer_256_1_conv2"
467 | type: "Convolution"
468 | bottom: "layer_256_1_conv1"
469 | top: "layer_256_1_conv2"
470 | param {
471 | lr_mult: 1.0
472 | decay_mult: 1.0
473 | }
474 | convolution_param {
475 | num_output: 256
476 | bias_term: false
477 | pad: 1
478 | kernel_size: 3
479 | stride: 1
480 | weight_filler {
481 | type: "msra"
482 | }
483 | bias_filler {
484 | type: "constant"
485 | value: 0.0
486 | }
487 | }
488 | }
489 | layer {
490 | name: "layer_256_1_conv_expand"
491 | type: "Convolution"
492 | bottom: "layer_256_1_bn1"
493 | top: "layer_256_1_conv_expand"
494 | param {
495 | lr_mult: 1.0
496 | decay_mult: 1.0
497 | }
498 | convolution_param {
499 | num_output: 256
500 | bias_term: false
501 | pad: 0
502 | kernel_size: 1
503 | stride: 2
504 | weight_filler {
505 | type: "msra"
506 | }
507 | bias_filler {
508 | type: "constant"
509 | value: 0.0
510 | }
511 | }
512 | }
513 | layer {
514 | name: "layer_256_1_sum"
515 | type: "Eltwise"
516 | bottom: "layer_256_1_conv2"
517 | bottom: "layer_256_1_conv_expand"
518 | top: "layer_256_1_sum"
519 | }
520 | layer {
521 | name: "layer_512_1_bn1"
522 | type: "BatchNorm"
523 | bottom: "layer_256_1_sum"
524 | top: "layer_512_1_bn1"
525 | param {
526 | lr_mult: 0.0
527 | }
528 | param {
529 | lr_mult: 0.0
530 | }
531 | param {
532 | lr_mult: 0.0
533 | }
534 | }
535 | layer {
536 | name: "layer_512_1_scale1"
537 | type: "Scale"
538 | bottom: "layer_512_1_bn1"
539 | top: "layer_512_1_bn1"
540 | param {
541 | lr_mult: 1.0
542 | decay_mult: 1.0
543 | }
544 | param {
545 | lr_mult: 2.0
546 | decay_mult: 1.0
547 | }
548 | scale_param {
549 | bias_term: true
550 | }
551 | }
552 | layer {
553 | name: "layer_512_1_relu1"
554 | type: "ReLU"
555 | bottom: "layer_512_1_bn1"
556 | top: "layer_512_1_bn1"
557 | }
558 | layer {
559 | name: "layer_512_1_conv1_h"
560 | type: "Convolution"
561 | bottom: "layer_512_1_bn1"
562 | top: "layer_512_1_conv1_h"
563 | param {
564 | lr_mult: 1.0
565 | decay_mult: 1.0
566 | }
567 | convolution_param {
568 | num_output: 128
569 | bias_term: false
570 | pad: 1
571 | kernel_size: 3
572 | stride: 1 # 2
573 | weight_filler {
574 | type: "msra"
575 | }
576 | bias_filler {
577 | type: "constant"
578 | value: 0.0
579 | }
580 | }
581 | }
582 | layer {
583 | name: "layer_512_1_bn2_h"
584 | type: "BatchNorm"
585 | bottom: "layer_512_1_conv1_h"
586 | top: "layer_512_1_conv1_h"
587 | param {
588 | lr_mult: 0.0
589 | }
590 | param {
591 | lr_mult: 0.0
592 | }
593 | param {
594 | lr_mult: 0.0
595 | }
596 | }
597 | layer {
598 | name: "layer_512_1_scale2_h"
599 | type: "Scale"
600 | bottom: "layer_512_1_conv1_h"
601 | top: "layer_512_1_conv1_h"
602 | param {
603 | lr_mult: 1.0
604 | decay_mult: 1.0
605 | }
606 | param {
607 | lr_mult: 2.0
608 | decay_mult: 1.0
609 | }
610 | scale_param {
611 | bias_term: true
612 | }
613 | }
614 | layer {
615 | name: "layer_512_1_relu2"
616 | type: "ReLU"
617 | bottom: "layer_512_1_conv1_h"
618 | top: "layer_512_1_conv1_h"
619 | }
620 | layer {
621 | name: "layer_512_1_conv2_h"
622 | type: "Convolution"
623 | bottom: "layer_512_1_conv1_h"
624 | top: "layer_512_1_conv2_h"
625 | param {
626 | lr_mult: 1.0
627 | decay_mult: 1.0
628 | }
629 | convolution_param {
630 | num_output: 256
631 | bias_term: false
632 | pad: 2 # 1
633 | kernel_size: 3
634 | stride: 1
635 | dilation: 2
636 | weight_filler {
637 | type: "msra"
638 | }
639 | bias_filler {
640 | type: "constant"
641 | value: 0.0
642 | }
643 | }
644 | }
645 | layer {
646 | name: "layer_512_1_conv_expand_h"
647 | type: "Convolution"
648 | bottom: "layer_512_1_bn1"
649 | top: "layer_512_1_conv_expand_h"
650 | param {
651 | lr_mult: 1.0
652 | decay_mult: 1.0
653 | }
654 | convolution_param {
655 | num_output: 256
656 | bias_term: false
657 | pad: 0
658 | kernel_size: 1
659 | stride: 1 # 2
660 | weight_filler {
661 | type: "msra"
662 | }
663 | bias_filler {
664 | type: "constant"
665 | value: 0.0
666 | }
667 | }
668 | }
669 | layer {
670 | name: "layer_512_1_sum"
671 | type: "Eltwise"
672 | bottom: "layer_512_1_conv2_h"
673 | bottom: "layer_512_1_conv_expand_h"
674 | top: "layer_512_1_sum"
675 | }
676 | layer {
677 | name: "last_bn_h"
678 | type: "BatchNorm"
679 | bottom: "layer_512_1_sum"
680 | top: "layer_512_1_sum"
681 | param {
682 | lr_mult: 0.0
683 | }
684 | param {
685 | lr_mult: 0.0
686 | }
687 | param {
688 | lr_mult: 0.0
689 | }
690 | }
691 | layer {
692 | name: "last_scale_h"
693 | type: "Scale"
694 | bottom: "layer_512_1_sum"
695 | top: "layer_512_1_sum"
696 | param {
697 | lr_mult: 1.0
698 | decay_mult: 1.0
699 | }
700 | param {
701 | lr_mult: 2.0
702 | decay_mult: 1.0
703 | }
704 | scale_param {
705 | bias_term: true
706 | }
707 | }
708 | layer {
709 | name: "last_relu"
710 | type: "ReLU"
711 | bottom: "layer_512_1_sum"
712 | top: "fc7"
713 | }
714 |
715 | layer {
716 | name: "conv6_1_h"
717 | type: "Convolution"
718 | bottom: "fc7"
719 | top: "conv6_1_h"
720 | param {
721 | lr_mult: 1
722 | decay_mult: 1
723 | }
724 | param {
725 | lr_mult: 2
726 | decay_mult: 0
727 | }
728 | convolution_param {
729 | num_output: 128
730 | pad: 0
731 | kernel_size: 1
732 | stride: 1
733 | weight_filler {
734 | type: "xavier"
735 | }
736 | bias_filler {
737 | type: "constant"
738 | value: 0
739 | }
740 | }
741 | }
742 | layer {
743 | name: "conv6_1_relu"
744 | type: "ReLU"
745 | bottom: "conv6_1_h"
746 | top: "conv6_1_h"
747 | }
748 | layer {
749 | name: "conv6_2_h"
750 | type: "Convolution"
751 | bottom: "conv6_1_h"
752 | top: "conv6_2_h"
753 | param {
754 | lr_mult: 1
755 | decay_mult: 1
756 | }
757 | param {
758 | lr_mult: 2
759 | decay_mult: 0
760 | }
761 | convolution_param {
762 | num_output: 256
763 | pad: 1
764 | kernel_size: 3
765 | stride: 2
766 | weight_filler {
767 | type: "xavier"
768 | }
769 | bias_filler {
770 | type: "constant"
771 | value: 0
772 | }
773 | }
774 | }
775 | layer {
776 | name: "conv6_2_relu"
777 | type: "ReLU"
778 | bottom: "conv6_2_h"
779 | top: "conv6_2_h"
780 | }
781 | layer {
782 | name: "conv7_1_h"
783 | type: "Convolution"
784 | bottom: "conv6_2_h"
785 | top: "conv7_1_h"
786 | param {
787 | lr_mult: 1
788 | decay_mult: 1
789 | }
790 | param {
791 | lr_mult: 2
792 | decay_mult: 0
793 | }
794 | convolution_param {
795 | num_output: 64
796 | pad: 0
797 | kernel_size: 1
798 | stride: 1
799 | weight_filler {
800 | type: "xavier"
801 | }
802 | bias_filler {
803 | type: "constant"
804 | value: 0
805 | }
806 | }
807 | }
808 | layer {
809 | name: "conv7_1_relu"
810 | type: "ReLU"
811 | bottom: "conv7_1_h"
812 | top: "conv7_1_h"
813 | }
814 | layer {
815 | name: "conv7_2_h"
816 | type: "Convolution"
817 | bottom: "conv7_1_h"
818 | top: "conv7_2_h"
819 | param {
820 | lr_mult: 1
821 | decay_mult: 1
822 | }
823 | param {
824 | lr_mult: 2
825 | decay_mult: 0
826 | }
827 | convolution_param {
828 | num_output: 128
829 | pad: 1
830 | kernel_size: 3
831 | stride: 2
832 | weight_filler {
833 | type: "xavier"
834 | }
835 | bias_filler {
836 | type: "constant"
837 | value: 0
838 | }
839 | }
840 | }
841 | layer {
842 | name: "conv7_2_relu"
843 | type: "ReLU"
844 | bottom: "conv7_2_h"
845 | top: "conv7_2_h"
846 | }
847 | layer {
848 | name: "conv8_1_h"
849 | type: "Convolution"
850 | bottom: "conv7_2_h"
851 | top: "conv8_1_h"
852 | param {
853 | lr_mult: 1
854 | decay_mult: 1
855 | }
856 | param {
857 | lr_mult: 2
858 | decay_mult: 0
859 | }
860 | convolution_param {
861 | num_output: 64
862 | pad: 0
863 | kernel_size: 1
864 | stride: 1
865 | weight_filler {
866 | type: "xavier"
867 | }
868 | bias_filler {
869 | type: "constant"
870 | value: 0
871 | }
872 | }
873 | }
874 | layer {
875 | name: "conv8_1_relu"
876 | type: "ReLU"
877 | bottom: "conv8_1_h"
878 | top: "conv8_1_h"
879 | }
880 | layer {
881 | name: "conv8_2_h"
882 | type: "Convolution"
883 | bottom: "conv8_1_h"
884 | top: "conv8_2_h"
885 | param {
886 | lr_mult: 1
887 | decay_mult: 1
888 | }
889 | param {
890 | lr_mult: 2
891 | decay_mult: 0
892 | }
893 | convolution_param {
894 | num_output: 128
895 | pad: 1
896 | kernel_size: 3
897 | stride: 1
898 | weight_filler {
899 | type: "xavier"
900 | }
901 | bias_filler {
902 | type: "constant"
903 | value: 0
904 | }
905 | }
906 | }
907 | layer {
908 | name: "conv8_2_relu"
909 | type: "ReLU"
910 | bottom: "conv8_2_h"
911 | top: "conv8_2_h"
912 | }
913 | layer {
914 | name: "conv9_1_h"
915 | type: "Convolution"
916 | bottom: "conv8_2_h"
917 | top: "conv9_1_h"
918 | param {
919 | lr_mult: 1
920 | decay_mult: 1
921 | }
922 | param {
923 | lr_mult: 2
924 | decay_mult: 0
925 | }
926 | convolution_param {
927 | num_output: 64
928 | pad: 0
929 | kernel_size: 1
930 | stride: 1
931 | weight_filler {
932 | type: "xavier"
933 | }
934 | bias_filler {
935 | type: "constant"
936 | value: 0
937 | }
938 | }
939 | }
940 | layer {
941 | name: "conv9_1_relu"
942 | type: "ReLU"
943 | bottom: "conv9_1_h"
944 | top: "conv9_1_h"
945 | }
946 | layer {
947 | name: "conv9_2_h"
948 | type: "Convolution"
949 | bottom: "conv9_1_h"
950 | top: "conv9_2_h"
951 | param {
952 | lr_mult: 1
953 | decay_mult: 1
954 | }
955 | param {
956 | lr_mult: 2
957 | decay_mult: 0
958 | }
959 | convolution_param {
960 | num_output: 128
961 | pad: 1
962 | kernel_size: 3
963 | stride: 1
964 | weight_filler {
965 | type: "xavier"
966 | }
967 | bias_filler {
968 | type: "constant"
969 | value: 0
970 | }
971 | }
972 | }
973 | layer {
974 | name: "conv9_2_relu"
975 | type: "ReLU"
976 | bottom: "conv9_2_h"
977 | top: "conv9_2_h"
978 | }
979 | layer {
980 | name: "conv4_3_norm"
981 | type: "Normalize"
982 | bottom: "layer_256_1_bn1"
983 | top: "conv4_3_norm"
984 | norm_param {
985 | across_spatial: false
986 | scale_filler {
987 | type: "constant"
988 | value: 20
989 | }
990 | channel_shared: false
991 | }
992 | }
993 | layer {
994 | name: "conv4_3_norm_mbox_loc"
995 | type: "Convolution"
996 | bottom: "conv4_3_norm"
997 | top: "conv4_3_norm_mbox_loc"
998 | param {
999 | lr_mult: 1
1000 | decay_mult: 1
1001 | }
1002 | param {
1003 | lr_mult: 2
1004 | decay_mult: 0
1005 | }
1006 | convolution_param {
1007 | num_output: 16
1008 | pad: 1
1009 | kernel_size: 3
1010 | stride: 1
1011 | weight_filler {
1012 | type: "xavier"
1013 | }
1014 | bias_filler {
1015 | type: "constant"
1016 | value: 0
1017 | }
1018 | }
1019 | }
1020 | layer {
1021 | name: "conv4_3_norm_mbox_loc_perm"
1022 | type: "Permute"
1023 | bottom: "conv4_3_norm_mbox_loc"
1024 | top: "conv4_3_norm_mbox_loc_perm"
1025 | permute_param {
1026 | order: 0
1027 | order: 2
1028 | order: 3
1029 | order: 1
1030 | }
1031 | }
1032 | layer {
1033 | name: "conv4_3_norm_mbox_loc_flat"
1034 | type: "Flatten"
1035 | bottom: "conv4_3_norm_mbox_loc_perm"
1036 | top: "conv4_3_norm_mbox_loc_flat"
1037 | flatten_param {
1038 | axis: 1
1039 | }
1040 | }
1041 | layer {
1042 | name: "conv4_3_norm_mbox_conf"
1043 | type: "Convolution"
1044 | bottom: "conv4_3_norm"
1045 | top: "conv4_3_norm_mbox_conf"
1046 | param {
1047 | lr_mult: 1
1048 | decay_mult: 1
1049 | }
1050 | param {
1051 | lr_mult: 2
1052 | decay_mult: 0
1053 | }
1054 | convolution_param {
1055 | num_output: 8 # 84
1056 | pad: 1
1057 | kernel_size: 3
1058 | stride: 1
1059 | weight_filler {
1060 | type: "xavier"
1061 | }
1062 | bias_filler {
1063 | type: "constant"
1064 | value: 0
1065 | }
1066 | }
1067 | }
1068 | layer {
1069 | name: "conv4_3_norm_mbox_conf_perm"
1070 | type: "Permute"
1071 | bottom: "conv4_3_norm_mbox_conf"
1072 | top: "conv4_3_norm_mbox_conf_perm"
1073 | permute_param {
1074 | order: 0
1075 | order: 2
1076 | order: 3
1077 | order: 1
1078 | }
1079 | }
1080 | layer {
1081 | name: "conv4_3_norm_mbox_conf_flat"
1082 | type: "Flatten"
1083 | bottom: "conv4_3_norm_mbox_conf_perm"
1084 | top: "conv4_3_norm_mbox_conf_flat"
1085 | flatten_param {
1086 | axis: 1
1087 | }
1088 | }
1089 | layer {
1090 | name: "conv4_3_norm_mbox_priorbox"
1091 | type: "PriorBox"
1092 | bottom: "conv4_3_norm"
1093 | bottom: "data"
1094 | top: "conv4_3_norm_mbox_priorbox"
1095 | prior_box_param {
1096 | min_size: 30.0
1097 | max_size: 60.0
1098 | aspect_ratio: 2
1099 | flip: true
1100 | clip: false
1101 | variance: 0.1
1102 | variance: 0.1
1103 | variance: 0.2
1104 | variance: 0.2
1105 | step: 8
1106 | offset: 0.5
1107 | }
1108 | }
1109 | layer {
1110 | name: "fc7_mbox_loc"
1111 | type: "Convolution"
1112 | bottom: "fc7"
1113 | top: "fc7_mbox_loc"
1114 | param {
1115 | lr_mult: 1
1116 | decay_mult: 1
1117 | }
1118 | param {
1119 | lr_mult: 2
1120 | decay_mult: 0
1121 | }
1122 | convolution_param {
1123 | num_output: 24
1124 | pad: 1
1125 | kernel_size: 3
1126 | stride: 1
1127 | weight_filler {
1128 | type: "xavier"
1129 | }
1130 | bias_filler {
1131 | type: "constant"
1132 | value: 0
1133 | }
1134 | }
1135 | }
1136 | layer {
1137 | name: "fc7_mbox_loc_perm"
1138 | type: "Permute"
1139 | bottom: "fc7_mbox_loc"
1140 | top: "fc7_mbox_loc_perm"
1141 | permute_param {
1142 | order: 0
1143 | order: 2
1144 | order: 3
1145 | order: 1
1146 | }
1147 | }
1148 | layer {
1149 | name: "fc7_mbox_loc_flat"
1150 | type: "Flatten"
1151 | bottom: "fc7_mbox_loc_perm"
1152 | top: "fc7_mbox_loc_flat"
1153 | flatten_param {
1154 | axis: 1
1155 | }
1156 | }
1157 | layer {
1158 | name: "fc7_mbox_conf"
1159 | type: "Convolution"
1160 | bottom: "fc7"
1161 | top: "fc7_mbox_conf"
1162 | param {
1163 | lr_mult: 1
1164 | decay_mult: 1
1165 | }
1166 | param {
1167 | lr_mult: 2
1168 | decay_mult: 0
1169 | }
1170 | convolution_param {
1171 | num_output: 12 # 126
1172 | pad: 1
1173 | kernel_size: 3
1174 | stride: 1
1175 | weight_filler {
1176 | type: "xavier"
1177 | }
1178 | bias_filler {
1179 | type: "constant"
1180 | value: 0
1181 | }
1182 | }
1183 | }
1184 | layer {
1185 | name: "fc7_mbox_conf_perm"
1186 | type: "Permute"
1187 | bottom: "fc7_mbox_conf"
1188 | top: "fc7_mbox_conf_perm"
1189 | permute_param {
1190 | order: 0
1191 | order: 2
1192 | order: 3
1193 | order: 1
1194 | }
1195 | }
1196 | layer {
1197 | name: "fc7_mbox_conf_flat"
1198 | type: "Flatten"
1199 | bottom: "fc7_mbox_conf_perm"
1200 | top: "fc7_mbox_conf_flat"
1201 | flatten_param {
1202 | axis: 1
1203 | }
1204 | }
1205 | layer {
1206 | name: "fc7_mbox_priorbox"
1207 | type: "PriorBox"
1208 | bottom: "fc7"
1209 | bottom: "data"
1210 | top: "fc7_mbox_priorbox"
1211 | prior_box_param {
1212 | min_size: 60.0
1213 | max_size: 111.0
1214 | aspect_ratio: 2
1215 | aspect_ratio: 3
1216 | flip: true
1217 | clip: false
1218 | variance: 0.1
1219 | variance: 0.1
1220 | variance: 0.2
1221 | variance: 0.2
1222 | step: 16
1223 | offset: 0.5
1224 | }
1225 | }
1226 | layer {
1227 | name: "conv6_2_mbox_loc"
1228 | type: "Convolution"
1229 | bottom: "conv6_2_h"
1230 | top: "conv6_2_mbox_loc"
1231 | param {
1232 | lr_mult: 1
1233 | decay_mult: 1
1234 | }
1235 | param {
1236 | lr_mult: 2
1237 | decay_mult: 0
1238 | }
1239 | convolution_param {
1240 | num_output: 24
1241 | pad: 1
1242 | kernel_size: 3
1243 | stride: 1
1244 | weight_filler {
1245 | type: "xavier"
1246 | }
1247 | bias_filler {
1248 | type: "constant"
1249 | value: 0
1250 | }
1251 | }
1252 | }
1253 | layer {
1254 | name: "conv6_2_mbox_loc_perm"
1255 | type: "Permute"
1256 | bottom: "conv6_2_mbox_loc"
1257 | top: "conv6_2_mbox_loc_perm"
1258 | permute_param {
1259 | order: 0
1260 | order: 2
1261 | order: 3
1262 | order: 1
1263 | }
1264 | }
1265 | layer {
1266 | name: "conv6_2_mbox_loc_flat"
1267 | type: "Flatten"
1268 | bottom: "conv6_2_mbox_loc_perm"
1269 | top: "conv6_2_mbox_loc_flat"
1270 | flatten_param {
1271 | axis: 1
1272 | }
1273 | }
1274 | layer {
1275 | name: "conv6_2_mbox_conf"
1276 | type: "Convolution"
1277 | bottom: "conv6_2_h"
1278 | top: "conv6_2_mbox_conf"
1279 | param {
1280 | lr_mult: 1
1281 | decay_mult: 1
1282 | }
1283 | param {
1284 | lr_mult: 2
1285 | decay_mult: 0
1286 | }
1287 | convolution_param {
1288 | num_output: 12 # 126
1289 | pad: 1
1290 | kernel_size: 3
1291 | stride: 1
1292 | weight_filler {
1293 | type: "xavier"
1294 | }
1295 | bias_filler {
1296 | type: "constant"
1297 | value: 0
1298 | }
1299 | }
1300 | }
1301 | layer {
1302 | name: "conv6_2_mbox_conf_perm"
1303 | type: "Permute"
1304 | bottom: "conv6_2_mbox_conf"
1305 | top: "conv6_2_mbox_conf_perm"
1306 | permute_param {
1307 | order: 0
1308 | order: 2
1309 | order: 3
1310 | order: 1
1311 | }
1312 | }
1313 | layer {
1314 | name: "conv6_2_mbox_conf_flat"
1315 | type: "Flatten"
1316 | bottom: "conv6_2_mbox_conf_perm"
1317 | top: "conv6_2_mbox_conf_flat"
1318 | flatten_param {
1319 | axis: 1
1320 | }
1321 | }
1322 | layer {
1323 | name: "conv6_2_mbox_priorbox"
1324 | type: "PriorBox"
1325 | bottom: "conv6_2_h"
1326 | bottom: "data"
1327 | top: "conv6_2_mbox_priorbox"
1328 | prior_box_param {
1329 | min_size: 111.0
1330 | max_size: 162.0
1331 | aspect_ratio: 2
1332 | aspect_ratio: 3
1333 | flip: true
1334 | clip: false
1335 | variance: 0.1
1336 | variance: 0.1
1337 | variance: 0.2
1338 | variance: 0.2
1339 | step: 32
1340 | offset: 0.5
1341 | }
1342 | }
1343 | layer {
1344 | name: "conv7_2_mbox_loc"
1345 | type: "Convolution"
1346 | bottom: "conv7_2_h"
1347 | top: "conv7_2_mbox_loc"
1348 | param {
1349 | lr_mult: 1
1350 | decay_mult: 1
1351 | }
1352 | param {
1353 | lr_mult: 2
1354 | decay_mult: 0
1355 | }
1356 | convolution_param {
1357 | num_output: 24
1358 | pad: 1
1359 | kernel_size: 3
1360 | stride: 1
1361 | weight_filler {
1362 | type: "xavier"
1363 | }
1364 | bias_filler {
1365 | type: "constant"
1366 | value: 0
1367 | }
1368 | }
1369 | }
1370 | layer {
1371 | name: "conv7_2_mbox_loc_perm"
1372 | type: "Permute"
1373 | bottom: "conv7_2_mbox_loc"
1374 | top: "conv7_2_mbox_loc_perm"
1375 | permute_param {
1376 | order: 0
1377 | order: 2
1378 | order: 3
1379 | order: 1
1380 | }
1381 | }
1382 | layer {
1383 | name: "conv7_2_mbox_loc_flat"
1384 | type: "Flatten"
1385 | bottom: "conv7_2_mbox_loc_perm"
1386 | top: "conv7_2_mbox_loc_flat"
1387 | flatten_param {
1388 | axis: 1
1389 | }
1390 | }
1391 | layer {
1392 | name: "conv7_2_mbox_conf"
1393 | type: "Convolution"
1394 | bottom: "conv7_2_h"
1395 | top: "conv7_2_mbox_conf"
1396 | param {
1397 | lr_mult: 1
1398 | decay_mult: 1
1399 | }
1400 | param {
1401 | lr_mult: 2
1402 | decay_mult: 0
1403 | }
1404 | convolution_param {
1405 | num_output: 12 # 126
1406 | pad: 1
1407 | kernel_size: 3
1408 | stride: 1
1409 | weight_filler {
1410 | type: "xavier"
1411 | }
1412 | bias_filler {
1413 | type: "constant"
1414 | value: 0
1415 | }
1416 | }
1417 | }
1418 | layer {
1419 | name: "conv7_2_mbox_conf_perm"
1420 | type: "Permute"
1421 | bottom: "conv7_2_mbox_conf"
1422 | top: "conv7_2_mbox_conf_perm"
1423 | permute_param {
1424 | order: 0
1425 | order: 2
1426 | order: 3
1427 | order: 1
1428 | }
1429 | }
1430 | layer {
1431 | name: "conv7_2_mbox_conf_flat"
1432 | type: "Flatten"
1433 | bottom: "conv7_2_mbox_conf_perm"
1434 | top: "conv7_2_mbox_conf_flat"
1435 | flatten_param {
1436 | axis: 1
1437 | }
1438 | }
1439 | layer {
1440 | name: "conv7_2_mbox_priorbox"
1441 | type: "PriorBox"
1442 | bottom: "conv7_2_h"
1443 | bottom: "data"
1444 | top: "conv7_2_mbox_priorbox"
1445 | prior_box_param {
1446 | min_size: 162.0
1447 | max_size: 213.0
1448 | aspect_ratio: 2
1449 | aspect_ratio: 3
1450 | flip: true
1451 | clip: false
1452 | variance: 0.1
1453 | variance: 0.1
1454 | variance: 0.2
1455 | variance: 0.2
1456 | step: 64
1457 | offset: 0.5
1458 | }
1459 | }
1460 | layer {
1461 | name: "conv8_2_mbox_loc"
1462 | type: "Convolution"
1463 | bottom: "conv8_2_h"
1464 | top: "conv8_2_mbox_loc"
1465 | param {
1466 | lr_mult: 1
1467 | decay_mult: 1
1468 | }
1469 | param {
1470 | lr_mult: 2
1471 | decay_mult: 0
1472 | }
1473 | convolution_param {
1474 | num_output: 16
1475 | pad: 1
1476 | kernel_size: 3
1477 | stride: 1
1478 | weight_filler {
1479 | type: "xavier"
1480 | }
1481 | bias_filler {
1482 | type: "constant"
1483 | value: 0
1484 | }
1485 | }
1486 | }
1487 | layer {
1488 | name: "conv8_2_mbox_loc_perm"
1489 | type: "Permute"
1490 | bottom: "conv8_2_mbox_loc"
1491 | top: "conv8_2_mbox_loc_perm"
1492 | permute_param {
1493 | order: 0
1494 | order: 2
1495 | order: 3
1496 | order: 1
1497 | }
1498 | }
1499 | layer {
1500 | name: "conv8_2_mbox_loc_flat"
1501 | type: "Flatten"
1502 | bottom: "conv8_2_mbox_loc_perm"
1503 | top: "conv8_2_mbox_loc_flat"
1504 | flatten_param {
1505 | axis: 1
1506 | }
1507 | }
1508 | layer {
1509 | name: "conv8_2_mbox_conf"
1510 | type: "Convolution"
1511 | bottom: "conv8_2_h"
1512 | top: "conv8_2_mbox_conf"
1513 | param {
1514 | lr_mult: 1
1515 | decay_mult: 1
1516 | }
1517 | param {
1518 | lr_mult: 2
1519 | decay_mult: 0
1520 | }
1521 | convolution_param {
1522 | num_output: 8 # 84
1523 | pad: 1
1524 | kernel_size: 3
1525 | stride: 1
1526 | weight_filler {
1527 | type: "xavier"
1528 | }
1529 | bias_filler {
1530 | type: "constant"
1531 | value: 0
1532 | }
1533 | }
1534 | }
1535 | layer {
1536 | name: "conv8_2_mbox_conf_perm"
1537 | type: "Permute"
1538 | bottom: "conv8_2_mbox_conf"
1539 | top: "conv8_2_mbox_conf_perm"
1540 | permute_param {
1541 | order: 0
1542 | order: 2
1543 | order: 3
1544 | order: 1
1545 | }
1546 | }
1547 | layer {
1548 | name: "conv8_2_mbox_conf_flat"
1549 | type: "Flatten"
1550 | bottom: "conv8_2_mbox_conf_perm"
1551 | top: "conv8_2_mbox_conf_flat"
1552 | flatten_param {
1553 | axis: 1
1554 | }
1555 | }
1556 | layer {
1557 | name: "conv8_2_mbox_priorbox"
1558 | type: "PriorBox"
1559 | bottom: "conv8_2_h"
1560 | bottom: "data"
1561 | top: "conv8_2_mbox_priorbox"
1562 | prior_box_param {
1563 | min_size: 213.0
1564 | max_size: 264.0
1565 | aspect_ratio: 2
1566 | flip: true
1567 | clip: false
1568 | variance: 0.1
1569 | variance: 0.1
1570 | variance: 0.2
1571 | variance: 0.2
1572 | step: 100
1573 | offset: 0.5
1574 | }
1575 | }
1576 | layer {
1577 | name: "conv9_2_mbox_loc"
1578 | type: "Convolution"
1579 | bottom: "conv9_2_h"
1580 | top: "conv9_2_mbox_loc"
1581 | param {
1582 | lr_mult: 1
1583 | decay_mult: 1
1584 | }
1585 | param {
1586 | lr_mult: 2
1587 | decay_mult: 0
1588 | }
1589 | convolution_param {
1590 | num_output: 16
1591 | pad: 1
1592 | kernel_size: 3
1593 | stride: 1
1594 | weight_filler {
1595 | type: "xavier"
1596 | }
1597 | bias_filler {
1598 | type: "constant"
1599 | value: 0
1600 | }
1601 | }
1602 | }
1603 | layer {
1604 | name: "conv9_2_mbox_loc_perm"
1605 | type: "Permute"
1606 | bottom: "conv9_2_mbox_loc"
1607 | top: "conv9_2_mbox_loc_perm"
1608 | permute_param {
1609 | order: 0
1610 | order: 2
1611 | order: 3
1612 | order: 1
1613 | }
1614 | }
1615 | layer {
1616 | name: "conv9_2_mbox_loc_flat"
1617 | type: "Flatten"
1618 | bottom: "conv9_2_mbox_loc_perm"
1619 | top: "conv9_2_mbox_loc_flat"
1620 | flatten_param {
1621 | axis: 1
1622 | }
1623 | }
1624 | layer {
1625 | name: "conv9_2_mbox_conf"
1626 | type: "Convolution"
1627 | bottom: "conv9_2_h"
1628 | top: "conv9_2_mbox_conf"
1629 | param {
1630 | lr_mult: 1
1631 | decay_mult: 1
1632 | }
1633 | param {
1634 | lr_mult: 2
1635 | decay_mult: 0
1636 | }
1637 | convolution_param {
1638 | num_output: 8 # 84
1639 | pad: 1
1640 | kernel_size: 3
1641 | stride: 1
1642 | weight_filler {
1643 | type: "xavier"
1644 | }
1645 | bias_filler {
1646 | type: "constant"
1647 | value: 0
1648 | }
1649 | }
1650 | }
1651 | layer {
1652 | name: "conv9_2_mbox_conf_perm"
1653 | type: "Permute"
1654 | bottom: "conv9_2_mbox_conf"
1655 | top: "conv9_2_mbox_conf_perm"
1656 | permute_param {
1657 | order: 0
1658 | order: 2
1659 | order: 3
1660 | order: 1
1661 | }
1662 | }
1663 | layer {
1664 | name: "conv9_2_mbox_conf_flat"
1665 | type: "Flatten"
1666 | bottom: "conv9_2_mbox_conf_perm"
1667 | top: "conv9_2_mbox_conf_flat"
1668 | flatten_param {
1669 | axis: 1
1670 | }
1671 | }
1672 | layer {
1673 | name: "conv9_2_mbox_priorbox"
1674 | type: "PriorBox"
1675 | bottom: "conv9_2_h"
1676 | bottom: "data"
1677 | top: "conv9_2_mbox_priorbox"
1678 | prior_box_param {
1679 | min_size: 264.0
1680 | max_size: 315.0
1681 | aspect_ratio: 2
1682 | flip: true
1683 | clip: false
1684 | variance: 0.1
1685 | variance: 0.1
1686 | variance: 0.2
1687 | variance: 0.2
1688 | step: 300
1689 | offset: 0.5
1690 | }
1691 | }
1692 | layer {
1693 | name: "mbox_loc"
1694 | type: "Concat"
1695 | bottom: "conv4_3_norm_mbox_loc_flat"
1696 | bottom: "fc7_mbox_loc_flat"
1697 | bottom: "conv6_2_mbox_loc_flat"
1698 | bottom: "conv7_2_mbox_loc_flat"
1699 | bottom: "conv8_2_mbox_loc_flat"
1700 | bottom: "conv9_2_mbox_loc_flat"
1701 | top: "mbox_loc"
1702 | concat_param {
1703 | axis: 1
1704 | }
1705 | }
1706 | layer {
1707 | name: "mbox_conf"
1708 | type: "Concat"
1709 | bottom: "conv4_3_norm_mbox_conf_flat"
1710 | bottom: "fc7_mbox_conf_flat"
1711 | bottom: "conv6_2_mbox_conf_flat"
1712 | bottom: "conv7_2_mbox_conf_flat"
1713 | bottom: "conv8_2_mbox_conf_flat"
1714 | bottom: "conv9_2_mbox_conf_flat"
1715 | top: "mbox_conf"
1716 | concat_param {
1717 | axis: 1
1718 | }
1719 | }
1720 | layer {
1721 | name: "mbox_priorbox"
1722 | type: "Concat"
1723 | bottom: "conv4_3_norm_mbox_priorbox"
1724 | bottom: "fc7_mbox_priorbox"
1725 | bottom: "conv6_2_mbox_priorbox"
1726 | bottom: "conv7_2_mbox_priorbox"
1727 | bottom: "conv8_2_mbox_priorbox"
1728 | bottom: "conv9_2_mbox_priorbox"
1729 | top: "mbox_priorbox"
1730 | concat_param {
1731 | axis: 2
1732 | }
1733 | }
1734 |
1735 | layer {
1736 | name: "mbox_conf_reshape"
1737 | type: "Reshape"
1738 | bottom: "mbox_conf"
1739 | top: "mbox_conf_reshape"
1740 | reshape_param {
1741 | shape {
1742 | dim: 0
1743 | dim: -1
1744 | dim: 2
1745 | }
1746 | }
1747 | }
1748 | layer {
1749 | name: "mbox_conf_softmax"
1750 | type: "Softmax"
1751 | bottom: "mbox_conf_reshape"
1752 | top: "mbox_conf_softmax"
1753 | softmax_param {
1754 | axis: 2
1755 | }
1756 | }
1757 | layer {
1758 | name: "mbox_conf_flatten"
1759 | type: "Flatten"
1760 | bottom: "mbox_conf_softmax"
1761 | top: "mbox_conf_flatten"
1762 | flatten_param {
1763 | axis: 1
1764 | }
1765 | }
1766 |
1767 | layer {
1768 | name: "detection_out"
1769 | type: "DetectionOutput"
1770 | bottom: "mbox_loc"
1771 | bottom: "mbox_conf_flatten"
1772 | bottom: "mbox_priorbox"
1773 | top: "detection_out"
1774 | include {
1775 | phase: TEST
1776 | }
1777 | detection_output_param {
1778 | num_classes: 2
1779 | share_location: true
1780 | background_label_id: 0
1781 | nms_param {
1782 | nms_threshold: 0.45
1783 | top_k: 400
1784 | }
1785 | code_type: CENTER_SIZE
1786 | keep_top_k: 200
1787 | confidence_threshold: 0.01
1788 | }
1789 | }
1790 |
--------------------------------------------------------------------------------
/face_detector/res10_300x300_ssd_iter_140000.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aiwithqasim/facemask_detection/4ce00c02a6fe390e5240edb8deb86f1a4374324c/face_detector/res10_300x300_ssd_iter_140000.caffemodel
--------------------------------------------------------------------------------
/mask_detector.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aiwithqasim/facemask_detection/4ce00c02a6fe390e5240edb8deb86f1a4374324c/mask_detector.model
--------------------------------------------------------------------------------
/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aiwithqasim/facemask_detection/4ce00c02a6fe390e5240edb8deb86f1a4374324c/plot.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=1.15.2
2 | keras==2.3.1
3 | imutils==0.5.3
4 | numpy==1.18.2
5 | opencv-python==4.2.0.*
6 | matplotlib==3.2.1
7 | scipy==1.4.1
8 | sklearn
9 |
--------------------------------------------------------------------------------
/train_mask_detector.py:
--------------------------------------------------------------------------------
1 | # required packages
2 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
3 | from tensorflow.keras.applications import MobileNetV2
4 | from tensorflow.keras.layers import AveragePooling2D
5 | from tensorflow.keras.layers import Dropout
6 | from tensorflow.keras.layers import Flatten
7 | from tensorflow.keras.layers import Dense
8 | from tensorflow.keras.layers import Input
9 | from tensorflow.keras.models import Model
10 | from tensorflow.keras.optimizers import Adam
11 | from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
12 | from tensorflow.keras.preprocessing.image import img_to_array
13 | from tensorflow.keras.preprocessing.image import load_img
14 | from tensorflow.keras.utils import to_categorical
15 | from sklearn.preprocessing import LabelBinarizer
16 | from sklearn.model_selection import train_test_split
17 | from sklearn.metrics import classification_report
18 | import matplotlib.pyplot as plt
19 | import numpy as np
20 | import os
21 |
22 | # initializing batch size,epochs,learning rate
23 | learning_rate = 1e-4
24 | epochs = 20
25 | batch_size = 32
26 |
27 | # basic directory to dataset
28 | data_set = r"C:\Users\Public\facemask\dataset"
29 | classes = ["with_mask", "without_mask"]
30 |
31 | # loading info of dataset & initializing it
32 | print("loading images...")
33 |
34 | data = []
35 | labels = []
36 |
37 | for class_temp in classes:
38 | directory = os.path.join(data_set, class_temp)
39 | for pic in os.listdir(directory):
40 | image_path = os.path.join(directory, pic)
41 | img = load_img(image_path, target_size=(224, 224))
42 | img = img_to_array(img)
43 | img = preprocess_input(img)
44 |
45 | data.append(img)
46 | labels.append(class_temp)
47 |
48 | # one hot encoding
49 | lb = LabelBinarizer()
50 | labels = lb.fit_transform(labels)
51 | labels = to_categorical(labels)
52 |
53 | data = np.array(data, dtype="float32")
54 | labels = np.array(labels)
55 |
56 | (trainX, testX, trainY, testY) = train_test_split(data, labels,
57 | test_size=0.20, stratify=labels, random_state=42)
58 |
59 | # image generator for data augmentation
60 | aug = ImageDataGenerator(
61 | rotation_range=20,
62 | zoom_range=0.15,
63 | width_shift_range=0.2,
64 | height_shift_range=0.2,
65 | shear_range=0.15,
66 | horizontal_flip=True,
67 | fill_mode="nearest")
68 |
69 | # using the MobileNetV2 network
70 | base_model = MobileNetV2(weights="imagenet", include_top=False,
71 | input_tensor=Input(shape=(224, 224, 3)))
72 |
73 | # making the head of the model for base model
74 | head_model = base_model.output
75 | head_model = AveragePooling2D(pool_size=(7, 7))(head_model)
76 | head_model = Flatten(name="flatten")(head_model)
77 | head_model = Dense(128, activation="relu")(head_model)
78 | head_model = Dropout(0.5)(head_model)
79 | head_model = Dense(2, activation="softmax")(head_model)
80 |
81 | # making model by turning off FC
82 | model = Model(inputs=base_model.input, outputs=head_model)
83 |
84 | # freezing base layers
85 | for layer in base_model.layers:
86 | layer.trainable = False
87 |
88 | # compile our model
89 | print("compiling model...")
90 | optimize = Adam(lr=learning_rate, decay=learning_rate / epochs)
91 | model.compile(loss="binary_crossentropy", optimizer=optimize,
92 | metrics=["accuracy"])
93 |
94 | # train the head of the network
95 | print("training head...")
96 | H = model.fit(
97 | aug.flow(trainX, trainY, batch_size=batch_size),
98 | steps_per_epoch=len(trainX) // batch_size,
99 | validation_data=(testX, testY),
100 | validation_steps=len(testX) // batch_size,
101 | epochs=epochs)
102 |
103 | # make predictions on the testing set
104 | print("[INFO] evaluating network...")
105 | predIdxs = model.predict(testX, batch_size=batch_size)
106 | predIdxs = np.argmax(predIdxs, axis=1)
107 |
108 | # show a nicely formatted classification report
109 | print(classification_report(testY.argmax(axis=1), predIdxs,
110 | target_names=lb.classes_))
111 |
112 | # saving the model to disk with h5 format
113 | print("saving mask detector model...")
114 | model.save("mask_detector.model", save_format="h5")
115 |
116 | # plotting the training loss and accuracy
117 | N = epochs
118 | plt.style.use("ggplot")
119 | plt.figure()
120 | plt.plot(np.arange(0, N), H.history["loss"], label="train_loss")
121 | plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
122 | plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc")
123 | plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc")
124 | plt.title("Training Loss VS Accuracy")
125 | plt.xlabel("no of epochs")
126 | plt.ylabel("Loss & Accuracy")
127 | plt.legend(loc="lower left")
128 | plt.savefig("plot.png")
129 |
--------------------------------------------------------------------------------