├── .idea
├── DNN_Object_Detection.iml
├── misc.xml
├── modules.xml
└── other.xml
├── MobileNetSSD_deploy.caffemodel
├── MobileNetSSD_deploy.prototxt.txt
├── README.md
├── dnn_object_detection.py
├── object_detection_classes_pascal_voc.txt
├── output.gif
├── requeriments.txt
└── sample.jpeg
/.idea/DNN_Object_Detection.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/MobileNetSSD_deploy.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheNsBhasin/DNN_Object_Detection/7680652f963748c28108800450c5c88ddf3964ad/MobileNetSSD_deploy.caffemodel
--------------------------------------------------------------------------------
/MobileNetSSD_deploy.prototxt.txt:
--------------------------------------------------------------------------------
1 | name: "MobileNet-SSD"
2 | input: "data"
3 | input_shape {
4 | dim: 1
5 | dim: 3
6 | dim: 300
7 | dim: 300
8 | }
9 | layer {
10 | name: "conv0"
11 | type: "Convolution"
12 | bottom: "data"
13 | top: "conv0"
14 | param {
15 | lr_mult: 1.0
16 | decay_mult: 1.0
17 | }
18 | param {
19 | lr_mult: 2.0
20 | decay_mult: 0.0
21 | }
22 | convolution_param {
23 | num_output: 32
24 | pad: 1
25 | kernel_size: 3
26 | stride: 2
27 | weight_filler {
28 | type: "msra"
29 | }
30 | bias_filler {
31 | type: "constant"
32 | value: 0.0
33 | }
34 | }
35 | }
36 | layer {
37 | name: "conv0/relu"
38 | type: "ReLU"
39 | bottom: "conv0"
40 | top: "conv0"
41 | }
42 | layer {
43 | name: "conv1/dw"
44 | type: "Convolution"
45 | bottom: "conv0"
46 | top: "conv1/dw"
47 | param {
48 | lr_mult: 1.0
49 | decay_mult: 1.0
50 | }
51 | param {
52 | lr_mult: 2.0
53 | decay_mult: 0.0
54 | }
55 | convolution_param {
56 | num_output: 32
57 | pad: 1
58 | kernel_size: 3
59 | group: 32
60 | engine: CAFFE
61 | weight_filler {
62 | type: "msra"
63 | }
64 | bias_filler {
65 | type: "constant"
66 | value: 0.0
67 | }
68 | }
69 | }
70 | layer {
71 | name: "conv1/dw/relu"
72 | type: "ReLU"
73 | bottom: "conv1/dw"
74 | top: "conv1/dw"
75 | }
76 | layer {
77 | name: "conv1"
78 | type: "Convolution"
79 | bottom: "conv1/dw"
80 | top: "conv1"
81 | param {
82 | lr_mult: 1.0
83 | decay_mult: 1.0
84 | }
85 | param {
86 | lr_mult: 2.0
87 | decay_mult: 0.0
88 | }
89 | convolution_param {
90 | num_output: 64
91 | kernel_size: 1
92 | weight_filler {
93 | type: "msra"
94 | }
95 | bias_filler {
96 | type: "constant"
97 | value: 0.0
98 | }
99 | }
100 | }
101 | layer {
102 | name: "conv1/relu"
103 | type: "ReLU"
104 | bottom: "conv1"
105 | top: "conv1"
106 | }
107 | layer {
108 | name: "conv2/dw"
109 | type: "Convolution"
110 | bottom: "conv1"
111 | top: "conv2/dw"
112 | param {
113 | lr_mult: 1.0
114 | decay_mult: 1.0
115 | }
116 | param {
117 | lr_mult: 2.0
118 | decay_mult: 0.0
119 | }
120 | convolution_param {
121 | num_output: 64
122 | pad: 1
123 | kernel_size: 3
124 | stride: 2
125 | group: 64
126 | engine: CAFFE
127 | weight_filler {
128 | type: "msra"
129 | }
130 | bias_filler {
131 | type: "constant"
132 | value: 0.0
133 | }
134 | }
135 | }
136 | layer {
137 | name: "conv2/dw/relu"
138 | type: "ReLU"
139 | bottom: "conv2/dw"
140 | top: "conv2/dw"
141 | }
142 | layer {
143 | name: "conv2"
144 | type: "Convolution"
145 | bottom: "conv2/dw"
146 | top: "conv2"
147 | param {
148 | lr_mult: 1.0
149 | decay_mult: 1.0
150 | }
151 | param {
152 | lr_mult: 2.0
153 | decay_mult: 0.0
154 | }
155 | convolution_param {
156 | num_output: 128
157 | kernel_size: 1
158 | weight_filler {
159 | type: "msra"
160 | }
161 | bias_filler {
162 | type: "constant"
163 | value: 0.0
164 | }
165 | }
166 | }
167 | layer {
168 | name: "conv2/relu"
169 | type: "ReLU"
170 | bottom: "conv2"
171 | top: "conv2"
172 | }
173 | layer {
174 | name: "conv3/dw"
175 | type: "Convolution"
176 | bottom: "conv2"
177 | top: "conv3/dw"
178 | param {
179 | lr_mult: 1.0
180 | decay_mult: 1.0
181 | }
182 | param {
183 | lr_mult: 2.0
184 | decay_mult: 0.0
185 | }
186 | convolution_param {
187 | num_output: 128
188 | pad: 1
189 | kernel_size: 3
190 | group: 128
191 | engine: CAFFE
192 | weight_filler {
193 | type: "msra"
194 | }
195 | bias_filler {
196 | type: "constant"
197 | value: 0.0
198 | }
199 | }
200 | }
201 | layer {
202 | name: "conv3/dw/relu"
203 | type: "ReLU"
204 | bottom: "conv3/dw"
205 | top: "conv3/dw"
206 | }
207 | layer {
208 | name: "conv3"
209 | type: "Convolution"
210 | bottom: "conv3/dw"
211 | top: "conv3"
212 | param {
213 | lr_mult: 1.0
214 | decay_mult: 1.0
215 | }
216 | param {
217 | lr_mult: 2.0
218 | decay_mult: 0.0
219 | }
220 | convolution_param {
221 | num_output: 128
222 | kernel_size: 1
223 | weight_filler {
224 | type: "msra"
225 | }
226 | bias_filler {
227 | type: "constant"
228 | value: 0.0
229 | }
230 | }
231 | }
232 | layer {
233 | name: "conv3/relu"
234 | type: "ReLU"
235 | bottom: "conv3"
236 | top: "conv3"
237 | }
238 | layer {
239 | name: "conv4/dw"
240 | type: "Convolution"
241 | bottom: "conv3"
242 | top: "conv4/dw"
243 | param {
244 | lr_mult: 1.0
245 | decay_mult: 1.0
246 | }
247 | param {
248 | lr_mult: 2.0
249 | decay_mult: 0.0
250 | }
251 | convolution_param {
252 | num_output: 128
253 | pad: 1
254 | kernel_size: 3
255 | stride: 2
256 | group: 128
257 | engine: CAFFE
258 | weight_filler {
259 | type: "msra"
260 | }
261 | bias_filler {
262 | type: "constant"
263 | value: 0.0
264 | }
265 | }
266 | }
267 | layer {
268 | name: "conv4/dw/relu"
269 | type: "ReLU"
270 | bottom: "conv4/dw"
271 | top: "conv4/dw"
272 | }
273 | layer {
274 | name: "conv4"
275 | type: "Convolution"
276 | bottom: "conv4/dw"
277 | top: "conv4"
278 | param {
279 | lr_mult: 1.0
280 | decay_mult: 1.0
281 | }
282 | param {
283 | lr_mult: 2.0
284 | decay_mult: 0.0
285 | }
286 | convolution_param {
287 | num_output: 256
288 | kernel_size: 1
289 | weight_filler {
290 | type: "msra"
291 | }
292 | bias_filler {
293 | type: "constant"
294 | value: 0.0
295 | }
296 | }
297 | }
298 | layer {
299 | name: "conv4/relu"
300 | type: "ReLU"
301 | bottom: "conv4"
302 | top: "conv4"
303 | }
304 | layer {
305 | name: "conv5/dw"
306 | type: "Convolution"
307 | bottom: "conv4"
308 | top: "conv5/dw"
309 | param {
310 | lr_mult: 1.0
311 | decay_mult: 1.0
312 | }
313 | param {
314 | lr_mult: 2.0
315 | decay_mult: 0.0
316 | }
317 | convolution_param {
318 | num_output: 256
319 | pad: 1
320 | kernel_size: 3
321 | group: 256
322 | engine: CAFFE
323 | weight_filler {
324 | type: "msra"
325 | }
326 | bias_filler {
327 | type: "constant"
328 | value: 0.0
329 | }
330 | }
331 | }
332 | layer {
333 | name: "conv5/dw/relu"
334 | type: "ReLU"
335 | bottom: "conv5/dw"
336 | top: "conv5/dw"
337 | }
338 | layer {
339 | name: "conv5"
340 | type: "Convolution"
341 | bottom: "conv5/dw"
342 | top: "conv5"
343 | param {
344 | lr_mult: 1.0
345 | decay_mult: 1.0
346 | }
347 | param {
348 | lr_mult: 2.0
349 | decay_mult: 0.0
350 | }
351 | convolution_param {
352 | num_output: 256
353 | kernel_size: 1
354 | weight_filler {
355 | type: "msra"
356 | }
357 | bias_filler {
358 | type: "constant"
359 | value: 0.0
360 | }
361 | }
362 | }
363 | layer {
364 | name: "conv5/relu"
365 | type: "ReLU"
366 | bottom: "conv5"
367 | top: "conv5"
368 | }
369 | layer {
370 | name: "conv6/dw"
371 | type: "Convolution"
372 | bottom: "conv5"
373 | top: "conv6/dw"
374 | param {
375 | lr_mult: 1.0
376 | decay_mult: 1.0
377 | }
378 | param {
379 | lr_mult: 2.0
380 | decay_mult: 0.0
381 | }
382 | convolution_param {
383 | num_output: 256
384 | pad: 1
385 | kernel_size: 3
386 | stride: 2
387 | group: 256
388 | engine: CAFFE
389 | weight_filler {
390 | type: "msra"
391 | }
392 | bias_filler {
393 | type: "constant"
394 | value: 0.0
395 | }
396 | }
397 | }
398 | layer {
399 | name: "conv6/dw/relu"
400 | type: "ReLU"
401 | bottom: "conv6/dw"
402 | top: "conv6/dw"
403 | }
404 | layer {
405 | name: "conv6"
406 | type: "Convolution"
407 | bottom: "conv6/dw"
408 | top: "conv6"
409 | param {
410 | lr_mult: 1.0
411 | decay_mult: 1.0
412 | }
413 | param {
414 | lr_mult: 2.0
415 | decay_mult: 0.0
416 | }
417 | convolution_param {
418 | num_output: 512
419 | kernel_size: 1
420 | weight_filler {
421 | type: "msra"
422 | }
423 | bias_filler {
424 | type: "constant"
425 | value: 0.0
426 | }
427 | }
428 | }
429 | layer {
430 | name: "conv6/relu"
431 | type: "ReLU"
432 | bottom: "conv6"
433 | top: "conv6"
434 | }
435 | layer {
436 | name: "conv7/dw"
437 | type: "Convolution"
438 | bottom: "conv6"
439 | top: "conv7/dw"
440 | param {
441 | lr_mult: 1.0
442 | decay_mult: 1.0
443 | }
444 | param {
445 | lr_mult: 2.0
446 | decay_mult: 0.0
447 | }
448 | convolution_param {
449 | num_output: 512
450 | pad: 1
451 | kernel_size: 3
452 | group: 512
453 | engine: CAFFE
454 | weight_filler {
455 | type: "msra"
456 | }
457 | bias_filler {
458 | type: "constant"
459 | value: 0.0
460 | }
461 | }
462 | }
463 | layer {
464 | name: "conv7/dw/relu"
465 | type: "ReLU"
466 | bottom: "conv7/dw"
467 | top: "conv7/dw"
468 | }
469 | layer {
470 | name: "conv7"
471 | type: "Convolution"
472 | bottom: "conv7/dw"
473 | top: "conv7"
474 | param {
475 | lr_mult: 1.0
476 | decay_mult: 1.0
477 | }
478 | param {
479 | lr_mult: 2.0
480 | decay_mult: 0.0
481 | }
482 | convolution_param {
483 | num_output: 512
484 | kernel_size: 1
485 | weight_filler {
486 | type: "msra"
487 | }
488 | bias_filler {
489 | type: "constant"
490 | value: 0.0
491 | }
492 | }
493 | }
494 | layer {
495 | name: "conv7/relu"
496 | type: "ReLU"
497 | bottom: "conv7"
498 | top: "conv7"
499 | }
500 | layer {
501 | name: "conv8/dw"
502 | type: "Convolution"
503 | bottom: "conv7"
504 | top: "conv8/dw"
505 | param {
506 | lr_mult: 1.0
507 | decay_mult: 1.0
508 | }
509 | param {
510 | lr_mult: 2.0
511 | decay_mult: 0.0
512 | }
513 | convolution_param {
514 | num_output: 512
515 | pad: 1
516 | kernel_size: 3
517 | group: 512
518 | engine: CAFFE
519 | weight_filler {
520 | type: "msra"
521 | }
522 | bias_filler {
523 | type: "constant"
524 | value: 0.0
525 | }
526 | }
527 | }
528 | layer {
529 | name: "conv8/dw/relu"
530 | type: "ReLU"
531 | bottom: "conv8/dw"
532 | top: "conv8/dw"
533 | }
534 | layer {
535 | name: "conv8"
536 | type: "Convolution"
537 | bottom: "conv8/dw"
538 | top: "conv8"
539 | param {
540 | lr_mult: 1.0
541 | decay_mult: 1.0
542 | }
543 | param {
544 | lr_mult: 2.0
545 | decay_mult: 0.0
546 | }
547 | convolution_param {
548 | num_output: 512
549 | kernel_size: 1
550 | weight_filler {
551 | type: "msra"
552 | }
553 | bias_filler {
554 | type: "constant"
555 | value: 0.0
556 | }
557 | }
558 | }
559 | layer {
560 | name: "conv8/relu"
561 | type: "ReLU"
562 | bottom: "conv8"
563 | top: "conv8"
564 | }
565 | layer {
566 | name: "conv9/dw"
567 | type: "Convolution"
568 | bottom: "conv8"
569 | top: "conv9/dw"
570 | param {
571 | lr_mult: 1.0
572 | decay_mult: 1.0
573 | }
574 | param {
575 | lr_mult: 2.0
576 | decay_mult: 0.0
577 | }
578 | convolution_param {
579 | num_output: 512
580 | pad: 1
581 | kernel_size: 3
582 | group: 512
583 | engine: CAFFE
584 | weight_filler {
585 | type: "msra"
586 | }
587 | bias_filler {
588 | type: "constant"
589 | value: 0.0
590 | }
591 | }
592 | }
593 | layer {
594 | name: "conv9/dw/relu"
595 | type: "ReLU"
596 | bottom: "conv9/dw"
597 | top: "conv9/dw"
598 | }
599 | layer {
600 | name: "conv9"
601 | type: "Convolution"
602 | bottom: "conv9/dw"
603 | top: "conv9"
604 | param {
605 | lr_mult: 1.0
606 | decay_mult: 1.0
607 | }
608 | param {
609 | lr_mult: 2.0
610 | decay_mult: 0.0
611 | }
612 | convolution_param {
613 | num_output: 512
614 | kernel_size: 1
615 | weight_filler {
616 | type: "msra"
617 | }
618 | bias_filler {
619 | type: "constant"
620 | value: 0.0
621 | }
622 | }
623 | }
624 | layer {
625 | name: "conv9/relu"
626 | type: "ReLU"
627 | bottom: "conv9"
628 | top: "conv9"
629 | }
630 | layer {
631 | name: "conv10/dw"
632 | type: "Convolution"
633 | bottom: "conv9"
634 | top: "conv10/dw"
635 | param {
636 | lr_mult: 1.0
637 | decay_mult: 1.0
638 | }
639 | param {
640 | lr_mult: 2.0
641 | decay_mult: 0.0
642 | }
643 | convolution_param {
644 | num_output: 512
645 | pad: 1
646 | kernel_size: 3
647 | group: 512
648 | engine: CAFFE
649 | weight_filler {
650 | type: "msra"
651 | }
652 | bias_filler {
653 | type: "constant"
654 | value: 0.0
655 | }
656 | }
657 | }
658 | layer {
659 | name: "conv10/dw/relu"
660 | type: "ReLU"
661 | bottom: "conv10/dw"
662 | top: "conv10/dw"
663 | }
664 | layer {
665 | name: "conv10"
666 | type: "Convolution"
667 | bottom: "conv10/dw"
668 | top: "conv10"
669 | param {
670 | lr_mult: 1.0
671 | decay_mult: 1.0
672 | }
673 | param {
674 | lr_mult: 2.0
675 | decay_mult: 0.0
676 | }
677 | convolution_param {
678 | num_output: 512
679 | kernel_size: 1
680 | weight_filler {
681 | type: "msra"
682 | }
683 | bias_filler {
684 | type: "constant"
685 | value: 0.0
686 | }
687 | }
688 | }
689 | layer {
690 | name: "conv10/relu"
691 | type: "ReLU"
692 | bottom: "conv10"
693 | top: "conv10"
694 | }
695 | layer {
696 | name: "conv11/dw"
697 | type: "Convolution"
698 | bottom: "conv10"
699 | top: "conv11/dw"
700 | param {
701 | lr_mult: 1.0
702 | decay_mult: 1.0
703 | }
704 | param {
705 | lr_mult: 2.0
706 | decay_mult: 0.0
707 | }
708 | convolution_param {
709 | num_output: 512
710 | pad: 1
711 | kernel_size: 3
712 | group: 512
713 | engine: CAFFE
714 | weight_filler {
715 | type: "msra"
716 | }
717 | bias_filler {
718 | type: "constant"
719 | value: 0.0
720 | }
721 | }
722 | }
723 | layer {
724 | name: "conv11/dw/relu"
725 | type: "ReLU"
726 | bottom: "conv11/dw"
727 | top: "conv11/dw"
728 | }
729 | layer {
730 | name: "conv11"
731 | type: "Convolution"
732 | bottom: "conv11/dw"
733 | top: "conv11"
734 | param {
735 | lr_mult: 1.0
736 | decay_mult: 1.0
737 | }
738 | param {
739 | lr_mult: 2.0
740 | decay_mult: 0.0
741 | }
742 | convolution_param {
743 | num_output: 512
744 | kernel_size: 1
745 | weight_filler {
746 | type: "msra"
747 | }
748 | bias_filler {
749 | type: "constant"
750 | value: 0.0
751 | }
752 | }
753 | }
754 | layer {
755 | name: "conv11/relu"
756 | type: "ReLU"
757 | bottom: "conv11"
758 | top: "conv11"
759 | }
760 | layer {
761 | name: "conv12/dw"
762 | type: "Convolution"
763 | bottom: "conv11"
764 | top: "conv12/dw"
765 | param {
766 | lr_mult: 1.0
767 | decay_mult: 1.0
768 | }
769 | param {
770 | lr_mult: 2.0
771 | decay_mult: 0.0
772 | }
773 | convolution_param {
774 | num_output: 512
775 | pad: 1
776 | kernel_size: 3
777 | stride: 2
778 | group: 512
779 | engine: CAFFE
780 | weight_filler {
781 | type: "msra"
782 | }
783 | bias_filler {
784 | type: "constant"
785 | value: 0.0
786 | }
787 | }
788 | }
789 | layer {
790 | name: "conv12/dw/relu"
791 | type: "ReLU"
792 | bottom: "conv12/dw"
793 | top: "conv12/dw"
794 | }
795 | layer {
796 | name: "conv12"
797 | type: "Convolution"
798 | bottom: "conv12/dw"
799 | top: "conv12"
800 | param {
801 | lr_mult: 1.0
802 | decay_mult: 1.0
803 | }
804 | param {
805 | lr_mult: 2.0
806 | decay_mult: 0.0
807 | }
808 | convolution_param {
809 | num_output: 1024
810 | kernel_size: 1
811 | weight_filler {
812 | type: "msra"
813 | }
814 | bias_filler {
815 | type: "constant"
816 | value: 0.0
817 | }
818 | }
819 | }
820 | layer {
821 | name: "conv12/relu"
822 | type: "ReLU"
823 | bottom: "conv12"
824 | top: "conv12"
825 | }
826 | layer {
827 | name: "conv13/dw"
828 | type: "Convolution"
829 | bottom: "conv12"
830 | top: "conv13/dw"
831 | param {
832 | lr_mult: 1.0
833 | decay_mult: 1.0
834 | }
835 | param {
836 | lr_mult: 2.0
837 | decay_mult: 0.0
838 | }
839 | convolution_param {
840 | num_output: 1024
841 | pad: 1
842 | kernel_size: 3
843 | group: 1024
844 | engine: CAFFE
845 | weight_filler {
846 | type: "msra"
847 | }
848 | bias_filler {
849 | type: "constant"
850 | value: 0.0
851 | }
852 | }
853 | }
854 | layer {
855 | name: "conv13/dw/relu"
856 | type: "ReLU"
857 | bottom: "conv13/dw"
858 | top: "conv13/dw"
859 | }
860 | layer {
861 | name: "conv13"
862 | type: "Convolution"
863 | bottom: "conv13/dw"
864 | top: "conv13"
865 | param {
866 | lr_mult: 1.0
867 | decay_mult: 1.0
868 | }
869 | param {
870 | lr_mult: 2.0
871 | decay_mult: 0.0
872 | }
873 | convolution_param {
874 | num_output: 1024
875 | kernel_size: 1
876 | weight_filler {
877 | type: "msra"
878 | }
879 | bias_filler {
880 | type: "constant"
881 | value: 0.0
882 | }
883 | }
884 | }
885 | layer {
886 | name: "conv13/relu"
887 | type: "ReLU"
888 | bottom: "conv13"
889 | top: "conv13"
890 | }
891 | layer {
892 | name: "conv14_1"
893 | type: "Convolution"
894 | bottom: "conv13"
895 | top: "conv14_1"
896 | param {
897 | lr_mult: 1.0
898 | decay_mult: 1.0
899 | }
900 | param {
901 | lr_mult: 2.0
902 | decay_mult: 0.0
903 | }
904 | convolution_param {
905 | num_output: 256
906 | kernel_size: 1
907 | weight_filler {
908 | type: "msra"
909 | }
910 | bias_filler {
911 | type: "constant"
912 | value: 0.0
913 | }
914 | }
915 | }
916 | layer {
917 | name: "conv14_1/relu"
918 | type: "ReLU"
919 | bottom: "conv14_1"
920 | top: "conv14_1"
921 | }
922 | layer {
923 | name: "conv14_2"
924 | type: "Convolution"
925 | bottom: "conv14_1"
926 | top: "conv14_2"
927 | param {
928 | lr_mult: 1.0
929 | decay_mult: 1.0
930 | }
931 | param {
932 | lr_mult: 2.0
933 | decay_mult: 0.0
934 | }
935 | convolution_param {
936 | num_output: 512
937 | pad: 1
938 | kernel_size: 3
939 | stride: 2
940 | weight_filler {
941 | type: "msra"
942 | }
943 | bias_filler {
944 | type: "constant"
945 | value: 0.0
946 | }
947 | }
948 | }
949 | layer {
950 | name: "conv14_2/relu"
951 | type: "ReLU"
952 | bottom: "conv14_2"
953 | top: "conv14_2"
954 | }
955 | layer {
956 | name: "conv15_1"
957 | type: "Convolution"
958 | bottom: "conv14_2"
959 | top: "conv15_1"
960 | param {
961 | lr_mult: 1.0
962 | decay_mult: 1.0
963 | }
964 | param {
965 | lr_mult: 2.0
966 | decay_mult: 0.0
967 | }
968 | convolution_param {
969 | num_output: 128
970 | kernel_size: 1
971 | weight_filler {
972 | type: "msra"
973 | }
974 | bias_filler {
975 | type: "constant"
976 | value: 0.0
977 | }
978 | }
979 | }
980 | layer {
981 | name: "conv15_1/relu"
982 | type: "ReLU"
983 | bottom: "conv15_1"
984 | top: "conv15_1"
985 | }
986 | layer {
987 | name: "conv15_2"
988 | type: "Convolution"
989 | bottom: "conv15_1"
990 | top: "conv15_2"
991 | param {
992 | lr_mult: 1.0
993 | decay_mult: 1.0
994 | }
995 | param {
996 | lr_mult: 2.0
997 | decay_mult: 0.0
998 | }
999 | convolution_param {
1000 | num_output: 256
1001 | pad: 1
1002 | kernel_size: 3
1003 | stride: 2
1004 | weight_filler {
1005 | type: "msra"
1006 | }
1007 | bias_filler {
1008 | type: "constant"
1009 | value: 0.0
1010 | }
1011 | }
1012 | }
1013 | layer {
1014 | name: "conv15_2/relu"
1015 | type: "ReLU"
1016 | bottom: "conv15_2"
1017 | top: "conv15_2"
1018 | }
1019 | layer {
1020 | name: "conv16_1"
1021 | type: "Convolution"
1022 | bottom: "conv15_2"
1023 | top: "conv16_1"
1024 | param {
1025 | lr_mult: 1.0
1026 | decay_mult: 1.0
1027 | }
1028 | param {
1029 | lr_mult: 2.0
1030 | decay_mult: 0.0
1031 | }
1032 | convolution_param {
1033 | num_output: 128
1034 | kernel_size: 1
1035 | weight_filler {
1036 | type: "msra"
1037 | }
1038 | bias_filler {
1039 | type: "constant"
1040 | value: 0.0
1041 | }
1042 | }
1043 | }
1044 | layer {
1045 | name: "conv16_1/relu"
1046 | type: "ReLU"
1047 | bottom: "conv16_1"
1048 | top: "conv16_1"
1049 | }
1050 | layer {
1051 | name: "conv16_2"
1052 | type: "Convolution"
1053 | bottom: "conv16_1"
1054 | top: "conv16_2"
1055 | param {
1056 | lr_mult: 1.0
1057 | decay_mult: 1.0
1058 | }
1059 | param {
1060 | lr_mult: 2.0
1061 | decay_mult: 0.0
1062 | }
1063 | convolution_param {
1064 | num_output: 256
1065 | pad: 1
1066 | kernel_size: 3
1067 | stride: 2
1068 | weight_filler {
1069 | type: "msra"
1070 | }
1071 | bias_filler {
1072 | type: "constant"
1073 | value: 0.0
1074 | }
1075 | }
1076 | }
1077 | layer {
1078 | name: "conv16_2/relu"
1079 | type: "ReLU"
1080 | bottom: "conv16_2"
1081 | top: "conv16_2"
1082 | }
1083 | layer {
1084 | name: "conv17_1"
1085 | type: "Convolution"
1086 | bottom: "conv16_2"
1087 | top: "conv17_1"
1088 | param {
1089 | lr_mult: 1.0
1090 | decay_mult: 1.0
1091 | }
1092 | param {
1093 | lr_mult: 2.0
1094 | decay_mult: 0.0
1095 | }
1096 | convolution_param {
1097 | num_output: 64
1098 | kernel_size: 1
1099 | weight_filler {
1100 | type: "msra"
1101 | }
1102 | bias_filler {
1103 | type: "constant"
1104 | value: 0.0
1105 | }
1106 | }
1107 | }
1108 | layer {
1109 | name: "conv17_1/relu"
1110 | type: "ReLU"
1111 | bottom: "conv17_1"
1112 | top: "conv17_1"
1113 | }
1114 | layer {
1115 | name: "conv17_2"
1116 | type: "Convolution"
1117 | bottom: "conv17_1"
1118 | top: "conv17_2"
1119 | param {
1120 | lr_mult: 1.0
1121 | decay_mult: 1.0
1122 | }
1123 | param {
1124 | lr_mult: 2.0
1125 | decay_mult: 0.0
1126 | }
1127 | convolution_param {
1128 | num_output: 128
1129 | pad: 1
1130 | kernel_size: 3
1131 | stride: 2
1132 | weight_filler {
1133 | type: "msra"
1134 | }
1135 | bias_filler {
1136 | type: "constant"
1137 | value: 0.0
1138 | }
1139 | }
1140 | }
1141 | layer {
1142 | name: "conv17_2/relu"
1143 | type: "ReLU"
1144 | bottom: "conv17_2"
1145 | top: "conv17_2"
1146 | }
1147 | layer {
1148 | name: "conv11_mbox_loc"
1149 | type: "Convolution"
1150 | bottom: "conv11"
1151 | top: "conv11_mbox_loc"
1152 | param {
1153 | lr_mult: 1.0
1154 | decay_mult: 1.0
1155 | }
1156 | param {
1157 | lr_mult: 2.0
1158 | decay_mult: 0.0
1159 | }
1160 | convolution_param {
1161 | num_output: 12
1162 | kernel_size: 1
1163 | weight_filler {
1164 | type: "msra"
1165 | }
1166 | bias_filler {
1167 | type: "constant"
1168 | value: 0.0
1169 | }
1170 | }
1171 | }
1172 | layer {
1173 | name: "conv11_mbox_loc_perm"
1174 | type: "Permute"
1175 | bottom: "conv11_mbox_loc"
1176 | top: "conv11_mbox_loc_perm"
1177 | permute_param {
1178 | order: 0
1179 | order: 2
1180 | order: 3
1181 | order: 1
1182 | }
1183 | }
1184 | layer {
1185 | name: "conv11_mbox_loc_flat"
1186 | type: "Flatten"
1187 | bottom: "conv11_mbox_loc_perm"
1188 | top: "conv11_mbox_loc_flat"
1189 | flatten_param {
1190 | axis: 1
1191 | }
1192 | }
1193 | layer {
1194 | name: "conv11_mbox_conf"
1195 | type: "Convolution"
1196 | bottom: "conv11"
1197 | top: "conv11_mbox_conf"
1198 | param {
1199 | lr_mult: 1.0
1200 | decay_mult: 1.0
1201 | }
1202 | param {
1203 | lr_mult: 2.0
1204 | decay_mult: 0.0
1205 | }
1206 | convolution_param {
1207 | num_output: 63
1208 | kernel_size: 1
1209 | weight_filler {
1210 | type: "msra"
1211 | }
1212 | bias_filler {
1213 | type: "constant"
1214 | value: 0.0
1215 | }
1216 | }
1217 | }
1218 | layer {
1219 | name: "conv11_mbox_conf_perm"
1220 | type: "Permute"
1221 | bottom: "conv11_mbox_conf"
1222 | top: "conv11_mbox_conf_perm"
1223 | permute_param {
1224 | order: 0
1225 | order: 2
1226 | order: 3
1227 | order: 1
1228 | }
1229 | }
1230 | layer {
1231 | name: "conv11_mbox_conf_flat"
1232 | type: "Flatten"
1233 | bottom: "conv11_mbox_conf_perm"
1234 | top: "conv11_mbox_conf_flat"
1235 | flatten_param {
1236 | axis: 1
1237 | }
1238 | }
1239 | layer {
1240 | name: "conv11_mbox_priorbox"
1241 | type: "PriorBox"
1242 | bottom: "conv11"
1243 | bottom: "data"
1244 | top: "conv11_mbox_priorbox"
1245 | prior_box_param {
1246 | min_size: 60.0
1247 | aspect_ratio: 2.0
1248 | flip: true
1249 | clip: false
1250 | variance: 0.1
1251 | variance: 0.1
1252 | variance: 0.2
1253 | variance: 0.2
1254 | offset: 0.5
1255 | }
1256 | }
1257 | layer {
1258 | name: "conv13_mbox_loc"
1259 | type: "Convolution"
1260 | bottom: "conv13"
1261 | top: "conv13_mbox_loc"
1262 | param {
1263 | lr_mult: 1.0
1264 | decay_mult: 1.0
1265 | }
1266 | param {
1267 | lr_mult: 2.0
1268 | decay_mult: 0.0
1269 | }
1270 | convolution_param {
1271 | num_output: 24
1272 | kernel_size: 1
1273 | weight_filler {
1274 | type: "msra"
1275 | }
1276 | bias_filler {
1277 | type: "constant"
1278 | value: 0.0
1279 | }
1280 | }
1281 | }
1282 | layer {
1283 | name: "conv13_mbox_loc_perm"
1284 | type: "Permute"
1285 | bottom: "conv13_mbox_loc"
1286 | top: "conv13_mbox_loc_perm"
1287 | permute_param {
1288 | order: 0
1289 | order: 2
1290 | order: 3
1291 | order: 1
1292 | }
1293 | }
1294 | layer {
1295 | name: "conv13_mbox_loc_flat"
1296 | type: "Flatten"
1297 | bottom: "conv13_mbox_loc_perm"
1298 | top: "conv13_mbox_loc_flat"
1299 | flatten_param {
1300 | axis: 1
1301 | }
1302 | }
1303 | layer {
1304 | name: "conv13_mbox_conf"
1305 | type: "Convolution"
1306 | bottom: "conv13"
1307 | top: "conv13_mbox_conf"
1308 | param {
1309 | lr_mult: 1.0
1310 | decay_mult: 1.0
1311 | }
1312 | param {
1313 | lr_mult: 2.0
1314 | decay_mult: 0.0
1315 | }
1316 | convolution_param {
1317 | num_output: 126
1318 | kernel_size: 1
1319 | weight_filler {
1320 | type: "msra"
1321 | }
1322 | bias_filler {
1323 | type: "constant"
1324 | value: 0.0
1325 | }
1326 | }
1327 | }
1328 | layer {
1329 | name: "conv13_mbox_conf_perm"
1330 | type: "Permute"
1331 | bottom: "conv13_mbox_conf"
1332 | top: "conv13_mbox_conf_perm"
1333 | permute_param {
1334 | order: 0
1335 | order: 2
1336 | order: 3
1337 | order: 1
1338 | }
1339 | }
1340 | layer {
1341 | name: "conv13_mbox_conf_flat"
1342 | type: "Flatten"
1343 | bottom: "conv13_mbox_conf_perm"
1344 | top: "conv13_mbox_conf_flat"
1345 | flatten_param {
1346 | axis: 1
1347 | }
1348 | }
1349 | layer {
1350 | name: "conv13_mbox_priorbox"
1351 | type: "PriorBox"
1352 | bottom: "conv13"
1353 | bottom: "data"
1354 | top: "conv13_mbox_priorbox"
1355 | prior_box_param {
1356 | min_size: 105.0
1357 | max_size: 150.0
1358 | aspect_ratio: 2.0
1359 | aspect_ratio: 3.0
1360 | flip: true
1361 | clip: false
1362 | variance: 0.1
1363 | variance: 0.1
1364 | variance: 0.2
1365 | variance: 0.2
1366 | offset: 0.5
1367 | }
1368 | }
1369 | layer {
1370 | name: "conv14_2_mbox_loc"
1371 | type: "Convolution"
1372 | bottom: "conv14_2"
1373 | top: "conv14_2_mbox_loc"
1374 | param {
1375 | lr_mult: 1.0
1376 | decay_mult: 1.0
1377 | }
1378 | param {
1379 | lr_mult: 2.0
1380 | decay_mult: 0.0
1381 | }
1382 | convolution_param {
1383 | num_output: 24
1384 | kernel_size: 1
1385 | weight_filler {
1386 | type: "msra"
1387 | }
1388 | bias_filler {
1389 | type: "constant"
1390 | value: 0.0
1391 | }
1392 | }
1393 | }
1394 | layer {
1395 | name: "conv14_2_mbox_loc_perm"
1396 | type: "Permute"
1397 | bottom: "conv14_2_mbox_loc"
1398 | top: "conv14_2_mbox_loc_perm"
1399 | permute_param {
1400 | order: 0
1401 | order: 2
1402 | order: 3
1403 | order: 1
1404 | }
1405 | }
1406 | layer {
1407 | name: "conv14_2_mbox_loc_flat"
1408 | type: "Flatten"
1409 | bottom: "conv14_2_mbox_loc_perm"
1410 | top: "conv14_2_mbox_loc_flat"
1411 | flatten_param {
1412 | axis: 1
1413 | }
1414 | }
1415 | layer {
1416 | name: "conv14_2_mbox_conf"
1417 | type: "Convolution"
1418 | bottom: "conv14_2"
1419 | top: "conv14_2_mbox_conf"
1420 | param {
1421 | lr_mult: 1.0
1422 | decay_mult: 1.0
1423 | }
1424 | param {
1425 | lr_mult: 2.0
1426 | decay_mult: 0.0
1427 | }
1428 | convolution_param {
1429 | num_output: 126
1430 | kernel_size: 1
1431 | weight_filler {
1432 | type: "msra"
1433 | }
1434 | bias_filler {
1435 | type: "constant"
1436 | value: 0.0
1437 | }
1438 | }
1439 | }
1440 | layer {
1441 | name: "conv14_2_mbox_conf_perm"
1442 | type: "Permute"
1443 | bottom: "conv14_2_mbox_conf"
1444 | top: "conv14_2_mbox_conf_perm"
1445 | permute_param {
1446 | order: 0
1447 | order: 2
1448 | order: 3
1449 | order: 1
1450 | }
1451 | }
1452 | layer {
1453 | name: "conv14_2_mbox_conf_flat"
1454 | type: "Flatten"
1455 | bottom: "conv14_2_mbox_conf_perm"
1456 | top: "conv14_2_mbox_conf_flat"
1457 | flatten_param {
1458 | axis: 1
1459 | }
1460 | }
1461 | layer {
1462 | name: "conv14_2_mbox_priorbox"
1463 | type: "PriorBox"
1464 | bottom: "conv14_2"
1465 | bottom: "data"
1466 | top: "conv14_2_mbox_priorbox"
1467 | prior_box_param {
1468 | min_size: 150.0
1469 | max_size: 195.0
1470 | aspect_ratio: 2.0
1471 | aspect_ratio: 3.0
1472 | flip: true
1473 | clip: false
1474 | variance: 0.1
1475 | variance: 0.1
1476 | variance: 0.2
1477 | variance: 0.2
1478 | offset: 0.5
1479 | }
1480 | }
1481 | layer {
1482 | name: "conv15_2_mbox_loc"
1483 | type: "Convolution"
1484 | bottom: "conv15_2"
1485 | top: "conv15_2_mbox_loc"
1486 | param {
1487 | lr_mult: 1.0
1488 | decay_mult: 1.0
1489 | }
1490 | param {
1491 | lr_mult: 2.0
1492 | decay_mult: 0.0
1493 | }
1494 | convolution_param {
1495 | num_output: 24
1496 | kernel_size: 1
1497 | weight_filler {
1498 | type: "msra"
1499 | }
1500 | bias_filler {
1501 | type: "constant"
1502 | value: 0.0
1503 | }
1504 | }
1505 | }
1506 | layer {
1507 | name: "conv15_2_mbox_loc_perm"
1508 | type: "Permute"
1509 | bottom: "conv15_2_mbox_loc"
1510 | top: "conv15_2_mbox_loc_perm"
1511 | permute_param {
1512 | order: 0
1513 | order: 2
1514 | order: 3
1515 | order: 1
1516 | }
1517 | }
1518 | layer {
1519 | name: "conv15_2_mbox_loc_flat"
1520 | type: "Flatten"
1521 | bottom: "conv15_2_mbox_loc_perm"
1522 | top: "conv15_2_mbox_loc_flat"
1523 | flatten_param {
1524 | axis: 1
1525 | }
1526 | }
1527 | layer {
1528 | name: "conv15_2_mbox_conf"
1529 | type: "Convolution"
1530 | bottom: "conv15_2"
1531 | top: "conv15_2_mbox_conf"
1532 | param {
1533 | lr_mult: 1.0
1534 | decay_mult: 1.0
1535 | }
1536 | param {
1537 | lr_mult: 2.0
1538 | decay_mult: 0.0
1539 | }
1540 | convolution_param {
1541 | num_output: 126
1542 | kernel_size: 1
1543 | weight_filler {
1544 | type: "msra"
1545 | }
1546 | bias_filler {
1547 | type: "constant"
1548 | value: 0.0
1549 | }
1550 | }
1551 | }
1552 | layer {
1553 | name: "conv15_2_mbox_conf_perm"
1554 | type: "Permute"
1555 | bottom: "conv15_2_mbox_conf"
1556 | top: "conv15_2_mbox_conf_perm"
1557 | permute_param {
1558 | order: 0
1559 | order: 2
1560 | order: 3
1561 | order: 1
1562 | }
1563 | }
1564 | layer {
1565 | name: "conv15_2_mbox_conf_flat"
1566 | type: "Flatten"
1567 | bottom: "conv15_2_mbox_conf_perm"
1568 | top: "conv15_2_mbox_conf_flat"
1569 | flatten_param {
1570 | axis: 1
1571 | }
1572 | }
1573 | layer {
1574 | name: "conv15_2_mbox_priorbox"
1575 | type: "PriorBox"
1576 | bottom: "conv15_2"
1577 | bottom: "data"
1578 | top: "conv15_2_mbox_priorbox"
1579 | prior_box_param {
1580 | min_size: 195.0
1581 | max_size: 240.0
1582 | aspect_ratio: 2.0
1583 | aspect_ratio: 3.0
1584 | flip: true
1585 | clip: false
1586 | variance: 0.1
1587 | variance: 0.1
1588 | variance: 0.2
1589 | variance: 0.2
1590 | offset: 0.5
1591 | }
1592 | }
1593 | layer {
1594 | name: "conv16_2_mbox_loc"
1595 | type: "Convolution"
1596 | bottom: "conv16_2"
1597 | top: "conv16_2_mbox_loc"
1598 | param {
1599 | lr_mult: 1.0
1600 | decay_mult: 1.0
1601 | }
1602 | param {
1603 | lr_mult: 2.0
1604 | decay_mult: 0.0
1605 | }
1606 | convolution_param {
1607 | num_output: 24
1608 | kernel_size: 1
1609 | weight_filler {
1610 | type: "msra"
1611 | }
1612 | bias_filler {
1613 | type: "constant"
1614 | value: 0.0
1615 | }
1616 | }
1617 | }
1618 | layer {
1619 | name: "conv16_2_mbox_loc_perm"
1620 | type: "Permute"
1621 | bottom: "conv16_2_mbox_loc"
1622 | top: "conv16_2_mbox_loc_perm"
1623 | permute_param {
1624 | order: 0
1625 | order: 2
1626 | order: 3
1627 | order: 1
1628 | }
1629 | }
1630 | layer {
1631 | name: "conv16_2_mbox_loc_flat"
1632 | type: "Flatten"
1633 | bottom: "conv16_2_mbox_loc_perm"
1634 | top: "conv16_2_mbox_loc_flat"
1635 | flatten_param {
1636 | axis: 1
1637 | }
1638 | }
1639 | layer {
1640 | name: "conv16_2_mbox_conf"
1641 | type: "Convolution"
1642 | bottom: "conv16_2"
1643 | top: "conv16_2_mbox_conf"
1644 | param {
1645 | lr_mult: 1.0
1646 | decay_mult: 1.0
1647 | }
1648 | param {
1649 | lr_mult: 2.0
1650 | decay_mult: 0.0
1651 | }
1652 | convolution_param {
1653 | num_output: 126
1654 | kernel_size: 1
1655 | weight_filler {
1656 | type: "msra"
1657 | }
1658 | bias_filler {
1659 | type: "constant"
1660 | value: 0.0
1661 | }
1662 | }
1663 | }
1664 | layer {
1665 | name: "conv16_2_mbox_conf_perm"
1666 | type: "Permute"
1667 | bottom: "conv16_2_mbox_conf"
1668 | top: "conv16_2_mbox_conf_perm"
1669 | permute_param {
1670 | order: 0
1671 | order: 2
1672 | order: 3
1673 | order: 1
1674 | }
1675 | }
1676 | layer {
1677 | name: "conv16_2_mbox_conf_flat"
1678 | type: "Flatten"
1679 | bottom: "conv16_2_mbox_conf_perm"
1680 | top: "conv16_2_mbox_conf_flat"
1681 | flatten_param {
1682 | axis: 1
1683 | }
1684 | }
1685 | layer {
1686 | name: "conv16_2_mbox_priorbox"
1687 | type: "PriorBox"
1688 | bottom: "conv16_2"
1689 | bottom: "data"
1690 | top: "conv16_2_mbox_priorbox"
1691 | prior_box_param {
1692 | min_size: 240.0
1693 | max_size: 285.0
1694 | aspect_ratio: 2.0
1695 | aspect_ratio: 3.0
1696 | flip: true
1697 | clip: false
1698 | variance: 0.1
1699 | variance: 0.1
1700 | variance: 0.2
1701 | variance: 0.2
1702 | offset: 0.5
1703 | }
1704 | }
1705 | layer {
1706 | name: "conv17_2_mbox_loc"
1707 | type: "Convolution"
1708 | bottom: "conv17_2"
1709 | top: "conv17_2_mbox_loc"
1710 | param {
1711 | lr_mult: 1.0
1712 | decay_mult: 1.0
1713 | }
1714 | param {
1715 | lr_mult: 2.0
1716 | decay_mult: 0.0
1717 | }
1718 | convolution_param {
1719 | num_output: 24
1720 | kernel_size: 1
1721 | weight_filler {
1722 | type: "msra"
1723 | }
1724 | bias_filler {
1725 | type: "constant"
1726 | value: 0.0
1727 | }
1728 | }
1729 | }
1730 | layer {
1731 | name: "conv17_2_mbox_loc_perm"
1732 | type: "Permute"
1733 | bottom: "conv17_2_mbox_loc"
1734 | top: "conv17_2_mbox_loc_perm"
1735 | permute_param {
1736 | order: 0
1737 | order: 2
1738 | order: 3
1739 | order: 1
1740 | }
1741 | }
1742 | layer {
1743 | name: "conv17_2_mbox_loc_flat"
1744 | type: "Flatten"
1745 | bottom: "conv17_2_mbox_loc_perm"
1746 | top: "conv17_2_mbox_loc_flat"
1747 | flatten_param {
1748 | axis: 1
1749 | }
1750 | }
1751 | layer {
1752 | name: "conv17_2_mbox_conf"
1753 | type: "Convolution"
1754 | bottom: "conv17_2"
1755 | top: "conv17_2_mbox_conf"
1756 | param {
1757 | lr_mult: 1.0
1758 | decay_mult: 1.0
1759 | }
1760 | param {
1761 | lr_mult: 2.0
1762 | decay_mult: 0.0
1763 | }
1764 | convolution_param {
1765 | num_output: 126
1766 | kernel_size: 1
1767 | weight_filler {
1768 | type: "msra"
1769 | }
1770 | bias_filler {
1771 | type: "constant"
1772 | value: 0.0
1773 | }
1774 | }
1775 | }
1776 | layer {
1777 | name: "conv17_2_mbox_conf_perm"
1778 | type: "Permute"
1779 | bottom: "conv17_2_mbox_conf"
1780 | top: "conv17_2_mbox_conf_perm"
1781 | permute_param {
1782 | order: 0
1783 | order: 2
1784 | order: 3
1785 | order: 1
1786 | }
1787 | }
1788 | layer {
1789 | name: "conv17_2_mbox_conf_flat"
1790 | type: "Flatten"
1791 | bottom: "conv17_2_mbox_conf_perm"
1792 | top: "conv17_2_mbox_conf_flat"
1793 | flatten_param {
1794 | axis: 1
1795 | }
1796 | }
1797 | layer {
1798 | name: "conv17_2_mbox_priorbox"
1799 | type: "PriorBox"
1800 | bottom: "conv17_2"
1801 | bottom: "data"
1802 | top: "conv17_2_mbox_priorbox"
1803 | prior_box_param {
1804 | min_size: 285.0
1805 | max_size: 300.0
1806 | aspect_ratio: 2.0
1807 | aspect_ratio: 3.0
1808 | flip: true
1809 | clip: false
1810 | variance: 0.1
1811 | variance: 0.1
1812 | variance: 0.2
1813 | variance: 0.2
1814 | offset: 0.5
1815 | }
1816 | }
1817 | layer {
1818 | name: "mbox_loc"
1819 | type: "Concat"
1820 | bottom: "conv11_mbox_loc_flat"
1821 | bottom: "conv13_mbox_loc_flat"
1822 | bottom: "conv14_2_mbox_loc_flat"
1823 | bottom: "conv15_2_mbox_loc_flat"
1824 | bottom: "conv16_2_mbox_loc_flat"
1825 | bottom: "conv17_2_mbox_loc_flat"
1826 | top: "mbox_loc"
1827 | concat_param {
1828 | axis: 1
1829 | }
1830 | }
1831 | layer {
1832 | name: "mbox_conf"
1833 | type: "Concat"
1834 | bottom: "conv11_mbox_conf_flat"
1835 | bottom: "conv13_mbox_conf_flat"
1836 | bottom: "conv14_2_mbox_conf_flat"
1837 | bottom: "conv15_2_mbox_conf_flat"
1838 | bottom: "conv16_2_mbox_conf_flat"
1839 | bottom: "conv17_2_mbox_conf_flat"
1840 | top: "mbox_conf"
1841 | concat_param {
1842 | axis: 1
1843 | }
1844 | }
1845 | layer {
1846 | name: "mbox_priorbox"
1847 | type: "Concat"
1848 | bottom: "conv11_mbox_priorbox"
1849 | bottom: "conv13_mbox_priorbox"
1850 | bottom: "conv14_2_mbox_priorbox"
1851 | bottom: "conv15_2_mbox_priorbox"
1852 | bottom: "conv16_2_mbox_priorbox"
1853 | bottom: "conv17_2_mbox_priorbox"
1854 | top: "mbox_priorbox"
1855 | concat_param {
1856 | axis: 2
1857 | }
1858 | }
1859 | layer {
1860 | name: "mbox_conf_reshape"
1861 | type: "Reshape"
1862 | bottom: "mbox_conf"
1863 | top: "mbox_conf_reshape"
1864 | reshape_param {
1865 | shape {
1866 | dim: 0
1867 | dim: -1
1868 | dim: 21
1869 | }
1870 | }
1871 | }
1872 | layer {
1873 | name: "mbox_conf_softmax"
1874 | type: "Softmax"
1875 | bottom: "mbox_conf_reshape"
1876 | top: "mbox_conf_softmax"
1877 | softmax_param {
1878 | axis: 2
1879 | }
1880 | }
1881 | layer {
1882 | name: "mbox_conf_flatten"
1883 | type: "Flatten"
1884 | bottom: "mbox_conf_softmax"
1885 | top: "mbox_conf_flatten"
1886 | flatten_param {
1887 | axis: 1
1888 | }
1889 | }
1890 | layer {
1891 | name: "detection_out"
1892 | type: "DetectionOutput"
1893 | bottom: "mbox_loc"
1894 | bottom: "mbox_conf_flatten"
1895 | bottom: "mbox_priorbox"
1896 | top: "detection_out"
1897 | include {
1898 | phase: TEST
1899 | }
1900 | detection_output_param {
1901 | num_classes: 21
1902 | share_location: true
1903 | background_label_id: 0
1904 | nms_param {
1905 | nms_threshold: 0.45
1906 | top_k: 100
1907 | }
1908 | code_type: CENTER_SIZE
1909 | keep_top_k: 100
1910 | confidence_threshold: 0.25
1911 | }
1912 | }
1913 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DNN_Object_Detection
2 |
3 | This code demonstrates usage of OpenCV deep learning module (dnn module) with MobileNet-SSD network for object detection.
4 |
5 | ## Setup
6 |
7 | ### Dependencies
8 |
9 | ```Linux
10 | pip install -r requirements.txt
11 | ```
12 |
13 | ### Execution
14 | ```Linux
15 | python dnn_object_detection.py \
16 | --prototxt MobileNetSSD_deploy.prototxt.txt \
17 | --model MobileNetSSD_deploy.caffemodel \
18 | --labels object_detection_classes_pascal_voc.txt
19 | ```
20 |
21 | ### Usage
22 |
23 | ```Linux
24 | python dnn_object_detection.py [-h] -p PROTOTXT -m MODEL -l LABELS
25 | [-c CONFIDENCE] [-v VIDEO]
26 | ```
27 |
28 | ## Description
29 |
30 | Object recognition is a computer vision technique for identifying objects in images or videos. Object recognition is a key output of deep learning and machine learning algorithms.
31 |
32 | As part of Opencv 3.4.+ deep neural network(dnn) module was included officially. The dnn module allows load pre-trained models from most populars deep learning frameworks, including Tensorflow, Caffe, Darknet, Torch. Besides MobileNet-SDD other architectures are compatible with OpenCV 3.4.1 :
33 |
34 | * GoogleLeNet
35 | * YOLO
36 | * SqueezeNet
37 | * Faster R-CNN
38 | * ResNet
39 |
40 | This API is compatible with C++ and Python.
41 |
42 | ### Implementation
43 |
44 | Network used - [MobileNet-SSD](https://github.com/chuanqi305/MobileNet-SSD)
45 |
46 | We can therefore detect 20 objects in images (+1 for the background class), including airplanes, bicycles, birds, boats, bottles, buses, cars, cats, chairs, cows, dining tables, dogs, horses, motorbikes, people, potted plants, sheep, sofas, trains, and tv monitors.
47 |
48 | ## Results
49 |
50 | 
51 |
52 | 
53 |
54 | ## References
55 |
56 | [Real-time object detection with deep learning and OpenCV](https://www.pyimagesearch.com/2017/09/18/real-time-object-detection-with-deep-learning-and-opencv/)
57 |
58 | [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf)
59 |
60 | [SSD: Single Shot MultiBox Detector](https://arxiv.org/pdf/1512.02325.pdf)
61 |
62 | [OpenCV deep learning module](https://github.com/opencv/opencv/tree/master/samples/dnn)
63 |
--------------------------------------------------------------------------------
/dnn_object_detection.py:
--------------------------------------------------------------------------------
1 | # Import necessary libraries
2 | import numpy as np
3 | import argparse
4 | import cv2
5 |
6 | # Construct the argument parse and parse the arguments
7 | ap = argparse.ArgumentParser(
8 | description='Script to run MobileNet-SSD object detection network')
9 | ap.add_argument('-v', '--video', type=str, default='',
10 | help='Path to video file. If empty, web cam stream will be used')
11 | ap.add_argument('-p', '--prototxt', required=True,
12 | help="Path to Caffe 'deploy' prototxt file")
13 | ap.add_argument('-m', '--model', required=True,
14 | help='Path to weights for Caffe model')
15 | ap.add_argument('-l', '--labels', required=True,
16 | help='Path to labels for dataset')
17 | ap.add_argument('-c', '--confidence', type=float, default=0.2,
18 | help='Minimum probability to filter weak detections')
19 | args = vars(ap.parse_args())
20 |
21 | # Initialize class labels of the dataset
22 | CLASSES = [line.strip() for line in open(args['labels'])]
23 | print('[INFO]', CLASSES)
24 |
25 | # Generate random bounding box colors for each class label
26 | COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
27 |
28 | # Load Caffe model from disk
29 | print("[INFO] Loading model")
30 | net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
31 |
32 | # Open video capture from file or capture device
33 | print("[INFO] Starting video stream")
34 | if args['video']:
35 | cap = cv2.VideoCapture(args['video'])
36 | else:
37 | cap = cv2.VideoCapture(0)
38 |
39 | while cap.isOpened():
40 | # Capture frame-by-frame
41 | ret, frame = cap.read()
42 |
43 | if not ret:
44 | break
45 |
46 | (h, w) = frame.shape[:2]
47 |
48 | # MobileNet requires fixed dimensions for input image(s)
49 | # so we have to ensure that it is resized to 300x300 pixels.
50 | # set a scale factor to image because network the objects has differents size.
51 | # We perform a mean subtraction (127.5, 127.5, 127.5) to normalize the input;
52 | # after executing this command our "blob" now has the shape:
53 | # (1, 3, 300, 300)
54 | blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 0.007843, (300, 300), 127.5)
55 |
56 | # Pass the blob through the network and obtain the detections and predictions
57 | net.setInput(blob)
58 | detections = net.forward()
59 |
60 | for i in range(detections.shape[2]):
61 | # Extract the confidence (i.e., probability) associated with the prediction
62 | confidence = detections[0, 0, i, 2]
63 |
64 | # Filter out weak detections by ensuring the `confidence` is
65 | # greater than the minimum confidence
66 | if confidence > args["confidence"]:
67 | # Extract the index of the class label from the `detections`,
68 | # then compute the (x, y)-coordinates of the bounding box for
69 | # the object
70 | class_id = int(detections[0, 0, i, 1])
71 | box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
72 | (startX, startY, endX, endY) = box.astype('int')
73 |
74 | # Draw bounding box for the object
75 | cv2.rectangle(frame, (startX, startY), (endX, endY), COLORS[class_id], 2)
76 |
77 | # Draw label and confidence of prediction in frame
78 | label = "{}: {:.2f}%".format(CLASSES[class_id], confidence * 100)
79 | print("[INFO] {}".format(label))
80 | cv2.rectangle(frame, (startX, startY), (endX, endY),
81 | COLORS[class_id], 2)
82 | y = startY - 15 if startY - 15 > 15 else startY + 15
83 | cv2.putText(frame, label, (startX, y),
84 | cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[class_id], 2)
85 |
86 | # Show fame
87 | cv2.imshow("Frame", frame)
88 |
89 | key = cv2.waitKey(1) & 0xFF
90 |
91 | # Press `q` to exit
92 | if key == ord("q"):
93 | break
94 |
95 | # Clean-up
96 | cap.release()
97 | cv2.destroyAllWindows()
98 |
--------------------------------------------------------------------------------
/object_detection_classes_pascal_voc.txt:
--------------------------------------------------------------------------------
1 | background
2 | aeroplane
3 | bicycle
4 | bird
5 | boat
6 | bottle
7 | bus
8 | car
9 | cat
10 | chair
11 | cow
12 | diningtable
13 | dog
14 | horse
15 | motorbike
16 | person
17 | pottedplant
18 | sheep
19 | sofa
20 | train
21 | tvmonitor
--------------------------------------------------------------------------------
/output.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheNsBhasin/DNN_Object_Detection/7680652f963748c28108800450c5c88ddf3964ad/output.gif
--------------------------------------------------------------------------------
/requeriments.txt:
--------------------------------------------------------------------------------
1 | numpy==1.14.3
2 | opencv-python==3.4.2
--------------------------------------------------------------------------------
/sample.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheNsBhasin/DNN_Object_Detection/7680652f963748c28108800450c5c88ddf3964ad/sample.jpeg
--------------------------------------------------------------------------------