├── .gitignore
├── LICENSE
├── README.md
├── cfg
    └── yolo.cfg
├── data
    └── coco_classes.txt
├── demo.py
├── images
    ├── res
    │   ├── dog.jpg
    │   ├── eagle.jpg
    │   ├── giraffe.jpg
    │   ├── horses.jpg
    │   ├── person.jpg
    │   └── takagaki.jpg
    ├── test
    │   ├── dog.jpg
    │   ├── eagle.jpg
    │   ├── giraffe.jpg
    │   ├── horses.jpg
    │   ├── person.jpg
    │   └── takagaki.jpg
    └── yolo.png
├── model
    ├── darknet53.py
    └── yolo_model.py
├── videos
    ├── res
    │   └── library1.mp4
    └── test
    │   └── library1.mp4
└── yad2k.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Larry
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YOLOv3
 2 | Keras(TF backend) implementation of yolo v3 objects detection. 
 3 | 
 4 | According to the paper [YOLOv3: An Incremental Improvement](https://pjreddie.com/media/files/papers/YOLOv3.pdf).
 5 | 
 6 | ## Requirement
 7 | - OpenCV 3.4
 8 | - Python 3.6    
 9 | - Tensorflow-gpu 1.5.0  
10 | - Keras 2.1.3
11 | 
12 | ## Quick start
13 | 
14 | - Download official [yolov3.weights](https://pjreddie.com/media/files/yolov3.weights) and put it on top floder of project.
15 | 
16 | - Run the follow command to convert darknet weight file to keras h5 file. The `yad2k.py` was modified from [allanzelener/YAD2K](https://github.com/allanzelener/YAD2K).
17 | ```
18 | python yad2k.py cfg\yolo.cfg yolov3.weights data\yolo.h5
19 | ```
20 | 
21 | - run follow command to show the demo. The result can be found in `images\res\` floder.
22 | ```
23 | python demo.py
24 | ```
25 | 
26 | ## Demo result
27 | 
28 | It can be seen that yolo v3 has a better classification ability than yolo v2.
29 | 
30 | <img width="400" height="350" src="/images/res/dog.jpg"/><img width="400" height="350" src="/images/res/person.jpg"/>
31 | 
32 | ## TODO
33 | 
34 | - Train the model.
35 | 
36 | ## Reference
37 | 
38 | 	@article{YOLOv3,  
39 | 	  title={YOLOv3: An Incremental Improvement},  
40 | 	  author={J Redmon, A Farhadi },
41 | 	  year={2018}
42 | 
43 | 
44 | 
45 | ## Copyright
46 | See [LICENSE](LICENSE) for details.
47 | 


--------------------------------------------------------------------------------
/cfg/yolo.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .5
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .5
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .5
787 | truth_thresh = 1
788 | random=1


--------------------------------------------------------------------------------
/data/coco_classes.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | """Demo for use yolo v3
  2 | """
  3 | import os
  4 | import time
  5 | import cv2
  6 | import numpy as np
  7 | from model.yolo_model import YOLO
  8 | 
  9 | 
 10 | def process_image(img):
 11 |     """Resize, reduce and expand image.
 12 | 
 13 |     # Argument:
 14 |         img: original image.
 15 | 
 16 |     # Returns
 17 |         image: ndarray(64, 64, 3), processed image.
 18 |     """
 19 |     image = cv2.resize(img, (416, 416),
 20 |                        interpolation=cv2.INTER_CUBIC)
 21 |     image = np.array(image, dtype='float32')
 22 |     image /= 255.
 23 |     image = np.expand_dims(image, axis=0)
 24 | 
 25 |     return image
 26 | 
 27 | 
 28 | def get_classes(file):
 29 |     """Get classes name.
 30 | 
 31 |     # Argument:
 32 |         file: classes name for database.
 33 | 
 34 |     # Returns
 35 |         class_names: List, classes name.
 36 | 
 37 |     """
 38 |     with open(file) as f:
 39 |         class_names = f.readlines()
 40 |     class_names = [c.strip() for c in class_names]
 41 | 
 42 |     return class_names
 43 | 
 44 | 
 45 | def draw(image, boxes, scores, classes, all_classes):
 46 |     """Draw the boxes on the image.
 47 | 
 48 |     # Argument:
 49 |         image: original image.
 50 |         boxes: ndarray, boxes of objects.
 51 |         classes: ndarray, classes of objects.
 52 |         scores: ndarray, scores of objects.
 53 |         all_classes: all classes name.
 54 |     """
 55 |     for box, score, cl in zip(boxes, scores, classes):
 56 |         x, y, w, h = box
 57 | 
 58 |         top = max(0, np.floor(x + 0.5).astype(int))
 59 |         left = max(0, np.floor(y + 0.5).astype(int))
 60 |         right = min(image.shape[1], np.floor(x + w + 0.5).astype(int))
 61 |         bottom = min(image.shape[0], np.floor(y + h + 0.5).astype(int))
 62 | 
 63 |         cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
 64 |         cv2.putText(image, '{0} {1:.2f}'.format(all_classes[cl], score),
 65 |                     (top, left - 6),
 66 |                     cv2.FONT_HERSHEY_SIMPLEX,
 67 |                     0.6, (0, 0, 255), 1,
 68 |                     cv2.LINE_AA)
 69 | 
 70 |         print('class: {0}, score: {1:.2f}'.format(all_classes[cl], score))
 71 |         print('box coordinate x,y,w,h: {0}'.format(box))
 72 | 
 73 |     print()
 74 | 
 75 | 
 76 | def detect_image(image, yolo, all_classes):
 77 |     """Use yolo v3 to detect images.
 78 | 
 79 |     # Argument:
 80 |         image: original image.
 81 |         yolo: YOLO, yolo model.
 82 |         all_classes: all classes name.
 83 | 
 84 |     # Returns:
 85 |         image: processed image.
 86 |     """
 87 |     pimage = process_image(image)
 88 | 
 89 |     start = time.time()
 90 |     boxes, classes, scores = yolo.predict(pimage, image.shape)
 91 |     end = time.time()
 92 | 
 93 |     print('time: {0:.2f}s'.format(end - start))
 94 | 
 95 |     if boxes is not None:
 96 |         draw(image, boxes, scores, classes, all_classes)
 97 | 
 98 |     return image
 99 | 
100 | 
101 | def detect_video(video, yolo, all_classes):
102 |     """Use yolo v3 to detect video.
103 | 
104 |     # Argument:
105 |         video: video file.
106 |         yolo: YOLO, yolo model.
107 |         all_classes: all classes name.
108 |     """
109 |     video_path = os.path.join("videos", "test", video)
110 |     camera = cv2.VideoCapture(video_path)
111 |     cv2.namedWindow("detection", cv2.WINDOW_AUTOSIZE)
112 | 
113 |     # Prepare for saving the detected video
114 |     sz = (int(camera.get(cv2.CAP_PROP_FRAME_WIDTH)),
115 |         int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT)))
116 |     fourcc = cv2.VideoWriter_fourcc(*'mpeg')
117 | 
118 |     vout = cv2.VideoWriter()
119 |     vout.open(os.path.join("videos", "res", video), fourcc, 20, sz, True)
120 | 
121 |     while True:
122 |         res, frame = camera.read()
123 | 
124 |         if not res:
125 |             break
126 | 
127 |         image = detect_image(frame, yolo, all_classes)
128 |         cv2.imshow("detection", image)
129 | 
130 |         # Save the video frame by frame
131 |         vout.write(image)
132 | 
133 |         if cv2.waitKey(110) & 0xff == 27:
134 |                 break
135 | 
136 |     vout.release()
137 |     camera.release()
138 |     
139 | 
140 | if __name__ == '__main__':
141 |     yolo = YOLO(0.6, 0.5)
142 |     file = 'data/coco_classes.txt'
143 |     all_classes = get_classes(file)
144 | 
145 |     # detect images in test floder.
146 |     for (root, dirs, files) in os.walk('images/test'):
147 |         if files:
148 |             for f in files:
149 |                 print(f)
150 |                 path = os.path.join(root, f)
151 |                 image = cv2.imread(path)
152 |                 image = detect_image(image, yolo, all_classes)
153 |                 cv2.imwrite('images/res/' + f, image)
154 | 
155 |     # detect videos one at a time in videos/test folder    
156 |     video = 'library1.mp4'
157 |     detect_video(video, yolo, all_classes)
158 | 


--------------------------------------------------------------------------------
/images/res/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/dog.jpg


--------------------------------------------------------------------------------
/images/res/eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/eagle.jpg


--------------------------------------------------------------------------------
/images/res/giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/giraffe.jpg


--------------------------------------------------------------------------------
/images/res/horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/horses.jpg


--------------------------------------------------------------------------------
/images/res/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/person.jpg


--------------------------------------------------------------------------------
/images/res/takagaki.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/takagaki.jpg


--------------------------------------------------------------------------------
/images/test/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/dog.jpg


--------------------------------------------------------------------------------
/images/test/eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/eagle.jpg


--------------------------------------------------------------------------------
/images/test/giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/giraffe.jpg


--------------------------------------------------------------------------------
/images/test/horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/horses.jpg


--------------------------------------------------------------------------------
/images/test/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/person.jpg


--------------------------------------------------------------------------------
/images/test/takagaki.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/takagaki.jpg


--------------------------------------------------------------------------------
/images/yolo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/yolo.png


--------------------------------------------------------------------------------
/model/darknet53.py:
--------------------------------------------------------------------------------
  1 | """Darknet-53 for yolo v3.
  2 | """
  3 | from keras.models import Model
  4 | from keras.layers import Input, Conv2D, GlobalAveragePooling2D, Dense
  5 | from keras.layers import add, Activation, BatchNormalization
  6 | from keras.layers.advanced_activations import LeakyReLU
  7 | from keras.regularizers import l2
  8 | 
  9 | 
 10 | def conv2d_unit(x, filters, kernels, strides=1):
 11 |     """Convolution Unit
 12 |     This function defines a 2D convolution operation with BN and LeakyReLU.
 13 | 
 14 |     # Arguments
 15 |         x: Tensor, input tensor of conv layer.
 16 |         filters: Integer, the dimensionality of the output space.
 17 |         kernels: An integer or tuple/list of 2 integers, specifying the
 18 |             width and height of the 2D convolution window.
 19 |         strides: An integer or tuple/list of 2 integers,
 20 |             specifying the strides of the convolution along the width and
 21 |             height. Can be a single integer to specify the same value for
 22 |             all spatial dimensions.
 23 | 
 24 |     # Returns
 25 |             Output tensor.
 26 |     """
 27 |     x = Conv2D(filters, kernels,
 28 |                padding='same',
 29 |                strides=strides,
 30 |                activation='linear',
 31 |                kernel_regularizer=l2(5e-4))(x)
 32 |     x = BatchNormalization()(x)
 33 |     x = LeakyReLU(alpha=0.1)(x)
 34 | 
 35 |     return x
 36 | 
 37 | 
 38 | def residual_block(inputs, filters):
 39 |     """Residual Block
 40 |     This function defines a 2D convolution operation with BN and LeakyReLU.
 41 | 
 42 |     # Arguments
 43 |         x: Tensor, input tensor of residual block.
 44 |         kernels: An integer or tuple/list of 2 integers, specifying the
 45 |             width and height of the 2D convolution window.
 46 | 
 47 |     # Returns
 48 |         Output tensor.
 49 |     """
 50 |     x = conv2d_unit(inputs, filters, (1, 1))
 51 |     x = conv2d_unit(x, 2 * filters, (3, 3))
 52 |     x = add([inputs, x])
 53 |     x = Activation('linear')(x)
 54 | 
 55 |     return x
 56 | 
 57 | 
 58 | def stack_residual_block(inputs, filters, n):
 59 |     """Stacked residual Block
 60 |     """
 61 |     x = residual_block(inputs, filters)
 62 | 
 63 |     for i in range(n - 1):
 64 |         x = residual_block(x, filters)
 65 | 
 66 |     return x
 67 | 
 68 | 
 69 | def darknet_base(inputs):
 70 |     """Darknet-53 base model.
 71 |     """
 72 | 
 73 |     x = conv2d_unit(inputs, 32, (3, 3))
 74 | 
 75 |     x = conv2d_unit(x, 64, (3, 3), strides=2)
 76 |     x = stack_residual_block(x, 32, n=1)
 77 | 
 78 |     x = conv2d_unit(x, 128, (3, 3), strides=2)
 79 |     x = stack_residual_block(x, 64, n=2)
 80 | 
 81 |     x = conv2d_unit(x, 256, (3, 3), strides=2)
 82 |     x = stack_residual_block(x, 128, n=8)
 83 | 
 84 |     x = conv2d_unit(x, 512, (3, 3), strides=2)
 85 |     x = stack_residual_block(x, 256, n=8)
 86 | 
 87 |     x = conv2d_unit(x, 1024, (3, 3), strides=2)
 88 |     x = stack_residual_block(x, 512, n=4)
 89 | 
 90 |     return x
 91 | 
 92 | 
 93 | def darknet():
 94 |     """Darknet-53 classifier.
 95 |     """
 96 |     inputs = Input(shape=(416, 416, 3))
 97 |     x = darknet_base(inputs)
 98 | 
 99 |     x = GlobalAveragePooling2D()(x)
100 |     x = Dense(1000, activation='softmax')(x)
101 | 
102 |     model = Model(inputs, x)
103 | 
104 |     return model
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     model = darknet()
109 |     print(model.summary())
110 | 


--------------------------------------------------------------------------------
/model/yolo_model.py:
--------------------------------------------------------------------------------
  1 | """YOLO v3 output
  2 | """
  3 | import numpy as np
  4 | import keras.backend as K
  5 | from keras.models import load_model
  6 | 
  7 | 
  8 | class YOLO:
  9 |     def __init__(self, obj_threshold, nms_threshold):
 10 |         """Init.
 11 | 
 12 |         # Arguments
 13 |             obj_threshold: Integer, threshold for object.
 14 |             nms_threshold: Integer, threshold for box.
 15 |         """
 16 |         self._t1 = obj_threshold
 17 |         self._t2 = nms_threshold
 18 |         self._yolo = load_model('data/yolo.h5')
 19 | 
 20 |     def _sigmoid(self, x):
 21 |         """sigmoid.
 22 | 
 23 |         # Arguments
 24 |             x: Tensor.
 25 | 
 26 |         # Returns
 27 |             numpy ndarray.
 28 |         """
 29 |         return 1 / (1 + np.exp(-x))
 30 | 
 31 |     def _process_feats(self, out, anchors, mask):
 32 |         """process output features.
 33 | 
 34 |         # Arguments
 35 |             out: Tensor (N, N, 3, 4 + 1 +80), output feature map of yolo.
 36 |             anchors: List, anchors for box.
 37 |             mask: List, mask for anchors.
 38 | 
 39 |         # Returns
 40 |             boxes: ndarray (N, N, 3, 4), x,y,w,h for per box.
 41 |             box_confidence: ndarray (N, N, 3, 1), confidence for per box.
 42 |             box_class_probs: ndarray (N, N, 3, 80), class probs for per box.
 43 |         """
 44 |         grid_h, grid_w, num_boxes = map(int, out.shape[1: 4])
 45 | 
 46 |         anchors = [anchors[i] for i in mask]
 47 |         anchors_tensor = np.array(anchors).reshape(1, 1, len(anchors), 2)
 48 | 
 49 |         # Reshape to batch, height, width, num_anchors, box_params.
 50 |         out = out[0]
 51 |         box_xy = self._sigmoid(out[..., :2])
 52 |         box_wh = np.exp(out[..., 2:4])
 53 |         box_wh = box_wh * anchors_tensor
 54 | 
 55 |         box_confidence = self._sigmoid(out[..., 4])
 56 |         box_confidence = np.expand_dims(box_confidence, axis=-1)
 57 |         box_class_probs = self._sigmoid(out[..., 5:])
 58 | 
 59 |         col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w)
 60 |         row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h)
 61 | 
 62 |         col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
 63 |         row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
 64 |         grid = np.concatenate((col, row), axis=-1)
 65 | 
 66 |         box_xy += grid
 67 |         box_xy /= (grid_w, grid_h)
 68 |         box_wh /= (416, 416)
 69 |         box_xy -= (box_wh / 2.)
 70 |         boxes = np.concatenate((box_xy, box_wh), axis=-1)
 71 | 
 72 |         return boxes, box_confidence, box_class_probs
 73 | 
 74 |     def _filter_boxes(self, boxes, box_confidences, box_class_probs):
 75 |         """Filter boxes with object threshold.
 76 | 
 77 |         # Arguments
 78 |             boxes: ndarray, boxes of objects.
 79 |             box_confidences: ndarray, confidences of objects.
 80 |             box_class_probs: ndarray, class_probs of objects.
 81 | 
 82 |         # Returns
 83 |             boxes: ndarray, filtered boxes.
 84 |             classes: ndarray, classes for boxes.
 85 |             scores: ndarray, scores for boxes.
 86 |         """
 87 |         box_scores = box_confidences * box_class_probs
 88 |         box_classes = np.argmax(box_scores, axis=-1)
 89 |         box_class_scores = np.max(box_scores, axis=-1)
 90 |         pos = np.where(box_class_scores >= self._t1)
 91 | 
 92 |         boxes = boxes[pos]
 93 |         classes = box_classes[pos]
 94 |         scores = box_class_scores[pos]
 95 | 
 96 |         return boxes, classes, scores
 97 | 
 98 |     def _nms_boxes(self, boxes, scores):
 99 |         """Suppress non-maximal boxes.
100 | 
101 |         # Arguments
102 |             boxes: ndarray, boxes of objects.
103 |             scores: ndarray, scores of objects.
104 | 
105 |         # Returns
106 |             keep: ndarray, index of effective boxes.
107 |         """
108 |         x = boxes[:, 0]
109 |         y = boxes[:, 1]
110 |         w = boxes[:, 2]
111 |         h = boxes[:, 3]
112 | 
113 |         areas = w * h
114 |         order = scores.argsort()[::-1]
115 | 
116 |         keep = []
117 |         while order.size > 0:
118 |             i = order[0]
119 |             keep.append(i)
120 | 
121 |             xx1 = np.maximum(x[i], x[order[1:]])
122 |             yy1 = np.maximum(y[i], y[order[1:]])
123 |             xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
124 |             yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])
125 | 
126 |             w1 = np.maximum(0.0, xx2 - xx1 + 1)
127 |             h1 = np.maximum(0.0, yy2 - yy1 + 1)
128 |             inter = w1 * h1
129 | 
130 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
131 |             inds = np.where(ovr <= self._t2)[0]
132 |             order = order[inds + 1]
133 | 
134 |         keep = np.array(keep)
135 | 
136 |         return keep
137 | 
138 |     def _yolo_out(self, outs, shape):
139 |         """Process output of yolo base net.
140 | 
141 |         # Argument:
142 |             outs: output of yolo base net.
143 |             shape: shape of original image.
144 | 
145 |         # Returns:
146 |             boxes: ndarray, boxes of objects.
147 |             classes: ndarray, classes of objects.
148 |             scores: ndarray, scores of objects.
149 |         """
150 |         masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
151 |         anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
152 |                    [59, 119], [116, 90], [156, 198], [373, 326]]
153 | 
154 |         boxes, classes, scores = [], [], []
155 | 
156 |         for out, mask in zip(outs, masks):
157 |             b, c, s = self._process_feats(out, anchors, mask)
158 |             b, c, s = self._filter_boxes(b, c, s)
159 |             boxes.append(b)
160 |             classes.append(c)
161 |             scores.append(s)
162 | 
163 |         boxes = np.concatenate(boxes)
164 |         classes = np.concatenate(classes)
165 |         scores = np.concatenate(scores)
166 | 
167 |         # Scale boxes back to original image shape.
168 |         width, height = shape[1], shape[0]
169 |         image_dims = [width, height, width, height]
170 |         boxes = boxes * image_dims
171 | 
172 |         nboxes, nclasses, nscores = [], [], []
173 |         for c in set(classes):
174 |             inds = np.where(classes == c)
175 |             b = boxes[inds]
176 |             c = classes[inds]
177 |             s = scores[inds]
178 | 
179 |             keep = self._nms_boxes(b, s)
180 | 
181 |             nboxes.append(b[keep])
182 |             nclasses.append(c[keep])
183 |             nscores.append(s[keep])
184 | 
185 |         if not nclasses and not nscores:
186 |             return None, None, None
187 | 
188 |         boxes = np.concatenate(nboxes)
189 |         classes = np.concatenate(nclasses)
190 |         scores = np.concatenate(nscores)
191 | 
192 |         return boxes, classes, scores
193 | 
194 |     def predict(self, image, shape):
195 |         """Detect the objects with yolo.
196 | 
197 |         # Arguments
198 |             image: ndarray, processed input image.
199 |             shape: shape of original image.
200 | 
201 |         # Returns
202 |             boxes: ndarray, boxes of objects.
203 |             classes: ndarray, classes of objects.
204 |             scores: ndarray, scores of objects.
205 |         """
206 | 
207 |         outs = self._yolo.predict(image)
208 |         boxes, classes, scores = self._yolo_out(outs, shape)
209 | 
210 |         return boxes, classes, scores
211 | 


--------------------------------------------------------------------------------
/videos/res/library1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/videos/res/library1.mp4


--------------------------------------------------------------------------------
/videos/test/library1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/videos/test/library1.mp4


--------------------------------------------------------------------------------
/yad2k.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Reads Darknet53 config and weights and creates Keras model with TF backend.
  4 | 
  5 | Currently only supports layers in Darknet53 config.
  6 | """
  7 | 
  8 | import argparse
  9 | import configparser
 10 | import io
 11 | import os
 12 | from collections import defaultdict
 13 | 
 14 | import numpy as np
 15 | from keras import backend as K
 16 | from keras.layers import (Conv2D, GlobalAveragePooling2D, Input, Reshape,
 17 |                           ZeroPadding2D, UpSampling2D, Activation, Lambda, MaxPooling2D)
 18 | from keras.layers.advanced_activations import LeakyReLU
 19 | from keras.layers.merge import concatenate, add
 20 | from keras.layers.normalization import BatchNormalization
 21 | from keras.models import Model
 22 | from keras.regularizers import l2
 23 | from keras.utils.vis_utils import plot_model as plot
 24 | 
 25 | 
 26 | parser = argparse.ArgumentParser(
 27 |     description='Yet Another Darknet To Keras Converter.')
 28 | parser.add_argument('config_path', help='Path to Darknet cfg file.')
 29 | parser.add_argument('weights_path', help='Path to Darknet weights file.')
 30 | parser.add_argument('output_path', help='Path to output Keras model file.')
 31 | parser.add_argument(
 32 |     '-p',
 33 |     '--plot_model',
 34 |     help='Plot generated Keras model and save as image.',
 35 |     action='store_true')
 36 | parser.add_argument(
 37 |     '-flcl',
 38 |     '--fully_convolutional',
 39 |     help='Model is fully convolutional so set input shape to (None, None, 3). '
 40 |     'WARNING: This experimental option does not work properly for YOLO_v2.',
 41 |     action='store_true')
 42 | 
 43 | 
 44 | def unique_config_sections(config_file):
 45 |     """Convert all config sections to have unique names.
 46 | 
 47 |     Adds unique suffixes to config sections for compability with configparser.
 48 |     """
 49 |     section_counters = defaultdict(int)
 50 |     output_stream = io.StringIO()
 51 |     with open(config_file) as fin:
 52 |         for line in fin:
 53 |             if line.startswith('['):
 54 |                 section = line.strip().strip('[]')
 55 |                 _section = section + '_' + str(section_counters[section])
 56 |                 section_counters[section] += 1
 57 |                 line = line.replace(section, _section)
 58 |             output_stream.write(line)
 59 |     output_stream.seek(0)
 60 |     return output_stream
 61 | 
 62 | 
 63 | def _main(args):
 64 |     config_path = os.path.expanduser(args.config_path)
 65 |     weights_path = os.path.expanduser(args.weights_path)
 66 |     assert config_path.endswith('.cfg'), '{} is not a .cfg file'.format(
 67 |         config_path)
 68 |     assert weights_path.endswith(
 69 |         '.weights'), '{} is not a .weights file'.format(weights_path)
 70 | 
 71 |     output_path = os.path.expanduser(args.output_path)
 72 |     assert output_path.endswith(
 73 |         '.h5'), 'output path {} is not a .h5 file'.format(output_path)
 74 |     output_root = os.path.splitext(output_path)[0]
 75 | 
 76 |     # Load weights and config.
 77 |     print('Loading weights.')
 78 |     weights_file = open(weights_path, 'rb')
 79 |     weights_header = np.ndarray(
 80 |         shape=(5, ), dtype='int32', buffer=weights_file.read(20))
 81 |     print('Weights Header: ', weights_header)
 82 |     # TODO: Check transpose flag when implementing fully connected layers.
 83 |     # transpose = (weight_header[0] > 1000) or (weight_header[1] > 1000)
 84 | 
 85 |     print('Parsing Darknet config.')
 86 |     unique_config_file = unique_config_sections(config_path)
 87 |     cfg_parser = configparser.ConfigParser()
 88 |     cfg_parser.read_file(unique_config_file)
 89 | 
 90 |     print('Creating Keras model.')
 91 |     if args.fully_convolutional:
 92 |         image_height, image_width = None, None
 93 |     else:
 94 |         image_height = int(cfg_parser['net_0']['height'])
 95 |         image_width = int(cfg_parser['net_0']['width'])
 96 | 
 97 |     prev_layer = Input(shape=(image_height, image_width, 3))
 98 |     all_layers = [prev_layer]
 99 |     outputs = []
100 | 
101 |     weight_decay = float(cfg_parser['net_0']['decay']
102 |                          ) if 'net_0' in cfg_parser.sections() else 5e-4
103 |     count = 0
104 | 
105 |     for section in cfg_parser.sections():
106 |         print('Parsing section {}'.format(section))
107 |         if section.startswith('convolutional'):
108 |             filters = int(cfg_parser[section]['filters'])
109 |             size = int(cfg_parser[section]['size'])
110 |             stride = int(cfg_parser[section]['stride'])
111 |             pad = int(cfg_parser[section]['pad'])
112 |             activation = cfg_parser[section]['activation']
113 |             batch_normalize = 'batch_normalize' in cfg_parser[section]
114 | 
115 |             # Setting weights.
116 |             # Darknet serializes convolutional weights as:
117 |             # [bias/beta, [gamma, mean, variance], conv_weights]
118 |             prev_layer_shape = K.int_shape(prev_layer)
119 | 
120 |             # TODO: This assumes channel last dim_ordering.
121 |             weights_shape = (size, size, prev_layer_shape[-1], filters)
122 |             darknet_w_shape = (filters, weights_shape[2], size, size)
123 |             weights_size = np.product(weights_shape)
124 | 
125 |             print('conv2d', 'bn'
126 |                   if batch_normalize else '  ', activation, weights_shape)
127 | 
128 |             conv_bias = np.ndarray(
129 |                 shape=(filters, ),
130 |                 dtype='float32',
131 |                 buffer=weights_file.read(filters * 4))
132 |             count += filters
133 | 
134 |             if batch_normalize:
135 |                 bn_weights = np.ndarray(
136 |                     shape=(3, filters),
137 |                     dtype='float32',
138 |                     buffer=weights_file.read(filters * 12))
139 |                 count += 3 * filters
140 | 
141 |                 # TODO: Keras BatchNormalization mistakenly refers to var
142 |                 # as std.
143 |                 bn_weight_list = [
144 |                     bn_weights[0],  # scale gamma
145 |                     conv_bias,  # shift beta
146 |                     bn_weights[1],  # running mean
147 |                     bn_weights[2]  # running var
148 |                 ]
149 | 
150 |             conv_weights = np.ndarray(
151 |                 shape=darknet_w_shape,
152 |                 dtype='float32',
153 |                 buffer=weights_file.read(weights_size * 4))
154 |             count += weights_size
155 | 
156 |             # DarkNet conv_weights are serialized Caffe-style:
157 |             # (out_dim, in_dim, height, width)
158 |             # We would like to set these to Tensorflow order:
159 |             # (height, width, in_dim, out_dim)
160 |             # TODO: Add check for Theano dim ordering.
161 |             conv_weights = np.transpose(conv_weights, [2, 3, 1, 0])
162 |             conv_weights = [conv_weights] if batch_normalize else [
163 |                 conv_weights, conv_bias
164 |             ]
165 | 
166 |             # Handle activation.
167 |             act_fn = None
168 |             if activation == 'leaky':
169 |                 pass  # Add advanced activation later.
170 |             elif activation != 'linear':
171 |                 raise ValueError(
172 |                     'Unknown activation function `{}` in section {}'.format(
173 |                         activation, section))
174 | 
175 |             padding = 'same' if pad == 1 and stride == 1 else 'valid'
176 |             # Adjust padding model for darknet.
177 |             if stride == 2:
178 |                 prev_layer = ZeroPadding2D(((1, 0), (1, 0)))(prev_layer)
179 | 
180 |             # Create Conv2D layer
181 |             conv_layer = (Conv2D(
182 |                 filters, (size, size),
183 |                 strides=(stride, stride),
184 |                 kernel_regularizer=l2(weight_decay),
185 |                 use_bias=not batch_normalize,
186 |                 weights=conv_weights,
187 |                 activation=act_fn,
188 |                 padding=padding))(prev_layer)
189 | 
190 |             if batch_normalize:
191 |                 conv_layer = (BatchNormalization(
192 |                     weights=bn_weight_list))(conv_layer)
193 | 
194 |             prev_layer = conv_layer
195 | 
196 |             if activation == 'linear':
197 |                 all_layers.append(prev_layer)
198 |             elif activation == 'leaky':
199 |                 act_layer = LeakyReLU(alpha=0.1)(prev_layer)
200 |                 prev_layer = act_layer
201 |                 all_layers.append(prev_layer)
202 | 
203 |         elif section.startswith('maxpool'):
204 |             size = int(cfg_parser[section]['size'])
205 |             stride = int(cfg_parser[section]['stride'])
206 |             all_layers.append(
207 |                 MaxPooling2D(
208 |                     padding='same',
209 |                     pool_size=(size, size),
210 |                     strides=(stride, stride))(prev_layer))
211 |             prev_layer = all_layers[-1]
212 | 
213 |         elif section.startswith('avgpool'):
214 |             if cfg_parser.items(section) != []:
215 |                 raise ValueError('{} with params unsupported.'.format(section))
216 |             all_layers.append(GlobalAveragePooling2D()(prev_layer))
217 |             prev_layer = all_layers[-1]
218 | 
219 |         elif section.startswith('route'):
220 |             ids = [int(i) for i in cfg_parser[section]['layers'].split(',')]
221 |             if len(ids) == 2:
222 |                 for i, item in enumerate(ids):
223 |                     if item != -1:
224 |                         ids[i] = item + 1
225 | 
226 |             layers = [all_layers[i] for i in ids]
227 | 
228 |             if len(layers) > 1:
229 |                 print('Concatenating route layers:', layers)
230 |                 concatenate_layer = concatenate(layers)
231 |                 all_layers.append(concatenate_layer)
232 |                 prev_layer = concatenate_layer
233 |             else:
234 |                 skip_layer = layers[0]  # only one layer to route
235 |                 all_layers.append(skip_layer)
236 |                 prev_layer = skip_layer
237 | 
238 |         elif section.startswith('shortcut'):
239 |             ids = [int(i) for i in cfg_parser[section]['from'].split(',')][0]
240 |             activation = cfg_parser[section]['activation']
241 |             shortcut = add([all_layers[ids], prev_layer])
242 |             if activation == 'linear':
243 |                 shortcut = Activation('linear')(shortcut)
244 |             all_layers.append(shortcut)
245 |             prev_layer = all_layers[-1]
246 | 
247 |         elif section.startswith('upsample'):
248 |             stride = int(cfg_parser[section]['stride'])
249 |             all_layers.append(
250 |                 UpSampling2D(
251 |                     size=(stride, stride))(prev_layer))
252 |             prev_layer = all_layers[-1]
253 | 
254 |         elif section.startswith('yolo'):
255 |             classes = int(cfg_parser[section]['classes'])
256 |             # num = int(cfg_parser[section]['num'])
257 |             # mask = int(cfg_parser[section]['mask'])
258 |             n1, n2 = int(prev_layer.shape[1]), int(prev_layer.shape[2])
259 |             n3 = 3
260 |             n4 = (4 + 1 + classes)
261 |             yolo = Reshape((n1, n2, n3, n4))(prev_layer)
262 |             all_layers.append(yolo)
263 |             prev_layer = all_layers[-1]
264 |             outputs.append(len(all_layers) - 1)
265 | 
266 |         elif (section.startswith('net')):
267 |             pass  # Configs not currently handled during model definition.
268 |         else:
269 |             raise ValueError(
270 |                 'Unsupported section header type: {}'.format(section))
271 | 
272 |     # Create and save model.
273 |     model = Model(inputs=all_layers[0],
274 |                   outputs=[all_layers[i] for i in outputs])
275 |     print(model.summary())
276 |     model.save('{}'.format(output_path))
277 |     print('Saved Keras model to {}'.format(output_path))
278 |     # Check to see if all weights have been read.
279 |     remaining_weights = len(weights_file.read()) / 4
280 |     weights_file.close()
281 |     print('Read {} of {} from Darknet weights.'.format(count, count +
282 |                                                        remaining_weights))
283 |     if remaining_weights > 0:
284 |         print('Warning: {} unused weights'.format(remaining_weights))
285 | 
286 |     plot(model, to_file='{}.png'.format(output_root), show_shapes=True)
287 |     print('Saved model plot to {}.png'.format(output_root))
288 | 
289 | 
290 | if __name__ == '__main__':
291 |     _main(parser.parse_args())
292 | 


--------------------------------------------------------------------------------