├── .gitignore ├── LICENSE ├── README.md ├── cfg └── yolo.cfg ├── data └── coco_classes.txt ├── demo.py ├── images ├── res │ ├── dog.jpg │ ├── eagle.jpg │ ├── giraffe.jpg │ ├── horses.jpg │ ├── person.jpg │ └── takagaki.jpg ├── test │ ├── dog.jpg │ ├── eagle.jpg │ ├── giraffe.jpg │ ├── horses.jpg │ ├── person.jpg │ └── takagaki.jpg └── yolo.png ├── model ├── darknet53.py └── yolo_model.py ├── videos ├── res │ └── library1.mp4 └── test │ └── library1.mp4 └── yad2k.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Larry 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLOv3 2 | Keras(TF backend) implementation of yolo v3 objects detection. 3 | 4 | According to the paper [YOLOv3: An Incremental Improvement](https://pjreddie.com/media/files/papers/YOLOv3.pdf). 5 | 6 | ## Requirement 7 | - OpenCV 3.4 8 | - Python 3.6 9 | - Tensorflow-gpu 1.5.0 10 | - Keras 2.1.3 11 | 12 | ## Quick start 13 | 14 | - Download official [yolov3.weights](https://pjreddie.com/media/files/yolov3.weights) and put it on top floder of project. 15 | 16 | - Run the follow command to convert darknet weight file to keras h5 file. The `yad2k.py` was modified from [allanzelener/YAD2K](https://github.com/allanzelener/YAD2K). 17 | ``` 18 | python yad2k.py cfg\yolo.cfg yolov3.weights data\yolo.h5 19 | ``` 20 | 21 | - run follow command to show the demo. The result can be found in `images\res\` floder. 22 | ``` 23 | python demo.py 24 | ``` 25 | 26 | ## Demo result 27 | 28 | It can be seen that yolo v3 has a better classification ability than yolo v2. 29 | 30 | 31 | 32 | ## TODO 33 | 34 | - Train the model. 35 | 36 | ## Reference 37 | 38 | @article{YOLOv3, 39 | title={YOLOv3: An Incremental Improvement}, 40 | author={J Redmon, A Farhadi }, 41 | year={2018} 42 | 43 | 44 | 45 | ## Copyright 46 | See [LICENSE](LICENSE) for details. 47 | -------------------------------------------------------------------------------- /cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 -------------------------------------------------------------------------------- /data/coco_classes.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | """Demo for use yolo v3 2 | """ 3 | import os 4 | import time 5 | import cv2 6 | import numpy as np 7 | from model.yolo_model import YOLO 8 | 9 | 10 | def process_image(img): 11 | """Resize, reduce and expand image. 12 | 13 | # Argument: 14 | img: original image. 15 | 16 | # Returns 17 | image: ndarray(64, 64, 3), processed image. 18 | """ 19 | image = cv2.resize(img, (416, 416), 20 | interpolation=cv2.INTER_CUBIC) 21 | image = np.array(image, dtype='float32') 22 | image /= 255. 23 | image = np.expand_dims(image, axis=0) 24 | 25 | return image 26 | 27 | 28 | def get_classes(file): 29 | """Get classes name. 30 | 31 | # Argument: 32 | file: classes name for database. 33 | 34 | # Returns 35 | class_names: List, classes name. 36 | 37 | """ 38 | with open(file) as f: 39 | class_names = f.readlines() 40 | class_names = [c.strip() for c in class_names] 41 | 42 | return class_names 43 | 44 | 45 | def draw(image, boxes, scores, classes, all_classes): 46 | """Draw the boxes on the image. 47 | 48 | # Argument: 49 | image: original image. 50 | boxes: ndarray, boxes of objects. 51 | classes: ndarray, classes of objects. 52 | scores: ndarray, scores of objects. 53 | all_classes: all classes name. 54 | """ 55 | for box, score, cl in zip(boxes, scores, classes): 56 | x, y, w, h = box 57 | 58 | top = max(0, np.floor(x + 0.5).astype(int)) 59 | left = max(0, np.floor(y + 0.5).astype(int)) 60 | right = min(image.shape[1], np.floor(x + w + 0.5).astype(int)) 61 | bottom = min(image.shape[0], np.floor(y + h + 0.5).astype(int)) 62 | 63 | cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2) 64 | cv2.putText(image, '{0} {1:.2f}'.format(all_classes[cl], score), 65 | (top, left - 6), 66 | cv2.FONT_HERSHEY_SIMPLEX, 67 | 0.6, (0, 0, 255), 1, 68 | cv2.LINE_AA) 69 | 70 | print('class: {0}, score: {1:.2f}'.format(all_classes[cl], score)) 71 | print('box coordinate x,y,w,h: {0}'.format(box)) 72 | 73 | print() 74 | 75 | 76 | def detect_image(image, yolo, all_classes): 77 | """Use yolo v3 to detect images. 78 | 79 | # Argument: 80 | image: original image. 81 | yolo: YOLO, yolo model. 82 | all_classes: all classes name. 83 | 84 | # Returns: 85 | image: processed image. 86 | """ 87 | pimage = process_image(image) 88 | 89 | start = time.time() 90 | boxes, classes, scores = yolo.predict(pimage, image.shape) 91 | end = time.time() 92 | 93 | print('time: {0:.2f}s'.format(end - start)) 94 | 95 | if boxes is not None: 96 | draw(image, boxes, scores, classes, all_classes) 97 | 98 | return image 99 | 100 | 101 | def detect_video(video, yolo, all_classes): 102 | """Use yolo v3 to detect video. 103 | 104 | # Argument: 105 | video: video file. 106 | yolo: YOLO, yolo model. 107 | all_classes: all classes name. 108 | """ 109 | video_path = os.path.join("videos", "test", video) 110 | camera = cv2.VideoCapture(video_path) 111 | cv2.namedWindow("detection", cv2.WINDOW_AUTOSIZE) 112 | 113 | # Prepare for saving the detected video 114 | sz = (int(camera.get(cv2.CAP_PROP_FRAME_WIDTH)), 115 | int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT))) 116 | fourcc = cv2.VideoWriter_fourcc(*'mpeg') 117 | 118 | vout = cv2.VideoWriter() 119 | vout.open(os.path.join("videos", "res", video), fourcc, 20, sz, True) 120 | 121 | while True: 122 | res, frame = camera.read() 123 | 124 | if not res: 125 | break 126 | 127 | image = detect_image(frame, yolo, all_classes) 128 | cv2.imshow("detection", image) 129 | 130 | # Save the video frame by frame 131 | vout.write(image) 132 | 133 | if cv2.waitKey(110) & 0xff == 27: 134 | break 135 | 136 | vout.release() 137 | camera.release() 138 | 139 | 140 | if __name__ == '__main__': 141 | yolo = YOLO(0.6, 0.5) 142 | file = 'data/coco_classes.txt' 143 | all_classes = get_classes(file) 144 | 145 | # detect images in test floder. 146 | for (root, dirs, files) in os.walk('images/test'): 147 | if files: 148 | for f in files: 149 | print(f) 150 | path = os.path.join(root, f) 151 | image = cv2.imread(path) 152 | image = detect_image(image, yolo, all_classes) 153 | cv2.imwrite('images/res/' + f, image) 154 | 155 | # detect videos one at a time in videos/test folder 156 | video = 'library1.mp4' 157 | detect_video(video, yolo, all_classes) 158 | -------------------------------------------------------------------------------- /images/res/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/dog.jpg -------------------------------------------------------------------------------- /images/res/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/eagle.jpg -------------------------------------------------------------------------------- /images/res/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/giraffe.jpg -------------------------------------------------------------------------------- /images/res/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/horses.jpg -------------------------------------------------------------------------------- /images/res/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/person.jpg -------------------------------------------------------------------------------- /images/res/takagaki.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/res/takagaki.jpg -------------------------------------------------------------------------------- /images/test/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/dog.jpg -------------------------------------------------------------------------------- /images/test/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/eagle.jpg -------------------------------------------------------------------------------- /images/test/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/giraffe.jpg -------------------------------------------------------------------------------- /images/test/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/horses.jpg -------------------------------------------------------------------------------- /images/test/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/person.jpg -------------------------------------------------------------------------------- /images/test/takagaki.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/test/takagaki.jpg -------------------------------------------------------------------------------- /images/yolo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/images/yolo.png -------------------------------------------------------------------------------- /model/darknet53.py: -------------------------------------------------------------------------------- 1 | """Darknet-53 for yolo v3. 2 | """ 3 | from keras.models import Model 4 | from keras.layers import Input, Conv2D, GlobalAveragePooling2D, Dense 5 | from keras.layers import add, Activation, BatchNormalization 6 | from keras.layers.advanced_activations import LeakyReLU 7 | from keras.regularizers import l2 8 | 9 | 10 | def conv2d_unit(x, filters, kernels, strides=1): 11 | """Convolution Unit 12 | This function defines a 2D convolution operation with BN and LeakyReLU. 13 | 14 | # Arguments 15 | x: Tensor, input tensor of conv layer. 16 | filters: Integer, the dimensionality of the output space. 17 | kernels: An integer or tuple/list of 2 integers, specifying the 18 | width and height of the 2D convolution window. 19 | strides: An integer or tuple/list of 2 integers, 20 | specifying the strides of the convolution along the width and 21 | height. Can be a single integer to specify the same value for 22 | all spatial dimensions. 23 | 24 | # Returns 25 | Output tensor. 26 | """ 27 | x = Conv2D(filters, kernels, 28 | padding='same', 29 | strides=strides, 30 | activation='linear', 31 | kernel_regularizer=l2(5e-4))(x) 32 | x = BatchNormalization()(x) 33 | x = LeakyReLU(alpha=0.1)(x) 34 | 35 | return x 36 | 37 | 38 | def residual_block(inputs, filters): 39 | """Residual Block 40 | This function defines a 2D convolution operation with BN and LeakyReLU. 41 | 42 | # Arguments 43 | x: Tensor, input tensor of residual block. 44 | kernels: An integer or tuple/list of 2 integers, specifying the 45 | width and height of the 2D convolution window. 46 | 47 | # Returns 48 | Output tensor. 49 | """ 50 | x = conv2d_unit(inputs, filters, (1, 1)) 51 | x = conv2d_unit(x, 2 * filters, (3, 3)) 52 | x = add([inputs, x]) 53 | x = Activation('linear')(x) 54 | 55 | return x 56 | 57 | 58 | def stack_residual_block(inputs, filters, n): 59 | """Stacked residual Block 60 | """ 61 | x = residual_block(inputs, filters) 62 | 63 | for i in range(n - 1): 64 | x = residual_block(x, filters) 65 | 66 | return x 67 | 68 | 69 | def darknet_base(inputs): 70 | """Darknet-53 base model. 71 | """ 72 | 73 | x = conv2d_unit(inputs, 32, (3, 3)) 74 | 75 | x = conv2d_unit(x, 64, (3, 3), strides=2) 76 | x = stack_residual_block(x, 32, n=1) 77 | 78 | x = conv2d_unit(x, 128, (3, 3), strides=2) 79 | x = stack_residual_block(x, 64, n=2) 80 | 81 | x = conv2d_unit(x, 256, (3, 3), strides=2) 82 | x = stack_residual_block(x, 128, n=8) 83 | 84 | x = conv2d_unit(x, 512, (3, 3), strides=2) 85 | x = stack_residual_block(x, 256, n=8) 86 | 87 | x = conv2d_unit(x, 1024, (3, 3), strides=2) 88 | x = stack_residual_block(x, 512, n=4) 89 | 90 | return x 91 | 92 | 93 | def darknet(): 94 | """Darknet-53 classifier. 95 | """ 96 | inputs = Input(shape=(416, 416, 3)) 97 | x = darknet_base(inputs) 98 | 99 | x = GlobalAveragePooling2D()(x) 100 | x = Dense(1000, activation='softmax')(x) 101 | 102 | model = Model(inputs, x) 103 | 104 | return model 105 | 106 | 107 | if __name__ == '__main__': 108 | model = darknet() 109 | print(model.summary()) 110 | -------------------------------------------------------------------------------- /model/yolo_model.py: -------------------------------------------------------------------------------- 1 | """YOLO v3 output 2 | """ 3 | import numpy as np 4 | import keras.backend as K 5 | from keras.models import load_model 6 | 7 | 8 | class YOLO: 9 | def __init__(self, obj_threshold, nms_threshold): 10 | """Init. 11 | 12 | # Arguments 13 | obj_threshold: Integer, threshold for object. 14 | nms_threshold: Integer, threshold for box. 15 | """ 16 | self._t1 = obj_threshold 17 | self._t2 = nms_threshold 18 | self._yolo = load_model('data/yolo.h5') 19 | 20 | def _sigmoid(self, x): 21 | """sigmoid. 22 | 23 | # Arguments 24 | x: Tensor. 25 | 26 | # Returns 27 | numpy ndarray. 28 | """ 29 | return 1 / (1 + np.exp(-x)) 30 | 31 | def _process_feats(self, out, anchors, mask): 32 | """process output features. 33 | 34 | # Arguments 35 | out: Tensor (N, N, 3, 4 + 1 +80), output feature map of yolo. 36 | anchors: List, anchors for box. 37 | mask: List, mask for anchors. 38 | 39 | # Returns 40 | boxes: ndarray (N, N, 3, 4), x,y,w,h for per box. 41 | box_confidence: ndarray (N, N, 3, 1), confidence for per box. 42 | box_class_probs: ndarray (N, N, 3, 80), class probs for per box. 43 | """ 44 | grid_h, grid_w, num_boxes = map(int, out.shape[1: 4]) 45 | 46 | anchors = [anchors[i] for i in mask] 47 | anchors_tensor = np.array(anchors).reshape(1, 1, len(anchors), 2) 48 | 49 | # Reshape to batch, height, width, num_anchors, box_params. 50 | out = out[0] 51 | box_xy = self._sigmoid(out[..., :2]) 52 | box_wh = np.exp(out[..., 2:4]) 53 | box_wh = box_wh * anchors_tensor 54 | 55 | box_confidence = self._sigmoid(out[..., 4]) 56 | box_confidence = np.expand_dims(box_confidence, axis=-1) 57 | box_class_probs = self._sigmoid(out[..., 5:]) 58 | 59 | col = np.tile(np.arange(0, grid_w), grid_w).reshape(-1, grid_w) 60 | row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_h) 61 | 62 | col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) 63 | row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) 64 | grid = np.concatenate((col, row), axis=-1) 65 | 66 | box_xy += grid 67 | box_xy /= (grid_w, grid_h) 68 | box_wh /= (416, 416) 69 | box_xy -= (box_wh / 2.) 70 | boxes = np.concatenate((box_xy, box_wh), axis=-1) 71 | 72 | return boxes, box_confidence, box_class_probs 73 | 74 | def _filter_boxes(self, boxes, box_confidences, box_class_probs): 75 | """Filter boxes with object threshold. 76 | 77 | # Arguments 78 | boxes: ndarray, boxes of objects. 79 | box_confidences: ndarray, confidences of objects. 80 | box_class_probs: ndarray, class_probs of objects. 81 | 82 | # Returns 83 | boxes: ndarray, filtered boxes. 84 | classes: ndarray, classes for boxes. 85 | scores: ndarray, scores for boxes. 86 | """ 87 | box_scores = box_confidences * box_class_probs 88 | box_classes = np.argmax(box_scores, axis=-1) 89 | box_class_scores = np.max(box_scores, axis=-1) 90 | pos = np.where(box_class_scores >= self._t1) 91 | 92 | boxes = boxes[pos] 93 | classes = box_classes[pos] 94 | scores = box_class_scores[pos] 95 | 96 | return boxes, classes, scores 97 | 98 | def _nms_boxes(self, boxes, scores): 99 | """Suppress non-maximal boxes. 100 | 101 | # Arguments 102 | boxes: ndarray, boxes of objects. 103 | scores: ndarray, scores of objects. 104 | 105 | # Returns 106 | keep: ndarray, index of effective boxes. 107 | """ 108 | x = boxes[:, 0] 109 | y = boxes[:, 1] 110 | w = boxes[:, 2] 111 | h = boxes[:, 3] 112 | 113 | areas = w * h 114 | order = scores.argsort()[::-1] 115 | 116 | keep = [] 117 | while order.size > 0: 118 | i = order[0] 119 | keep.append(i) 120 | 121 | xx1 = np.maximum(x[i], x[order[1:]]) 122 | yy1 = np.maximum(y[i], y[order[1:]]) 123 | xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]]) 124 | yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]]) 125 | 126 | w1 = np.maximum(0.0, xx2 - xx1 + 1) 127 | h1 = np.maximum(0.0, yy2 - yy1 + 1) 128 | inter = w1 * h1 129 | 130 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 131 | inds = np.where(ovr <= self._t2)[0] 132 | order = order[inds + 1] 133 | 134 | keep = np.array(keep) 135 | 136 | return keep 137 | 138 | def _yolo_out(self, outs, shape): 139 | """Process output of yolo base net. 140 | 141 | # Argument: 142 | outs: output of yolo base net. 143 | shape: shape of original image. 144 | 145 | # Returns: 146 | boxes: ndarray, boxes of objects. 147 | classes: ndarray, classes of objects. 148 | scores: ndarray, scores of objects. 149 | """ 150 | masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] 151 | anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], 152 | [59, 119], [116, 90], [156, 198], [373, 326]] 153 | 154 | boxes, classes, scores = [], [], [] 155 | 156 | for out, mask in zip(outs, masks): 157 | b, c, s = self._process_feats(out, anchors, mask) 158 | b, c, s = self._filter_boxes(b, c, s) 159 | boxes.append(b) 160 | classes.append(c) 161 | scores.append(s) 162 | 163 | boxes = np.concatenate(boxes) 164 | classes = np.concatenate(classes) 165 | scores = np.concatenate(scores) 166 | 167 | # Scale boxes back to original image shape. 168 | width, height = shape[1], shape[0] 169 | image_dims = [width, height, width, height] 170 | boxes = boxes * image_dims 171 | 172 | nboxes, nclasses, nscores = [], [], [] 173 | for c in set(classes): 174 | inds = np.where(classes == c) 175 | b = boxes[inds] 176 | c = classes[inds] 177 | s = scores[inds] 178 | 179 | keep = self._nms_boxes(b, s) 180 | 181 | nboxes.append(b[keep]) 182 | nclasses.append(c[keep]) 183 | nscores.append(s[keep]) 184 | 185 | if not nclasses and not nscores: 186 | return None, None, None 187 | 188 | boxes = np.concatenate(nboxes) 189 | classes = np.concatenate(nclasses) 190 | scores = np.concatenate(nscores) 191 | 192 | return boxes, classes, scores 193 | 194 | def predict(self, image, shape): 195 | """Detect the objects with yolo. 196 | 197 | # Arguments 198 | image: ndarray, processed input image. 199 | shape: shape of original image. 200 | 201 | # Returns 202 | boxes: ndarray, boxes of objects. 203 | classes: ndarray, classes of objects. 204 | scores: ndarray, scores of objects. 205 | """ 206 | 207 | outs = self._yolo.predict(image) 208 | boxes, classes, scores = self._yolo_out(outs, shape) 209 | 210 | return boxes, classes, scores 211 | -------------------------------------------------------------------------------- /videos/res/library1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/videos/res/library1.mp4 -------------------------------------------------------------------------------- /videos/test/library1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaochus/YOLOv3/db63e48f501a9019eb420f77dfc7fa6f44329270/videos/test/library1.mp4 -------------------------------------------------------------------------------- /yad2k.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Reads Darknet53 config and weights and creates Keras model with TF backend. 4 | 5 | Currently only supports layers in Darknet53 config. 6 | """ 7 | 8 | import argparse 9 | import configparser 10 | import io 11 | import os 12 | from collections import defaultdict 13 | 14 | import numpy as np 15 | from keras import backend as K 16 | from keras.layers import (Conv2D, GlobalAveragePooling2D, Input, Reshape, 17 | ZeroPadding2D, UpSampling2D, Activation, Lambda, MaxPooling2D) 18 | from keras.layers.advanced_activations import LeakyReLU 19 | from keras.layers.merge import concatenate, add 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.models import Model 22 | from keras.regularizers import l2 23 | from keras.utils.vis_utils import plot_model as plot 24 | 25 | 26 | parser = argparse.ArgumentParser( 27 | description='Yet Another Darknet To Keras Converter.') 28 | parser.add_argument('config_path', help='Path to Darknet cfg file.') 29 | parser.add_argument('weights_path', help='Path to Darknet weights file.') 30 | parser.add_argument('output_path', help='Path to output Keras model file.') 31 | parser.add_argument( 32 | '-p', 33 | '--plot_model', 34 | help='Plot generated Keras model and save as image.', 35 | action='store_true') 36 | parser.add_argument( 37 | '-flcl', 38 | '--fully_convolutional', 39 | help='Model is fully convolutional so set input shape to (None, None, 3). ' 40 | 'WARNING: This experimental option does not work properly for YOLO_v2.', 41 | action='store_true') 42 | 43 | 44 | def unique_config_sections(config_file): 45 | """Convert all config sections to have unique names. 46 | 47 | Adds unique suffixes to config sections for compability with configparser. 48 | """ 49 | section_counters = defaultdict(int) 50 | output_stream = io.StringIO() 51 | with open(config_file) as fin: 52 | for line in fin: 53 | if line.startswith('['): 54 | section = line.strip().strip('[]') 55 | _section = section + '_' + str(section_counters[section]) 56 | section_counters[section] += 1 57 | line = line.replace(section, _section) 58 | output_stream.write(line) 59 | output_stream.seek(0) 60 | return output_stream 61 | 62 | 63 | def _main(args): 64 | config_path = os.path.expanduser(args.config_path) 65 | weights_path = os.path.expanduser(args.weights_path) 66 | assert config_path.endswith('.cfg'), '{} is not a .cfg file'.format( 67 | config_path) 68 | assert weights_path.endswith( 69 | '.weights'), '{} is not a .weights file'.format(weights_path) 70 | 71 | output_path = os.path.expanduser(args.output_path) 72 | assert output_path.endswith( 73 | '.h5'), 'output path {} is not a .h5 file'.format(output_path) 74 | output_root = os.path.splitext(output_path)[0] 75 | 76 | # Load weights and config. 77 | print('Loading weights.') 78 | weights_file = open(weights_path, 'rb') 79 | weights_header = np.ndarray( 80 | shape=(5, ), dtype='int32', buffer=weights_file.read(20)) 81 | print('Weights Header: ', weights_header) 82 | # TODO: Check transpose flag when implementing fully connected layers. 83 | # transpose = (weight_header[0] > 1000) or (weight_header[1] > 1000) 84 | 85 | print('Parsing Darknet config.') 86 | unique_config_file = unique_config_sections(config_path) 87 | cfg_parser = configparser.ConfigParser() 88 | cfg_parser.read_file(unique_config_file) 89 | 90 | print('Creating Keras model.') 91 | if args.fully_convolutional: 92 | image_height, image_width = None, None 93 | else: 94 | image_height = int(cfg_parser['net_0']['height']) 95 | image_width = int(cfg_parser['net_0']['width']) 96 | 97 | prev_layer = Input(shape=(image_height, image_width, 3)) 98 | all_layers = [prev_layer] 99 | outputs = [] 100 | 101 | weight_decay = float(cfg_parser['net_0']['decay'] 102 | ) if 'net_0' in cfg_parser.sections() else 5e-4 103 | count = 0 104 | 105 | for section in cfg_parser.sections(): 106 | print('Parsing section {}'.format(section)) 107 | if section.startswith('convolutional'): 108 | filters = int(cfg_parser[section]['filters']) 109 | size = int(cfg_parser[section]['size']) 110 | stride = int(cfg_parser[section]['stride']) 111 | pad = int(cfg_parser[section]['pad']) 112 | activation = cfg_parser[section]['activation'] 113 | batch_normalize = 'batch_normalize' in cfg_parser[section] 114 | 115 | # Setting weights. 116 | # Darknet serializes convolutional weights as: 117 | # [bias/beta, [gamma, mean, variance], conv_weights] 118 | prev_layer_shape = K.int_shape(prev_layer) 119 | 120 | # TODO: This assumes channel last dim_ordering. 121 | weights_shape = (size, size, prev_layer_shape[-1], filters) 122 | darknet_w_shape = (filters, weights_shape[2], size, size) 123 | weights_size = np.product(weights_shape) 124 | 125 | print('conv2d', 'bn' 126 | if batch_normalize else ' ', activation, weights_shape) 127 | 128 | conv_bias = np.ndarray( 129 | shape=(filters, ), 130 | dtype='float32', 131 | buffer=weights_file.read(filters * 4)) 132 | count += filters 133 | 134 | if batch_normalize: 135 | bn_weights = np.ndarray( 136 | shape=(3, filters), 137 | dtype='float32', 138 | buffer=weights_file.read(filters * 12)) 139 | count += 3 * filters 140 | 141 | # TODO: Keras BatchNormalization mistakenly refers to var 142 | # as std. 143 | bn_weight_list = [ 144 | bn_weights[0], # scale gamma 145 | conv_bias, # shift beta 146 | bn_weights[1], # running mean 147 | bn_weights[2] # running var 148 | ] 149 | 150 | conv_weights = np.ndarray( 151 | shape=darknet_w_shape, 152 | dtype='float32', 153 | buffer=weights_file.read(weights_size * 4)) 154 | count += weights_size 155 | 156 | # DarkNet conv_weights are serialized Caffe-style: 157 | # (out_dim, in_dim, height, width) 158 | # We would like to set these to Tensorflow order: 159 | # (height, width, in_dim, out_dim) 160 | # TODO: Add check for Theano dim ordering. 161 | conv_weights = np.transpose(conv_weights, [2, 3, 1, 0]) 162 | conv_weights = [conv_weights] if batch_normalize else [ 163 | conv_weights, conv_bias 164 | ] 165 | 166 | # Handle activation. 167 | act_fn = None 168 | if activation == 'leaky': 169 | pass # Add advanced activation later. 170 | elif activation != 'linear': 171 | raise ValueError( 172 | 'Unknown activation function `{}` in section {}'.format( 173 | activation, section)) 174 | 175 | padding = 'same' if pad == 1 and stride == 1 else 'valid' 176 | # Adjust padding model for darknet. 177 | if stride == 2: 178 | prev_layer = ZeroPadding2D(((1, 0), (1, 0)))(prev_layer) 179 | 180 | # Create Conv2D layer 181 | conv_layer = (Conv2D( 182 | filters, (size, size), 183 | strides=(stride, stride), 184 | kernel_regularizer=l2(weight_decay), 185 | use_bias=not batch_normalize, 186 | weights=conv_weights, 187 | activation=act_fn, 188 | padding=padding))(prev_layer) 189 | 190 | if batch_normalize: 191 | conv_layer = (BatchNormalization( 192 | weights=bn_weight_list))(conv_layer) 193 | 194 | prev_layer = conv_layer 195 | 196 | if activation == 'linear': 197 | all_layers.append(prev_layer) 198 | elif activation == 'leaky': 199 | act_layer = LeakyReLU(alpha=0.1)(prev_layer) 200 | prev_layer = act_layer 201 | all_layers.append(prev_layer) 202 | 203 | elif section.startswith('maxpool'): 204 | size = int(cfg_parser[section]['size']) 205 | stride = int(cfg_parser[section]['stride']) 206 | all_layers.append( 207 | MaxPooling2D( 208 | padding='same', 209 | pool_size=(size, size), 210 | strides=(stride, stride))(prev_layer)) 211 | prev_layer = all_layers[-1] 212 | 213 | elif section.startswith('avgpool'): 214 | if cfg_parser.items(section) != []: 215 | raise ValueError('{} with params unsupported.'.format(section)) 216 | all_layers.append(GlobalAveragePooling2D()(prev_layer)) 217 | prev_layer = all_layers[-1] 218 | 219 | elif section.startswith('route'): 220 | ids = [int(i) for i in cfg_parser[section]['layers'].split(',')] 221 | if len(ids) == 2: 222 | for i, item in enumerate(ids): 223 | if item != -1: 224 | ids[i] = item + 1 225 | 226 | layers = [all_layers[i] for i in ids] 227 | 228 | if len(layers) > 1: 229 | print('Concatenating route layers:', layers) 230 | concatenate_layer = concatenate(layers) 231 | all_layers.append(concatenate_layer) 232 | prev_layer = concatenate_layer 233 | else: 234 | skip_layer = layers[0] # only one layer to route 235 | all_layers.append(skip_layer) 236 | prev_layer = skip_layer 237 | 238 | elif section.startswith('shortcut'): 239 | ids = [int(i) for i in cfg_parser[section]['from'].split(',')][0] 240 | activation = cfg_parser[section]['activation'] 241 | shortcut = add([all_layers[ids], prev_layer]) 242 | if activation == 'linear': 243 | shortcut = Activation('linear')(shortcut) 244 | all_layers.append(shortcut) 245 | prev_layer = all_layers[-1] 246 | 247 | elif section.startswith('upsample'): 248 | stride = int(cfg_parser[section]['stride']) 249 | all_layers.append( 250 | UpSampling2D( 251 | size=(stride, stride))(prev_layer)) 252 | prev_layer = all_layers[-1] 253 | 254 | elif section.startswith('yolo'): 255 | classes = int(cfg_parser[section]['classes']) 256 | # num = int(cfg_parser[section]['num']) 257 | # mask = int(cfg_parser[section]['mask']) 258 | n1, n2 = int(prev_layer.shape[1]), int(prev_layer.shape[2]) 259 | n3 = 3 260 | n4 = (4 + 1 + classes) 261 | yolo = Reshape((n1, n2, n3, n4))(prev_layer) 262 | all_layers.append(yolo) 263 | prev_layer = all_layers[-1] 264 | outputs.append(len(all_layers) - 1) 265 | 266 | elif (section.startswith('net')): 267 | pass # Configs not currently handled during model definition. 268 | else: 269 | raise ValueError( 270 | 'Unsupported section header type: {}'.format(section)) 271 | 272 | # Create and save model. 273 | model = Model(inputs=all_layers[0], 274 | outputs=[all_layers[i] for i in outputs]) 275 | print(model.summary()) 276 | model.save('{}'.format(output_path)) 277 | print('Saved Keras model to {}'.format(output_path)) 278 | # Check to see if all weights have been read. 279 | remaining_weights = len(weights_file.read()) / 4 280 | weights_file.close() 281 | print('Read {} of {} from Darknet weights.'.format(count, count + 282 | remaining_weights)) 283 | if remaining_weights > 0: 284 | print('Warning: {} unused weights'.format(remaining_weights)) 285 | 286 | plot(model, to_file='{}.png'.format(output_root), show_shapes=True) 287 | print('Saved model plot to {}.png'.format(output_root)) 288 | 289 | 290 | if __name__ == '__main__': 291 | _main(parser.parse_args()) 292 | --------------------------------------------------------------------------------