├── README.md ├── bus.jpg ├── coco.names ├── convert_darknet.py ├── csdarknet53s-panet-spp.cfg └── main_yolobile.py /README.md: -------------------------------------------------------------------------------- 1 | 刚才在微信朋友圈看到关于Yolo目标检测的新的网络:YOLObile 2 | 3 | 它的paper下载链接是 https://arxiv.org/abs/2009.05697 4 | github代码地址是 https://github.com/CoCoPIE-Pruning/CoCoPIE-ModelZoo/tree/master/YOLObile 5 | 我把代码和模型.pt文件下载到本地后运行了一下,可以看到它是基于pytorch框架实现的,下载到的.pt文件有245M,这么大,我有些怀疑YOLObile是否真的如paper种所说的“在移动设备上实时检测”。 6 | 7 | 接下来我就想编写一个用opencv的dnn模块做YOLObile的目标检测,但是模型文件时.pt格式的,这是dnn模块不支持的格式,于是我就想把它提供的.pt文件转换成darknet框架里的.weights文件。 8 | 方法是本代码仓库里的convert_darknet.py文件,把它拷贝到YOLObile文件夹里,然后运行,程序会读取.cfg文件和.pt文件,最后生成.weights文件。 9 | 有了.cfg文件和.weights文件,我们就可以利用opencv的dnn木块做目标检测,此时就不依赖pytorch框架了。 10 | .weights文件有245M,无法直接上传到github,百度云盘的下载 11 | 链接: https://pan.baidu.com/s/1FsTGBoGuJNYSdGvFSw9Trg 提取码: mc9b 12 | 13 | 把.weights文件下载完成之后放在本仓库代码的文件夹里,运行main_yolobile.py 14 | 15 | 我曾想过运行convert_darknet.py文件,来转换https://github.com/ultralytics/yolov5 里的yolov5s.pt, 16 | 但是我发现它得.pt文件里包含了网络结构和模型参数的,使用torch.load加载.pt文件 17 | 就会报出 18 | “RuntimeError: yolov5s.pt is a zip archive (did you mean to use torch.jit.load()?)” 19 | 这样的错误。并且它的网络结构文件是.yaml格式的,不是.cfg格式的,这就无法用opencv的dnn模块做yolov5目标检测了。 20 | 因此,我就没有在github上发布使用opencv的dnn模块做yolov5目标检测的程序 21 | -------------------------------------------------------------------------------- /bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/YOLObile-opencv-dnn/ea962e52ee1b78ea7ff37c2705f35abab00038b6/bus.jpg -------------------------------------------------------------------------------- /coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /convert_darknet.py: -------------------------------------------------------------------------------- 1 | from models import * 2 | 3 | if __name__ == '__main__': 4 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 5 | cfgpath = 'cfg/csdarknet53s-panet-spp.cfg' 6 | weights = 'weights/best8x-514.pt' 7 | target_weight = 'weights/yolobile.weights' 8 | model = Darknet(cfgpath) 9 | model.load_state_dict(torch.load(weights, map_location=device)['model'], strict=False) 10 | model.to(device) 11 | model.eval() 12 | 13 | save_weights(model, path=target_weight, cutoff=-1) -------------------------------------------------------------------------------- /csdarknet53s-panet-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500500 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | #23:104x104 54:52x52 85:26x26 104:13x13 for 416 26 | 27 | 28 | 29 | [convolutional] 30 | batch_normalize=1 31 | filters=32 32 | size=3 33 | stride=1 34 | pad=1 35 | activation=leaky 36 | 37 | # Downsample 38 | 39 | [convolutional] 40 | batch_normalize=1 41 | filters=64 42 | size=3 43 | stride=2 44 | pad=1 45 | activation=leaky 46 | 47 | #[convolutional] 48 | #batch_normalize=1 49 | #filters=64 50 | #size=1 51 | #stride=1 52 | #pad=1 53 | #activation=leaky 54 | 55 | #[route] 56 | #layers = -2 57 | 58 | #[convolutional] 59 | #batch_normalize=1 60 | #filters=64 61 | #size=1 62 | #stride=1 63 | #pad=1 64 | #activation=leaky 65 | 66 | [convolutional] 67 | batch_normalize=1 68 | filters=32 69 | size=1 70 | stride=1 71 | pad=1 72 | activation=leaky 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=64 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [shortcut] 83 | from=-3 84 | activation=linear 85 | 86 | #[convolutional] 87 | #batch_normalize=1 88 | #filters=64 89 | #size=1 90 | #stride=1 91 | #pad=1 92 | #activation=leaky 93 | 94 | #[route] 95 | #layers = -1,-7 96 | 97 | #[convolutional] 98 | #batch_normalize=1 99 | #filters=64 100 | #size=1 101 | #stride=1 102 | #pad=1 103 | #activation=leaky 104 | 105 | # Downsample 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=128 110 | size=3 111 | stride=2 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=64 118 | size=1 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [route] 124 | layers = -2 125 | 126 | [convolutional] 127 | batch_normalize=1 128 | filters=64 129 | size=1 130 | stride=1 131 | pad=1 132 | activation=leaky 133 | 134 | [convolutional] 135 | batch_normalize=1 136 | filters=64 137 | size=1 138 | stride=1 139 | pad=1 140 | activation=leaky 141 | 142 | [convolutional] 143 | batch_normalize=1 144 | filters=64 145 | size=3 146 | stride=1 147 | pad=1 148 | activation=leaky 149 | 150 | [shortcut] 151 | from=-3 152 | activation=linear 153 | 154 | [convolutional] 155 | batch_normalize=1 156 | filters=64 157 | size=1 158 | stride=1 159 | pad=1 160 | activation=leaky 161 | 162 | [convolutional] 163 | batch_normalize=1 164 | filters=64 165 | size=3 166 | stride=1 167 | pad=1 168 | activation=leaky 169 | 170 | [shortcut] 171 | from=-3 172 | activation=linear 173 | 174 | [convolutional] 175 | batch_normalize=1 176 | filters=64 177 | size=1 178 | stride=1 179 | pad=1 180 | activation=leaky 181 | 182 | [route] 183 | layers = -1,-10 184 | 185 | [convolutional] 186 | batch_normalize=1 187 | filters=128 188 | size=1 189 | stride=1 190 | pad=1 191 | activation=leaky 192 | 193 | # Downsample 194 | 195 | [convolutional] 196 | batch_normalize=1 197 | filters=256 198 | size=3 199 | stride=2 200 | pad=1 201 | activation=leaky 202 | 203 | [convolutional] 204 | batch_normalize=1 205 | filters=128 206 | size=1 207 | stride=1 208 | pad=1 209 | activation=leaky 210 | 211 | [route] 212 | layers = -2 213 | 214 | [convolutional] 215 | batch_normalize=1 216 | filters=128 217 | size=1 218 | stride=1 219 | pad=1 220 | activation=leaky 221 | 222 | [convolutional] 223 | batch_normalize=1 224 | filters=128 225 | size=1 226 | stride=1 227 | pad=1 228 | activation=leaky 229 | 230 | [convolutional] 231 | batch_normalize=1 232 | filters=128 233 | size=3 234 | stride=1 235 | pad=1 236 | activation=leaky 237 | 238 | [shortcut] 239 | from=-3 240 | activation=linear 241 | 242 | [convolutional] 243 | batch_normalize=1 244 | filters=128 245 | size=1 246 | stride=1 247 | pad=1 248 | activation=leaky 249 | 250 | [convolutional] 251 | batch_normalize=1 252 | filters=128 253 | size=3 254 | stride=1 255 | pad=1 256 | activation=leaky 257 | 258 | [shortcut] 259 | from=-3 260 | activation=linear 261 | 262 | [convolutional] 263 | batch_normalize=1 264 | filters=128 265 | size=1 266 | stride=1 267 | pad=1 268 | activation=leaky 269 | 270 | [convolutional] 271 | batch_normalize=1 272 | filters=128 273 | size=3 274 | stride=1 275 | pad=1 276 | activation=leaky 277 | 278 | [shortcut] 279 | from=-3 280 | activation=linear 281 | 282 | [convolutional] 283 | batch_normalize=1 284 | filters=128 285 | size=1 286 | stride=1 287 | pad=1 288 | activation=leaky 289 | 290 | [convolutional] 291 | batch_normalize=1 292 | filters=128 293 | size=3 294 | stride=1 295 | pad=1 296 | activation=leaky 297 | 298 | [shortcut] 299 | from=-3 300 | activation=linear 301 | 302 | 303 | [convolutional] 304 | batch_normalize=1 305 | filters=128 306 | size=1 307 | stride=1 308 | pad=1 309 | activation=leaky 310 | 311 | [convolutional] 312 | batch_normalize=1 313 | filters=128 314 | size=3 315 | stride=1 316 | pad=1 317 | activation=leaky 318 | 319 | [shortcut] 320 | from=-3 321 | activation=linear 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=128 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=128 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=leaky 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | [convolutional] 344 | batch_normalize=1 345 | filters=128 346 | size=1 347 | stride=1 348 | pad=1 349 | activation=leaky 350 | 351 | [convolutional] 352 | batch_normalize=1 353 | filters=128 354 | size=3 355 | stride=1 356 | pad=1 357 | activation=leaky 358 | 359 | [shortcut] 360 | from=-3 361 | activation=linear 362 | 363 | [convolutional] 364 | batch_normalize=1 365 | filters=128 366 | size=1 367 | stride=1 368 | pad=1 369 | activation=leaky 370 | 371 | [convolutional] 372 | batch_normalize=1 373 | filters=128 374 | size=3 375 | stride=1 376 | pad=1 377 | activation=leaky 378 | 379 | [shortcut] 380 | from=-3 381 | activation=linear 382 | 383 | [convolutional] 384 | batch_normalize=1 385 | filters=128 386 | size=1 387 | stride=1 388 | pad=1 389 | activation=leaky 390 | 391 | [route] 392 | layers = -1,-28 393 | 394 | [convolutional] 395 | batch_normalize=1 396 | filters=256 397 | size=1 398 | stride=1 399 | pad=1 400 | activation=leaky 401 | 402 | # Downsample 403 | 404 | [convolutional] 405 | batch_normalize=1 406 | filters=512 407 | size=3 408 | stride=2 409 | pad=1 410 | activation=leaky 411 | 412 | [convolutional] 413 | batch_normalize=1 414 | filters=256 415 | size=1 416 | stride=1 417 | pad=1 418 | activation=leaky 419 | 420 | [route] 421 | layers = -2 422 | 423 | [convolutional] 424 | batch_normalize=1 425 | filters=256 426 | size=1 427 | stride=1 428 | pad=1 429 | activation=leaky 430 | 431 | [convolutional] 432 | batch_normalize=1 433 | filters=256 434 | size=1 435 | stride=1 436 | pad=1 437 | activation=leaky 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=3 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [shortcut] 448 | from=-3 449 | activation=linear 450 | 451 | 452 | [convolutional] 453 | batch_normalize=1 454 | filters=256 455 | size=1 456 | stride=1 457 | pad=1 458 | activation=leaky 459 | 460 | [convolutional] 461 | batch_normalize=1 462 | filters=256 463 | size=3 464 | stride=1 465 | pad=1 466 | activation=leaky 467 | 468 | [shortcut] 469 | from=-3 470 | activation=linear 471 | 472 | 473 | [convolutional] 474 | batch_normalize=1 475 | filters=256 476 | size=1 477 | stride=1 478 | pad=1 479 | activation=leaky 480 | 481 | [convolutional] 482 | batch_normalize=1 483 | filters=256 484 | size=3 485 | stride=1 486 | pad=1 487 | activation=leaky 488 | 489 | [shortcut] 490 | from=-3 491 | activation=linear 492 | 493 | 494 | [convolutional] 495 | batch_normalize=1 496 | filters=256 497 | size=1 498 | stride=1 499 | pad=1 500 | activation=leaky 501 | 502 | [convolutional] 503 | batch_normalize=1 504 | filters=256 505 | size=3 506 | stride=1 507 | pad=1 508 | activation=leaky 509 | 510 | [shortcut] 511 | from=-3 512 | activation=linear 513 | 514 | 515 | [convolutional] 516 | batch_normalize=1 517 | filters=256 518 | size=1 519 | stride=1 520 | pad=1 521 | activation=leaky 522 | 523 | [convolutional] 524 | batch_normalize=1 525 | filters=256 526 | size=3 527 | stride=1 528 | pad=1 529 | activation=leaky 530 | 531 | [shortcut] 532 | from=-3 533 | activation=linear 534 | 535 | 536 | [convolutional] 537 | batch_normalize=1 538 | filters=256 539 | size=1 540 | stride=1 541 | pad=1 542 | activation=leaky 543 | 544 | [convolutional] 545 | batch_normalize=1 546 | filters=256 547 | size=3 548 | stride=1 549 | pad=1 550 | activation=leaky 551 | 552 | [shortcut] 553 | from=-3 554 | activation=linear 555 | 556 | 557 | [convolutional] 558 | batch_normalize=1 559 | filters=256 560 | size=1 561 | stride=1 562 | pad=1 563 | activation=leaky 564 | 565 | [convolutional] 566 | batch_normalize=1 567 | filters=256 568 | size=3 569 | stride=1 570 | pad=1 571 | activation=leaky 572 | 573 | [shortcut] 574 | from=-3 575 | activation=linear 576 | 577 | [convolutional] 578 | batch_normalize=1 579 | filters=256 580 | size=1 581 | stride=1 582 | pad=1 583 | activation=leaky 584 | 585 | [convolutional] 586 | batch_normalize=1 587 | filters=256 588 | size=3 589 | stride=1 590 | pad=1 591 | activation=leaky 592 | 593 | [shortcut] 594 | from=-3 595 | activation=linear 596 | 597 | [convolutional] 598 | batch_normalize=1 599 | filters=256 600 | size=1 601 | stride=1 602 | pad=1 603 | activation=leaky 604 | 605 | [route] 606 | layers = -1,-28 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | filters=512 611 | size=1 612 | stride=1 613 | pad=1 614 | activation=leaky 615 | 616 | # Downsample 617 | 618 | [convolutional] 619 | batch_normalize=1 620 | filters=1024 621 | size=3 622 | stride=2 623 | pad=1 624 | activation=leaky 625 | 626 | [convolutional] 627 | batch_normalize=1 628 | filters=512 629 | size=1 630 | stride=1 631 | pad=1 632 | activation=leaky 633 | 634 | [route] 635 | layers = -2 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=512 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | filters=512 648 | size=1 649 | stride=1 650 | pad=1 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=512 656 | size=3 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [shortcut] 662 | from=-3 663 | activation=linear 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | filters=512 668 | size=1 669 | stride=1 670 | pad=1 671 | activation=leaky 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=512 676 | size=3 677 | stride=1 678 | pad=1 679 | activation=leaky 680 | 681 | [shortcut] 682 | from=-3 683 | activation=linear 684 | 685 | [convolutional] 686 | batch_normalize=1 687 | filters=512 688 | size=1 689 | stride=1 690 | pad=1 691 | activation=leaky 692 | 693 | [convolutional] 694 | batch_normalize=1 695 | filters=512 696 | size=3 697 | stride=1 698 | pad=1 699 | activation=leaky 700 | 701 | [shortcut] 702 | from=-3 703 | activation=linear 704 | 705 | [convolutional] 706 | batch_normalize=1 707 | filters=512 708 | size=1 709 | stride=1 710 | pad=1 711 | activation=leaky 712 | 713 | [convolutional] 714 | batch_normalize=1 715 | filters=512 716 | size=3 717 | stride=1 718 | pad=1 719 | activation=leaky 720 | 721 | [shortcut] 722 | from=-3 723 | activation=linear 724 | 725 | [convolutional] 726 | batch_normalize=1 727 | filters=512 728 | size=1 729 | stride=1 730 | pad=1 731 | activation=leaky 732 | 733 | [route] 734 | layers = -1,-16 735 | 736 | [convolutional] 737 | batch_normalize=1 738 | filters=1024 739 | size=1 740 | stride=1 741 | pad=1 742 | activation=leaky 743 | 744 | ########################## 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | filters=512 749 | size=1 750 | stride=1 751 | pad=1 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | size=3 757 | stride=1 758 | pad=1 759 | filters=1024 760 | activation=leaky 761 | 762 | [convolutional] 763 | batch_normalize=1 764 | filters=512 765 | size=1 766 | stride=1 767 | pad=1 768 | activation=leaky 769 | 770 | ### SPP ### 771 | [maxpool] 772 | stride=1 773 | size=5 774 | 775 | [route] 776 | layers=-2 777 | 778 | [maxpool] 779 | stride=1 780 | size=9 781 | 782 | [route] 783 | layers=-4 784 | 785 | [maxpool] 786 | stride=1 787 | size=13 788 | 789 | [route] 790 | layers=-1,-3,-5,-6 791 | ### End SPP ### 792 | 793 | [convolutional] 794 | batch_normalize=1 795 | filters=512 796 | size=1 797 | stride=1 798 | pad=1 799 | activation=leaky 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | size=3 804 | stride=1 805 | pad=1 806 | filters=1024 807 | activation=leaky 808 | 809 | [convolutional] 810 | batch_normalize=1 811 | filters=512 812 | size=1 813 | stride=1 814 | pad=1 815 | activation=leaky 816 | 817 | [convolutional] 818 | batch_normalize=1 819 | filters=256 820 | size=1 821 | stride=1 822 | pad=1 823 | activation=leaky 824 | 825 | [upsample] 826 | stride=2 827 | 828 | [route] 829 | layers = 79 830 | 831 | [convolutional] 832 | batch_normalize=1 833 | filters=256 834 | size=1 835 | stride=1 836 | pad=1 837 | activation=leaky 838 | 839 | [route] 840 | layers = -1, -3 841 | 842 | [convolutional] 843 | batch_normalize=1 844 | filters=256 845 | size=1 846 | stride=1 847 | pad=1 848 | activation=leaky 849 | 850 | [convolutional] 851 | batch_normalize=1 852 | size=3 853 | stride=1 854 | pad=1 855 | filters=512 856 | activation=leaky 857 | 858 | [convolutional] 859 | batch_normalize=1 860 | filters=256 861 | size=1 862 | stride=1 863 | pad=1 864 | activation=leaky 865 | 866 | [convolutional] 867 | batch_normalize=1 868 | size=3 869 | stride=1 870 | pad=1 871 | filters=512 872 | activation=leaky 873 | 874 | [convolutional] 875 | batch_normalize=1 876 | filters=256 877 | size=1 878 | stride=1 879 | pad=1 880 | activation=leaky 881 | 882 | [convolutional] 883 | batch_normalize=1 884 | filters=128 885 | size=1 886 | stride=1 887 | pad=1 888 | activation=leaky 889 | 890 | [upsample] 891 | stride=2 892 | 893 | [route] 894 | layers = 48 895 | 896 | [convolutional] 897 | batch_normalize=1 898 | filters=128 899 | size=1 900 | stride=1 901 | pad=1 902 | activation=leaky 903 | 904 | [route] 905 | layers = -1, -3 906 | 907 | [convolutional] 908 | batch_normalize=1 909 | filters=128 910 | size=1 911 | stride=1 912 | pad=1 913 | activation=leaky 914 | 915 | [convolutional] 916 | batch_normalize=1 917 | size=3 918 | stride=1 919 | pad=1 920 | filters=256 921 | activation=leaky 922 | 923 | [convolutional] 924 | batch_normalize=1 925 | filters=128 926 | size=1 927 | stride=1 928 | pad=1 929 | activation=leaky 930 | 931 | [convolutional] 932 | batch_normalize=1 933 | size=3 934 | stride=1 935 | pad=1 936 | filters=256 937 | activation=leaky 938 | 939 | [convolutional] 940 | batch_normalize=1 941 | filters=128 942 | size=1 943 | stride=1 944 | pad=1 945 | activation=leaky 946 | 947 | ########################## 948 | 949 | [convolutional] 950 | batch_normalize=1 951 | size=3 952 | stride=1 953 | pad=1 954 | filters=256 955 | activation=leaky 956 | 957 | [convolutional] 958 | size=1 959 | stride=1 960 | pad=1 961 | filters=255 962 | activation=linear 963 | 964 | 965 | [yolo] 966 | mask = 0,1,2 967 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 968 | classes=80 969 | num=9 970 | jitter=.3 971 | ignore_thresh = .7 972 | truth_thresh = 1 973 | random=1 974 | 975 | [route] 976 | layers = -4 977 | 978 | [convolutional] 979 | batch_normalize=1 980 | size=3 981 | stride=2 982 | pad=1 983 | filters=256 984 | activation=leaky 985 | 986 | [route] 987 | layers = -1, -16 988 | 989 | [convolutional] 990 | batch_normalize=1 991 | filters=256 992 | size=1 993 | stride=1 994 | pad=1 995 | activation=leaky 996 | 997 | [convolutional] 998 | batch_normalize=1 999 | size=3 1000 | stride=1 1001 | pad=1 1002 | filters=512 1003 | activation=leaky 1004 | 1005 | [convolutional] 1006 | batch_normalize=1 1007 | filters=256 1008 | size=1 1009 | stride=1 1010 | pad=1 1011 | activation=leaky 1012 | 1013 | [convolutional] 1014 | batch_normalize=1 1015 | size=3 1016 | stride=1 1017 | pad=1 1018 | filters=512 1019 | activation=leaky 1020 | 1021 | [convolutional] 1022 | batch_normalize=1 1023 | filters=256 1024 | size=1 1025 | stride=1 1026 | pad=1 1027 | activation=leaky 1028 | 1029 | [convolutional] 1030 | batch_normalize=1 1031 | size=3 1032 | stride=1 1033 | pad=1 1034 | filters=512 1035 | activation=leaky 1036 | 1037 | [convolutional] 1038 | size=1 1039 | stride=1 1040 | pad=1 1041 | filters=255 1042 | activation=linear 1043 | 1044 | 1045 | [yolo] 1046 | mask = 3,4,5 1047 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 1048 | classes=80 1049 | num=9 1050 | jitter=.3 1051 | ignore_thresh = .7 1052 | truth_thresh = 1 1053 | random=1 1054 | 1055 | [route] 1056 | layers = -4 1057 | 1058 | [convolutional] 1059 | batch_normalize=1 1060 | size=3 1061 | stride=2 1062 | pad=1 1063 | filters=512 1064 | activation=leaky 1065 | 1066 | [route] 1067 | layers = -1, -37 1068 | 1069 | [convolutional] 1070 | batch_normalize=1 1071 | filters=512 1072 | size=1 1073 | stride=1 1074 | pad=1 1075 | activation=leaky 1076 | 1077 | [convolutional] 1078 | batch_normalize=1 1079 | size=3 1080 | stride=1 1081 | pad=1 1082 | filters=1024 1083 | activation=leaky 1084 | 1085 | [convolutional] 1086 | batch_normalize=1 1087 | filters=512 1088 | size=1 1089 | stride=1 1090 | pad=1 1091 | activation=leaky 1092 | 1093 | [convolutional] 1094 | batch_normalize=1 1095 | size=3 1096 | stride=1 1097 | pad=1 1098 | filters=1024 1099 | activation=leaky 1100 | 1101 | [convolutional] 1102 | batch_normalize=1 1103 | filters=512 1104 | size=1 1105 | stride=1 1106 | pad=1 1107 | activation=leaky 1108 | 1109 | [convolutional] 1110 | batch_normalize=1 1111 | size=3 1112 | stride=1 1113 | pad=1 1114 | filters=1024 1115 | activation=leaky 1116 | 1117 | [convolutional] 1118 | size=1 1119 | stride=1 1120 | pad=1 1121 | filters=255 1122 | activation=linear 1123 | 1124 | 1125 | [yolo] 1126 | mask = 6,7,8 1127 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 1128 | classes=80 1129 | num=9 1130 | jitter=.3 1131 | ignore_thresh = .7 1132 | truth_thresh = 1 1133 | random=1 1134 | -------------------------------------------------------------------------------- /main_yolobile.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import argparse 3 | import numpy as np 4 | 5 | # Initialize the parameters 6 | confThreshold = 0.25 # Confidence threshold 7 | nmsThreshold = 0.4 # Non-maximum suppression threshold 8 | inpWidth = 320 # Width of network's input image 9 | inpHeight = 320 # Height of network's input image 10 | 11 | # Give the configuration and weight files for the model and load the network using them. 12 | modelConfiguration = "csdarknet53s-panet-spp.cfg" 13 | modelWeights = "yolobile.weights" 14 | 15 | # Load names of classes 16 | classesFile = "coco.names" 17 | classes = None 18 | with open(classesFile, 'rt') as f: 19 | classes = f.read().rstrip('\n').split('\n') 20 | colors = [np.random.randint(0, 255, size=3).tolist() for _ in range(len(classes))] 21 | 22 | # Get the names of the output layers 23 | def getOutputsNames(net): 24 | # Get the names of all the layers in the network 25 | layersNames = net.getLayerNames() 26 | # print(dir(net)) 27 | # Get the names of the output layers, i.e. the layers with unconnected outputs 28 | return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()] 29 | 30 | # Draw the predicted bounding box 31 | def drawPred(classId, conf, left, top, right, bottom): 32 | # Draw a bounding box. 33 | cv.rectangle(frame, (left, top), (right, bottom), (0,0,255), thickness=4) 34 | 35 | label = '%.2f' % conf 36 | 37 | # Get the label for the class name and its confidence 38 | if classes: 39 | assert (classId < len(classes)) 40 | label = '%s:%s' % (classes[classId], label) 41 | 42 | # Display the label at the top of the bounding box 43 | labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) 44 | top = max(top, labelSize[1]) 45 | # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED) 46 | cv.putText(frame, label, (left, top-10), cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), thickness=2) 47 | 48 | # Remove the bounding boxes with low confidence using non-maxima suppression 49 | def postprocess(frame, outs): 50 | frameHeight = frame.shape[0] 51 | frameWidth = frame.shape[1] 52 | 53 | # Scan through all the bounding boxes output from the network and keep only the 54 | # ones with high confidence scores. Assign the box's class label as the class with the highest score. 55 | classIds = [] 56 | confidences = [] 57 | boxes = [] 58 | for out in outs: 59 | for detection in out: 60 | scores = detection[5:] 61 | classId = np.argmax(scores) 62 | confidence = scores[classId] 63 | if confidence > confThreshold: 64 | center_x = int(detection[0] * frameWidth) 65 | center_y = int(detection[1] * frameHeight) 66 | width = int(detection[2] * frameWidth) 67 | height = int(detection[3] * frameHeight) 68 | left = int(center_x - width / 2) 69 | top = int(center_y - height / 2) 70 | classIds.append(classId) 71 | confidences.append(float(confidence)) 72 | boxes.append([left, top, width, height]) 73 | 74 | # Perform non maximum suppression to eliminate redundant overlapping boxes with 75 | # lower confidences. 76 | indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold) 77 | for i in indices: 78 | i = i[0] 79 | box = boxes[i] 80 | left = box[0] 81 | top = box[1] 82 | width = box[2] 83 | height = box[3] 84 | drawPred(classIds[i], confidences[i], left, top, left + width, top + height) 85 | 86 | if __name__=='__main__': 87 | parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV') 88 | parser.add_argument('--image', type=str, default='bus.jpg', help='Path to image file.') 89 | args = parser.parse_args() 90 | 91 | net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights) 92 | net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV) 93 | net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU) 94 | # Process inputs 95 | frame = cv.imread(args.image) 96 | 97 | # Create a 4D blob from a frame. 98 | blob = cv.dnn.blobFromImage(frame, 1/255.0, (inpWidth, inpHeight), [0, 0, 0], swapRB=False, crop=False) 99 | 100 | # Sets the input to the network 101 | net.setInput(blob) 102 | 103 | # Runs the forward pass to get output of the output layers 104 | outs = net.forward(getOutputsNames(net)) 105 | # Remove the bounding boxes with low confidence 106 | postprocess(frame, outs) 107 | 108 | # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes) 109 | t, _ = net.getPerfProfile() 110 | label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) 111 | cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255)) 112 | 113 | winName = 'Deep learning object detection in OpenCV' 114 | cv.namedWindow(winName,0) 115 | cv.imshow(winName, frame) 116 | cv.waitKey(0) 117 | cv.destroyAllWindows() 118 | --------------------------------------------------------------------------------