├── README.md
├── bus.jpg
├── coco.names
├── convert_darknet.py
├── csdarknet53s-panet-spp.cfg
└── main_yolobile.py


/README.md:
--------------------------------------------------------------------------------
 1 | 刚才在微信朋友圈看到关于Yolo目标检测的新的网络：YOLObile
 2 | 
 3 | 它的paper下载链接是 https://arxiv.org/abs/2009.05697
 4 | github代码地址是 https://github.com/CoCoPIE-Pruning/CoCoPIE-ModelZoo/tree/master/YOLObile
 5 | 我把代码和模型.pt文件下载到本地后运行了一下，可以看到它是基于pytorch框架实现的，下载到的.pt文件有245M，这么大，我有些怀疑YOLObile是否真的如paper种所说的“在移动设备上实时检测”。
 6 | 
 7 | 接下来我就想编写一个用opencv的dnn模块做YOLObile的目标检测，但是模型文件时.pt格式的，这是dnn模块不支持的格式，于是我就想把它提供的.pt文件转换成darknet框架里的.weights文件。
 8 | 方法是本代码仓库里的convert_darknet.py文件，把它拷贝到YOLObile文件夹里，然后运行，程序会读取.cfg文件和.pt文件，最后生成.weights文件。
 9 | 有了.cfg文件和.weights文件，我们就可以利用opencv的dnn木块做目标检测，此时就不依赖pytorch框架了。
10 | .weights文件有245M，无法直接上传到github，百度云盘的下载
11 | 链接: https://pan.baidu.com/s/1FsTGBoGuJNYSdGvFSw9Trg 提取码: mc9b
12 | 
13 | 把.weights文件下载完成之后放在本仓库代码的文件夹里，运行main_yolobile.py
14 | 
15 | 我曾想过运行convert_darknet.py文件，来转换https://github.com/ultralytics/yolov5 里的yolov5s.pt，
16 | 但是我发现它得.pt文件里包含了网络结构和模型参数的，使用torch.load加载.pt文件
17 | 就会报出
18 | “RuntimeError: yolov5s.pt is a zip archive (did you mean to use torch.jit.load()?)”
19 | 这样的错误。并且它的网络结构文件是.yaml格式的，不是.cfg格式的，这就无法用opencv的dnn模块做yolov5目标检测了。
20 | 因此，我就没有在github上发布使用opencv的dnn模块做yolov5目标检测的程序
21 | 


--------------------------------------------------------------------------------
/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/YOLObile-opencv-dnn/ea962e52ee1b78ea7ff37c2705f35abab00038b6/bus.jpg


--------------------------------------------------------------------------------
/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/convert_darknet.py:
--------------------------------------------------------------------------------
 1 | from models import *
 2 | 
 3 | if __name__ == '__main__':
 4 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 5 |     cfgpath = 'cfg/csdarknet53s-panet-spp.cfg'
 6 |     weights = 'weights/best8x-514.pt'
 7 |     target_weight = 'weights/yolobile.weights'
 8 |     model = Darknet(cfgpath)
 9 |     model.load_state_dict(torch.load(weights, map_location=device)['model'], strict=False)
10 |     model.to(device)
11 |     model.eval()
12 | 
13 |     save_weights(model, path=target_weight, cutoff=-1)


--------------------------------------------------------------------------------
/csdarknet53s-panet-spp.cfg:
--------------------------------------------------------------------------------
   1 | [net]
   2 | # Testing
   3 | #batch=1
   4 | #subdivisions=1
   5 | # Training
   6 | batch=64
   7 | subdivisions=16
   8 | width=416
   9 | height=416
  10 | channels=3
  11 | momentum=0.9
  12 | decay=0.0005
  13 | angle=0
  14 | saturation = 1.5
  15 | exposure = 1.5
  16 | hue=.1
  17 | 
  18 | learning_rate=0.001
  19 | burn_in=1000
  20 | max_batches = 500500
  21 | policy=steps
  22 | steps=400000,450000
  23 | scales=.1,.1
  24 | 
  25 | #23:104x104 54:52x52 85:26x26 104:13x13 for 416
  26 | 
  27 | 
  28 | 
  29 | [convolutional]
  30 | batch_normalize=1
  31 | filters=32
  32 | size=3
  33 | stride=1
  34 | pad=1
  35 | activation=leaky
  36 | 
  37 | # Downsample
  38 | 
  39 | [convolutional]
  40 | batch_normalize=1
  41 | filters=64
  42 | size=3
  43 | stride=2
  44 | pad=1
  45 | activation=leaky
  46 | 
  47 | #[convolutional]
  48 | #batch_normalize=1
  49 | #filters=64
  50 | #size=1
  51 | #stride=1
  52 | #pad=1
  53 | #activation=leaky
  54 | 
  55 | #[route]
  56 | #layers = -2
  57 | 
  58 | #[convolutional]
  59 | #batch_normalize=1
  60 | #filters=64
  61 | #size=1
  62 | #stride=1
  63 | #pad=1
  64 | #activation=leaky
  65 | 
  66 | [convolutional]
  67 | batch_normalize=1
  68 | filters=32
  69 | size=1
  70 | stride=1
  71 | pad=1
  72 | activation=leaky
  73 | 
  74 | [convolutional]
  75 | batch_normalize=1
  76 | filters=64
  77 | size=3
  78 | stride=1
  79 | pad=1
  80 | activation=leaky
  81 | 
  82 | [shortcut]
  83 | from=-3
  84 | activation=linear
  85 | 
  86 | #[convolutional]
  87 | #batch_normalize=1
  88 | #filters=64
  89 | #size=1
  90 | #stride=1
  91 | #pad=1
  92 | #activation=leaky
  93 | 
  94 | #[route]
  95 | #layers = -1,-7
  96 | 
  97 | #[convolutional]
  98 | #batch_normalize=1
  99 | #filters=64
 100 | #size=1
 101 | #stride=1
 102 | #pad=1
 103 | #activation=leaky
 104 | 
 105 | # Downsample
 106 | 
 107 | [convolutional]
 108 | batch_normalize=1
 109 | filters=128
 110 | size=3
 111 | stride=2
 112 | pad=1
 113 | activation=leaky
 114 | 
 115 | [convolutional]
 116 | batch_normalize=1
 117 | filters=64
 118 | size=1
 119 | stride=1
 120 | pad=1
 121 | activation=leaky
 122 | 
 123 | [route]
 124 | layers = -2
 125 | 
 126 | [convolutional]
 127 | batch_normalize=1
 128 | filters=64
 129 | size=1
 130 | stride=1
 131 | pad=1
 132 | activation=leaky
 133 | 
 134 | [convolutional]
 135 | batch_normalize=1
 136 | filters=64
 137 | size=1
 138 | stride=1
 139 | pad=1
 140 | activation=leaky
 141 | 
 142 | [convolutional]
 143 | batch_normalize=1
 144 | filters=64
 145 | size=3
 146 | stride=1
 147 | pad=1
 148 | activation=leaky
 149 | 
 150 | [shortcut]
 151 | from=-3
 152 | activation=linear
 153 | 
 154 | [convolutional]
 155 | batch_normalize=1
 156 | filters=64
 157 | size=1
 158 | stride=1
 159 | pad=1
 160 | activation=leaky
 161 | 
 162 | [convolutional]
 163 | batch_normalize=1
 164 | filters=64
 165 | size=3
 166 | stride=1
 167 | pad=1
 168 | activation=leaky
 169 | 
 170 | [shortcut]
 171 | from=-3
 172 | activation=linear
 173 | 
 174 | [convolutional]
 175 | batch_normalize=1
 176 | filters=64
 177 | size=1
 178 | stride=1
 179 | pad=1
 180 | activation=leaky
 181 | 
 182 | [route]
 183 | layers = -1,-10
 184 | 
 185 | [convolutional]
 186 | batch_normalize=1
 187 | filters=128
 188 | size=1
 189 | stride=1
 190 | pad=1
 191 | activation=leaky
 192 | 
 193 | # Downsample
 194 | 
 195 | [convolutional]
 196 | batch_normalize=1
 197 | filters=256
 198 | size=3
 199 | stride=2
 200 | pad=1
 201 | activation=leaky
 202 | 
 203 | [convolutional]
 204 | batch_normalize=1
 205 | filters=128
 206 | size=1
 207 | stride=1
 208 | pad=1
 209 | activation=leaky
 210 | 
 211 | [route]
 212 | layers = -2
 213 | 
 214 | [convolutional]
 215 | batch_normalize=1
 216 | filters=128
 217 | size=1
 218 | stride=1
 219 | pad=1
 220 | activation=leaky
 221 | 
 222 | [convolutional]
 223 | batch_normalize=1
 224 | filters=128
 225 | size=1
 226 | stride=1
 227 | pad=1
 228 | activation=leaky
 229 | 
 230 | [convolutional]
 231 | batch_normalize=1
 232 | filters=128
 233 | size=3
 234 | stride=1
 235 | pad=1
 236 | activation=leaky
 237 | 
 238 | [shortcut]
 239 | from=-3
 240 | activation=linear
 241 | 
 242 | [convolutional]
 243 | batch_normalize=1
 244 | filters=128
 245 | size=1
 246 | stride=1
 247 | pad=1
 248 | activation=leaky
 249 | 
 250 | [convolutional]
 251 | batch_normalize=1
 252 | filters=128
 253 | size=3
 254 | stride=1
 255 | pad=1
 256 | activation=leaky
 257 | 
 258 | [shortcut]
 259 | from=-3
 260 | activation=linear
 261 | 
 262 | [convolutional]
 263 | batch_normalize=1
 264 | filters=128
 265 | size=1
 266 | stride=1
 267 | pad=1
 268 | activation=leaky
 269 | 
 270 | [convolutional]
 271 | batch_normalize=1
 272 | filters=128
 273 | size=3
 274 | stride=1
 275 | pad=1
 276 | activation=leaky
 277 | 
 278 | [shortcut]
 279 | from=-3
 280 | activation=linear
 281 | 
 282 | [convolutional]
 283 | batch_normalize=1
 284 | filters=128
 285 | size=1
 286 | stride=1
 287 | pad=1
 288 | activation=leaky
 289 | 
 290 | [convolutional]
 291 | batch_normalize=1
 292 | filters=128
 293 | size=3
 294 | stride=1
 295 | pad=1
 296 | activation=leaky
 297 | 
 298 | [shortcut]
 299 | from=-3
 300 | activation=linear
 301 | 
 302 | 
 303 | [convolutional]
 304 | batch_normalize=1
 305 | filters=128
 306 | size=1
 307 | stride=1
 308 | pad=1
 309 | activation=leaky
 310 | 
 311 | [convolutional]
 312 | batch_normalize=1
 313 | filters=128
 314 | size=3
 315 | stride=1
 316 | pad=1
 317 | activation=leaky
 318 | 
 319 | [shortcut]
 320 | from=-3
 321 | activation=linear
 322 | 
 323 | [convolutional]
 324 | batch_normalize=1
 325 | filters=128
 326 | size=1
 327 | stride=1
 328 | pad=1
 329 | activation=leaky
 330 | 
 331 | [convolutional]
 332 | batch_normalize=1
 333 | filters=128
 334 | size=3
 335 | stride=1
 336 | pad=1
 337 | activation=leaky
 338 | 
 339 | [shortcut]
 340 | from=-3
 341 | activation=linear
 342 | 
 343 | [convolutional]
 344 | batch_normalize=1
 345 | filters=128
 346 | size=1
 347 | stride=1
 348 | pad=1
 349 | activation=leaky
 350 | 
 351 | [convolutional]
 352 | batch_normalize=1
 353 | filters=128
 354 | size=3
 355 | stride=1
 356 | pad=1
 357 | activation=leaky
 358 | 
 359 | [shortcut]
 360 | from=-3
 361 | activation=linear
 362 | 
 363 | [convolutional]
 364 | batch_normalize=1
 365 | filters=128
 366 | size=1
 367 | stride=1
 368 | pad=1
 369 | activation=leaky
 370 | 
 371 | [convolutional]
 372 | batch_normalize=1
 373 | filters=128
 374 | size=3
 375 | stride=1
 376 | pad=1
 377 | activation=leaky
 378 | 
 379 | [shortcut]
 380 | from=-3
 381 | activation=linear
 382 | 
 383 | [convolutional]
 384 | batch_normalize=1
 385 | filters=128
 386 | size=1
 387 | stride=1
 388 | pad=1
 389 | activation=leaky
 390 | 
 391 | [route]
 392 | layers = -1,-28
 393 | 
 394 | [convolutional]
 395 | batch_normalize=1
 396 | filters=256
 397 | size=1
 398 | stride=1
 399 | pad=1
 400 | activation=leaky
 401 | 
 402 | # Downsample
 403 | 
 404 | [convolutional]
 405 | batch_normalize=1
 406 | filters=512
 407 | size=3
 408 | stride=2
 409 | pad=1
 410 | activation=leaky
 411 | 
 412 | [convolutional]
 413 | batch_normalize=1
 414 | filters=256
 415 | size=1
 416 | stride=1
 417 | pad=1
 418 | activation=leaky
 419 | 
 420 | [route]
 421 | layers = -2
 422 | 
 423 | [convolutional]
 424 | batch_normalize=1
 425 | filters=256
 426 | size=1
 427 | stride=1
 428 | pad=1
 429 | activation=leaky
 430 | 
 431 | [convolutional]
 432 | batch_normalize=1
 433 | filters=256
 434 | size=1
 435 | stride=1
 436 | pad=1
 437 | activation=leaky
 438 | 
 439 | [convolutional]
 440 | batch_normalize=1
 441 | filters=256
 442 | size=3
 443 | stride=1
 444 | pad=1
 445 | activation=leaky
 446 | 
 447 | [shortcut]
 448 | from=-3
 449 | activation=linear
 450 | 
 451 | 
 452 | [convolutional]
 453 | batch_normalize=1
 454 | filters=256
 455 | size=1
 456 | stride=1
 457 | pad=1
 458 | activation=leaky
 459 | 
 460 | [convolutional]
 461 | batch_normalize=1
 462 | filters=256
 463 | size=3
 464 | stride=1
 465 | pad=1
 466 | activation=leaky
 467 | 
 468 | [shortcut]
 469 | from=-3
 470 | activation=linear
 471 | 
 472 | 
 473 | [convolutional]
 474 | batch_normalize=1
 475 | filters=256
 476 | size=1
 477 | stride=1
 478 | pad=1
 479 | activation=leaky
 480 | 
 481 | [convolutional]
 482 | batch_normalize=1
 483 | filters=256
 484 | size=3
 485 | stride=1
 486 | pad=1
 487 | activation=leaky
 488 | 
 489 | [shortcut]
 490 | from=-3
 491 | activation=linear
 492 | 
 493 | 
 494 | [convolutional]
 495 | batch_normalize=1
 496 | filters=256
 497 | size=1
 498 | stride=1
 499 | pad=1
 500 | activation=leaky
 501 | 
 502 | [convolutional]
 503 | batch_normalize=1
 504 | filters=256
 505 | size=3
 506 | stride=1
 507 | pad=1
 508 | activation=leaky
 509 | 
 510 | [shortcut]
 511 | from=-3
 512 | activation=linear
 513 | 
 514 | 
 515 | [convolutional]
 516 | batch_normalize=1
 517 | filters=256
 518 | size=1
 519 | stride=1
 520 | pad=1
 521 | activation=leaky
 522 | 
 523 | [convolutional]
 524 | batch_normalize=1
 525 | filters=256
 526 | size=3
 527 | stride=1
 528 | pad=1
 529 | activation=leaky
 530 | 
 531 | [shortcut]
 532 | from=-3
 533 | activation=linear
 534 | 
 535 | 
 536 | [convolutional]
 537 | batch_normalize=1
 538 | filters=256
 539 | size=1
 540 | stride=1
 541 | pad=1
 542 | activation=leaky
 543 | 
 544 | [convolutional]
 545 | batch_normalize=1
 546 | filters=256
 547 | size=3
 548 | stride=1
 549 | pad=1
 550 | activation=leaky
 551 | 
 552 | [shortcut]
 553 | from=-3
 554 | activation=linear
 555 | 
 556 | 
 557 | [convolutional]
 558 | batch_normalize=1
 559 | filters=256
 560 | size=1
 561 | stride=1
 562 | pad=1
 563 | activation=leaky
 564 | 
 565 | [convolutional]
 566 | batch_normalize=1
 567 | filters=256
 568 | size=3
 569 | stride=1
 570 | pad=1
 571 | activation=leaky
 572 | 
 573 | [shortcut]
 574 | from=-3
 575 | activation=linear
 576 | 
 577 | [convolutional]
 578 | batch_normalize=1
 579 | filters=256
 580 | size=1
 581 | stride=1
 582 | pad=1
 583 | activation=leaky
 584 | 
 585 | [convolutional]
 586 | batch_normalize=1
 587 | filters=256
 588 | size=3
 589 | stride=1
 590 | pad=1
 591 | activation=leaky
 592 | 
 593 | [shortcut]
 594 | from=-3
 595 | activation=linear
 596 | 
 597 | [convolutional]
 598 | batch_normalize=1
 599 | filters=256
 600 | size=1
 601 | stride=1
 602 | pad=1
 603 | activation=leaky
 604 | 
 605 | [route]
 606 | layers = -1,-28
 607 | 
 608 | [convolutional]
 609 | batch_normalize=1
 610 | filters=512
 611 | size=1
 612 | stride=1
 613 | pad=1
 614 | activation=leaky
 615 | 
 616 | # Downsample
 617 | 
 618 | [convolutional]
 619 | batch_normalize=1
 620 | filters=1024
 621 | size=3
 622 | stride=2
 623 | pad=1
 624 | activation=leaky
 625 | 
 626 | [convolutional]
 627 | batch_normalize=1
 628 | filters=512
 629 | size=1
 630 | stride=1
 631 | pad=1
 632 | activation=leaky
 633 | 
 634 | [route]
 635 | layers = -2
 636 | 
 637 | [convolutional]
 638 | batch_normalize=1
 639 | filters=512
 640 | size=1
 641 | stride=1
 642 | pad=1
 643 | activation=leaky
 644 | 
 645 | [convolutional]
 646 | batch_normalize=1
 647 | filters=512
 648 | size=1
 649 | stride=1
 650 | pad=1
 651 | activation=leaky
 652 | 
 653 | [convolutional]
 654 | batch_normalize=1
 655 | filters=512
 656 | size=3
 657 | stride=1
 658 | pad=1
 659 | activation=leaky
 660 | 
 661 | [shortcut]
 662 | from=-3
 663 | activation=linear
 664 | 
 665 | [convolutional]
 666 | batch_normalize=1
 667 | filters=512
 668 | size=1
 669 | stride=1
 670 | pad=1
 671 | activation=leaky
 672 | 
 673 | [convolutional]
 674 | batch_normalize=1
 675 | filters=512
 676 | size=3
 677 | stride=1
 678 | pad=1
 679 | activation=leaky
 680 | 
 681 | [shortcut]
 682 | from=-3
 683 | activation=linear
 684 | 
 685 | [convolutional]
 686 | batch_normalize=1
 687 | filters=512
 688 | size=1
 689 | stride=1
 690 | pad=1
 691 | activation=leaky
 692 | 
 693 | [convolutional]
 694 | batch_normalize=1
 695 | filters=512
 696 | size=3
 697 | stride=1
 698 | pad=1
 699 | activation=leaky
 700 | 
 701 | [shortcut]
 702 | from=-3
 703 | activation=linear
 704 | 
 705 | [convolutional]
 706 | batch_normalize=1
 707 | filters=512
 708 | size=1
 709 | stride=1
 710 | pad=1
 711 | activation=leaky
 712 | 
 713 | [convolutional]
 714 | batch_normalize=1
 715 | filters=512
 716 | size=3
 717 | stride=1
 718 | pad=1
 719 | activation=leaky
 720 | 
 721 | [shortcut]
 722 | from=-3
 723 | activation=linear
 724 | 
 725 | [convolutional]
 726 | batch_normalize=1
 727 | filters=512
 728 | size=1
 729 | stride=1
 730 | pad=1
 731 | activation=leaky
 732 | 
 733 | [route]
 734 | layers = -1,-16
 735 | 
 736 | [convolutional]
 737 | batch_normalize=1
 738 | filters=1024
 739 | size=1
 740 | stride=1
 741 | pad=1
 742 | activation=leaky
 743 | 
 744 | ##########################
 745 | 
 746 | [convolutional]
 747 | batch_normalize=1
 748 | filters=512
 749 | size=1
 750 | stride=1
 751 | pad=1
 752 | activation=leaky
 753 | 
 754 | [convolutional]
 755 | batch_normalize=1
 756 | size=3
 757 | stride=1
 758 | pad=1
 759 | filters=1024
 760 | activation=leaky
 761 | 
 762 | [convolutional]
 763 | batch_normalize=1
 764 | filters=512
 765 | size=1
 766 | stride=1
 767 | pad=1
 768 | activation=leaky
 769 | 
 770 | ### SPP ###
 771 | [maxpool]
 772 | stride=1
 773 | size=5
 774 | 
 775 | [route]
 776 | layers=-2
 777 | 
 778 | [maxpool]
 779 | stride=1
 780 | size=9
 781 | 
 782 | [route]
 783 | layers=-4
 784 | 
 785 | [maxpool]
 786 | stride=1
 787 | size=13
 788 | 
 789 | [route]
 790 | layers=-1,-3,-5,-6
 791 | ### End SPP ###
 792 | 
 793 | [convolutional]
 794 | batch_normalize=1
 795 | filters=512
 796 | size=1
 797 | stride=1
 798 | pad=1
 799 | activation=leaky
 800 | 
 801 | [convolutional]
 802 | batch_normalize=1
 803 | size=3
 804 | stride=1
 805 | pad=1
 806 | filters=1024
 807 | activation=leaky
 808 | 
 809 | [convolutional]
 810 | batch_normalize=1
 811 | filters=512
 812 | size=1
 813 | stride=1
 814 | pad=1
 815 | activation=leaky
 816 | 
 817 | [convolutional]
 818 | batch_normalize=1
 819 | filters=256
 820 | size=1
 821 | stride=1
 822 | pad=1
 823 | activation=leaky
 824 | 
 825 | [upsample]
 826 | stride=2
 827 | 
 828 | [route]
 829 | layers = 79
 830 | 
 831 | [convolutional]
 832 | batch_normalize=1
 833 | filters=256
 834 | size=1
 835 | stride=1
 836 | pad=1
 837 | activation=leaky
 838 | 
 839 | [route]
 840 | layers = -1, -3
 841 | 
 842 | [convolutional]
 843 | batch_normalize=1
 844 | filters=256
 845 | size=1
 846 | stride=1
 847 | pad=1
 848 | activation=leaky
 849 | 
 850 | [convolutional]
 851 | batch_normalize=1
 852 | size=3
 853 | stride=1
 854 | pad=1
 855 | filters=512
 856 | activation=leaky
 857 | 
 858 | [convolutional]
 859 | batch_normalize=1
 860 | filters=256
 861 | size=1
 862 | stride=1
 863 | pad=1
 864 | activation=leaky
 865 | 
 866 | [convolutional]
 867 | batch_normalize=1
 868 | size=3
 869 | stride=1
 870 | pad=1
 871 | filters=512
 872 | activation=leaky
 873 | 
 874 | [convolutional]
 875 | batch_normalize=1
 876 | filters=256
 877 | size=1
 878 | stride=1
 879 | pad=1
 880 | activation=leaky
 881 | 
 882 | [convolutional]
 883 | batch_normalize=1
 884 | filters=128
 885 | size=1
 886 | stride=1
 887 | pad=1
 888 | activation=leaky
 889 | 
 890 | [upsample]
 891 | stride=2
 892 | 
 893 | [route]
 894 | layers = 48
 895 | 
 896 | [convolutional]
 897 | batch_normalize=1
 898 | filters=128
 899 | size=1
 900 | stride=1
 901 | pad=1
 902 | activation=leaky
 903 | 
 904 | [route]
 905 | layers = -1, -3
 906 | 
 907 | [convolutional]
 908 | batch_normalize=1
 909 | filters=128
 910 | size=1
 911 | stride=1
 912 | pad=1
 913 | activation=leaky
 914 | 
 915 | [convolutional]
 916 | batch_normalize=1
 917 | size=3
 918 | stride=1
 919 | pad=1
 920 | filters=256
 921 | activation=leaky
 922 | 
 923 | [convolutional]
 924 | batch_normalize=1
 925 | filters=128
 926 | size=1
 927 | stride=1
 928 | pad=1
 929 | activation=leaky
 930 | 
 931 | [convolutional]
 932 | batch_normalize=1
 933 | size=3
 934 | stride=1
 935 | pad=1
 936 | filters=256
 937 | activation=leaky
 938 | 
 939 | [convolutional]
 940 | batch_normalize=1
 941 | filters=128
 942 | size=1
 943 | stride=1
 944 | pad=1
 945 | activation=leaky
 946 | 
 947 | ##########################
 948 | 
 949 | [convolutional]
 950 | batch_normalize=1
 951 | size=3
 952 | stride=1
 953 | pad=1
 954 | filters=256
 955 | activation=leaky
 956 | 
 957 | [convolutional]
 958 | size=1
 959 | stride=1
 960 | pad=1
 961 | filters=255
 962 | activation=linear
 963 | 
 964 | 
 965 | [yolo]
 966 | mask = 0,1,2
 967 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
 968 | classes=80
 969 | num=9
 970 | jitter=.3
 971 | ignore_thresh = .7
 972 | truth_thresh = 1
 973 | random=1
 974 | 
 975 | [route]
 976 | layers = -4
 977 | 
 978 | [convolutional]
 979 | batch_normalize=1
 980 | size=3
 981 | stride=2
 982 | pad=1
 983 | filters=256
 984 | activation=leaky
 985 | 
 986 | [route]
 987 | layers = -1, -16
 988 | 
 989 | [convolutional]
 990 | batch_normalize=1
 991 | filters=256
 992 | size=1
 993 | stride=1
 994 | pad=1
 995 | activation=leaky
 996 | 
 997 | [convolutional]
 998 | batch_normalize=1
 999 | size=3
1000 | stride=1
1001 | pad=1
1002 | filters=512
1003 | activation=leaky
1004 | 
1005 | [convolutional]
1006 | batch_normalize=1
1007 | filters=256
1008 | size=1
1009 | stride=1
1010 | pad=1
1011 | activation=leaky
1012 | 
1013 | [convolutional]
1014 | batch_normalize=1
1015 | size=3
1016 | stride=1
1017 | pad=1
1018 | filters=512
1019 | activation=leaky
1020 | 
1021 | [convolutional]
1022 | batch_normalize=1
1023 | filters=256
1024 | size=1
1025 | stride=1
1026 | pad=1
1027 | activation=leaky
1028 | 
1029 | [convolutional]
1030 | batch_normalize=1
1031 | size=3
1032 | stride=1
1033 | pad=1
1034 | filters=512
1035 | activation=leaky
1036 | 
1037 | [convolutional]
1038 | size=1
1039 | stride=1
1040 | pad=1
1041 | filters=255
1042 | activation=linear
1043 | 
1044 | 
1045 | [yolo]
1046 | mask = 3,4,5
1047 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
1048 | classes=80
1049 | num=9
1050 | jitter=.3
1051 | ignore_thresh = .7
1052 | truth_thresh = 1
1053 | random=1
1054 | 
1055 | [route]
1056 | layers = -4
1057 | 
1058 | [convolutional]
1059 | batch_normalize=1
1060 | size=3
1061 | stride=2
1062 | pad=1
1063 | filters=512
1064 | activation=leaky
1065 | 
1066 | [route]
1067 | layers = -1, -37
1068 | 
1069 | [convolutional]
1070 | batch_normalize=1
1071 | filters=512
1072 | size=1
1073 | stride=1
1074 | pad=1
1075 | activation=leaky
1076 | 
1077 | [convolutional]
1078 | batch_normalize=1
1079 | size=3
1080 | stride=1
1081 | pad=1
1082 | filters=1024
1083 | activation=leaky
1084 | 
1085 | [convolutional]
1086 | batch_normalize=1
1087 | filters=512
1088 | size=1
1089 | stride=1
1090 | pad=1
1091 | activation=leaky
1092 | 
1093 | [convolutional]
1094 | batch_normalize=1
1095 | size=3
1096 | stride=1
1097 | pad=1
1098 | filters=1024
1099 | activation=leaky
1100 | 
1101 | [convolutional]
1102 | batch_normalize=1
1103 | filters=512
1104 | size=1
1105 | stride=1
1106 | pad=1
1107 | activation=leaky
1108 | 
1109 | [convolutional]
1110 | batch_normalize=1
1111 | size=3
1112 | stride=1
1113 | pad=1
1114 | filters=1024
1115 | activation=leaky
1116 | 
1117 | [convolutional]
1118 | size=1
1119 | stride=1
1120 | pad=1
1121 | filters=255
1122 | activation=linear
1123 | 
1124 | 
1125 | [yolo]
1126 | mask = 6,7,8
1127 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
1128 | classes=80
1129 | num=9
1130 | jitter=.3
1131 | ignore_thresh = .7
1132 | truth_thresh = 1
1133 | random=1
1134 | 


--------------------------------------------------------------------------------
/main_yolobile.py:
--------------------------------------------------------------------------------
  1 | import cv2 as cv
  2 | import argparse
  3 | import numpy as np
  4 | 
  5 | # Initialize the parameters
  6 | confThreshold = 0.25  # Confidence threshold
  7 | nmsThreshold = 0.4  # Non-maximum suppression threshold
  8 | inpWidth = 320  # Width of network's input image
  9 | inpHeight = 320  # Height of network's input image
 10 | 
 11 | # Give the configuration and weight files for the model and load the network using them.
 12 | modelConfiguration = "csdarknet53s-panet-spp.cfg"
 13 | modelWeights = "yolobile.weights"
 14 | 
 15 | # Load names of classes
 16 | classesFile = "coco.names"
 17 | classes = None
 18 | with open(classesFile, 'rt') as f:
 19 |     classes = f.read().rstrip('\n').split('\n')
 20 | colors = [np.random.randint(0, 255, size=3).tolist() for _ in range(len(classes))]
 21 | 
 22 | # Get the names of the output layers
 23 | def getOutputsNames(net):
 24 |     # Get the names of all the layers in the network
 25 |     layersNames = net.getLayerNames()
 26 |     # print(dir(net))
 27 |     # Get the names of the output layers, i.e. the layers with unconnected outputs
 28 |     return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
 29 | 
 30 | # Draw the predicted bounding box
 31 | def drawPred(classId, conf, left, top, right, bottom):
 32 |     # Draw a bounding box.
 33 |     cv.rectangle(frame, (left, top), (right, bottom), (0,0,255), thickness=4)
 34 | 
 35 |     label = '%.2f' % conf
 36 | 
 37 |     # Get the label for the class name and its confidence
 38 |     if classes:
 39 |         assert (classId < len(classes))
 40 |         label = '%s:%s' % (classes[classId], label)
 41 | 
 42 |     # Display the label at the top of the bounding box
 43 |     labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
 44 |     top = max(top, labelSize[1])
 45 |     # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED)
 46 |     cv.putText(frame, label, (left, top-10), cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), thickness=2)
 47 | 
 48 | # Remove the bounding boxes with low confidence using non-maxima suppression
 49 | def postprocess(frame, outs):
 50 |     frameHeight = frame.shape[0]
 51 |     frameWidth = frame.shape[1]
 52 | 
 53 |     # Scan through all the bounding boxes output from the network and keep only the
 54 |     # ones with high confidence scores. Assign the box's class label as the class with the highest score.
 55 |     classIds = []
 56 |     confidences = []
 57 |     boxes = []
 58 |     for out in outs:
 59 |         for detection in out:
 60 |             scores = detection[5:]
 61 |             classId = np.argmax(scores)
 62 |             confidence = scores[classId]
 63 |             if confidence > confThreshold:
 64 |                 center_x = int(detection[0] * frameWidth)
 65 |                 center_y = int(detection[1] * frameHeight)
 66 |                 width = int(detection[2] * frameWidth)
 67 |                 height = int(detection[3] * frameHeight)
 68 |                 left = int(center_x - width / 2)
 69 |                 top = int(center_y - height / 2)
 70 |                 classIds.append(classId)
 71 |                 confidences.append(float(confidence))
 72 |                 boxes.append([left, top, width, height])
 73 | 
 74 |     # Perform non maximum suppression to eliminate redundant overlapping boxes with
 75 |     # lower confidences.
 76 |     indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
 77 |     for i in indices:
 78 |         i = i[0]
 79 |         box = boxes[i]
 80 |         left = box[0]
 81 |         top = box[1]
 82 |         width = box[2]
 83 |         height = box[3]
 84 |         drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
 85 | 
 86 | if __name__=='__main__':
 87 |     parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
 88 |     parser.add_argument('--image', type=str, default='bus.jpg', help='Path to image file.')
 89 |     args = parser.parse_args()
 90 | 
 91 |     net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
 92 |     net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 93 |     net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
 94 |     # Process inputs
 95 |     frame = cv.imread(args.image)
 96 | 
 97 |         # Create a 4D blob from a frame.
 98 |     blob = cv.dnn.blobFromImage(frame, 1/255.0, (inpWidth, inpHeight), [0, 0, 0], swapRB=False, crop=False)
 99 | 
100 |     # Sets the input to the network
101 |     net.setInput(blob)
102 | 
103 |     # Runs the forward pass to get output of the output layers
104 |     outs = net.forward(getOutputsNames(net))
105 |     # Remove the bounding boxes with low confidence
106 |     postprocess(frame, outs)
107 | 
108 |     # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
109 |     t, _ = net.getPerfProfile()
110 |     label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
111 |     cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
112 | 
113 |     winName = 'Deep learning object detection in OpenCV'
114 |     cv.namedWindow(winName,0)
115 |     cv.imshow(winName, frame)
116 |     cv.waitKey(0)
117 |     cv.destroyAllWindows()
118 | 


--------------------------------------------------------------------------------