├── README.md ├── coco.names ├── coco_model ├── yolov4-tiny.cfg ├── yolov4-tiny.weights └── yolov4.cfg ├── dog.jpg ├── main2_yolov4.py ├── main_yolov4.py └── person.jpg /README.md: -------------------------------------------------------------------------------- 1 | # yolov4-opencv-dnn 2 | 使用opencv的dnn模块做yolov4目标检测 3 | 程序运行环境:Python3.7+opencv4.4 4 | 5 | 由于yolov4.weights文件太大,无法直接上传,因此在运行程序前需要把yolov4.weights下载到coco_model文件夹里。 6 | yolov4.weights的下载链接是: 7 | baidu(https://pan.baidu.com/s/1dAGEW8cm-dqK14TbhhVetA Extraction code:dm5b) 8 | google(https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) 9 | -------------------------------------------------------------------------------- /coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /coco_model/yolov4-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.00261 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=2 30 | pad=1 31 | activation=leaky 32 | 33 | [convolutional] 34 | batch_normalize=1 35 | filters=64 36 | size=3 37 | stride=2 38 | pad=1 39 | activation=leaky 40 | 41 | [convolutional] 42 | batch_normalize=1 43 | filters=64 44 | size=3 45 | stride=1 46 | pad=1 47 | activation=leaky 48 | 49 | [route] 50 | layers=-1 51 | groups=2 52 | group_id=1 53 | 54 | [convolutional] 55 | batch_normalize=1 56 | filters=32 57 | size=3 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=32 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [route] 71 | layers = -1,-2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [route] 82 | layers = -6,-1 83 | 84 | [maxpool] 85 | size=2 86 | stride=2 87 | 88 | [convolutional] 89 | batch_normalize=1 90 | filters=128 91 | size=3 92 | stride=1 93 | pad=1 94 | activation=leaky 95 | 96 | [route] 97 | layers=-1 98 | groups=2 99 | group_id=1 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=64 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [convolutional] 110 | batch_normalize=1 111 | filters=64 112 | size=3 113 | stride=1 114 | pad=1 115 | activation=leaky 116 | 117 | [route] 118 | layers = -1,-2 119 | 120 | [convolutional] 121 | batch_normalize=1 122 | filters=128 123 | size=1 124 | stride=1 125 | pad=1 126 | activation=leaky 127 | 128 | [route] 129 | layers = -6,-1 130 | 131 | [maxpool] 132 | size=2 133 | stride=2 134 | 135 | [convolutional] 136 | batch_normalize=1 137 | filters=256 138 | size=3 139 | stride=1 140 | pad=1 141 | activation=leaky 142 | 143 | [route] 144 | layers=-1 145 | groups=2 146 | group_id=1 147 | 148 | [convolutional] 149 | batch_normalize=1 150 | filters=128 151 | size=3 152 | stride=1 153 | pad=1 154 | activation=leaky 155 | 156 | [convolutional] 157 | batch_normalize=1 158 | filters=128 159 | size=3 160 | stride=1 161 | pad=1 162 | activation=leaky 163 | 164 | [route] 165 | layers = -1,-2 166 | 167 | [convolutional] 168 | batch_normalize=1 169 | filters=256 170 | size=1 171 | stride=1 172 | pad=1 173 | activation=leaky 174 | 175 | [route] 176 | layers = -6,-1 177 | 178 | [maxpool] 179 | size=2 180 | stride=2 181 | 182 | [convolutional] 183 | batch_normalize=1 184 | filters=512 185 | size=3 186 | stride=1 187 | pad=1 188 | activation=leaky 189 | 190 | ################################## 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | filters=256 195 | size=1 196 | stride=1 197 | pad=1 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | filters=512 203 | size=3 204 | stride=1 205 | pad=1 206 | activation=leaky 207 | 208 | [convolutional] 209 | size=1 210 | stride=1 211 | pad=1 212 | filters=255 213 | activation=linear 214 | 215 | 216 | 217 | [yolo] 218 | mask = 3,4,5 219 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 220 | classes=80 221 | num=6 222 | jitter=.3 223 | scale_x_y = 1.05 224 | cls_normalizer=1.0 225 | iou_normalizer=0.07 226 | iou_loss=ciou 227 | ignore_thresh = .7 228 | truth_thresh = 1 229 | random=0 230 | resize=1.5 231 | nms_kind=greedynms 232 | beta_nms=0.6 233 | 234 | [route] 235 | layers = -4 236 | 237 | [convolutional] 238 | batch_normalize=1 239 | filters=128 240 | size=1 241 | stride=1 242 | pad=1 243 | activation=leaky 244 | 245 | [upsample] 246 | stride=2 247 | 248 | [route] 249 | layers = -1, 23 250 | 251 | [convolutional] 252 | batch_normalize=1 253 | filters=256 254 | size=3 255 | stride=1 256 | pad=1 257 | activation=leaky 258 | 259 | [convolutional] 260 | size=1 261 | stride=1 262 | pad=1 263 | filters=255 264 | activation=linear 265 | 266 | [yolo] 267 | mask = 1,2,3 268 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 269 | classes=80 270 | num=6 271 | jitter=.3 272 | scale_x_y = 1.05 273 | cls_normalizer=1.0 274 | iou_normalizer=0.07 275 | iou_loss=ciou 276 | ignore_thresh = .7 277 | truth_thresh = 1 278 | random=0 279 | resize=1.5 280 | nms_kind=greedynms 281 | beta_nms=0.6 282 | -------------------------------------------------------------------------------- /coco_model/yolov4-tiny.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/coco_model/yolov4-tiny.weights -------------------------------------------------------------------------------- /coco_model/yolov4.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=8 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.949 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.00261 19 | burn_in=1000 20 | max_batches = 500500 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | #cutmix=1 26 | mosaic=1 27 | 28 | #:104x104 54:52x52 85:26x26 104:13x13 for 416 29 | 30 | [convolutional] 31 | batch_normalize=1 32 | filters=32 33 | size=3 34 | stride=1 35 | pad=1 36 | activation=mish 37 | 38 | # Downsample 39 | 40 | [convolutional] 41 | batch_normalize=1 42 | filters=64 43 | size=3 44 | stride=2 45 | pad=1 46 | activation=mish 47 | 48 | [convolutional] 49 | batch_normalize=1 50 | filters=64 51 | size=1 52 | stride=1 53 | pad=1 54 | activation=mish 55 | 56 | [route] 57 | layers = -2 58 | 59 | [convolutional] 60 | batch_normalize=1 61 | filters=64 62 | size=1 63 | stride=1 64 | pad=1 65 | activation=mish 66 | 67 | [convolutional] 68 | batch_normalize=1 69 | filters=32 70 | size=1 71 | stride=1 72 | pad=1 73 | activation=mish 74 | 75 | [convolutional] 76 | batch_normalize=1 77 | filters=64 78 | size=3 79 | stride=1 80 | pad=1 81 | activation=mish 82 | 83 | [shortcut] 84 | from=-3 85 | activation=linear 86 | 87 | [convolutional] 88 | batch_normalize=1 89 | filters=64 90 | size=1 91 | stride=1 92 | pad=1 93 | activation=mish 94 | 95 | [route] 96 | layers = -1,-7 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=64 101 | size=1 102 | stride=1 103 | pad=1 104 | activation=mish 105 | 106 | # Downsample 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=128 111 | size=3 112 | stride=2 113 | pad=1 114 | activation=mish 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=64 119 | size=1 120 | stride=1 121 | pad=1 122 | activation=mish 123 | 124 | [route] 125 | layers = -2 126 | 127 | [convolutional] 128 | batch_normalize=1 129 | filters=64 130 | size=1 131 | stride=1 132 | pad=1 133 | activation=mish 134 | 135 | [convolutional] 136 | batch_normalize=1 137 | filters=64 138 | size=1 139 | stride=1 140 | pad=1 141 | activation=mish 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=64 146 | size=3 147 | stride=1 148 | pad=1 149 | activation=mish 150 | 151 | [shortcut] 152 | from=-3 153 | activation=linear 154 | 155 | [convolutional] 156 | batch_normalize=1 157 | filters=64 158 | size=1 159 | stride=1 160 | pad=1 161 | activation=mish 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=64 166 | size=3 167 | stride=1 168 | pad=1 169 | activation=mish 170 | 171 | [shortcut] 172 | from=-3 173 | activation=linear 174 | 175 | [convolutional] 176 | batch_normalize=1 177 | filters=64 178 | size=1 179 | stride=1 180 | pad=1 181 | activation=mish 182 | 183 | [route] 184 | layers = -1,-10 185 | 186 | [convolutional] 187 | batch_normalize=1 188 | filters=128 189 | size=1 190 | stride=1 191 | pad=1 192 | activation=mish 193 | 194 | # Downsample 195 | 196 | [convolutional] 197 | batch_normalize=1 198 | filters=256 199 | size=3 200 | stride=2 201 | pad=1 202 | activation=mish 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=mish 211 | 212 | [route] 213 | layers = -2 214 | 215 | [convolutional] 216 | batch_normalize=1 217 | filters=128 218 | size=1 219 | stride=1 220 | pad=1 221 | activation=mish 222 | 223 | [convolutional] 224 | batch_normalize=1 225 | filters=128 226 | size=1 227 | stride=1 228 | pad=1 229 | activation=mish 230 | 231 | [convolutional] 232 | batch_normalize=1 233 | filters=128 234 | size=3 235 | stride=1 236 | pad=1 237 | activation=mish 238 | 239 | [shortcut] 240 | from=-3 241 | activation=linear 242 | 243 | [convolutional] 244 | batch_normalize=1 245 | filters=128 246 | size=1 247 | stride=1 248 | pad=1 249 | activation=mish 250 | 251 | [convolutional] 252 | batch_normalize=1 253 | filters=128 254 | size=3 255 | stride=1 256 | pad=1 257 | activation=mish 258 | 259 | [shortcut] 260 | from=-3 261 | activation=linear 262 | 263 | [convolutional] 264 | batch_normalize=1 265 | filters=128 266 | size=1 267 | stride=1 268 | pad=1 269 | activation=mish 270 | 271 | [convolutional] 272 | batch_normalize=1 273 | filters=128 274 | size=3 275 | stride=1 276 | pad=1 277 | activation=mish 278 | 279 | [shortcut] 280 | from=-3 281 | activation=linear 282 | 283 | [convolutional] 284 | batch_normalize=1 285 | filters=128 286 | size=1 287 | stride=1 288 | pad=1 289 | activation=mish 290 | 291 | [convolutional] 292 | batch_normalize=1 293 | filters=128 294 | size=3 295 | stride=1 296 | pad=1 297 | activation=mish 298 | 299 | [shortcut] 300 | from=-3 301 | activation=linear 302 | 303 | 304 | [convolutional] 305 | batch_normalize=1 306 | filters=128 307 | size=1 308 | stride=1 309 | pad=1 310 | activation=mish 311 | 312 | [convolutional] 313 | batch_normalize=1 314 | filters=128 315 | size=3 316 | stride=1 317 | pad=1 318 | activation=mish 319 | 320 | [shortcut] 321 | from=-3 322 | activation=linear 323 | 324 | [convolutional] 325 | batch_normalize=1 326 | filters=128 327 | size=1 328 | stride=1 329 | pad=1 330 | activation=mish 331 | 332 | [convolutional] 333 | batch_normalize=1 334 | filters=128 335 | size=3 336 | stride=1 337 | pad=1 338 | activation=mish 339 | 340 | [shortcut] 341 | from=-3 342 | activation=linear 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=128 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=mish 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=128 355 | size=3 356 | stride=1 357 | pad=1 358 | activation=mish 359 | 360 | [shortcut] 361 | from=-3 362 | activation=linear 363 | 364 | [convolutional] 365 | batch_normalize=1 366 | filters=128 367 | size=1 368 | stride=1 369 | pad=1 370 | activation=mish 371 | 372 | [convolutional] 373 | batch_normalize=1 374 | filters=128 375 | size=3 376 | stride=1 377 | pad=1 378 | activation=mish 379 | 380 | [shortcut] 381 | from=-3 382 | activation=linear 383 | 384 | [convolutional] 385 | batch_normalize=1 386 | filters=128 387 | size=1 388 | stride=1 389 | pad=1 390 | activation=mish 391 | 392 | [route] 393 | layers = -1,-28 394 | 395 | [convolutional] 396 | batch_normalize=1 397 | filters=256 398 | size=1 399 | stride=1 400 | pad=1 401 | activation=mish 402 | 403 | # Downsample 404 | 405 | [convolutional] 406 | batch_normalize=1 407 | filters=512 408 | size=3 409 | stride=2 410 | pad=1 411 | activation=mish 412 | 413 | [convolutional] 414 | batch_normalize=1 415 | filters=256 416 | size=1 417 | stride=1 418 | pad=1 419 | activation=mish 420 | 421 | [route] 422 | layers = -2 423 | 424 | [convolutional] 425 | batch_normalize=1 426 | filters=256 427 | size=1 428 | stride=1 429 | pad=1 430 | activation=mish 431 | 432 | [convolutional] 433 | batch_normalize=1 434 | filters=256 435 | size=1 436 | stride=1 437 | pad=1 438 | activation=mish 439 | 440 | [convolutional] 441 | batch_normalize=1 442 | filters=256 443 | size=3 444 | stride=1 445 | pad=1 446 | activation=mish 447 | 448 | [shortcut] 449 | from=-3 450 | activation=linear 451 | 452 | 453 | [convolutional] 454 | batch_normalize=1 455 | filters=256 456 | size=1 457 | stride=1 458 | pad=1 459 | activation=mish 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=256 464 | size=3 465 | stride=1 466 | pad=1 467 | activation=mish 468 | 469 | [shortcut] 470 | from=-3 471 | activation=linear 472 | 473 | 474 | [convolutional] 475 | batch_normalize=1 476 | filters=256 477 | size=1 478 | stride=1 479 | pad=1 480 | activation=mish 481 | 482 | [convolutional] 483 | batch_normalize=1 484 | filters=256 485 | size=3 486 | stride=1 487 | pad=1 488 | activation=mish 489 | 490 | [shortcut] 491 | from=-3 492 | activation=linear 493 | 494 | 495 | [convolutional] 496 | batch_normalize=1 497 | filters=256 498 | size=1 499 | stride=1 500 | pad=1 501 | activation=mish 502 | 503 | [convolutional] 504 | batch_normalize=1 505 | filters=256 506 | size=3 507 | stride=1 508 | pad=1 509 | activation=mish 510 | 511 | [shortcut] 512 | from=-3 513 | activation=linear 514 | 515 | 516 | [convolutional] 517 | batch_normalize=1 518 | filters=256 519 | size=1 520 | stride=1 521 | pad=1 522 | activation=mish 523 | 524 | [convolutional] 525 | batch_normalize=1 526 | filters=256 527 | size=3 528 | stride=1 529 | pad=1 530 | activation=mish 531 | 532 | [shortcut] 533 | from=-3 534 | activation=linear 535 | 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=256 540 | size=1 541 | stride=1 542 | pad=1 543 | activation=mish 544 | 545 | [convolutional] 546 | batch_normalize=1 547 | filters=256 548 | size=3 549 | stride=1 550 | pad=1 551 | activation=mish 552 | 553 | [shortcut] 554 | from=-3 555 | activation=linear 556 | 557 | 558 | [convolutional] 559 | batch_normalize=1 560 | filters=256 561 | size=1 562 | stride=1 563 | pad=1 564 | activation=mish 565 | 566 | [convolutional] 567 | batch_normalize=1 568 | filters=256 569 | size=3 570 | stride=1 571 | pad=1 572 | activation=mish 573 | 574 | [shortcut] 575 | from=-3 576 | activation=linear 577 | 578 | [convolutional] 579 | batch_normalize=1 580 | filters=256 581 | size=1 582 | stride=1 583 | pad=1 584 | activation=mish 585 | 586 | [convolutional] 587 | batch_normalize=1 588 | filters=256 589 | size=3 590 | stride=1 591 | pad=1 592 | activation=mish 593 | 594 | [shortcut] 595 | from=-3 596 | activation=linear 597 | 598 | [convolutional] 599 | batch_normalize=1 600 | filters=256 601 | size=1 602 | stride=1 603 | pad=1 604 | activation=mish 605 | 606 | [route] 607 | layers = -1,-28 608 | 609 | [convolutional] 610 | batch_normalize=1 611 | filters=512 612 | size=1 613 | stride=1 614 | pad=1 615 | activation=mish 616 | 617 | # Downsample 618 | 619 | [convolutional] 620 | batch_normalize=1 621 | filters=1024 622 | size=3 623 | stride=2 624 | pad=1 625 | activation=mish 626 | 627 | [convolutional] 628 | batch_normalize=1 629 | filters=512 630 | size=1 631 | stride=1 632 | pad=1 633 | activation=mish 634 | 635 | [route] 636 | layers = -2 637 | 638 | [convolutional] 639 | batch_normalize=1 640 | filters=512 641 | size=1 642 | stride=1 643 | pad=1 644 | activation=mish 645 | 646 | [convolutional] 647 | batch_normalize=1 648 | filters=512 649 | size=1 650 | stride=1 651 | pad=1 652 | activation=mish 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=512 657 | size=3 658 | stride=1 659 | pad=1 660 | activation=mish 661 | 662 | [shortcut] 663 | from=-3 664 | activation=linear 665 | 666 | [convolutional] 667 | batch_normalize=1 668 | filters=512 669 | size=1 670 | stride=1 671 | pad=1 672 | activation=mish 673 | 674 | [convolutional] 675 | batch_normalize=1 676 | filters=512 677 | size=3 678 | stride=1 679 | pad=1 680 | activation=mish 681 | 682 | [shortcut] 683 | from=-3 684 | activation=linear 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=512 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=mish 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | filters=512 697 | size=3 698 | stride=1 699 | pad=1 700 | activation=mish 701 | 702 | [shortcut] 703 | from=-3 704 | activation=linear 705 | 706 | [convolutional] 707 | batch_normalize=1 708 | filters=512 709 | size=1 710 | stride=1 711 | pad=1 712 | activation=mish 713 | 714 | [convolutional] 715 | batch_normalize=1 716 | filters=512 717 | size=3 718 | stride=1 719 | pad=1 720 | activation=mish 721 | 722 | [shortcut] 723 | from=-3 724 | activation=linear 725 | 726 | [convolutional] 727 | batch_normalize=1 728 | filters=512 729 | size=1 730 | stride=1 731 | pad=1 732 | activation=mish 733 | 734 | [route] 735 | layers = -1,-16 736 | 737 | [convolutional] 738 | batch_normalize=1 739 | filters=1024 740 | size=1 741 | stride=1 742 | pad=1 743 | activation=mish 744 | 745 | ########################## 746 | 747 | [convolutional] 748 | batch_normalize=1 749 | filters=512 750 | size=1 751 | stride=1 752 | pad=1 753 | activation=leaky 754 | 755 | [convolutional] 756 | batch_normalize=1 757 | size=3 758 | stride=1 759 | pad=1 760 | filters=1024 761 | activation=leaky 762 | 763 | [convolutional] 764 | batch_normalize=1 765 | filters=512 766 | size=1 767 | stride=1 768 | pad=1 769 | activation=leaky 770 | 771 | ### SPP ### 772 | [maxpool] 773 | stride=1 774 | size=5 775 | 776 | [route] 777 | layers=-2 778 | 779 | [maxpool] 780 | stride=1 781 | size=9 782 | 783 | [route] 784 | layers=-4 785 | 786 | [maxpool] 787 | stride=1 788 | size=13 789 | 790 | [route] 791 | layers=-1,-3,-5,-6 792 | ### End SPP ### 793 | 794 | [convolutional] 795 | batch_normalize=1 796 | filters=512 797 | size=1 798 | stride=1 799 | pad=1 800 | activation=leaky 801 | 802 | [convolutional] 803 | batch_normalize=1 804 | size=3 805 | stride=1 806 | pad=1 807 | filters=1024 808 | activation=leaky 809 | 810 | [convolutional] 811 | batch_normalize=1 812 | filters=512 813 | size=1 814 | stride=1 815 | pad=1 816 | activation=leaky 817 | 818 | [convolutional] 819 | batch_normalize=1 820 | filters=256 821 | size=1 822 | stride=1 823 | pad=1 824 | activation=leaky 825 | 826 | [upsample] 827 | stride=2 828 | 829 | [route] 830 | layers = 85 831 | 832 | [convolutional] 833 | batch_normalize=1 834 | filters=256 835 | size=1 836 | stride=1 837 | pad=1 838 | activation=leaky 839 | 840 | [route] 841 | layers = -1, -3 842 | 843 | [convolutional] 844 | batch_normalize=1 845 | filters=256 846 | size=1 847 | stride=1 848 | pad=1 849 | activation=leaky 850 | 851 | [convolutional] 852 | batch_normalize=1 853 | size=3 854 | stride=1 855 | pad=1 856 | filters=512 857 | activation=leaky 858 | 859 | [convolutional] 860 | batch_normalize=1 861 | filters=256 862 | size=1 863 | stride=1 864 | pad=1 865 | activation=leaky 866 | 867 | [convolutional] 868 | batch_normalize=1 869 | size=3 870 | stride=1 871 | pad=1 872 | filters=512 873 | activation=leaky 874 | 875 | [convolutional] 876 | batch_normalize=1 877 | filters=256 878 | size=1 879 | stride=1 880 | pad=1 881 | activation=leaky 882 | 883 | [convolutional] 884 | batch_normalize=1 885 | filters=128 886 | size=1 887 | stride=1 888 | pad=1 889 | activation=leaky 890 | 891 | [upsample] 892 | stride=2 893 | 894 | [route] 895 | layers = 54 896 | 897 | [convolutional] 898 | batch_normalize=1 899 | filters=128 900 | size=1 901 | stride=1 902 | pad=1 903 | activation=leaky 904 | 905 | [route] 906 | layers = -1, -3 907 | 908 | [convolutional] 909 | batch_normalize=1 910 | filters=128 911 | size=1 912 | stride=1 913 | pad=1 914 | activation=leaky 915 | 916 | [convolutional] 917 | batch_normalize=1 918 | size=3 919 | stride=1 920 | pad=1 921 | filters=256 922 | activation=leaky 923 | 924 | [convolutional] 925 | batch_normalize=1 926 | filters=128 927 | size=1 928 | stride=1 929 | pad=1 930 | activation=leaky 931 | 932 | [convolutional] 933 | batch_normalize=1 934 | size=3 935 | stride=1 936 | pad=1 937 | filters=256 938 | activation=leaky 939 | 940 | [convolutional] 941 | batch_normalize=1 942 | filters=128 943 | size=1 944 | stride=1 945 | pad=1 946 | activation=leaky 947 | 948 | ########################## 949 | 950 | [convolutional] 951 | batch_normalize=1 952 | size=3 953 | stride=1 954 | pad=1 955 | filters=256 956 | activation=leaky 957 | 958 | [convolutional] 959 | size=1 960 | stride=1 961 | pad=1 962 | filters=255 963 | activation=linear 964 | 965 | 966 | [yolo] 967 | mask = 0,1,2 968 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 969 | classes=80 970 | num=9 971 | jitter=.3 972 | ignore_thresh = .7 973 | truth_thresh = 1 974 | scale_x_y = 1.2 975 | iou_thresh=0.213 976 | cls_normalizer=1.0 977 | iou_normalizer=0.07 978 | iou_loss=ciou 979 | nms_kind=greedynms 980 | beta_nms=0.6 981 | 982 | 983 | [route] 984 | layers = -4 985 | 986 | [convolutional] 987 | batch_normalize=1 988 | size=3 989 | stride=2 990 | pad=1 991 | filters=256 992 | activation=leaky 993 | 994 | [route] 995 | layers = -1, -16 996 | 997 | [convolutional] 998 | batch_normalize=1 999 | filters=256 1000 | size=1 1001 | stride=1 1002 | pad=1 1003 | activation=leaky 1004 | 1005 | [convolutional] 1006 | batch_normalize=1 1007 | size=3 1008 | stride=1 1009 | pad=1 1010 | filters=512 1011 | activation=leaky 1012 | 1013 | [convolutional] 1014 | batch_normalize=1 1015 | filters=256 1016 | size=1 1017 | stride=1 1018 | pad=1 1019 | activation=leaky 1020 | 1021 | [convolutional] 1022 | batch_normalize=1 1023 | size=3 1024 | stride=1 1025 | pad=1 1026 | filters=512 1027 | activation=leaky 1028 | 1029 | [convolutional] 1030 | batch_normalize=1 1031 | filters=256 1032 | size=1 1033 | stride=1 1034 | pad=1 1035 | activation=leaky 1036 | 1037 | [convolutional] 1038 | batch_normalize=1 1039 | size=3 1040 | stride=1 1041 | pad=1 1042 | filters=512 1043 | activation=leaky 1044 | 1045 | [convolutional] 1046 | size=1 1047 | stride=1 1048 | pad=1 1049 | filters=255 1050 | activation=linear 1051 | 1052 | 1053 | [yolo] 1054 | mask = 3,4,5 1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1056 | classes=80 1057 | num=9 1058 | jitter=.3 1059 | ignore_thresh = .7 1060 | truth_thresh = 1 1061 | scale_x_y = 1.1 1062 | iou_thresh=0.213 1063 | cls_normalizer=1.0 1064 | iou_normalizer=0.07 1065 | iou_loss=ciou 1066 | nms_kind=greedynms 1067 | beta_nms=0.6 1068 | 1069 | 1070 | [route] 1071 | layers = -4 1072 | 1073 | [convolutional] 1074 | batch_normalize=1 1075 | size=3 1076 | stride=2 1077 | pad=1 1078 | filters=512 1079 | activation=leaky 1080 | 1081 | [route] 1082 | layers = -1, -37 1083 | 1084 | [convolutional] 1085 | batch_normalize=1 1086 | filters=512 1087 | size=1 1088 | stride=1 1089 | pad=1 1090 | activation=leaky 1091 | 1092 | [convolutional] 1093 | batch_normalize=1 1094 | size=3 1095 | stride=1 1096 | pad=1 1097 | filters=1024 1098 | activation=leaky 1099 | 1100 | [convolutional] 1101 | batch_normalize=1 1102 | filters=512 1103 | size=1 1104 | stride=1 1105 | pad=1 1106 | activation=leaky 1107 | 1108 | [convolutional] 1109 | batch_normalize=1 1110 | size=3 1111 | stride=1 1112 | pad=1 1113 | filters=1024 1114 | activation=leaky 1115 | 1116 | [convolutional] 1117 | batch_normalize=1 1118 | filters=512 1119 | size=1 1120 | stride=1 1121 | pad=1 1122 | activation=leaky 1123 | 1124 | [convolutional] 1125 | batch_normalize=1 1126 | size=3 1127 | stride=1 1128 | pad=1 1129 | filters=1024 1130 | activation=leaky 1131 | 1132 | [convolutional] 1133 | size=1 1134 | stride=1 1135 | pad=1 1136 | filters=255 1137 | activation=linear 1138 | 1139 | 1140 | [yolo] 1141 | mask = 6,7,8 1142 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1143 | classes=80 1144 | num=9 1145 | jitter=.3 1146 | ignore_thresh = .7 1147 | truth_thresh = 1 1148 | random=1 1149 | scale_x_y = 1.05 1150 | iou_thresh=0.213 1151 | cls_normalizer=1.0 1152 | iou_normalizer=0.07 1153 | iou_loss=ciou 1154 | nms_kind=greedynms 1155 | beta_nms=0.6 1156 | 1157 | -------------------------------------------------------------------------------- /dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/dog.jpg -------------------------------------------------------------------------------- /main2_yolov4.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import argparse 3 | import random 4 | 5 | # 文件需要加载的文件 6 | cfg = "coco_model/yolov4.cfg" 7 | weights = "coco_model/yolov4.weights" 8 | className = "coco.names" 9 | 10 | if __name__=='__main__': 11 | parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV') 12 | parser.add_argument('--image', type=str, default='dog.jpg', help='Path to image file.') 13 | args = parser.parse_args() 14 | 15 | net = cv.dnn_DetectionModel(cfg, weights) 16 | net.setInputSize(608, 608) 17 | net.setInputScale(1.0 / 255) 18 | net.setInputSwapRB(True) 19 | with open(className, 'rt') as f: 20 | names = f.read().rstrip('\n').split('\n') 21 | 22 | img = cv.imread(args.image) 23 | # 模型检测 24 | classes, confidences, boxes = net.detect(img, confThreshold=0.1, nmsThreshold=0.4) 25 | # 将检测结果显示到图像上 26 | for classId, confidence, box in zip(classes.flatten(), confidences.flatten(), boxes): 27 | label = '%s: %.2f' % (names[classId], confidence) 28 | labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) 29 | left, top, width, height = box 30 | top = max(top, labelSize[1]) 31 | b = random.randint(0, 255) 32 | g = random.randint(0, 255) 33 | r = random.randint(0, 255) 34 | cv.rectangle(img, box, color=(b, g, r), thickness=2) 35 | cv.rectangle(img, (left - 1, top - labelSize[1]), (left + labelSize[0], top), (b, g, r), cv.FILLED) 36 | cv.putText(img, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255 - b, 255 - g, 255 - r)) 37 | cv.namedWindow('detect out', cv.WINDOW_NORMAL) 38 | cv.imshow('detect out', img) 39 | cv.waitKey(0) -------------------------------------------------------------------------------- /main_yolov4.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import argparse 3 | import numpy as np 4 | 5 | # Initialize the parameters 6 | confThreshold = 0.1 # Confidence threshold 7 | nmsThreshold = 0.4 # Non-maximum suppression threshold 8 | inpWidth = 608 # Width of network's input image yolov4: 608, yolov4-tiny: 416 9 | inpHeight = 608 # Height of network's input image 10 | 11 | # Give the configuration and weight files for the model and load the network using them. 12 | modelConfiguration = "coco_model/yolov4.cfg" 13 | modelWeights = "coco_model/yolov4.weights" 14 | 15 | # Load names of classes 16 | classesFile = "coco.names" 17 | classes = None 18 | with open(classesFile, 'rt') as f: 19 | classes = f.read().rstrip('\n').split('\n') 20 | colors = [np.random.randint(0, 255, size=3).tolist() for _ in range(len(classes))] 21 | 22 | # Get the names of the output layers 23 | def getOutputsNames(net): 24 | # Get the names of all the layers in the network 25 | layersNames = net.getLayerNames() 26 | # print(dir(net)) 27 | # Get the names of the output layers, i.e. the layers with unconnected outputs 28 | return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()] 29 | 30 | # Draw the predicted bounding box 31 | def drawPred(classId, conf, left, top, right, bottom): 32 | # Draw a bounding box. 33 | cv.rectangle(frame, (left, top), (right, bottom), (0,0,255), thickness=4) 34 | 35 | label = '%.2f' % conf 36 | 37 | # Get the label for the class name and its confidence 38 | if classes: 39 | assert (classId < len(classes)) 40 | label = '%s:%s' % (classes[classId], label) 41 | 42 | # Display the label at the top of the bounding box 43 | labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) 44 | top = max(top, labelSize[1]) 45 | # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED) 46 | cv.putText(frame, label, (left, top-10), cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), thickness=2) 47 | 48 | # Remove the bounding boxes with low confidence using non-maxima suppression 49 | def postprocess(frame, outs): 50 | frameHeight = frame.shape[0] 51 | frameWidth = frame.shape[1] 52 | 53 | classIds = [] 54 | confidences = [] 55 | boxes = [] 56 | # Scan through all the bounding boxes output from the network and keep only the 57 | # ones with high confidence scores. Assign the box's class label as the class with the highest score. 58 | classIds = [] 59 | confidences = [] 60 | boxes = [] 61 | for out in outs: 62 | for detection in out: 63 | scores = detection[5:] 64 | classId = np.argmax(scores) 65 | confidence = scores[classId] 66 | if confidence > confThreshold: 67 | center_x = int(detection[0] * frameWidth) 68 | center_y = int(detection[1] * frameHeight) 69 | width = int(detection[2] * frameWidth) 70 | height = int(detection[3] * frameHeight) 71 | left = int(center_x - width / 2) 72 | top = int(center_y - height / 2) 73 | classIds.append(classId) 74 | confidences.append(float(confidence)) 75 | boxes.append([left, top, width, height]) 76 | 77 | # Perform non maximum suppression to eliminate redundant overlapping boxes with 78 | # lower confidences. 79 | indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold) 80 | for i in indices: 81 | i = i[0] 82 | box = boxes[i] 83 | left = box[0] 84 | top = box[1] 85 | width = box[2] 86 | height = box[3] 87 | drawPred(classIds[i], confidences[i], left, top, left + width, top + height) 88 | 89 | if __name__=='__main__': 90 | parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV') 91 | parser.add_argument('--image', type=str, default='dog.jpg', help='Path to image file.') 92 | args = parser.parse_args() 93 | 94 | net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights) 95 | net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV) 96 | net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU) 97 | # Process inputs 98 | frame = cv.imread(args.image) 99 | 100 | # Create a 4D blob from a frame. 101 | blob = cv.dnn.blobFromImage(frame, 1/255.0, (inpWidth, inpHeight), [0, 0, 0], swapRB=False, crop=False) 102 | 103 | # Sets the input to the network 104 | net.setInput(blob) 105 | 106 | # Runs the forward pass to get output of the output layers 107 | outs = net.forward(getOutputsNames(net)) 108 | # Remove the bounding boxes with low confidence 109 | postprocess(frame, outs) 110 | 111 | # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes) 112 | t, _ = net.getPerfProfile() 113 | label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) 114 | cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255)) 115 | 116 | winName = 'Deep learning object detection in OpenCV' 117 | cv.namedWindow(winName,0) 118 | cv.imshow(winName, frame) 119 | cv.waitKey(0) 120 | cv.destroyAllWindows() -------------------------------------------------------------------------------- /person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/person.jpg --------------------------------------------------------------------------------