├── README.md
├── coco.names
├── coco_model
    ├── yolov4-tiny.cfg
    ├── yolov4-tiny.weights
    └── yolov4.cfg
├── dog.jpg
├── main2_yolov4.py
├── main_yolov4.py
└── person.jpg


/README.md:
--------------------------------------------------------------------------------
1 | # yolov4-opencv-dnn
2 | 使用opencv的dnn模块做yolov4目标检测
3 | 程序运行环境：Python3.7+opencv4.4
4 | 
5 | 由于yolov4.weights文件太大，无法直接上传，因此在运行程序前需要把yolov4.weights下载到coco_model文件夹里。
6 | yolov4.weights的下载链接是：
7 | baidu(https://pan.baidu.com/s/1dAGEW8cm-dqK14TbhhVetA Extraction code:dm5b)
8 | google(https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT)
9 | 


--------------------------------------------------------------------------------
/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/coco_model/yolov4-tiny.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.00261
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=2
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [convolutional]
 34 | batch_normalize=1
 35 | filters=64
 36 | size=3
 37 | stride=2
 38 | pad=1
 39 | activation=leaky
 40 | 
 41 | [convolutional]
 42 | batch_normalize=1
 43 | filters=64
 44 | size=3
 45 | stride=1
 46 | pad=1
 47 | activation=leaky
 48 | 
 49 | [route]
 50 | layers=-1
 51 | groups=2
 52 | group_id=1
 53 | 
 54 | [convolutional]
 55 | batch_normalize=1
 56 | filters=32
 57 | size=3
 58 | stride=1
 59 | pad=1
 60 | activation=leaky
 61 | 
 62 | [convolutional]
 63 | batch_normalize=1
 64 | filters=32
 65 | size=3
 66 | stride=1
 67 | pad=1
 68 | activation=leaky
 69 | 
 70 | [route]
 71 | layers = -1,-2
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [route]
 82 | layers = -6,-1
 83 | 
 84 | [maxpool]
 85 | size=2
 86 | stride=2
 87 | 
 88 | [convolutional]
 89 | batch_normalize=1
 90 | filters=128
 91 | size=3
 92 | stride=1
 93 | pad=1
 94 | activation=leaky
 95 | 
 96 | [route]
 97 | layers=-1
 98 | groups=2
 99 | group_id=1
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=64
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [convolutional]
110 | batch_normalize=1
111 | filters=64
112 | size=3
113 | stride=1
114 | pad=1
115 | activation=leaky
116 | 
117 | [route]
118 | layers = -1,-2
119 | 
120 | [convolutional]
121 | batch_normalize=1
122 | filters=128
123 | size=1
124 | stride=1
125 | pad=1
126 | activation=leaky
127 | 
128 | [route]
129 | layers = -6,-1
130 | 
131 | [maxpool]
132 | size=2
133 | stride=2
134 | 
135 | [convolutional]
136 | batch_normalize=1
137 | filters=256
138 | size=3
139 | stride=1
140 | pad=1
141 | activation=leaky
142 | 
143 | [route]
144 | layers=-1
145 | groups=2
146 | group_id=1
147 | 
148 | [convolutional]
149 | batch_normalize=1
150 | filters=128
151 | size=3
152 | stride=1
153 | pad=1
154 | activation=leaky
155 | 
156 | [convolutional]
157 | batch_normalize=1
158 | filters=128
159 | size=3
160 | stride=1
161 | pad=1
162 | activation=leaky
163 | 
164 | [route]
165 | layers = -1,-2
166 | 
167 | [convolutional]
168 | batch_normalize=1
169 | filters=256
170 | size=1
171 | stride=1
172 | pad=1
173 | activation=leaky
174 | 
175 | [route]
176 | layers = -6,-1
177 | 
178 | [maxpool]
179 | size=2
180 | stride=2
181 | 
182 | [convolutional]
183 | batch_normalize=1
184 | filters=512
185 | size=3
186 | stride=1
187 | pad=1
188 | activation=leaky
189 | 
190 | ##################################
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | filters=256
195 | size=1
196 | stride=1
197 | pad=1
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | filters=512
203 | size=3
204 | stride=1
205 | pad=1
206 | activation=leaky
207 | 
208 | [convolutional]
209 | size=1
210 | stride=1
211 | pad=1
212 | filters=255
213 | activation=linear
214 | 
215 | 
216 | 
217 | [yolo]
218 | mask = 3,4,5
219 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
220 | classes=80
221 | num=6
222 | jitter=.3
223 | scale_x_y = 1.05
224 | cls_normalizer=1.0
225 | iou_normalizer=0.07
226 | iou_loss=ciou
227 | ignore_thresh = .7
228 | truth_thresh = 1
229 | random=0
230 | resize=1.5
231 | nms_kind=greedynms
232 | beta_nms=0.6
233 | 
234 | [route]
235 | layers = -4
236 | 
237 | [convolutional]
238 | batch_normalize=1
239 | filters=128
240 | size=1
241 | stride=1
242 | pad=1
243 | activation=leaky
244 | 
245 | [upsample]
246 | stride=2
247 | 
248 | [route]
249 | layers = -1, 23
250 | 
251 | [convolutional]
252 | batch_normalize=1
253 | filters=256
254 | size=3
255 | stride=1
256 | pad=1
257 | activation=leaky
258 | 
259 | [convolutional]
260 | size=1
261 | stride=1
262 | pad=1
263 | filters=255
264 | activation=linear
265 | 
266 | [yolo]
267 | mask = 1,2,3
268 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
269 | classes=80
270 | num=6
271 | jitter=.3
272 | scale_x_y = 1.05
273 | cls_normalizer=1.0
274 | iou_normalizer=0.07
275 | iou_loss=ciou
276 | ignore_thresh = .7
277 | truth_thresh = 1
278 | random=0
279 | resize=1.5
280 | nms_kind=greedynms
281 | beta_nms=0.6
282 | 


--------------------------------------------------------------------------------
/coco_model/yolov4-tiny.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/coco_model/yolov4-tiny.weights


--------------------------------------------------------------------------------
/coco_model/yolov4.cfg:
--------------------------------------------------------------------------------
   1 | [net]
   2 | # Testing
   3 | #batch=1
   4 | #subdivisions=1
   5 | # Training
   6 | batch=64
   7 | subdivisions=8
   8 | width=608
   9 | height=608
  10 | channels=3
  11 | momentum=0.949
  12 | decay=0.0005
  13 | angle=0
  14 | saturation = 1.5
  15 | exposure = 1.5
  16 | hue=.1
  17 | 
  18 | learning_rate=0.00261
  19 | burn_in=1000
  20 | max_batches = 500500
  21 | policy=steps
  22 | steps=400000,450000
  23 | scales=.1,.1
  24 | 
  25 | #cutmix=1
  26 | mosaic=1
  27 | 
  28 | #:104x104 54:52x52 85:26x26 104:13x13 for 416
  29 | 
  30 | [convolutional]
  31 | batch_normalize=1
  32 | filters=32
  33 | size=3
  34 | stride=1
  35 | pad=1
  36 | activation=mish
  37 | 
  38 | # Downsample
  39 | 
  40 | [convolutional]
  41 | batch_normalize=1
  42 | filters=64
  43 | size=3
  44 | stride=2
  45 | pad=1
  46 | activation=mish
  47 | 
  48 | [convolutional]
  49 | batch_normalize=1
  50 | filters=64
  51 | size=1
  52 | stride=1
  53 | pad=1
  54 | activation=mish
  55 | 
  56 | [route]
  57 | layers = -2
  58 | 
  59 | [convolutional]
  60 | batch_normalize=1
  61 | filters=64
  62 | size=1
  63 | stride=1
  64 | pad=1
  65 | activation=mish
  66 | 
  67 | [convolutional]
  68 | batch_normalize=1
  69 | filters=32
  70 | size=1
  71 | stride=1
  72 | pad=1
  73 | activation=mish
  74 | 
  75 | [convolutional]
  76 | batch_normalize=1
  77 | filters=64
  78 | size=3
  79 | stride=1
  80 | pad=1
  81 | activation=mish
  82 | 
  83 | [shortcut]
  84 | from=-3
  85 | activation=linear
  86 | 
  87 | [convolutional]
  88 | batch_normalize=1
  89 | filters=64
  90 | size=1
  91 | stride=1
  92 | pad=1
  93 | activation=mish
  94 | 
  95 | [route]
  96 | layers = -1,-7
  97 | 
  98 | [convolutional]
  99 | batch_normalize=1
 100 | filters=64
 101 | size=1
 102 | stride=1
 103 | pad=1
 104 | activation=mish
 105 | 
 106 | # Downsample
 107 | 
 108 | [convolutional]
 109 | batch_normalize=1
 110 | filters=128
 111 | size=3
 112 | stride=2
 113 | pad=1
 114 | activation=mish
 115 | 
 116 | [convolutional]
 117 | batch_normalize=1
 118 | filters=64
 119 | size=1
 120 | stride=1
 121 | pad=1
 122 | activation=mish
 123 | 
 124 | [route]
 125 | layers = -2
 126 | 
 127 | [convolutional]
 128 | batch_normalize=1
 129 | filters=64
 130 | size=1
 131 | stride=1
 132 | pad=1
 133 | activation=mish
 134 | 
 135 | [convolutional]
 136 | batch_normalize=1
 137 | filters=64
 138 | size=1
 139 | stride=1
 140 | pad=1
 141 | activation=mish
 142 | 
 143 | [convolutional]
 144 | batch_normalize=1
 145 | filters=64
 146 | size=3
 147 | stride=1
 148 | pad=1
 149 | activation=mish
 150 | 
 151 | [shortcut]
 152 | from=-3
 153 | activation=linear
 154 | 
 155 | [convolutional]
 156 | batch_normalize=1
 157 | filters=64
 158 | size=1
 159 | stride=1
 160 | pad=1
 161 | activation=mish
 162 | 
 163 | [convolutional]
 164 | batch_normalize=1
 165 | filters=64
 166 | size=3
 167 | stride=1
 168 | pad=1
 169 | activation=mish
 170 | 
 171 | [shortcut]
 172 | from=-3
 173 | activation=linear
 174 | 
 175 | [convolutional]
 176 | batch_normalize=1
 177 | filters=64
 178 | size=1
 179 | stride=1
 180 | pad=1
 181 | activation=mish
 182 | 
 183 | [route]
 184 | layers = -1,-10
 185 | 
 186 | [convolutional]
 187 | batch_normalize=1
 188 | filters=128
 189 | size=1
 190 | stride=1
 191 | pad=1
 192 | activation=mish
 193 | 
 194 | # Downsample
 195 | 
 196 | [convolutional]
 197 | batch_normalize=1
 198 | filters=256
 199 | size=3
 200 | stride=2
 201 | pad=1
 202 | activation=mish
 203 | 
 204 | [convolutional]
 205 | batch_normalize=1
 206 | filters=128
 207 | size=1
 208 | stride=1
 209 | pad=1
 210 | activation=mish
 211 | 
 212 | [route]
 213 | layers = -2
 214 | 
 215 | [convolutional]
 216 | batch_normalize=1
 217 | filters=128
 218 | size=1
 219 | stride=1
 220 | pad=1
 221 | activation=mish
 222 | 
 223 | [convolutional]
 224 | batch_normalize=1
 225 | filters=128
 226 | size=1
 227 | stride=1
 228 | pad=1
 229 | activation=mish
 230 | 
 231 | [convolutional]
 232 | batch_normalize=1
 233 | filters=128
 234 | size=3
 235 | stride=1
 236 | pad=1
 237 | activation=mish
 238 | 
 239 | [shortcut]
 240 | from=-3
 241 | activation=linear
 242 | 
 243 | [convolutional]
 244 | batch_normalize=1
 245 | filters=128
 246 | size=1
 247 | stride=1
 248 | pad=1
 249 | activation=mish
 250 | 
 251 | [convolutional]
 252 | batch_normalize=1
 253 | filters=128
 254 | size=3
 255 | stride=1
 256 | pad=1
 257 | activation=mish
 258 | 
 259 | [shortcut]
 260 | from=-3
 261 | activation=linear
 262 | 
 263 | [convolutional]
 264 | batch_normalize=1
 265 | filters=128
 266 | size=1
 267 | stride=1
 268 | pad=1
 269 | activation=mish
 270 | 
 271 | [convolutional]
 272 | batch_normalize=1
 273 | filters=128
 274 | size=3
 275 | stride=1
 276 | pad=1
 277 | activation=mish
 278 | 
 279 | [shortcut]
 280 | from=-3
 281 | activation=linear
 282 | 
 283 | [convolutional]
 284 | batch_normalize=1
 285 | filters=128
 286 | size=1
 287 | stride=1
 288 | pad=1
 289 | activation=mish
 290 | 
 291 | [convolutional]
 292 | batch_normalize=1
 293 | filters=128
 294 | size=3
 295 | stride=1
 296 | pad=1
 297 | activation=mish
 298 | 
 299 | [shortcut]
 300 | from=-3
 301 | activation=linear
 302 | 
 303 | 
 304 | [convolutional]
 305 | batch_normalize=1
 306 | filters=128
 307 | size=1
 308 | stride=1
 309 | pad=1
 310 | activation=mish
 311 | 
 312 | [convolutional]
 313 | batch_normalize=1
 314 | filters=128
 315 | size=3
 316 | stride=1
 317 | pad=1
 318 | activation=mish
 319 | 
 320 | [shortcut]
 321 | from=-3
 322 | activation=linear
 323 | 
 324 | [convolutional]
 325 | batch_normalize=1
 326 | filters=128
 327 | size=1
 328 | stride=1
 329 | pad=1
 330 | activation=mish
 331 | 
 332 | [convolutional]
 333 | batch_normalize=1
 334 | filters=128
 335 | size=3
 336 | stride=1
 337 | pad=1
 338 | activation=mish
 339 | 
 340 | [shortcut]
 341 | from=-3
 342 | activation=linear
 343 | 
 344 | [convolutional]
 345 | batch_normalize=1
 346 | filters=128
 347 | size=1
 348 | stride=1
 349 | pad=1
 350 | activation=mish
 351 | 
 352 | [convolutional]
 353 | batch_normalize=1
 354 | filters=128
 355 | size=3
 356 | stride=1
 357 | pad=1
 358 | activation=mish
 359 | 
 360 | [shortcut]
 361 | from=-3
 362 | activation=linear
 363 | 
 364 | [convolutional]
 365 | batch_normalize=1
 366 | filters=128
 367 | size=1
 368 | stride=1
 369 | pad=1
 370 | activation=mish
 371 | 
 372 | [convolutional]
 373 | batch_normalize=1
 374 | filters=128
 375 | size=3
 376 | stride=1
 377 | pad=1
 378 | activation=mish
 379 | 
 380 | [shortcut]
 381 | from=-3
 382 | activation=linear
 383 | 
 384 | [convolutional]
 385 | batch_normalize=1
 386 | filters=128
 387 | size=1
 388 | stride=1
 389 | pad=1
 390 | activation=mish
 391 | 
 392 | [route]
 393 | layers = -1,-28
 394 | 
 395 | [convolutional]
 396 | batch_normalize=1
 397 | filters=256
 398 | size=1
 399 | stride=1
 400 | pad=1
 401 | activation=mish
 402 | 
 403 | # Downsample
 404 | 
 405 | [convolutional]
 406 | batch_normalize=1
 407 | filters=512
 408 | size=3
 409 | stride=2
 410 | pad=1
 411 | activation=mish
 412 | 
 413 | [convolutional]
 414 | batch_normalize=1
 415 | filters=256
 416 | size=1
 417 | stride=1
 418 | pad=1
 419 | activation=mish
 420 | 
 421 | [route]
 422 | layers = -2
 423 | 
 424 | [convolutional]
 425 | batch_normalize=1
 426 | filters=256
 427 | size=1
 428 | stride=1
 429 | pad=1
 430 | activation=mish
 431 | 
 432 | [convolutional]
 433 | batch_normalize=1
 434 | filters=256
 435 | size=1
 436 | stride=1
 437 | pad=1
 438 | activation=mish
 439 | 
 440 | [convolutional]
 441 | batch_normalize=1
 442 | filters=256
 443 | size=3
 444 | stride=1
 445 | pad=1
 446 | activation=mish
 447 | 
 448 | [shortcut]
 449 | from=-3
 450 | activation=linear
 451 | 
 452 | 
 453 | [convolutional]
 454 | batch_normalize=1
 455 | filters=256
 456 | size=1
 457 | stride=1
 458 | pad=1
 459 | activation=mish
 460 | 
 461 | [convolutional]
 462 | batch_normalize=1
 463 | filters=256
 464 | size=3
 465 | stride=1
 466 | pad=1
 467 | activation=mish
 468 | 
 469 | [shortcut]
 470 | from=-3
 471 | activation=linear
 472 | 
 473 | 
 474 | [convolutional]
 475 | batch_normalize=1
 476 | filters=256
 477 | size=1
 478 | stride=1
 479 | pad=1
 480 | activation=mish
 481 | 
 482 | [convolutional]
 483 | batch_normalize=1
 484 | filters=256
 485 | size=3
 486 | stride=1
 487 | pad=1
 488 | activation=mish
 489 | 
 490 | [shortcut]
 491 | from=-3
 492 | activation=linear
 493 | 
 494 | 
 495 | [convolutional]
 496 | batch_normalize=1
 497 | filters=256
 498 | size=1
 499 | stride=1
 500 | pad=1
 501 | activation=mish
 502 | 
 503 | [convolutional]
 504 | batch_normalize=1
 505 | filters=256
 506 | size=3
 507 | stride=1
 508 | pad=1
 509 | activation=mish
 510 | 
 511 | [shortcut]
 512 | from=-3
 513 | activation=linear
 514 | 
 515 | 
 516 | [convolutional]
 517 | batch_normalize=1
 518 | filters=256
 519 | size=1
 520 | stride=1
 521 | pad=1
 522 | activation=mish
 523 | 
 524 | [convolutional]
 525 | batch_normalize=1
 526 | filters=256
 527 | size=3
 528 | stride=1
 529 | pad=1
 530 | activation=mish
 531 | 
 532 | [shortcut]
 533 | from=-3
 534 | activation=linear
 535 | 
 536 | 
 537 | [convolutional]
 538 | batch_normalize=1
 539 | filters=256
 540 | size=1
 541 | stride=1
 542 | pad=1
 543 | activation=mish
 544 | 
 545 | [convolutional]
 546 | batch_normalize=1
 547 | filters=256
 548 | size=3
 549 | stride=1
 550 | pad=1
 551 | activation=mish
 552 | 
 553 | [shortcut]
 554 | from=-3
 555 | activation=linear
 556 | 
 557 | 
 558 | [convolutional]
 559 | batch_normalize=1
 560 | filters=256
 561 | size=1
 562 | stride=1
 563 | pad=1
 564 | activation=mish
 565 | 
 566 | [convolutional]
 567 | batch_normalize=1
 568 | filters=256
 569 | size=3
 570 | stride=1
 571 | pad=1
 572 | activation=mish
 573 | 
 574 | [shortcut]
 575 | from=-3
 576 | activation=linear
 577 | 
 578 | [convolutional]
 579 | batch_normalize=1
 580 | filters=256
 581 | size=1
 582 | stride=1
 583 | pad=1
 584 | activation=mish
 585 | 
 586 | [convolutional]
 587 | batch_normalize=1
 588 | filters=256
 589 | size=3
 590 | stride=1
 591 | pad=1
 592 | activation=mish
 593 | 
 594 | [shortcut]
 595 | from=-3
 596 | activation=linear
 597 | 
 598 | [convolutional]
 599 | batch_normalize=1
 600 | filters=256
 601 | size=1
 602 | stride=1
 603 | pad=1
 604 | activation=mish
 605 | 
 606 | [route]
 607 | layers = -1,-28
 608 | 
 609 | [convolutional]
 610 | batch_normalize=1
 611 | filters=512
 612 | size=1
 613 | stride=1
 614 | pad=1
 615 | activation=mish
 616 | 
 617 | # Downsample
 618 | 
 619 | [convolutional]
 620 | batch_normalize=1
 621 | filters=1024
 622 | size=3
 623 | stride=2
 624 | pad=1
 625 | activation=mish
 626 | 
 627 | [convolutional]
 628 | batch_normalize=1
 629 | filters=512
 630 | size=1
 631 | stride=1
 632 | pad=1
 633 | activation=mish
 634 | 
 635 | [route]
 636 | layers = -2
 637 | 
 638 | [convolutional]
 639 | batch_normalize=1
 640 | filters=512
 641 | size=1
 642 | stride=1
 643 | pad=1
 644 | activation=mish
 645 | 
 646 | [convolutional]
 647 | batch_normalize=1
 648 | filters=512
 649 | size=1
 650 | stride=1
 651 | pad=1
 652 | activation=mish
 653 | 
 654 | [convolutional]
 655 | batch_normalize=1
 656 | filters=512
 657 | size=3
 658 | stride=1
 659 | pad=1
 660 | activation=mish
 661 | 
 662 | [shortcut]
 663 | from=-3
 664 | activation=linear
 665 | 
 666 | [convolutional]
 667 | batch_normalize=1
 668 | filters=512
 669 | size=1
 670 | stride=1
 671 | pad=1
 672 | activation=mish
 673 | 
 674 | [convolutional]
 675 | batch_normalize=1
 676 | filters=512
 677 | size=3
 678 | stride=1
 679 | pad=1
 680 | activation=mish
 681 | 
 682 | [shortcut]
 683 | from=-3
 684 | activation=linear
 685 | 
 686 | [convolutional]
 687 | batch_normalize=1
 688 | filters=512
 689 | size=1
 690 | stride=1
 691 | pad=1
 692 | activation=mish
 693 | 
 694 | [convolutional]
 695 | batch_normalize=1
 696 | filters=512
 697 | size=3
 698 | stride=1
 699 | pad=1
 700 | activation=mish
 701 | 
 702 | [shortcut]
 703 | from=-3
 704 | activation=linear
 705 | 
 706 | [convolutional]
 707 | batch_normalize=1
 708 | filters=512
 709 | size=1
 710 | stride=1
 711 | pad=1
 712 | activation=mish
 713 | 
 714 | [convolutional]
 715 | batch_normalize=1
 716 | filters=512
 717 | size=3
 718 | stride=1
 719 | pad=1
 720 | activation=mish
 721 | 
 722 | [shortcut]
 723 | from=-3
 724 | activation=linear
 725 | 
 726 | [convolutional]
 727 | batch_normalize=1
 728 | filters=512
 729 | size=1
 730 | stride=1
 731 | pad=1
 732 | activation=mish
 733 | 
 734 | [route]
 735 | layers = -1,-16
 736 | 
 737 | [convolutional]
 738 | batch_normalize=1
 739 | filters=1024
 740 | size=1
 741 | stride=1
 742 | pad=1
 743 | activation=mish
 744 | 
 745 | ##########################
 746 | 
 747 | [convolutional]
 748 | batch_normalize=1
 749 | filters=512
 750 | size=1
 751 | stride=1
 752 | pad=1
 753 | activation=leaky
 754 | 
 755 | [convolutional]
 756 | batch_normalize=1
 757 | size=3
 758 | stride=1
 759 | pad=1
 760 | filters=1024
 761 | activation=leaky
 762 | 
 763 | [convolutional]
 764 | batch_normalize=1
 765 | filters=512
 766 | size=1
 767 | stride=1
 768 | pad=1
 769 | activation=leaky
 770 | 
 771 | ### SPP ###
 772 | [maxpool]
 773 | stride=1
 774 | size=5
 775 | 
 776 | [route]
 777 | layers=-2
 778 | 
 779 | [maxpool]
 780 | stride=1
 781 | size=9
 782 | 
 783 | [route]
 784 | layers=-4
 785 | 
 786 | [maxpool]
 787 | stride=1
 788 | size=13
 789 | 
 790 | [route]
 791 | layers=-1,-3,-5,-6
 792 | ### End SPP ###
 793 | 
 794 | [convolutional]
 795 | batch_normalize=1
 796 | filters=512
 797 | size=1
 798 | stride=1
 799 | pad=1
 800 | activation=leaky
 801 | 
 802 | [convolutional]
 803 | batch_normalize=1
 804 | size=3
 805 | stride=1
 806 | pad=1
 807 | filters=1024
 808 | activation=leaky
 809 | 
 810 | [convolutional]
 811 | batch_normalize=1
 812 | filters=512
 813 | size=1
 814 | stride=1
 815 | pad=1
 816 | activation=leaky
 817 | 
 818 | [convolutional]
 819 | batch_normalize=1
 820 | filters=256
 821 | size=1
 822 | stride=1
 823 | pad=1
 824 | activation=leaky
 825 | 
 826 | [upsample]
 827 | stride=2
 828 | 
 829 | [route]
 830 | layers = 85
 831 | 
 832 | [convolutional]
 833 | batch_normalize=1
 834 | filters=256
 835 | size=1
 836 | stride=1
 837 | pad=1
 838 | activation=leaky
 839 | 
 840 | [route]
 841 | layers = -1, -3
 842 | 
 843 | [convolutional]
 844 | batch_normalize=1
 845 | filters=256
 846 | size=1
 847 | stride=1
 848 | pad=1
 849 | activation=leaky
 850 | 
 851 | [convolutional]
 852 | batch_normalize=1
 853 | size=3
 854 | stride=1
 855 | pad=1
 856 | filters=512
 857 | activation=leaky
 858 | 
 859 | [convolutional]
 860 | batch_normalize=1
 861 | filters=256
 862 | size=1
 863 | stride=1
 864 | pad=1
 865 | activation=leaky
 866 | 
 867 | [convolutional]
 868 | batch_normalize=1
 869 | size=3
 870 | stride=1
 871 | pad=1
 872 | filters=512
 873 | activation=leaky
 874 | 
 875 | [convolutional]
 876 | batch_normalize=1
 877 | filters=256
 878 | size=1
 879 | stride=1
 880 | pad=1
 881 | activation=leaky
 882 | 
 883 | [convolutional]
 884 | batch_normalize=1
 885 | filters=128
 886 | size=1
 887 | stride=1
 888 | pad=1
 889 | activation=leaky
 890 | 
 891 | [upsample]
 892 | stride=2
 893 | 
 894 | [route]
 895 | layers = 54
 896 | 
 897 | [convolutional]
 898 | batch_normalize=1
 899 | filters=128
 900 | size=1
 901 | stride=1
 902 | pad=1
 903 | activation=leaky
 904 | 
 905 | [route]
 906 | layers = -1, -3
 907 | 
 908 | [convolutional]
 909 | batch_normalize=1
 910 | filters=128
 911 | size=1
 912 | stride=1
 913 | pad=1
 914 | activation=leaky
 915 | 
 916 | [convolutional]
 917 | batch_normalize=1
 918 | size=3
 919 | stride=1
 920 | pad=1
 921 | filters=256
 922 | activation=leaky
 923 | 
 924 | [convolutional]
 925 | batch_normalize=1
 926 | filters=128
 927 | size=1
 928 | stride=1
 929 | pad=1
 930 | activation=leaky
 931 | 
 932 | [convolutional]
 933 | batch_normalize=1
 934 | size=3
 935 | stride=1
 936 | pad=1
 937 | filters=256
 938 | activation=leaky
 939 | 
 940 | [convolutional]
 941 | batch_normalize=1
 942 | filters=128
 943 | size=1
 944 | stride=1
 945 | pad=1
 946 | activation=leaky
 947 | 
 948 | ##########################
 949 | 
 950 | [convolutional]
 951 | batch_normalize=1
 952 | size=3
 953 | stride=1
 954 | pad=1
 955 | filters=256
 956 | activation=leaky
 957 | 
 958 | [convolutional]
 959 | size=1
 960 | stride=1
 961 | pad=1
 962 | filters=255
 963 | activation=linear
 964 | 
 965 | 
 966 | [yolo]
 967 | mask = 0,1,2
 968 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
 969 | classes=80
 970 | num=9
 971 | jitter=.3
 972 | ignore_thresh = .7
 973 | truth_thresh = 1
 974 | scale_x_y = 1.2
 975 | iou_thresh=0.213
 976 | cls_normalizer=1.0
 977 | iou_normalizer=0.07
 978 | iou_loss=ciou
 979 | nms_kind=greedynms
 980 | beta_nms=0.6
 981 | 
 982 | 
 983 | [route]
 984 | layers = -4
 985 | 
 986 | [convolutional]
 987 | batch_normalize=1
 988 | size=3
 989 | stride=2
 990 | pad=1
 991 | filters=256
 992 | activation=leaky
 993 | 
 994 | [route]
 995 | layers = -1, -16
 996 | 
 997 | [convolutional]
 998 | batch_normalize=1
 999 | filters=256
1000 | size=1
1001 | stride=1
1002 | pad=1
1003 | activation=leaky
1004 | 
1005 | [convolutional]
1006 | batch_normalize=1
1007 | size=3
1008 | stride=1
1009 | pad=1
1010 | filters=512
1011 | activation=leaky
1012 | 
1013 | [convolutional]
1014 | batch_normalize=1
1015 | filters=256
1016 | size=1
1017 | stride=1
1018 | pad=1
1019 | activation=leaky
1020 | 
1021 | [convolutional]
1022 | batch_normalize=1
1023 | size=3
1024 | stride=1
1025 | pad=1
1026 | filters=512
1027 | activation=leaky
1028 | 
1029 | [convolutional]
1030 | batch_normalize=1
1031 | filters=256
1032 | size=1
1033 | stride=1
1034 | pad=1
1035 | activation=leaky
1036 | 
1037 | [convolutional]
1038 | batch_normalize=1
1039 | size=3
1040 | stride=1
1041 | pad=1
1042 | filters=512
1043 | activation=leaky
1044 | 
1045 | [convolutional]
1046 | size=1
1047 | stride=1
1048 | pad=1
1049 | filters=255
1050 | activation=linear
1051 | 
1052 | 
1053 | [yolo]
1054 | mask = 3,4,5
1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1056 | classes=80
1057 | num=9
1058 | jitter=.3
1059 | ignore_thresh = .7
1060 | truth_thresh = 1
1061 | scale_x_y = 1.1
1062 | iou_thresh=0.213
1063 | cls_normalizer=1.0
1064 | iou_normalizer=0.07
1065 | iou_loss=ciou
1066 | nms_kind=greedynms
1067 | beta_nms=0.6
1068 | 
1069 | 
1070 | [route]
1071 | layers = -4
1072 | 
1073 | [convolutional]
1074 | batch_normalize=1
1075 | size=3
1076 | stride=2
1077 | pad=1
1078 | filters=512
1079 | activation=leaky
1080 | 
1081 | [route]
1082 | layers = -1, -37
1083 | 
1084 | [convolutional]
1085 | batch_normalize=1
1086 | filters=512
1087 | size=1
1088 | stride=1
1089 | pad=1
1090 | activation=leaky
1091 | 
1092 | [convolutional]
1093 | batch_normalize=1
1094 | size=3
1095 | stride=1
1096 | pad=1
1097 | filters=1024
1098 | activation=leaky
1099 | 
1100 | [convolutional]
1101 | batch_normalize=1
1102 | filters=512
1103 | size=1
1104 | stride=1
1105 | pad=1
1106 | activation=leaky
1107 | 
1108 | [convolutional]
1109 | batch_normalize=1
1110 | size=3
1111 | stride=1
1112 | pad=1
1113 | filters=1024
1114 | activation=leaky
1115 | 
1116 | [convolutional]
1117 | batch_normalize=1
1118 | filters=512
1119 | size=1
1120 | stride=1
1121 | pad=1
1122 | activation=leaky
1123 | 
1124 | [convolutional]
1125 | batch_normalize=1
1126 | size=3
1127 | stride=1
1128 | pad=1
1129 | filters=1024
1130 | activation=leaky
1131 | 
1132 | [convolutional]
1133 | size=1
1134 | stride=1
1135 | pad=1
1136 | filters=255
1137 | activation=linear
1138 | 
1139 | 
1140 | [yolo]
1141 | mask = 6,7,8
1142 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1143 | classes=80
1144 | num=9
1145 | jitter=.3
1146 | ignore_thresh = .7
1147 | truth_thresh = 1
1148 | random=1
1149 | scale_x_y = 1.05
1150 | iou_thresh=0.213
1151 | cls_normalizer=1.0
1152 | iou_normalizer=0.07
1153 | iou_loss=ciou
1154 | nms_kind=greedynms
1155 | beta_nms=0.6
1156 | 
1157 | 


--------------------------------------------------------------------------------
/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/dog.jpg


--------------------------------------------------------------------------------
/main2_yolov4.py:
--------------------------------------------------------------------------------
 1 | import cv2 as cv
 2 | import argparse
 3 | import random
 4 | 
 5 | # 文件需要加载的文件
 6 | cfg = "coco_model/yolov4.cfg"
 7 | weights = "coco_model/yolov4.weights"
 8 | className = "coco.names"
 9 | 
10 | if __name__=='__main__':
11 |     parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
12 |     parser.add_argument('--image', type=str, default='dog.jpg', help='Path to image file.')
13 |     args = parser.parse_args()
14 | 
15 |     net = cv.dnn_DetectionModel(cfg, weights)
16 |     net.setInputSize(608, 608)
17 |     net.setInputScale(1.0 / 255)
18 |     net.setInputSwapRB(True)
19 |     with open(className, 'rt') as f:
20 |         names = f.read().rstrip('\n').split('\n')
21 | 
22 |     img = cv.imread(args.image)
23 |     # 模型检测
24 |     classes, confidences, boxes = net.detect(img, confThreshold=0.1, nmsThreshold=0.4)
25 |     # 将检测结果显示到图像上
26 |     for classId, confidence, box in zip(classes.flatten(), confidences.flatten(), boxes):
27 |         label = '%s: %.2f' % (names[classId], confidence)
28 |         labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
29 |         left, top, width, height = box
30 |         top = max(top, labelSize[1])
31 |         b = random.randint(0, 255)
32 |         g = random.randint(0, 255)
33 |         r = random.randint(0, 255)
34 |         cv.rectangle(img, box, color=(b, g, r), thickness=2)
35 |         cv.rectangle(img, (left - 1, top - labelSize[1]), (left + labelSize[0], top), (b, g, r), cv.FILLED)
36 |         cv.putText(img, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255 - b, 255 - g, 255 - r))
37 |     cv.namedWindow('detect out', cv.WINDOW_NORMAL)
38 |     cv.imshow('detect out', img)
39 |     cv.waitKey(0)


--------------------------------------------------------------------------------
/main_yolov4.py:
--------------------------------------------------------------------------------
  1 | import cv2 as cv
  2 | import argparse
  3 | import numpy as np
  4 | 
  5 | # Initialize the parameters
  6 | confThreshold = 0.1  # Confidence threshold
  7 | nmsThreshold = 0.4  # Non-maximum suppression threshold
  8 | inpWidth = 608  # Width of network's input image  yolov4: 608, yolov4-tiny: 416
  9 | inpHeight = 608  # Height of network's input image
 10 | 
 11 | # Give the configuration and weight files for the model and load the network using them.
 12 | modelConfiguration = "coco_model/yolov4.cfg"
 13 | modelWeights = "coco_model/yolov4.weights"
 14 | 
 15 | # Load names of classes
 16 | classesFile = "coco.names"
 17 | classes = None
 18 | with open(classesFile, 'rt') as f:
 19 |     classes = f.read().rstrip('\n').split('\n')
 20 | colors = [np.random.randint(0, 255, size=3).tolist() for _ in range(len(classes))]
 21 | 
 22 | # Get the names of the output layers
 23 | def getOutputsNames(net):
 24 |     # Get the names of all the layers in the network
 25 |     layersNames = net.getLayerNames()
 26 |     # print(dir(net))
 27 |     # Get the names of the output layers, i.e. the layers with unconnected outputs
 28 |     return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
 29 | 
 30 | # Draw the predicted bounding box
 31 | def drawPred(classId, conf, left, top, right, bottom):
 32 |     # Draw a bounding box.
 33 |     cv.rectangle(frame, (left, top), (right, bottom), (0,0,255), thickness=4)
 34 | 
 35 |     label = '%.2f' % conf
 36 | 
 37 |     # Get the label for the class name and its confidence
 38 |     if classes:
 39 |         assert (classId < len(classes))
 40 |         label = '%s:%s' % (classes[classId], label)
 41 | 
 42 |     # Display the label at the top of the bounding box
 43 |     labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
 44 |     top = max(top, labelSize[1])
 45 |     # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED)
 46 |     cv.putText(frame, label, (left, top-10), cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), thickness=2)
 47 | 
 48 | # Remove the bounding boxes with low confidence using non-maxima suppression
 49 | def postprocess(frame, outs):
 50 |     frameHeight = frame.shape[0]
 51 |     frameWidth = frame.shape[1]
 52 | 
 53 |     classIds = []
 54 |     confidences = []
 55 |     boxes = []
 56 |     # Scan through all the bounding boxes output from the network and keep only the
 57 |     # ones with high confidence scores. Assign the box's class label as the class with the highest score.
 58 |     classIds = []
 59 |     confidences = []
 60 |     boxes = []
 61 |     for out in outs:
 62 |         for detection in out:
 63 |             scores = detection[5:]
 64 |             classId = np.argmax(scores)
 65 |             confidence = scores[classId]
 66 |             if confidence > confThreshold:
 67 |                 center_x = int(detection[0] * frameWidth)
 68 |                 center_y = int(detection[1] * frameHeight)
 69 |                 width = int(detection[2] * frameWidth)
 70 |                 height = int(detection[3] * frameHeight)
 71 |                 left = int(center_x - width / 2)
 72 |                 top = int(center_y - height / 2)
 73 |                 classIds.append(classId)
 74 |                 confidences.append(float(confidence))
 75 |                 boxes.append([left, top, width, height])
 76 | 
 77 |     # Perform non maximum suppression to eliminate redundant overlapping boxes with
 78 |     # lower confidences.
 79 |     indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
 80 |     for i in indices:
 81 |         i = i[0]
 82 |         box = boxes[i]
 83 |         left = box[0]
 84 |         top = box[1]
 85 |         width = box[2]
 86 |         height = box[3]
 87 |         drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
 88 | 
 89 | if __name__=='__main__':
 90 |     parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
 91 |     parser.add_argument('--image', type=str, default='dog.jpg', help='Path to image file.')
 92 |     args = parser.parse_args()
 93 | 
 94 |     net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
 95 |     net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 96 |     net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
 97 |     # Process inputs
 98 |     frame = cv.imread(args.image)
 99 | 
100 |         # Create a 4D blob from a frame.
101 |     blob = cv.dnn.blobFromImage(frame, 1/255.0, (inpWidth, inpHeight), [0, 0, 0], swapRB=False, crop=False)
102 | 
103 |     # Sets the input to the network
104 |     net.setInput(blob)
105 | 
106 |     # Runs the forward pass to get output of the output layers
107 |     outs = net.forward(getOutputsNames(net))
108 |     # Remove the bounding boxes with low confidence
109 |     postprocess(frame, outs)
110 | 
111 |     # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
112 |     t, _ = net.getPerfProfile()
113 |     label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
114 |     cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
115 | 
116 |     winName = 'Deep learning object detection in OpenCV'
117 |     cv.namedWindow(winName,0)
118 |     cv.imshow(winName, frame)
119 |     cv.waitKey(0)
120 |     cv.destroyAllWindows()


--------------------------------------------------------------------------------
/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/yolov4-opencv-dnn/4b034978d3b51afad77219d10832d5938d5edff9/person.jpg


--------------------------------------------------------------------------------