├── data
    ├── 5k.txt
    ├── custom
    │   ├── classes.names
    │   ├── train.txt
    │   ├── valid.txt
    │   ├── labels
    │   │   └── train.txt
    │   └── images
    │   │   └── train.jpg
    ├── samples
    │   ├── dog.jpg
    │   ├── eagle.jpg
    │   ├── field.jpg
    │   ├── messi.jpg
    │   ├── person.jpg
    │   ├── room.jpg
    │   ├── street.jpg
    │   ├── giraffe.jpg
    │   └── herd_of_horses.jpg
    ├── temptest
    │   └── field.jpg
    ├── coco.names
    └── get_coco_dataset.sh
├── utils
    ├── __init__.py
    ├── __pycache__
    │   ├── utils.cpython-36.pyc
    │   ├── logger.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── datasets.cpython-36.pyc
    │   ├── parse_config.cpython-36.pyc
    │   └── augmentations.cpython-36.pyc
    ├── augmentations.py
    ├── logger.py
    ├── parse_config.py
    ├── datasets.py
    └── utils.py
├── assets
    ├── dog.png
    ├── giraffe.png
    ├── messi.png
    └── traffic.png
├── output
    ├── dog.png
    ├── room.png
    ├── eagle.png
    ├── field.png
    ├── giraffe.png
    ├── messi.png
    ├── person.png
    ├── street.png
    └── herd_of_horses.png
├── __pycache__
    ├── models.cpython-36.pyc
    └── test.cpython-36.pyc
├── config
    ├── custom.data
    ├── coco.data
    ├── yolov3-tiny.cfg
    ├── yolov3.cfg
    └── create_custom_model.sh
├── requirements.txt
├── weights
    └── download_weights.sh
├── check.py
├── README.md
├── test.py
├── train.py
├── detect.py
└── models.py


/data/5k.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/custom/classes.names:
--------------------------------------------------------------------------------
1 | train
2 | 


--------------------------------------------------------------------------------
/data/custom/train.txt:
--------------------------------------------------------------------------------
1 | data/custom/images/train.jpg
2 | 


--------------------------------------------------------------------------------
/data/custom/valid.txt:
--------------------------------------------------------------------------------
1 | data/custom/images/train.jpg
2 | 


--------------------------------------------------------------------------------
/data/custom/labels/train.txt:
--------------------------------------------------------------------------------
1 | 0 0.515 0.5 0.21694873 0.18286777
2 | 


--------------------------------------------------------------------------------
/assets/dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/assets/dog.png


--------------------------------------------------------------------------------
/output/dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/dog.png


--------------------------------------------------------------------------------
/output/room.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/room.png


--------------------------------------------------------------------------------
/assets/giraffe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/assets/giraffe.png


--------------------------------------------------------------------------------
/assets/messi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/assets/messi.png


--------------------------------------------------------------------------------
/assets/traffic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/assets/traffic.png


--------------------------------------------------------------------------------
/output/eagle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/eagle.png


--------------------------------------------------------------------------------
/output/field.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/field.png


--------------------------------------------------------------------------------
/output/giraffe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/giraffe.png


--------------------------------------------------------------------------------
/output/messi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/messi.png


--------------------------------------------------------------------------------
/output/person.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/person.png


--------------------------------------------------------------------------------
/output/street.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/street.png


--------------------------------------------------------------------------------
/data/samples/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/dog.jpg


--------------------------------------------------------------------------------
/data/samples/eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/eagle.jpg


--------------------------------------------------------------------------------
/data/samples/field.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/field.jpg


--------------------------------------------------------------------------------
/data/samples/messi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/messi.jpg


--------------------------------------------------------------------------------
/data/samples/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/person.jpg


--------------------------------------------------------------------------------
/data/samples/room.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/room.jpg


--------------------------------------------------------------------------------
/data/samples/street.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/street.jpg


--------------------------------------------------------------------------------
/data/temptest/field.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/temptest/field.jpg


--------------------------------------------------------------------------------
/data/samples/giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/giraffe.jpg


--------------------------------------------------------------------------------
/output/herd_of_horses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/output/herd_of_horses.png


--------------------------------------------------------------------------------
/data/custom/images/train.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/custom/images/train.jpg


--------------------------------------------------------------------------------
/__pycache__/models.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/__pycache__/models.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/__pycache__/test.cpython-36.pyc


--------------------------------------------------------------------------------
/data/samples/herd_of_horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/data/samples/herd_of_horses.jpg


--------------------------------------------------------------------------------
/config/custom.data:
--------------------------------------------------------------------------------
1 | classes= 1
2 | train=data/custom/train.txt
3 | valid=data/custom/valid.txt
4 | names=data/custom/classes.names
5 | 


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/logger.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/logger.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/datasets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/datasets.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parse_config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/parse_config.cpython-36.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | torch>=1.0
 3 | torchvision
 4 | matplotlib
 5 | tensorflow
 6 | tensorboard
 7 | terminaltables
 8 | pillow
 9 | tqdm
10 | 


--------------------------------------------------------------------------------
/utils/__pycache__/augmentations.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocF/YOLOv3-Torch2TRT/HEAD/utils/__pycache__/augmentations.cpython-36.pyc


--------------------------------------------------------------------------------
/config/coco.data:
--------------------------------------------------------------------------------
1 | classes= 80
2 | train=data/coco/trainvalno5k.txt
3 | valid=data/coco/5k.txt
4 | names=data/coco.names
5 | backup=backup/
6 | eval=coco
7 | 


--------------------------------------------------------------------------------
/utils/augmentations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | 
 5 | 
 6 | def horisontal_flip(images, targets):
 7 |     images = torch.flip(images, [-1])
 8 |     targets[:, 2] = 1 - targets[:, 2]
 9 |     return images, targets
10 | 


--------------------------------------------------------------------------------
/weights/download_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Download weights for vanilla YOLOv3
3 | wget -c https://pjreddie.com/media/files/yolov3.weights
4 | # # Download weights for tiny YOLOv3
5 | wget -c https://pjreddie.com/media/files/yolov3-tiny.weights
6 | # Download weights for backbone network
7 | wget -c https://pjreddie.com/media/files/darknet53.conv.74
8 | 


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class Logger(object):
 5 |     def __init__(self, log_dir):
 6 |         """Create a summary writer logging to log_dir."""
 7 |         self.writer = tf.summary.FileWriter(log_dir) 
 8 | 
 9 |     def scalar_summary(self, tag, value, step):
10 |         """Log a scalar variable."""
11 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
12 |         self.writer.add_summary(summary, step)
13 | 
14 |     def list_of_scalars_summary(self, tag_value_pairs, step):
15 |         """Log scalar variables."""
16 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value) for tag, value in tag_value_pairs])
17 |         self.writer.add_summary(summary, step)
18 | 


--------------------------------------------------------------------------------
/check.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: Richard Fang
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | import tensorrt as trt
 9 | from torch2trt import torch2trt, tensorrt_converter
10 | 
11 | 
12 | class UpsamlpleNet(nn.Module):
13 |     def __init__(self):
14 |         super(UpsamlpleNet, self).__init__()
15 | 
16 |     def forward(self, x):
17 |         x = F.interpolate(x, size=(64, 64), mode='nearest')
18 |         # print(x.type())
19 |         # print(x)
20 |         return x
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     model = UpsamlpleNet().eval().cuda()
25 |     data = torch.rand(size=(1, 3, 64, 64)).cuda()
26 |     pred = model(data)
27 | 
28 |     #  TensorRT
29 |     model_trt = torch2trt(model, [data])
30 |     pred_trt = model_trt(data)
31 | 
32 |     # check the output against PyTorch
33 |     print(torch.max(torch.abs(pred - pred_trt)))
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/data/get_coco_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # CREDIT: https://github.com/pjreddie/darknet/tree/master/scripts/get_coco_dataset.sh
 4 | 
 5 | # Clone COCO API
 6 | git clone https://github.com/pdollar/coco
 7 | cd coco
 8 | 
 9 | mkdir images
10 | cd images
11 | 
12 | # Download Images
13 | wget -c https://pjreddie.com/media/files/train2014.zip
14 | wget -c https://pjreddie.com/media/files/val2014.zip
15 | 
16 | # Unzip
17 | unzip -q train2014.zip
18 | unzip -q val2014.zip
19 | 
20 | cd ..
21 | 
22 | # Download COCO Metadata
23 | wget -c https://pjreddie.com/media/files/instances_train-val2014.zip
24 | wget -c https://pjreddie.com/media/files/coco/5k.part
25 | wget -c https://pjreddie.com/media/files/coco/trainvalno5k.part
26 | wget -c https://pjreddie.com/media/files/coco/labels.tgz
27 | tar xzf labels.tgz
28 | unzip -q instances_train-val2014.zip
29 | 
30 | # Set Up Image Lists
31 | paste <(awk "{print \"$PWD\"}" <5k.part) 5k.part | tr -d '\t' > 5k.txt
32 | paste <(awk "{print \"$PWD\"}" <trainvalno5k.part) trainvalno5k.part | tr -d '\t' > trainvalno5k.txt
33 | 


--------------------------------------------------------------------------------
/utils/parse_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def parse_model_config(path):
 4 |     """Parses the yolo-v3 layer configuration file and returns module definitions"""
 5 |     file = open(path, 'r')
 6 |     lines = file.read().split('\n')
 7 |     lines = [x for x in lines if x and not x.startswith('#')]
 8 |     lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
 9 |     module_defs = []
10 |     for line in lines:
11 |         if line.startswith('['): # This marks the start of a new block
12 |             module_defs.append({})
13 |             module_defs[-1]['type'] = line[1:-1].rstrip()
14 |             if module_defs[-1]['type'] == 'convolutional':
15 |                 module_defs[-1]['batch_normalize'] = 0
16 |         else:
17 |             key, value = line.split("=")
18 |             value = value.strip()
19 |             module_defs[-1][key.rstrip()] = value.strip()
20 | 
21 |     return module_defs
22 | 
23 | 
24 | def parse_data_config(path):
25 |     """Parses the data configuration file"""
26 |     options = dict()
27 |     options['gpus'] = '0,1,2,3'
28 |     options['num_workers'] = '10'
29 |     with open(path, 'r') as fp:
30 |         lines = fp.readlines()
31 |     for line in lines:
32 |         line = line.strip()
33 |         if line == '' or line.startswith('#'):
34 |             continue
35 |         key, value = line.split('=')
36 |         options[key.strip()] = value.strip()
37 |     return options
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YOLOv3-Torch2TRT
 2 | 
 3 | ## Introduction
 4 | Convert YOLOv3 and YOLOv3-tiny (PyTorch version) into TensorRT models, through the torch2trt Python API.
 5 | 
 6 | ## Installation 
 7 | #### Clone the repo
 8 |     git clone https://github.com/DocF/YOLOv3-Torch2TRT.git
 9 |     
10 | #### Download pretrained weights
11 |     $ cd weights/
12 |     $ bash download_weights.sh
13 |  
14 | ## Requirements
15 | Two special Python packages are needed:
16 |   
17 | * tensorrt
18 |   
19 | * torch2trt
20 |   
21 |  Due to the upsampling operation in YOLO, according to torch2trt API introduction, you need to install the version with plugins.
22 |  
23 |  Installation reference: https://github.com/NVIDIA-AI-IOT/torch2trt
24 |  
25 | #### Check torch2trt API
26 | 
27 |     python3 check.py
28 |  
29 |  
30 | ## Inference Acceleration
31 | Acceleration Techs：
32 | * FP16
33 | * TensorRT
34 | 
35 | 
36 | Here are some results on TITAN xp:
37 | 
38 | | Model name | Input Size |  FP16 | Entire Mode*(FPS) | Backbone+FeatureNet(FPS) | 
39 | |:---------: |------------|:-----:|:-----------------:|:-------------:|
40 | | YOLOv3  | 320×320 |  | 87.58 Hz| 102.95 Hz| 
41 | |         | 320×320 | ✔️ | 83.63 Hz| 100.36 Hz| 
42 | | YOLOv3-TRT  | 320×320 |  | 110.74 Hz| 121.81 Hz| 
43 | |             | 320×320 |  ✔️ | 106.92 Hz| 124.95 Hz| 
44 | | YOLOv3-tiny  | 320×320 | | 354.10 Hz| 668.71 Hz| 
45 | |              | 320×320 |  ✔️ | 379.11 Hz| 727.82 Hz| 
46 | | YOLOv3-tiny-TRT | 320×320 |  |684.75 Hz| 1035.11 Hz| 
47 | |                 | 320×320 |  ✔️ |649.71 Hz| 1012.66 Hz| 
48 | 
49 | Entire Model* = Backbone + Feature Net + YOLO Head
50 | 
51 |     python3 detect.py
52 | 
53 | ## Statement
54 | This repo is based on [PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3). Thx for the great repo.
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/config/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=2
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | # 0
 26 | [convolutional]
 27 | batch_normalize=1
 28 | filters=16
 29 | size=3
 30 | stride=1
 31 | pad=1
 32 | activation=leaky
 33 | 
 34 | # 1
 35 | [maxpool]
 36 | size=2
 37 | stride=2
 38 | 
 39 | # 2
 40 | [convolutional]
 41 | batch_normalize=1
 42 | filters=32
 43 | size=3
 44 | stride=1
 45 | pad=1
 46 | activation=leaky
 47 | 
 48 | # 3
 49 | [maxpool]
 50 | size=2
 51 | stride=2
 52 | 
 53 | # 4
 54 | [convolutional]
 55 | batch_normalize=1
 56 | filters=64
 57 | size=3
 58 | stride=1
 59 | pad=1
 60 | activation=leaky
 61 | 
 62 | # 5
 63 | [maxpool]
 64 | size=2
 65 | stride=2
 66 | 
 67 | # 6
 68 | [convolutional]
 69 | batch_normalize=1
 70 | filters=128
 71 | size=3
 72 | stride=1
 73 | pad=1
 74 | activation=leaky
 75 | 
 76 | # 7
 77 | [maxpool]
 78 | size=2
 79 | stride=2
 80 | 
 81 | # 8
 82 | [convolutional]
 83 | batch_normalize=1
 84 | filters=256
 85 | size=3
 86 | stride=1
 87 | pad=1
 88 | activation=leaky
 89 | 
 90 | # 9
 91 | [maxpool]
 92 | size=2
 93 | stride=2
 94 | 
 95 | # 10
 96 | [convolutional]
 97 | batch_normalize=1
 98 | filters=512
 99 | size=3
100 | stride=1
101 | pad=1
102 | activation=leaky
103 | 
104 | # 11
105 | [maxpool]
106 | size=2
107 | stride=1
108 | 
109 | # 12
110 | [convolutional]
111 | batch_normalize=1
112 | filters=1024
113 | size=3
114 | stride=1
115 | pad=1
116 | activation=leaky
117 | 
118 | ###########
119 | 
120 | # 13
121 | [convolutional]
122 | batch_normalize=1
123 | filters=256
124 | size=1
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | # 14
130 | [convolutional]
131 | batch_normalize=1
132 | filters=512
133 | size=3
134 | stride=1
135 | pad=1
136 | activation=leaky
137 | 
138 | # 15
139 | [convolutional]
140 | size=1
141 | stride=1
142 | pad=1
143 | filters=255
144 | activation=linear
145 | 
146 | 
147 | 
148 | # 16
149 | [yolo]
150 | mask = 3,4,5
151 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
152 | classes=80
153 | num=6
154 | jitter=.3
155 | ignore_thresh = .7
156 | truth_thresh = 1
157 | random=1
158 | 
159 | # 17
160 | [route]
161 | layers = -4
162 | 
163 | # 18
164 | [convolutional]
165 | batch_normalize=1
166 | filters=128
167 | size=1
168 | stride=1
169 | pad=1
170 | activation=leaky
171 | 
172 | # 19
173 | [upsample]
174 | stride=2
175 | 
176 | # 20
177 | [route]
178 | layers = -1, 8
179 | 
180 | # 21
181 | [convolutional]
182 | batch_normalize=1
183 | filters=256
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | # 22
190 | [convolutional]
191 | size=1
192 | stride=1
193 | pad=1
194 | filters=255
195 | activation=linear
196 | 
197 | # 23
198 | [yolo]
199 | mask = 1,2,3
200 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
201 | classes=80
202 | num=6
203 | jitter=.3
204 | ignore_thresh = .7
205 | truth_thresh = 1
206 | random=1
207 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from models import *
  4 | from utils.utils import *
  5 | from utils.datasets import *
  6 | from utils.parse_config import *
  7 | 
  8 | import os
  9 | import sys
 10 | import time
 11 | import datetime
 12 | import argparse
 13 | import tqdm
 14 | 
 15 | import torch
 16 | from torch.utils.data import DataLoader
 17 | from torchvision import datasets
 18 | from torchvision import transforms
 19 | from torch.autograd import Variable
 20 | import torch.optim as optim
 21 | 
 22 | 
 23 | def evaluate(model, path, iou_thres, conf_thres, nms_thres, img_size, batch_size):
 24 |     model.eval()
 25 | 
 26 |     # Get dataloader
 27 |     dataset = ListDataset(path, img_size=img_size, augment=False, multiscale=False)
 28 |     dataloader = torch.utils.data.DataLoader(
 29 |         dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn
 30 |     )
 31 | 
 32 |     Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
 33 | 
 34 |     labels = []
 35 |     sample_metrics = []  # List of tuples (TP, confs, pred)
 36 |     for batch_i, (_, imgs, targets) in enumerate(tqdm.tqdm(dataloader, desc="Detecting objects")):
 37 | 
 38 |         # Extract labels
 39 |         labels += targets[:, 1].tolist()
 40 |         # Rescale target
 41 |         targets[:, 2:] = xywh2xyxy(targets[:, 2:])
 42 |         targets[:, 2:] *= img_size
 43 | 
 44 |         imgs = Variable(imgs.type(Tensor), requires_grad=False)
 45 | 
 46 |         with torch.no_grad():
 47 |             outputs = model(imgs)
 48 |             outputs = non_max_suppression(outputs, conf_thres=conf_thres, nms_thres=nms_thres)
 49 | 
 50 |         sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres)
 51 | 
 52 |     # Concatenate sample statistics
 53 |     true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))]
 54 |     precision, recall, AP, f1, ap_class = ap_per_class(true_positives, pred_scores, pred_labels, labels)
 55 | 
 56 |     return precision, recall, AP, f1, ap_class
 57 | 
 58 | 
 59 | if __name__ == "__main__":
 60 |     parser = argparse.ArgumentParser()
 61 |     parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
 62 |     parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
 63 |     parser.add_argument("--data_config", type=str, default="config/coco.data", help="path to data config file")
 64 |     parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")
 65 |     parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")
 66 |     parser.add_argument("--iou_thres", type=float, default=0.5, help="iou threshold required to qualify as detected")
 67 |     parser.add_argument("--conf_thres", type=float, default=0.001, help="object confidence threshold")
 68 |     parser.add_argument("--nms_thres", type=float, default=0.5, help="iou thresshold for non-maximum suppression")
 69 |     parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
 70 |     parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
 71 |     opt = parser.parse_args()
 72 |     print(opt)
 73 | 
 74 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 75 | 
 76 |     data_config = parse_data_config(opt.data_config)
 77 |     valid_path = data_config["valid"]
 78 |     class_names = load_classes(data_config["names"])
 79 | 
 80 |     # Initiate model
 81 |     model = Darknet(opt.model_def).to(device)
 82 |     if opt.weights_path.endswith(".weights"):
 83 |         # Load darknet weights
 84 |         model.load_darknet_weights(opt.weights_path)
 85 |     else:
 86 |         # Load checkpoint weights
 87 |         model.load_state_dict(torch.load(opt.weights_path))
 88 | 
 89 |     print("Compute mAP...")
 90 | 
 91 |     precision, recall, AP, f1, ap_class = evaluate(
 92 |         model,
 93 |         path=valid_path,
 94 |         iou_thres=opt.iou_thres,
 95 |         conf_thres=opt.conf_thres,
 96 |         nms_thres=opt.nms_thres,
 97 |         img_size=opt.img_size,
 98 |         batch_size=8,
 99 |     )
100 | 
101 |     print("Average Precisions:")
102 |     for i, c in enumerate(ap_class):
103 |         print(f"+ Class '{c}' ({class_names[c]}) - AP: {AP[i]}")
104 | 
105 |     print(f"mAP: {AP.mean()}")
106 | 


--------------------------------------------------------------------------------
/utils/datasets.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import random
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | from PIL import Image
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | from utils.augmentations import horisontal_flip
 11 | from torch.utils.data import Dataset
 12 | import torchvision.transforms as transforms
 13 | 
 14 | 
 15 | def pad_to_square(img, pad_value):
 16 |     c, h, w = img.shape
 17 |     dim_diff = np.abs(h - w)
 18 |     # (upper / left) padding and (lower / right) padding
 19 |     pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2
 20 |     # Determine padding
 21 |     pad = (0, 0, pad1, pad2) if h <= w else (pad1, pad2, 0, 0)
 22 |     # Add padding
 23 |     img = F.pad(img, pad, "constant", value=pad_value)
 24 | 
 25 |     return img, pad
 26 | 
 27 | 
 28 | def resize(image, size):
 29 |     image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0)
 30 |     return image
 31 | 
 32 | 
 33 | def random_resize(images, min_size=288, max_size=448):
 34 |     new_size = random.sample(list(range(min_size, max_size + 1, 32)), 1)[0]
 35 |     images = F.interpolate(images, size=new_size, mode="nearest")
 36 |     return images
 37 | 
 38 | 
 39 | class ImageFolder(Dataset):
 40 |     def __init__(self, folder_path, img_size=416):
 41 |         self.files = sorted(glob.glob("%s/*.*" % folder_path))
 42 |         self.img_size = img_size
 43 | 
 44 |     def __getitem__(self, index):
 45 |         img_path = self.files[index % len(self.files)]
 46 |         # Extract image as PyTorch tensor
 47 |         img = transforms.ToTensor()(Image.open(img_path))
 48 |         # Pad to square resolution
 49 |         img, _ = pad_to_square(img, 0)
 50 |         # Resize
 51 |         img = resize(img, self.img_size)
 52 | 
 53 |         return img_path, img
 54 | 
 55 |     def __len__(self):
 56 |         return len(self.files)
 57 | 
 58 | 
 59 | class ListDataset(Dataset):
 60 |     def __init__(self, list_path, img_size=416, augment=True, multiscale=True, normalized_labels=True):
 61 |         with open(list_path, "r") as file:
 62 |             self.img_files = file.readlines()
 63 | 
 64 |         self.label_files = [
 65 |             path.replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt")
 66 |             for path in self.img_files
 67 |         ]
 68 |         self.img_size = img_size
 69 |         self.max_objects = 100
 70 |         self.augment = augment
 71 |         self.multiscale = multiscale
 72 |         self.normalized_labels = normalized_labels
 73 |         self.min_size = self.img_size - 3 * 32
 74 |         self.max_size = self.img_size + 3 * 32
 75 |         self.batch_count = 0
 76 | 
 77 |     def __getitem__(self, index):
 78 | 
 79 |         # ---------
 80 |         #  Image
 81 |         # ---------
 82 | 
 83 |         img_path = self.img_files[index % len(self.img_files)].rstrip()
 84 | 
 85 |         # Extract image as PyTorch tensor
 86 |         img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))
 87 | 
 88 |         # Handle images with less than three channels
 89 |         if len(img.shape) != 3:
 90 |             img = img.unsqueeze(0)
 91 |             img = img.expand((3, img.shape[1:]))
 92 | 
 93 |         _, h, w = img.shape
 94 |         h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1)
 95 |         # Pad to square resolution
 96 |         img, pad = pad_to_square(img, 0)
 97 |         _, padded_h, padded_w = img.shape
 98 | 
 99 |         # ---------
100 |         #  Label
101 |         # ---------
102 | 
103 |         label_path = self.label_files[index % len(self.img_files)].rstrip()
104 | 
105 |         targets = None
106 |         if os.path.exists(label_path):
107 |             boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))
108 |             # Extract coordinates for unpadded + unscaled image
109 |             x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2)
110 |             y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2)
111 |             x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2)
112 |             y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2)
113 |             # Adjust for added padding
114 |             x1 += pad[0]
115 |             y1 += pad[2]
116 |             x2 += pad[1]
117 |             y2 += pad[3]
118 |             # Returns (x, y, w, h)
119 |             boxes[:, 1] = ((x1 + x2) / 2) / padded_w
120 |             boxes[:, 2] = ((y1 + y2) / 2) / padded_h
121 |             boxes[:, 3] *= w_factor / padded_w
122 |             boxes[:, 4] *= h_factor / padded_h
123 | 
124 |             targets = torch.zeros((len(boxes), 6))
125 |             targets[:, 1:] = boxes
126 | 
127 |         # Apply augmentations
128 |         if self.augment:
129 |             if np.random.random() < 0.5:
130 |                 img, targets = horisontal_flip(img, targets)
131 | 
132 |         return img_path, img, targets
133 | 
134 |     def collate_fn(self, batch):
135 |         paths, imgs, targets = list(zip(*batch))
136 |         # Remove empty placeholder targets
137 |         targets = [boxes for boxes in targets if boxes is not None]
138 |         # Add sample index to targets
139 |         for i, boxes in enumerate(targets):
140 |             boxes[:, 0] = i
141 |         targets = torch.cat(targets, 0)
142 |         # Selects new image size every tenth batch
143 |         if self.multiscale and self.batch_count % 10 == 0:
144 |             self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32))
145 |         # Resize images to input shape
146 |         imgs = torch.stack([resize(img, self.img_size) for img in imgs])
147 |         self.batch_count += 1
148 |         return paths, imgs, targets
149 | 
150 |     def __len__(self):
151 |         return len(self.img_files)
152 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from models import *
  4 | from utils.logger import *
  5 | from utils.utils import *
  6 | from utils.datasets import *
  7 | from utils.parse_config import *
  8 | from test import evaluate
  9 | 
 10 | from terminaltables import AsciiTable
 11 | 
 12 | import os
 13 | import sys
 14 | import time
 15 | import datetime
 16 | import argparse
 17 | 
 18 | import torch
 19 | from torch.utils.data import DataLoader
 20 | from torchvision import datasets
 21 | from torchvision import transforms
 22 | from torch.autograd import Variable
 23 | import torch.optim as optim
 24 | 
 25 | if __name__ == "__main__":
 26 |     parser = argparse.ArgumentParser()
 27 |     parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 28 |     parser.add_argument("--batch_size", type=int, default=16, help="size of each image batch")
 29 |     parser.add_argument("--gradient_accumulations", type=int, default=2, help="number of gradient accums before step")
 30 |     parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
 31 |     parser.add_argument("--data_config", type=str, default="config/coco.data", help="path to data config file")
 32 |     parser.add_argument("--pretrained_weights", type=str, help="if specified starts from checkpoint model")
 33 |     parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
 34 |     parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
 35 |     parser.add_argument("--checkpoint_interval", type=int, default=1, help="interval between saving model weights")
 36 |     parser.add_argument("--evaluation_interval", type=int, default=1, help="interval evaluations on validation set")
 37 |     parser.add_argument("--compute_map", default=False, help="if True computes mAP every tenth batch")
 38 |     parser.add_argument("--multiscale_training", default=True, help="allow for multi-scale training")
 39 |     opt = parser.parse_args()
 40 |     print(opt)
 41 | 
 42 |     logger = Logger("logs")
 43 | 
 44 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 45 | 
 46 |     os.makedirs("output", exist_ok=True)
 47 |     os.makedirs("checkpoints", exist_ok=True)
 48 | 
 49 |     # Get data configuration
 50 |     data_config = parse_data_config(opt.data_config)
 51 |     train_path = data_config["train"]
 52 |     valid_path = data_config["valid"]
 53 |     class_names = load_classes(data_config["names"])
 54 | 
 55 |     # Initiate model
 56 |     model = Darknet(opt.model_def).to(device)
 57 |     model.apply(weights_init_normal)
 58 | 
 59 |     # If specified we start from checkpoint
 60 |     if opt.pretrained_weights:
 61 |         if opt.pretrained_weights.endswith(".pth"):
 62 |             model.load_state_dict(torch.load(opt.pretrained_weights))
 63 |         else:
 64 |             model.load_darknet_weights(opt.pretrained_weights)
 65 | 
 66 |     # Get dataloader
 67 |     dataset = ListDataset(train_path, augment=True, multiscale=opt.multiscale_training)
 68 |     dataloader = torch.utils.data.DataLoader(
 69 |         dataset,
 70 |         batch_size=opt.batch_size,
 71 |         shuffle=True,
 72 |         num_workers=opt.n_cpu,
 73 |         pin_memory=True,
 74 |         collate_fn=dataset.collate_fn,
 75 |     )
 76 | 
 77 |     optimizer = torch.optim.Adam(model.parameters())
 78 | 
 79 |     metrics = [
 80 |         "grid_size",
 81 |         "loss",
 82 |         "x",
 83 |         "y",
 84 |         "w",
 85 |         "h",
 86 |         "conf",
 87 |         "cls",
 88 |         "cls_acc",
 89 |         "recall50",
 90 |         "recall75",
 91 |         "precision",
 92 |         "conf_obj",
 93 |         "conf_noobj",
 94 |     ]
 95 | 
 96 |     for epoch in range(opt.epochs):
 97 |         model.train()
 98 |         start_time = time.time()
 99 |         for batch_i, (_, imgs, targets) in enumerate(dataloader):
100 |             batches_done = len(dataloader) * epoch + batch_i
101 | 
102 |             imgs = Variable(imgs.to(device))
103 |             targets = Variable(targets.to(device), requires_grad=False)
104 | 
105 |             loss, outputs = model(imgs, targets)
106 |             loss.backward()
107 | 
108 |             if batches_done % opt.gradient_accumulations:
109 |                 # Accumulates gradient before each step
110 |                 optimizer.step()
111 |                 optimizer.zero_grad()
112 | 
113 |             # ----------------
114 |             #   Log progress
115 |             # ----------------
116 | 
117 |             log_str = "\n---- [Epoch %d/%d, Batch %d/%d] ----\n" % (epoch, opt.epochs, batch_i, len(dataloader))
118 | 
119 |             metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(model.yolo_layers))]]]
120 | 
121 |             # Log metrics at each YOLO layer
122 |             for i, metric in enumerate(metrics):
123 |                 formats = {m: "%.6f" for m in metrics}
124 |                 formats["grid_size"] = "%2d"
125 |                 formats["cls_acc"] = "%.2f%%"
126 |                 row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in model.yolo_layers]
127 |                 metric_table += [[metric, *row_metrics]]
128 | 
129 |                 # Tensorboard logging
130 |                 tensorboard_log = []
131 |                 for j, yolo in enumerate(model.yolo_layers):
132 |                     for name, metric in yolo.metrics.items():
133 |                         if name != "grid_size":
134 |                             tensorboard_log += [(f"{name}_{j+1}", metric)]
135 |                 tensorboard_log += [("loss", loss.item())]
136 |                 logger.list_of_scalars_summary(tensorboard_log, batches_done)
137 | 
138 |             log_str += AsciiTable(metric_table).table
139 |             log_str += f"\nTotal loss {loss.item()}"
140 | 
141 |             # Determine approximate time left for epoch
142 |             epoch_batches_left = len(dataloader) - (batch_i + 1)
143 |             time_left = datetime.timedelta(seconds=epoch_batches_left * (time.time() - start_time) / (batch_i + 1))
144 |             log_str += f"\n---- ETA {time_left}"
145 | 
146 |             print(log_str)
147 | 
148 |             model.seen += imgs.size(0)
149 | 
150 |         if epoch % opt.evaluation_interval == 0:
151 |             print("\n---- Evaluating Model ----")
152 |             # Evaluate the model on the validation set
153 |             precision, recall, AP, f1, ap_class = evaluate(
154 |                 model,
155 |                 path=valid_path,
156 |                 iou_thres=0.5,
157 |                 conf_thres=0.5,
158 |                 nms_thres=0.5,
159 |                 img_size=opt.img_size,
160 |                 batch_size=8,
161 |             )
162 |             evaluation_metrics = [
163 |                 ("val_precision", precision.mean()),
164 |                 ("val_recall", recall.mean()),
165 |                 ("val_mAP", AP.mean()),
166 |                 ("val_f1", f1.mean()),
167 |             ]
168 |             logger.list_of_scalars_summary(evaluation_metrics, epoch)
169 | 
170 |             # Print class APs and mAP
171 |             ap_table = [["Index", "Class name", "AP"]]
172 |             for i, c in enumerate(ap_class):
173 |                 ap_table += [[c, class_names[c], "%.5f" % AP[i]]]
174 |             print(AsciiTable(ap_table).table)
175 |             print(f"---- mAP {AP.mean()}")
176 | 
177 |         if epoch % opt.checkpoint_interval == 0:
178 |             torch.save(model.state_dict(), f"checkpoints/yolov3_ckpt_%d.pth" % epoch)
179 | 


--------------------------------------------------------------------------------
/detect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from models import *
  4 | from utils.utils import *
  5 | from utils.datasets import *
  6 | 
  7 | import os
  8 | import sys
  9 | import time
 10 | import datetime
 11 | import argparse
 12 | 
 13 | from PIL import Image
 14 | 
 15 | import torch
 16 | from torch.utils.data import DataLoader
 17 | from torchvision import datasets
 18 | from torch.autograd import Variable
 19 | 
 20 | import matplotlib.pyplot as plt
 21 | import matplotlib.patches as patches
 22 | from matplotlib.ticker import NullLocator
 23 | 
 24 | from torchstat import stat
 25 | from torch2trt import torch2trt
 26 | 
 27 | 
 28 | def Speed(TensorRT, Half):
 29 |     if Half:
 30 |         test_data = torch.rand(size=(1000, 1, 3, opt.img_size, opt.img_size)).cuda().half()
 31 |     else:
 32 |         test_data = torch.rand(size=(1000, 1, 3, opt.img_size, opt.img_size)).cuda()
 33 | 
 34 |     print("Start test speed")
 35 |     # 模型推理
 36 |     if TensorRT:
 37 | 
 38 |         start = time.time()
 39 |         for i in range(test_data.size()[0]):
 40 |             detections = model_backbone(test_data[i])
 41 |         end = time.time()
 42 |         if Half is False:
 43 |             print("Fp32 Backbone Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 44 |         else:
 45 |             print("Fp16 Backbone Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 46 | 
 47 |         start = time.time()
 48 |         for i in range(test_data.size()[0]):
 49 |             detections = model_trt(test_data[i])
 50 |         end = time.time()
 51 |         if Half is False:
 52 |             print("Fp32 TensorRT Backbone Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 53 |         else:
 54 |             print("Fp16 TensorRT Backbone Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 55 | 
 56 |         start = time.time()
 57 |         for i in range(test_data.size()[0]):
 58 |             detections = yolo_head(model_trt(test_data[i]))
 59 |             detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres, method=1)
 60 |         end = time.time()
 61 |         if Half is False:
 62 |             print("Fp32 TensorRT Model Detect Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 63 |         else:
 64 |             print("Fp16 TensorRT Model Detect Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 65 | 
 66 |     else:
 67 | 
 68 |         start = time.time()
 69 |         for i in range(test_data.size()[0]):
 70 |             detections = model(test_data[i])
 71 |             detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres, method=1)
 72 |         end = time.time()
 73 |         if Half is False:
 74 |             print("Fp32 Original Model Detect Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 75 |         else:
 76 |             print("Fp16 Original Model Detect Speed:", 1 / (end - start) * test_data.size()[0], "Hz")
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument("--image_folder", type=str, default="data/samples/", help="path to dataset")
 82 |     parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
 83 |     parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")
 84 |     parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")
 85 |     parser.add_argument("--conf_thres", type=float, default=0.8, help="object confidence threshold")
 86 |     parser.add_argument("--nms_thres", type=float, default=0.4, help="iou thresshold for non-maximum suppression")
 87 |     parser.add_argument("--batch_size", type=int, default=1, help="size of the batches")
 88 |     parser.add_argument("--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation")
 89 |     parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
 90 |     parser.add_argument("--checkpoint_model", type=str, help="path to checkpoint model")
 91 |     opt = parser.parse_args()
 92 |     # print(opt)
 93 | 
 94 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 95 | 
 96 |     os.makedirs("output", exist_ok=True)
 97 | 
 98 |     # 模型选择
 99 |     # TensorRT 只能加速 YOLO 的特征提取网络, YOLO Head 目前还不能应用
100 |     Speed_Test = True
101 |     TensorRT = True
102 |     Half = True    # 半精度
103 | 
104 |     if TensorRT is True:
105 |         if Half is True:
106 |             model_backbone = Darknet_Backbone(opt.model_def, img_size=opt.img_size).to(device).half()
107 |         else:
108 |             model_backbone = Darknet_Backbone(opt.model_def, img_size=opt.img_size).to(device)
109 | 
110 |         # 权重加载
111 |         if opt.weights_path.endswith(".weights"):
112 |             # Load darknet weights
113 |             model_backbone.load_darknet_weights(opt.weights_path)
114 |         else:
115 |             # Load checkpoint weights
116 |             model_backbone.load_state_dict(torch.load(opt.weights_path))
117 |         # Set in evaluation mode 前向推理时候会忽略 BatchNormalization 和 Dropout
118 |         model_backbone.eval()
119 |         # 添加 Detection Head
120 |         yolo_head = YOLOHead(config_path=opt.model_def)
121 | 
122 |         # DarknetBackbone 转换为 TensorRT 模型
123 |         if Half is True:
124 |             x = torch.rand(size=(1, 3,  opt.img_size, opt.img_size)).cuda().half()
125 |             model_trt = torch2trt(model_backbone, [x], fp16_mode=True)
126 |         else:
127 |             x = torch.rand(size=(1, 3,  opt.img_size, opt.img_size)).cuda()
128 |             model_trt = torch2trt(model_backbone, [x])
129 | 
130 |     else:
131 |         if Half:
132 |             model = Darknet(opt.model_def, img_size=opt.img_size, TensorRT=False, Half=True).to(device).half()
133 |         else:
134 |             model = Darknet(opt.model_def, img_size=opt.img_size).to(device)
135 | 
136 |         # 权重加载
137 |         if opt.weights_path.endswith(".weights"):
138 |             # Load darknet weights
139 |             model.load_darknet_weights(opt.weights_path)
140 |         else:
141 |             # Load checkpoint weights
142 |             model.load_state_dict(torch.load(opt.weights_path))
143 |         # Set in evaluation mode 前向推理时候会忽略 BatchNormalization 和 Dropout
144 |         model.eval()
145 | 
146 |     # 速度测试
147 |     if Speed_Test:
148 |         Speed(TensorRT, Half)
149 | 
150 |     dataloader = DataLoader(
151 |         ImageFolder(opt.image_folder, img_size=opt.img_size),
152 |         batch_size=opt.batch_size,
153 |         shuffle=False,
154 |         num_workers=opt.n_cpu,
155 |     )
156 | 
157 |     classes = load_classes(opt.class_path)  # Extracts class labels from file
158 | 
159 |     Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
160 | 
161 |     imgs = []  # Stores image paths
162 |     img_detections = []  # Stores detections for each image index
163 | 
164 |     # # YOLO 参数、运算量测试
165 |     # model.to("cpu")
166 |     # stat(model, input_size=(3, 416, 416))
167 | 
168 |     print("\nPerforming object detection:")
169 |     prev_time = time.time()
170 |     for batch_i, (img_paths, input_imgs) in enumerate(dataloader):
171 |         # Configure input
172 |         input_imgs = Variable(input_imgs.type(Tensor))
173 | 
174 |         # # 半精度模型输入
175 |         # input_imgs = Variable(input_imgs.type(Tensor)).half()
176 | 
177 |         # Get detections
178 |         with torch.no_grad():
179 | 
180 |             # 注释说明
181 |             # YOLOv3  return tensor size [batch_size, 10647, 85]
182 |             # YOLOv3-tiny return tensor size [batch_size, 2535, 85]
183 |             # 10647 = 3×13×13 + 3×26×26 + 3×52×52
184 |             # 2535 = 3×13×13 + 3×26×26
185 |             # 85: 其中前4维度为坐标，第5个维度为bbox的置信度，后面80个维度为coco80类目标的对应概率
186 | 
187 |             # TensorRT 加速
188 |             if TensorRT:
189 |                 detections = yolo_head(model_trt(input_imgs))
190 |                 detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres, method=2)
191 |             else:
192 |                 detections = model(input_imgs)
193 |                 detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres, method=2)
194 | 
195 |         # # 保存模型
196 |         # torch.save(model, "YOLOv3.pth")
197 |         # # 打印模型
198 |         # print(model)
199 | 
200 |         # Log progress
201 |         current_time = time.time()
202 |         inference_time = datetime.timedelta(seconds=current_time - prev_time)
203 |         prev_time = current_time
204 |         print("\t+ Batch %d, Inference Time: %s" % (batch_i, inference_time))
205 | 
206 |         # Save image and detections
207 |         imgs.extend(img_paths)
208 |         img_detections.extend(detections)
209 | 
210 |     # Bounding-box colors
211 |     cmap = plt.get_cmap("tab20b")
212 |     colors = [cmap(i) for i in np.linspace(0, 1, 20)]
213 | 
214 |     print("\nSaving images:")
215 |     # Iterate through images and save plot of detections
216 |     for img_i, (path, detections) in enumerate(zip(imgs, img_detections)):
217 | 
218 |         print("(%d) Image: '%s'" % (img_i, path))
219 | 
220 |         # Create plot
221 |         img = np.array(Image.open(path))
222 |         plt.figure()
223 |         fig, ax = plt.subplots(1)
224 |         ax.imshow(img)
225 | 
226 |         # Draw bounding boxes and labels of detections
227 |         if detections is not None:
228 |             # Rescale boxes to original image
229 |             detections = rescale_boxes(detections, opt.img_size, img.shape[:2])
230 |             unique_labels = detections[:, -1].cpu().unique()
231 |             n_cls_preds = len(unique_labels)
232 |             bbox_colors = random.sample(colors, n_cls_preds)
233 |             for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:
234 | 
235 |                 print("\t+ Label: %s, Conf: %.5f" % (classes[int(cls_pred)], cls_conf.item()))
236 | 
237 |                 box_w = x2 - x1
238 |                 box_h = y2 - y1
239 | 
240 |                 color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]
241 |                 # Create a Rectangle patch
242 |                 bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=color, facecolor="none")
243 |                 # Add the bbox to the plot
244 |                 ax.add_patch(bbox)
245 |                 # Add label
246 |                 plt.text(
247 |                     x1,
248 |                     y1,
249 |                     s=classes[int(cls_pred)],
250 |                     color="white",
251 |                     verticalalignment="top",
252 |                     bbox={"color": color, "pad": 0},
253 |                 )
254 | 
255 |         # Save generated image with detections
256 |         plt.axis("off")
257 |         plt.gca().xaxis.set_major_locator(NullLocator())
258 |         plt.gca().yaxis.set_major_locator(NullLocator())
259 |         filename = path.split("/")[-1].split(".")[0]
260 |         plt.savefig(f"output/{filename}.png", bbox_inches="tight", pad_inches=0.0)
261 |         plt.close()
262 | 


--------------------------------------------------------------------------------
/config/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | ;batch=16
  7 | ;subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/config/create_custom_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | NUM_CLASSES=$1
  4 | 
  5 | echo "
  6 | [net]
  7 | # Testing
  8 | #batch=1
  9 | #subdivisions=1
 10 | # Training
 11 | batch=16
 12 | subdivisions=1
 13 | width=416
 14 | height=416
 15 | channels=3
 16 | momentum=0.9
 17 | decay=0.0005
 18 | angle=0
 19 | saturation = 1.5
 20 | exposure = 1.5
 21 | hue=.1
 22 | 
 23 | learning_rate=0.001
 24 | burn_in=1000
 25 | max_batches = 500200
 26 | policy=steps
 27 | steps=400000,450000
 28 | scales=.1,.1
 29 | 
 30 | [convolutional]
 31 | batch_normalize=1
 32 | filters=32
 33 | size=3
 34 | stride=1
 35 | pad=1
 36 | activation=leaky
 37 | 
 38 | # Downsample
 39 | 
 40 | [convolutional]
 41 | batch_normalize=1
 42 | filters=64
 43 | size=3
 44 | stride=2
 45 | pad=1
 46 | activation=leaky
 47 | 
 48 | [convolutional]
 49 | batch_normalize=1
 50 | filters=32
 51 | size=1
 52 | stride=1
 53 | pad=1
 54 | activation=leaky
 55 | 
 56 | [convolutional]
 57 | batch_normalize=1
 58 | filters=64
 59 | size=3
 60 | stride=1
 61 | pad=1
 62 | activation=leaky
 63 | 
 64 | [shortcut]
 65 | from=-3
 66 | activation=linear
 67 | 
 68 | # Downsample
 69 | 
 70 | [convolutional]
 71 | batch_normalize=1
 72 | filters=128
 73 | size=3
 74 | stride=2
 75 | pad=1
 76 | activation=leaky
 77 | 
 78 | [convolutional]
 79 | batch_normalize=1
 80 | filters=64
 81 | size=1
 82 | stride=1
 83 | pad=1
 84 | activation=leaky
 85 | 
 86 | [convolutional]
 87 | batch_normalize=1
 88 | filters=128
 89 | size=3
 90 | stride=1
 91 | pad=1
 92 | activation=leaky
 93 | 
 94 | [shortcut]
 95 | from=-3
 96 | activation=linear
 97 | 
 98 | [convolutional]
 99 | batch_normalize=1
100 | filters=64
101 | size=1
102 | stride=1
103 | pad=1
104 | activation=leaky
105 | 
106 | [convolutional]
107 | batch_normalize=1
108 | filters=128
109 | size=3
110 | stride=1
111 | pad=1
112 | activation=leaky
113 | 
114 | [shortcut]
115 | from=-3
116 | activation=linear
117 | 
118 | # Downsample
119 | 
120 | [convolutional]
121 | batch_normalize=1
122 | filters=256
123 | size=3
124 | stride=2
125 | pad=1
126 | activation=leaky
127 | 
128 | [convolutional]
129 | batch_normalize=1
130 | filters=128
131 | size=1
132 | stride=1
133 | pad=1
134 | activation=leaky
135 | 
136 | [convolutional]
137 | batch_normalize=1
138 | filters=256
139 | size=3
140 | stride=1
141 | pad=1
142 | activation=leaky
143 | 
144 | [shortcut]
145 | from=-3
146 | activation=linear
147 | 
148 | [convolutional]
149 | batch_normalize=1
150 | filters=128
151 | size=1
152 | stride=1
153 | pad=1
154 | activation=leaky
155 | 
156 | [convolutional]
157 | batch_normalize=1
158 | filters=256
159 | size=3
160 | stride=1
161 | pad=1
162 | activation=leaky
163 | 
164 | [shortcut]
165 | from=-3
166 | activation=linear
167 | 
168 | [convolutional]
169 | batch_normalize=1
170 | filters=128
171 | size=1
172 | stride=1
173 | pad=1
174 | activation=leaky
175 | 
176 | [convolutional]
177 | batch_normalize=1
178 | filters=256
179 | size=3
180 | stride=1
181 | pad=1
182 | activation=leaky
183 | 
184 | [shortcut]
185 | from=-3
186 | activation=linear
187 | 
188 | [convolutional]
189 | batch_normalize=1
190 | filters=128
191 | size=1
192 | stride=1
193 | pad=1
194 | activation=leaky
195 | 
196 | [convolutional]
197 | batch_normalize=1
198 | filters=256
199 | size=3
200 | stride=1
201 | pad=1
202 | activation=leaky
203 | 
204 | [shortcut]
205 | from=-3
206 | activation=linear
207 | 
208 | 
209 | [convolutional]
210 | batch_normalize=1
211 | filters=128
212 | size=1
213 | stride=1
214 | pad=1
215 | activation=leaky
216 | 
217 | [convolutional]
218 | batch_normalize=1
219 | filters=256
220 | size=3
221 | stride=1
222 | pad=1
223 | activation=leaky
224 | 
225 | [shortcut]
226 | from=-3
227 | activation=linear
228 | 
229 | [convolutional]
230 | batch_normalize=1
231 | filters=128
232 | size=1
233 | stride=1
234 | pad=1
235 | activation=leaky
236 | 
237 | [convolutional]
238 | batch_normalize=1
239 | filters=256
240 | size=3
241 | stride=1
242 | pad=1
243 | activation=leaky
244 | 
245 | [shortcut]
246 | from=-3
247 | activation=linear
248 | 
249 | [convolutional]
250 | batch_normalize=1
251 | filters=128
252 | size=1
253 | stride=1
254 | pad=1
255 | activation=leaky
256 | 
257 | [convolutional]
258 | batch_normalize=1
259 | filters=256
260 | size=3
261 | stride=1
262 | pad=1
263 | activation=leaky
264 | 
265 | [shortcut]
266 | from=-3
267 | activation=linear
268 | 
269 | [convolutional]
270 | batch_normalize=1
271 | filters=128
272 | size=1
273 | stride=1
274 | pad=1
275 | activation=leaky
276 | 
277 | [convolutional]
278 | batch_normalize=1
279 | filters=256
280 | size=3
281 | stride=1
282 | pad=1
283 | activation=leaky
284 | 
285 | [shortcut]
286 | from=-3
287 | activation=linear
288 | 
289 | # Downsample
290 | 
291 | [convolutional]
292 | batch_normalize=1
293 | filters=512
294 | size=3
295 | stride=2
296 | pad=1
297 | activation=leaky
298 | 
299 | [convolutional]
300 | batch_normalize=1
301 | filters=256
302 | size=1
303 | stride=1
304 | pad=1
305 | activation=leaky
306 | 
307 | [convolutional]
308 | batch_normalize=1
309 | filters=512
310 | size=3
311 | stride=1
312 | pad=1
313 | activation=leaky
314 | 
315 | [shortcut]
316 | from=-3
317 | activation=linear
318 | 
319 | 
320 | [convolutional]
321 | batch_normalize=1
322 | filters=256
323 | size=1
324 | stride=1
325 | pad=1
326 | activation=leaky
327 | 
328 | [convolutional]
329 | batch_normalize=1
330 | filters=512
331 | size=3
332 | stride=1
333 | pad=1
334 | activation=leaky
335 | 
336 | [shortcut]
337 | from=-3
338 | activation=linear
339 | 
340 | 
341 | [convolutional]
342 | batch_normalize=1
343 | filters=256
344 | size=1
345 | stride=1
346 | pad=1
347 | activation=leaky
348 | 
349 | [convolutional]
350 | batch_normalize=1
351 | filters=512
352 | size=3
353 | stride=1
354 | pad=1
355 | activation=leaky
356 | 
357 | [shortcut]
358 | from=-3
359 | activation=linear
360 | 
361 | 
362 | [convolutional]
363 | batch_normalize=1
364 | filters=256
365 | size=1
366 | stride=1
367 | pad=1
368 | activation=leaky
369 | 
370 | [convolutional]
371 | batch_normalize=1
372 | filters=512
373 | size=3
374 | stride=1
375 | pad=1
376 | activation=leaky
377 | 
378 | [shortcut]
379 | from=-3
380 | activation=linear
381 | 
382 | [convolutional]
383 | batch_normalize=1
384 | filters=256
385 | size=1
386 | stride=1
387 | pad=1
388 | activation=leaky
389 | 
390 | [convolutional]
391 | batch_normalize=1
392 | filters=512
393 | size=3
394 | stride=1
395 | pad=1
396 | activation=leaky
397 | 
398 | [shortcut]
399 | from=-3
400 | activation=linear
401 | 
402 | 
403 | [convolutional]
404 | batch_normalize=1
405 | filters=256
406 | size=1
407 | stride=1
408 | pad=1
409 | activation=leaky
410 | 
411 | [convolutional]
412 | batch_normalize=1
413 | filters=512
414 | size=3
415 | stride=1
416 | pad=1
417 | activation=leaky
418 | 
419 | [shortcut]
420 | from=-3
421 | activation=linear
422 | 
423 | 
424 | [convolutional]
425 | batch_normalize=1
426 | filters=256
427 | size=1
428 | stride=1
429 | pad=1
430 | activation=leaky
431 | 
432 | [convolutional]
433 | batch_normalize=1
434 | filters=512
435 | size=3
436 | stride=1
437 | pad=1
438 | activation=leaky
439 | 
440 | [shortcut]
441 | from=-3
442 | activation=linear
443 | 
444 | [convolutional]
445 | batch_normalize=1
446 | filters=256
447 | size=1
448 | stride=1
449 | pad=1
450 | activation=leaky
451 | 
452 | [convolutional]
453 | batch_normalize=1
454 | filters=512
455 | size=3
456 | stride=1
457 | pad=1
458 | activation=leaky
459 | 
460 | [shortcut]
461 | from=-3
462 | activation=linear
463 | 
464 | # Downsample
465 | 
466 | [convolutional]
467 | batch_normalize=1
468 | filters=1024
469 | size=3
470 | stride=2
471 | pad=1
472 | activation=leaky
473 | 
474 | [convolutional]
475 | batch_normalize=1
476 | filters=512
477 | size=1
478 | stride=1
479 | pad=1
480 | activation=leaky
481 | 
482 | [convolutional]
483 | batch_normalize=1
484 | filters=1024
485 | size=3
486 | stride=1
487 | pad=1
488 | activation=leaky
489 | 
490 | [shortcut]
491 | from=-3
492 | activation=linear
493 | 
494 | [convolutional]
495 | batch_normalize=1
496 | filters=512
497 | size=1
498 | stride=1
499 | pad=1
500 | activation=leaky
501 | 
502 | [convolutional]
503 | batch_normalize=1
504 | filters=1024
505 | size=3
506 | stride=1
507 | pad=1
508 | activation=leaky
509 | 
510 | [shortcut]
511 | from=-3
512 | activation=linear
513 | 
514 | [convolutional]
515 | batch_normalize=1
516 | filters=512
517 | size=1
518 | stride=1
519 | pad=1
520 | activation=leaky
521 | 
522 | [convolutional]
523 | batch_normalize=1
524 | filters=1024
525 | size=3
526 | stride=1
527 | pad=1
528 | activation=leaky
529 | 
530 | [shortcut]
531 | from=-3
532 | activation=linear
533 | 
534 | [convolutional]
535 | batch_normalize=1
536 | filters=512
537 | size=1
538 | stride=1
539 | pad=1
540 | activation=leaky
541 | 
542 | [convolutional]
543 | batch_normalize=1
544 | filters=1024
545 | size=3
546 | stride=1
547 | pad=1
548 | activation=leaky
549 | 
550 | [shortcut]
551 | from=-3
552 | activation=linear
553 | 
554 | ######################
555 | 
556 | [convolutional]
557 | batch_normalize=1
558 | filters=512
559 | size=1
560 | stride=1
561 | pad=1
562 | activation=leaky
563 | 
564 | [convolutional]
565 | batch_normalize=1
566 | size=3
567 | stride=1
568 | pad=1
569 | filters=1024
570 | activation=leaky
571 | 
572 | [convolutional]
573 | batch_normalize=1
574 | filters=512
575 | size=1
576 | stride=1
577 | pad=1
578 | activation=leaky
579 | 
580 | [convolutional]
581 | batch_normalize=1
582 | size=3
583 | stride=1
584 | pad=1
585 | filters=1024
586 | activation=leaky
587 | 
588 | [convolutional]
589 | batch_normalize=1
590 | filters=512
591 | size=1
592 | stride=1
593 | pad=1
594 | activation=leaky
595 | 
596 | [convolutional]
597 | batch_normalize=1
598 | size=3
599 | stride=1
600 | pad=1
601 | filters=1024
602 | activation=leaky
603 | 
604 | [convolutional]
605 | size=1
606 | stride=1
607 | pad=1
608 | filters=$(expr 3 \* $(expr $NUM_CLASSES \+ 5))
609 | activation=linear
610 | 
611 | 
612 | [yolo]
613 | mask = 6,7,8
614 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
615 | classes=$NUM_CLASSES
616 | num=9
617 | jitter=.3
618 | ignore_thresh = .7
619 | truth_thresh = 1
620 | random=1
621 | 
622 | 
623 | [route]
624 | layers = -4
625 | 
626 | [convolutional]
627 | batch_normalize=1
628 | filters=256
629 | size=1
630 | stride=1
631 | pad=1
632 | activation=leaky
633 | 
634 | [upsample]
635 | stride=2
636 | 
637 | [route]
638 | layers = -1, 61
639 | 
640 | 
641 | 
642 | [convolutional]
643 | batch_normalize=1
644 | filters=256
645 | size=1
646 | stride=1
647 | pad=1
648 | activation=leaky
649 | 
650 | [convolutional]
651 | batch_normalize=1
652 | size=3
653 | stride=1
654 | pad=1
655 | filters=512
656 | activation=leaky
657 | 
658 | [convolutional]
659 | batch_normalize=1
660 | filters=256
661 | size=1
662 | stride=1
663 | pad=1
664 | activation=leaky
665 | 
666 | [convolutional]
667 | batch_normalize=1
668 | size=3
669 | stride=1
670 | pad=1
671 | filters=512
672 | activation=leaky
673 | 
674 | [convolutional]
675 | batch_normalize=1
676 | filters=256
677 | size=1
678 | stride=1
679 | pad=1
680 | activation=leaky
681 | 
682 | [convolutional]
683 | batch_normalize=1
684 | size=3
685 | stride=1
686 | pad=1
687 | filters=512
688 | activation=leaky
689 | 
690 | [convolutional]
691 | size=1
692 | stride=1
693 | pad=1
694 | filters=$(expr 3 \* $(expr $NUM_CLASSES \+ 5))
695 | activation=linear
696 | 
697 | 
698 | [yolo]
699 | mask = 3,4,5
700 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
701 | classes=$NUM_CLASSES
702 | num=9
703 | jitter=.3
704 | ignore_thresh = .7
705 | truth_thresh = 1
706 | random=1
707 | 
708 | 
709 | 
710 | [route]
711 | layers = -4
712 | 
713 | [convolutional]
714 | batch_normalize=1
715 | filters=128
716 | size=1
717 | stride=1
718 | pad=1
719 | activation=leaky
720 | 
721 | [upsample]
722 | stride=2
723 | 
724 | [route]
725 | layers = -1, 36
726 | 
727 | 
728 | 
729 | [convolutional]
730 | batch_normalize=1
731 | filters=128
732 | size=1
733 | stride=1
734 | pad=1
735 | activation=leaky
736 | 
737 | [convolutional]
738 | batch_normalize=1
739 | size=3
740 | stride=1
741 | pad=1
742 | filters=256
743 | activation=leaky
744 | 
745 | [convolutional]
746 | batch_normalize=1
747 | filters=128
748 | size=1
749 | stride=1
750 | pad=1
751 | activation=leaky
752 | 
753 | [convolutional]
754 | batch_normalize=1
755 | size=3
756 | stride=1
757 | pad=1
758 | filters=256
759 | activation=leaky
760 | 
761 | [convolutional]
762 | batch_normalize=1
763 | filters=128
764 | size=1
765 | stride=1
766 | pad=1
767 | activation=leaky
768 | 
769 | [convolutional]
770 | batch_normalize=1
771 | size=3
772 | stride=1
773 | pad=1
774 | filters=256
775 | activation=leaky
776 | 
777 | [convolutional]
778 | size=1
779 | stride=1
780 | pad=1
781 | filters=$(expr 3 \* $(expr $NUM_CLASSES \+ 5))
782 | activation=linear
783 | 
784 | 
785 | [yolo]
786 | mask = 0,1,2
787 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
788 | classes=$NUM_CLASSES
789 | num=9
790 | jitter=.3
791 | ignore_thresh = .7
792 | truth_thresh = 1
793 | random=1
794 | " >> yolov3-custom.cfg
795 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import math
  3 | import time
  4 | import tqdm
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.patches as patches
 12 | 
 13 | from torch2trt import torch2trt
 14 | 
 15 | 
 16 | def to_cpu(tensor):
 17 |     return tensor.detach().cpu().float()
 18 | 
 19 | 
 20 | def load_classes(path):
 21 |     """
 22 |     Loads class labels at 'path'
 23 |     """
 24 |     fp = open(path, "r")
 25 |     names = fp.read().split("\n")[:-1]
 26 |     return names
 27 | 
 28 | 
 29 | def weights_init_normal(m):
 30 |     classname = m.__class__.__name__
 31 |     if classname.find("Conv") != -1:
 32 |         torch.nn.init.normal_(m.weight.data, 0.0, 0.02)
 33 |     elif classname.find("BatchNorm2d") != -1:
 34 |         torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
 35 |         torch.nn.init.constant_(m.bias.data, 0.0)
 36 | 
 37 | 
 38 | def rescale_boxes(boxes, current_dim, original_shape):
 39 |     """ Rescales bounding boxes to the original shape """
 40 |     orig_h, orig_w = original_shape
 41 |     # The amount of padding that was added
 42 |     pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape))
 43 |     pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape))
 44 |     # Image height and width after padding is removed
 45 |     unpad_h = current_dim - pad_y
 46 |     unpad_w = current_dim - pad_x
 47 |     # Rescale bounding boxes to dimension of original image
 48 |     boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w
 49 |     boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h
 50 |     boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w
 51 |     boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h
 52 |     return boxes
 53 | 
 54 | 
 55 | def xywh2xyxy(x):
 56 |     y = x.new(x.shape)
 57 |     y[..., 0] = x[..., 0] - x[..., 2] / 2
 58 |     y[..., 1] = x[..., 1] - x[..., 3] / 2
 59 |     y[..., 2] = x[..., 0] + x[..., 2] / 2
 60 |     y[..., 3] = x[..., 1] + x[..., 3] / 2
 61 |     return y
 62 | 
 63 | 
 64 | def ap_per_class(tp, conf, pred_cls, target_cls):
 65 |     """ Compute the average precision, given the recall and precision curves.
 66 |     Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
 67 |     # Arguments
 68 |         tp:    True positives (list).
 69 |         conf:  Objectness value from 0-1 (list).
 70 |         pred_cls: Predicted object classes (list).
 71 |         target_cls: True object classes (list).
 72 |     # Returns
 73 |         The average precision as computed in py-faster-rcnn.
 74 |     """
 75 | 
 76 |     # Sort by objectness
 77 |     i = np.argsort(-conf)
 78 |     tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
 79 | 
 80 |     # Find unique classes
 81 |     unique_classes = np.unique(target_cls)
 82 | 
 83 |     # Create Precision-Recall curve and compute AP for each class
 84 |     ap, p, r = [], [], []
 85 |     for c in tqdm.tqdm(unique_classes, desc="Computing AP"):
 86 |         i = pred_cls == c
 87 |         n_gt = (target_cls == c).sum()  # Number of ground truth objects
 88 |         n_p = i.sum()  # Number of predicted objects
 89 | 
 90 |         if n_p == 0 and n_gt == 0:
 91 |             continue
 92 |         elif n_p == 0 or n_gt == 0:
 93 |             ap.append(0)
 94 |             r.append(0)
 95 |             p.append(0)
 96 |         else:
 97 |             # Accumulate FPs and TPs
 98 |             fpc = (1 - tp[i]).cumsum()
 99 |             tpc = (tp[i]).cumsum()
100 | 
101 |             # Recall
102 |             recall_curve = tpc / (n_gt + 1e-16)
103 |             r.append(recall_curve[-1])
104 | 
105 |             # Precision
106 |             precision_curve = tpc / (tpc + fpc)
107 |             p.append(precision_curve[-1])
108 | 
109 |             # AP from recall-precision curve
110 |             ap.append(compute_ap(recall_curve, precision_curve))
111 | 
112 |     # Compute F1 score (harmonic mean of precision and recall)
113 |     p, r, ap = np.array(p), np.array(r), np.array(ap)
114 |     f1 = 2 * p * r / (p + r + 1e-16)
115 | 
116 |     return p, r, ap, f1, unique_classes.astype("int32")
117 | 
118 | 
119 | def compute_ap(recall, precision):
120 |     """ Compute the average precision, given the recall and precision curves.
121 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
122 | 
123 |     # Arguments
124 |         recall:    The recall curve (list).
125 |         precision: The precision curve (list).
126 |     # Returns
127 |         The average precision as computed in py-faster-rcnn.
128 |     """
129 |     # correct AP calculation
130 |     # first append sentinel values at the end
131 |     mrec = np.concatenate(([0.0], recall, [1.0]))
132 |     mpre = np.concatenate(([0.0], precision, [0.0]))
133 | 
134 |     # compute the precision envelope
135 |     for i in range(mpre.size - 1, 0, -1):
136 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
137 | 
138 |     # to calculate area under PR curve, look for points
139 |     # where X axis (recall) changes value
140 |     i = np.where(mrec[1:] != mrec[:-1])[0]
141 | 
142 |     # and sum (\Delta recall) * prec
143 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
144 |     return ap
145 | 
146 | 
147 | def get_batch_statistics(outputs, targets, iou_threshold):
148 |     """ Compute true positives, predicted scores and predicted labels per sample """
149 |     batch_metrics = []
150 |     for sample_i in range(len(outputs)):
151 | 
152 |         if outputs[sample_i] is None:
153 |             continue
154 | 
155 |         output = outputs[sample_i]
156 |         pred_boxes = output[:, :4]
157 |         pred_scores = output[:, 4]
158 |         pred_labels = output[:, -1]
159 | 
160 |         true_positives = np.zeros(pred_boxes.shape[0])
161 | 
162 |         annotations = targets[targets[:, 0] == sample_i][:, 1:]
163 |         target_labels = annotations[:, 0] if len(annotations) else []
164 |         if len(annotations):
165 |             detected_boxes = []
166 |             target_boxes = annotations[:, 1:]
167 | 
168 |             for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)):
169 | 
170 |                 # If targets are found break
171 |                 if len(detected_boxes) == len(annotations):
172 |                     break
173 | 
174 |                 # Ignore if label is not one of the target labels
175 |                 if pred_label not in target_labels:
176 |                     continue
177 | 
178 |                 iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0)
179 |                 if iou >= iou_threshold and box_index not in detected_boxes:
180 |                     true_positives[pred_i] = 1
181 |                     detected_boxes += [box_index]
182 |         batch_metrics.append([true_positives, pred_scores, pred_labels])
183 |     return batch_metrics
184 | 
185 | 
186 | def bbox_wh_iou(wh1, wh2):
187 |     wh2 = wh2.t()
188 |     w1, h1 = wh1[0], wh1[1]
189 |     w2, h2 = wh2[0], wh2[1]
190 |     inter_area = torch.min(w1, w2) * torch.min(h1, h2)
191 |     union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
192 |     return inter_area / union_area
193 | 
194 | 
195 | def bbox_iou(box1, box2, x1y1x2y2=True):
196 |     """
197 |     Returns the IoU of two bounding boxes
198 |     """
199 |     if not x1y1x2y2:
200 |         # Transform from center and width to exact coordinates
201 |         b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
202 |         b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
203 |         b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
204 |         b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
205 |     else:
206 |         # Get the coordinates of bounding boxes
207 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
208 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
209 | 
210 |     # get the corrdinates of the intersection rectangle
211 |     inter_rect_x1 = torch.max(b1_x1, b2_x1)
212 |     inter_rect_y1 = torch.max(b1_y1, b2_y1)
213 |     inter_rect_x2 = torch.min(b1_x2, b2_x2)
214 |     inter_rect_y2 = torch.min(b1_y2, b2_y2)
215 |     # Intersection area
216 |     inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
217 |         inter_rect_y2 - inter_rect_y1 + 1, min=0
218 |     )
219 |     # Union Area
220 |     b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
221 |     b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
222 | 
223 |     iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
224 | 
225 |     return iou
226 | 
227 | 
228 | def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4, method=1):
229 |     """
230 |     Removes detections with lower object confidence score than 'conf_thres' and performs
231 |     Non-Maximum Suppression to further filter detections.
232 |     Returns detections with shape:
233 |         (x1, y1, x2, y2, object_conf, class_score, class_pred)
234 |     """
235 | 
236 |     # From (center x, center y, width, height) to (x1, y1, x2, y2)
237 |     prediction[..., :4] = xywh2xyxy(prediction[..., :4])
238 |     output = [None for _ in range(len(prediction))]
239 |     # print(prediction.size())
240 |     for image_i, image_pred in enumerate(prediction):
241 |         # Filter out confidence scores below threshold
242 |         image_pred = image_pred[image_pred[:, 4] >= conf_thres]
243 |         # print(image_pred.size())
244 | 
245 |         # If none are remaining => process next image
246 |         if not image_pred.size(0):
247 |             continue
248 |         # Object confidence times class confidence
249 |         # image_pred[:, 5:].max(1)有两个维度：
250 |         # image_pred[:, 5:].max(1)[0]表示所有类别中最大置信度
251 |         # image_pred[:, 5:].max(1)[1]表示类最大置信度的index, 也即最大置信度类别
252 |         score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
253 |         # Sort by it
254 |         image_pred = image_pred[(-score).argsort()]
255 |         class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
256 |         detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
257 | 
258 |         if method == 1:
259 |             # Perform non-maximum suppression
260 |             keep_boxes = []
261 |             while detections.size(0):
262 |                 large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
263 |                 label_match = detections[0, -1] == detections[:, -1]
264 |                 # Indices of boxes with lower confidence scores, large IOUs and matching labels
265 |                 invalid = large_overlap & label_match
266 |                 weights = detections[invalid, 4:5]
267 |                 # Merge overlapping bboxes by order of confidence
268 |                 detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
269 |                 keep_boxes += [detections[0]]
270 |                 detections = detections[~invalid]
271 |             if keep_boxes:
272 |                 # print(keep_boxes)
273 |                 output[image_i] = torch.stack(keep_boxes)
274 | 
275 |         elif method == 2:
276 |             # Perform soft non-maximum suppression
277 |             keep_boxes = soft_nms_pytorch(detections, sigma=0.25, thresh=0.1, cuda=0)
278 |             if keep_boxes:
279 |                 # print(keep_boxes)
280 |                 output[image_i] = torch.stack(keep_boxes)
281 | 
282 |     return output
283 | 
284 | 
285 | def soft_nms_pytorch(dets, sigma=0.5, thresh=0.2, cuda=0):
286 |     """
287 |     Build a pytorch implement of Soft NMS algorithm.
288 |     # Augments
289 |         dets:        boxes coordinate tensor (format:[x1, y1, x2, y2, bbox_confs, cls_confs, cls_pred])
290 |         sigma:       variance of Gaussian function
291 |         thresh:      score thresh
292 |         cuda:        CUDA flag
293 |     # Return
294 |         the selected bboxes
295 |     """
296 | 
297 |     # Indexes concatenate boxes with the last column
298 |     N = dets.shape[0]
299 | 
300 |     # The order of boxes coordinate is [x1, y1, x2, y2]
301 |     x1 = dets[:, 0]
302 |     y1 = dets[:, 1]
303 |     x2 = dets[:, 2]
304 |     y2 = dets[:, 3]
305 |     scores = dets[:, 4]
306 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
307 |     # print(areas)
308 | 
309 |     for i in range(N):
310 |         # intermediate parameters for later parameters exchange
311 |         # tscore = scores[i].clone()
312 |         tscore = scores[i]
313 |         pos = i + 1
314 | 
315 |         if i != N - 1:
316 |             maxscore, maxpos = torch.max(scores[pos:], dim=0)
317 |             if tscore < maxscore:
318 |                 dets[i], dets[maxpos.item() + i + 1] = dets[maxpos.item() + i + 1].clone(), dets[i].clone()
319 |                 areas[i], areas[maxpos + i + 1] = areas[maxpos + i + 1].clone(), areas[i].clone()
320 | 
321 |         # IoU calculate
322 |         xx1 = np.maximum(dets[i, 0].to("cpu").numpy(), dets[pos:, 0].to("cpu").numpy())
323 |         yy1 = np.maximum(dets[i, 1].to("cpu").numpy(), dets[pos:, 1].to("cpu").numpy())
324 |         xx2 = np.minimum(dets[i, 2].to("cpu").numpy(), dets[pos:, 2].to("cpu").numpy())
325 |         yy2 = np.minimum(dets[i, 3].to("cpu").numpy(), dets[pos:, 3].to("cpu").numpy())
326 | 
327 |         w = np.maximum(0.0, xx2 - xx1 + 1)
328 |         h = np.maximum(0.0, yy2 - yy1 + 1)
329 |         inter = torch.tensor(w * h).cuda() if cuda else torch.tensor(w * h)
330 |         ovr = torch.div(inter, (areas[i] + areas[pos:] - inter))
331 | 
332 |         # Gaussian decay
333 |         weight = torch.exp(-(ovr * ovr) / sigma)
334 |         scores[pos:] = weight * scores[pos:]
335 | 
336 |     # select the boxes and keep the corresponding indexes
337 |     keep = dets[scores > thresh]
338 |     keep_boxes = []
339 |     for i in range(keep.size()[0]):
340 |         keep_boxes.append(keep[i])
341 |     return keep_boxes
342 | 
343 | 
344 | def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
345 | 
346 |     ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
347 |     FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
348 | 
349 |     nB = pred_boxes.size(0)
350 |     nA = pred_boxes.size(1)
351 |     nC = pred_cls.size(-1)
352 |     nG = pred_boxes.size(2)
353 | 
354 |     # Output tensors
355 |     obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)
356 |     noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)
357 |     class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)
358 |     iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)
359 |     tx = FloatTensor(nB, nA, nG, nG).fill_(0)
360 |     ty = FloatTensor(nB, nA, nG, nG).fill_(0)
361 |     tw = FloatTensor(nB, nA, nG, nG).fill_(0)
362 |     th = FloatTensor(nB, nA, nG, nG).fill_(0)
363 |     tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
364 | 
365 |     # Convert to position relative to box
366 |     target_boxes = target[:, 2:6] * nG
367 |     gxy = target_boxes[:, :2]
368 |     gwh = target_boxes[:, 2:]
369 |     # Get anchors with best iou
370 |     ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
371 |     best_ious, best_n = ious.max(0)
372 |     # Separate target values
373 |     b, target_labels = target[:, :2].long().t()
374 |     gx, gy = gxy.t()
375 |     gw, gh = gwh.t()
376 |     gi, gj = gxy.long().t()
377 |     # Set masks
378 |     obj_mask[b, best_n, gj, gi] = 1
379 |     noobj_mask[b, best_n, gj, gi] = 0
380 | 
381 |     # Set noobj mask to zero where iou exceeds ignore threshold
382 |     for i, anchor_ious in enumerate(ious.t()):
383 |         noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
384 | 
385 |     # Coordinates
386 |     tx[b, best_n, gj, gi] = gx - gx.floor()
387 |     ty[b, best_n, gj, gi] = gy - gy.floor()
388 |     # Width and height
389 |     tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
390 |     th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
391 |     # One-hot encoding of label
392 |     tcls[b, best_n, gj, gi, target_labels] = 1
393 |     # Compute label correctness and iou at best anchor
394 |     class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
395 |     iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)
396 | 
397 |     tconf = obj_mask.float()
398 |     return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
399 | 
400 | 
401 | 
402 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | import numpy as np
  8 | 
  9 | from utils.parse_config import *
 10 | from utils.utils import build_targets, to_cpu, non_max_suppression
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.patches as patches
 14 | 
 15 | from torch2trt import torch2trt
 16 | 
 17 | 
 18 | def create_modules(module_defs, TensorRT):
 19 |     """
 20 |     Constructs module list of layer blocks from module configuration in module_defs
 21 |     """
 22 |     hyperparams = module_defs.pop(0)
 23 |     output_filters = [int(hyperparams["channels"])]
 24 |     module_list = nn.ModuleList()
 25 |     for module_i, module_def in enumerate(module_defs):
 26 | 
 27 |         modules = nn.Sequential()
 28 | 
 29 |         if module_def["type"] == "convolutional":
 30 |             bn = int(module_def["batch_normalize"])
 31 |             filters = int(module_def["filters"])
 32 |             kernel_size = int(module_def["size"])
 33 |             pad = (kernel_size - 1) // 2
 34 |             modules.add_module(
 35 |                 f"conv_{module_i}",
 36 |                 nn.Conv2d(
 37 |                     in_channels=output_filters[-1],
 38 |                     out_channels=filters,
 39 |                     kernel_size=kernel_size,
 40 |                     stride=int(module_def["stride"]),
 41 |                     padding=pad,
 42 |                     bias=not bn,
 43 |                 ),
 44 |             )
 45 |             if bn:
 46 |                 modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
 47 |             if module_def["activation"] == "leaky":
 48 |                 modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
 49 | 
 50 |         elif module_def["type"] == "maxpool":
 51 |             kernel_size = int(module_def["size"])
 52 |             stride = int(module_def["stride"])
 53 |             if kernel_size == 2 and stride == 1:
 54 |                 modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
 55 |             maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
 56 |             modules.add_module(f"maxpool_{module_i}", maxpool)
 57 | 
 58 |         elif module_def["type"] == "upsample":
 59 |             upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
 60 |             modules.add_module(f"upsample_{module_i}", upsample)
 61 | 
 62 |         elif module_def["type"] == "route":
 63 |             layers = [int(x) for x in module_def["layers"].split(",")]
 64 |             filters = sum([output_filters[1:][i] for i in layers])
 65 |             modules.add_module(f"route_{module_i}", EmptyLayer())
 66 | 
 67 |         elif module_def["type"] == "shortcut":
 68 |             filters = output_filters[1:][int(module_def["from"])]
 69 |             modules.add_module(f"shortcut_{module_i}", EmptyLayer())
 70 | 
 71 |         elif module_def["type"] == "yolo":
 72 |             if TensorRT:
 73 |                 pass
 74 |             else:
 75 |                 anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
 76 |                 # print(anchor_idxs)
 77 |                 # Extract anchors
 78 |                 anchors = [int(x) for x in module_def["anchors"].split(",")]
 79 |                 anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
 80 |                 anchors = [anchors[i] for i in anchor_idxs]
 81 |                 # print(anchors)
 82 |                 num_classes = int(module_def["classes"])
 83 |                 img_size = int(hyperparams["height"])
 84 | 
 85 |                 # Define detection layer
 86 |                 yolo_layer = YOLOLayer(anchors, num_classes, img_size)
 87 |                 modules.add_module(f"yolo_{module_i}", yolo_layer)
 88 |         # Register module list and number of output filters
 89 |         module_list.append(modules)
 90 |         output_filters.append(filters)
 91 | 
 92 |     return hyperparams, module_list
 93 | 
 94 | 
 95 | class Upsample(nn.Module):
 96 |     """ nn.Upsample is deprecated """
 97 | 
 98 |     def __init__(self, scale_factor, mode="nearest"):
 99 |         super(Upsample, self).__init__()
100 |         self.scale_factor = scale_factor
101 |         self.mode = mode
102 | 
103 |     def forward(self, x):
104 |         # print("F.interpolate")
105 |         x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
106 |         return x
107 | 
108 | 
109 | class EmptyLayer(nn.Module):
110 |     """Placeholder for 'route' and 'shortcut' layers"""
111 | 
112 |     def __init__(self):
113 |         super(EmptyLayer, self).__init__()
114 | 
115 | 
116 | class YOLOLayer(nn.Module):
117 |     """Detection layer"""
118 | 
119 |     def __init__(self, anchors, num_classes, img_dim=416):
120 |         super(YOLOLayer, self).__init__()
121 |         self.anchors = anchors
122 |         self.num_anchors = len(anchors)
123 |         self.num_classes = num_classes
124 |         self.ignore_thres = 0.5
125 |         self.mse_loss = nn.MSELoss()
126 |         self.bce_loss = nn.BCELoss()
127 |         self.obj_scale = 1
128 |         self.noobj_scale = 100
129 |         self.metrics = {}
130 |         self.img_dim = img_dim
131 |         self.grid_size = 0  # grid size
132 | 
133 |         # 添加以下compute_grid_offset类函数中的类变量
134 |         self.stride = 0
135 |         self.grid_x = 0
136 |         self.grid_y = 0
137 |         self.scaled_anchors = 0
138 |         self.anchor_w = 0
139 |         self.anchor_h = 0
140 | 
141 |     def compute_grid_offsets(self, grid_size, img_dim, cuda=True, Half=False):
142 |         self.grid_size = grid_size
143 |         g = self.grid_size
144 |         FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
145 |         FloatTensor = torch.cuda.HalfTensor if Half else torch.cuda.FloatTensor
146 |         self.img_dim = img_dim
147 |         self.stride = self.img_dim / self.grid_size
148 |         # Calculate offsets for each grid
149 |         self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
150 |         self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
151 |         self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
152 |         self.anchor_w = self.scaled_anchors[:, 0].view((1, self.num_anchors, 1, 1))
153 |         self.anchor_h = self.scaled_anchors[:, 1].view((1, self.num_anchors, 1, 1))
154 | 
155 |     def forward(self, x, targets=None, img_dim=None, Half=False):
156 | 
157 |         # Tensors for cuda support
158 |         FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
159 |         FloatTensor = torch.cuda.HalfTensor if x.type() == "torch.cuda.HalfTensor" else torch.cuda.FloatTensor
160 |         # LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
161 |         # ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
162 | 
163 |         # 注释说明
164 |         # x 是最后一层卷积输出的特征图，在输入图片大小为416×416的前提下
165 |         # x[0],x[1],x[2],x[3] = batch size, 255, 13, 13
166 |         # x[0],x[1],x[2],x[3] = batch size, 255, 26, 26
167 |         # 255 = 3*(4+1+80)  3：我认为是mask的数量，也即每个cell生成的检测框数; 4:检测框坐标; 1:检测框置信度；80:类别数。
168 |         # 检测框具体顺序为 Center x，Center y，Width，Height
169 |         self.img_dim = img_dim
170 |         num_samples = x.size(0)
171 |         grid_size = x.size(2)
172 | 
173 |         # 注释说明
174 |         # prediction 的维度为 batch_size, num_anchors=3, grid_size, grid_size, num_classes + 5(coco:85)
175 |         prediction = (
176 |             x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
177 |             .permute(0, 1, 3, 4, 2)  # permute: 将维度换位
178 |             .contiguous()
179 |         )
180 |         # print(prediction.size())
181 | 
182 |         # 注释说明
183 |         # Center x，Center y，Conf，Cls pred 用sigmoid函数限定其范围在0-1范围内
184 |         # 为什么 w，h 不用限定范围？确实存在 w,h 大于1的是数据
185 |         # Get outputs
186 |         x = torch.sigmoid(prediction[..., 0])  # Center x
187 |         y = torch.sigmoid(prediction[..., 1])  # Center y
188 |         w = prediction[..., 2]  # Width
189 |         h = prediction[..., 3]  # Height
190 |         pred_conf = torch.sigmoid(prediction[..., 4])  # Conf （检测框置信度）
191 |         pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
192 |         # print(torch.max(w))
193 |         # print(h)
194 | 
195 |         # 调试
196 |         # If grid size does not match current we compute new offsets
197 |         if grid_size != self.grid_size:
198 |             self.compute_grid_offsets(grid_size, img_dim, cuda=x.is_cuda, Half=Half)
199 | 
200 |         # 注释说明
201 |         # pred_box 表示网络预测的框
202 |         # Add offset and scale with anchors
203 |         pred_boxes = FloatTensor(prediction[..., :4].shape)
204 |         pred_boxes[..., 0] = x.data + self.grid_x
205 |         pred_boxes[..., 1] = y.data + self.grid_y
206 |         pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
207 |         pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
208 |         # print(pred_boxes[..., 2].type())
209 | 
210 |         output = torch.cat(
211 |             (
212 |                 pred_boxes.view(num_samples, -1, 4) * self.stride,
213 |                 pred_conf.view(num_samples, -1, 1),
214 |                 pred_cls.view(num_samples, -1, self.num_classes),
215 |             ),
216 |             -1,
217 |         )
218 |         # print(output.size())
219 | 
220 |         # 注释说明
221 |         # target 用来表明是否是训练还是推理
222 |         if targets is None:
223 |             return output, 0
224 |         else:
225 |             iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
226 |                 pred_boxes=pred_boxes,
227 |                 pred_cls=pred_cls,
228 |                 target=targets,
229 |                 anchors=self.scaled_anchors,
230 |                 ignore_thres=self.ignore_thres,
231 |             )
232 | 
233 |             # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
234 |             loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])
235 |             loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
236 |             loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
237 |             loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
238 |             loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
239 |             loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
240 |             # 注释说明
241 |             # loss_conf 正负样本带有各自权重（obj_scale，noobj_scale）
242 |             loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
243 |             loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])
244 |             total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
245 | 
246 |             # Metrics
247 |             cls_acc = 100 * class_mask[obj_mask].mean()
248 |             conf_obj = pred_conf[obj_mask].mean()
249 |             conf_noobj = pred_conf[noobj_mask].mean()
250 |             conf50 = (pred_conf > 0.5).float()
251 |             iou50 = (iou_scores > 0.5).float()
252 |             iou75 = (iou_scores > 0.75).float()
253 |             detected_mask = conf50 * class_mask * tconf
254 |             precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
255 |             recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
256 |             recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
257 | 
258 |             self.metrics = {
259 |                 "loss": to_cpu(total_loss).item(),
260 |                 "x": to_cpu(loss_x).item(),
261 |                 "y": to_cpu(loss_y).item(),
262 |                 "w": to_cpu(loss_w).item(),
263 |                 "h": to_cpu(loss_h).item(),
264 |                 "conf": to_cpu(loss_conf).item(),
265 |                 "cls": to_cpu(loss_cls).item(),
266 |                 "cls_acc": to_cpu(cls_acc).item(),
267 |                 "recall50": to_cpu(recall50).item(),
268 |                 "recall75": to_cpu(recall75).item(),
269 |                 "precision": to_cpu(precision).item(),
270 |                 "conf_obj": to_cpu(conf_obj).item(),
271 |                 "conf_noobj": to_cpu(conf_noobj).item(),
272 |                 "grid_size": grid_size,
273 |             }
274 | 
275 |             return output, total_loss
276 | 
277 | 
278 | class Darknet_Backbone(nn.Module):
279 |     """YOLOv3 object detection model"""
280 | 
281 |     def __init__(self, config_path, img_size=416, TensorRT=False, Half=False):
282 |         super(Darknet_Backbone, self).__init__()
283 |         self.module_defs = parse_model_config(config_path)
284 |         self.hyperparams, self.module_list = create_modules(self.module_defs, TensorRT)
285 |         # self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
286 |         self.img_size = img_size
287 |         self.seen = 0
288 |         self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
289 | 
290 |     def forward(self, x, targets=None):
291 |         img_dim = x.shape[2]
292 |         loss = 0
293 |         layer_outputs, yolo_outputs = [], []
294 |         last_convs = []
295 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
296 |             if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
297 |                 # print(i, module_def["type"])
298 |                 x = module(x)
299 |             elif module_def["type"] == "route":
300 |                 # print(i, module_def["type"])
301 |                 # print(module_def["layers"].split(","))
302 |                 x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
303 |             # 注释说明
304 |             # shortcut 为YOLOv3的结构
305 |             elif module_def["type"] == "shortcut":
306 |                 # print(i, module_def["type"])
307 |                 layer_i = int(module_def["from"])
308 |                 x = layer_outputs[-1] + layer_outputs[layer_i]
309 |             elif module_def["type"] == "yolo":
310 |                 # print(i, module_def["type"])
311 |                 last_convs.append(x)
312 |             layer_outputs.append(x)
313 | 
314 |         return last_convs
315 | 
316 |     def load_darknet_weights(self, weights_path):
317 |         """Parses and loads the weights stored in 'weights_path'"""
318 | 
319 |         # Open the weights file
320 |         with open(weights_path, "rb") as f:
321 |             header = np.fromfile(f, dtype=np.int32, count=5)  # First five are header values
322 |             self.header_info = header  # Needed to write header when saving weights
323 |             self.seen = header[3]  # number of images seen during training
324 |             weights = np.fromfile(f, dtype=np.float32)  # The rest are weights
325 | 
326 |         # Establish cutoff for loading backbone weights
327 |         cutoff = None
328 |         if "darknet53.conv.74" in weights_path:
329 |             cutoff = 75
330 | 
331 |         ptr = 0
332 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
333 |             if i == cutoff:
334 |                 break
335 |             if module_def["type"] == "convolutional":
336 |                 conv_layer = module[0]
337 |                 if module_def["batch_normalize"]:
338 |                     # Load BN bias, weights, running mean and running variance
339 |                     bn_layer = module[1]
340 |                     num_b = bn_layer.bias.numel()  # Number of biases
341 |                     # Bias
342 |                     bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
343 |                     bn_layer.bias.data.copy_(bn_b)
344 |                     ptr += num_b
345 |                     # Weight
346 |                     bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
347 |                     bn_layer.weight.data.copy_(bn_w)
348 |                     ptr += num_b
349 |                     # Running Mean
350 |                     bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
351 |                     bn_layer.running_mean.data.copy_(bn_rm)
352 |                     ptr += num_b
353 |                     # Running Var
354 |                     bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
355 |                     bn_layer.running_var.data.copy_(bn_rv)
356 |                     ptr += num_b
357 |                 else:
358 |                     # Load conv. bias
359 |                     num_b = conv_layer.bias.numel()
360 |                     conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
361 |                     conv_layer.bias.data.copy_(conv_b)
362 |                     ptr += num_b
363 |                 # Load conv. weights
364 |                 num_w = conv_layer.weight.numel()
365 |                 conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
366 |                 conv_layer.weight.data.copy_(conv_w)
367 |                 ptr += num_w
368 | 
369 | 
370 | class YOLOHead(nn.Module):
371 |     """
372 |    Build a detection head.
373 |     """
374 |     def __init__(self, config_path, img_size=416):
375 |         super(YOLOHead, self).__init__()
376 |         self.img_size = img_size
377 |         self.module_defs = parse_model_config(config_path)
378 |         self.yolo_layer = self.build()
379 | 
380 |     def build(self):
381 |         # 构建YOLO层
382 |         hyperparams = self.module_defs.pop(0)
383 |         img_size = int(hyperparams["height"])
384 |         yolo_layer = []
385 |         for module_i, module_def in enumerate(self.module_defs):
386 |             if module_def["type"] == "yolo":
387 |                 anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
388 |                 # Extract anchors
389 |                 anchors = [int(x) for x in module_def["anchors"].split(",")]
390 |                 anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
391 |                 anchors = [anchors[i] for i in anchor_idxs]
392 |                 num_classes = int(module_def["classes"])
393 |                 # Define detection layer
394 |                 yolo_layer.append(YOLOLayer(anchors=anchors, num_classes=num_classes, img_dim=img_size))
395 |         return yolo_layer
396 | 
397 |     def forward(self, backbone_out):
398 | 
399 |         if len(self.yolo_layer) == 2:
400 |             x1, loss1 = self.yolo_layer[0](backbone_out[0], targets=None, img_dim=self.img_size)
401 |             x2, loss2 = self.yolo_layer[1](backbone_out[1], targets=None, img_dim=self.img_size)
402 |             yolo_out = to_cpu(torch.cat((x1, x2), 1))
403 |             return yolo_out
404 |         else:
405 |             x1, loss1 = self.yolo_layer[0](backbone_out[0], targets=None, img_dim=self.img_size )
406 |             x2, loss2 = self.yolo_layer[1](backbone_out[1], targets=None, img_dim=self.img_size)
407 |             x3, loss3 = self.yolo_layer[2](backbone_out[2], targets=None, img_dim=self.img_size)
408 |             yolo_out = to_cpu(torch.cat((x1, x2, x3), 1))
409 |             return yolo_out
410 | 
411 | 
412 | class Darknet(nn.Module):
413 |     """YOLOv3 object detection model"""
414 | 
415 |     def __init__(self, config_path, img_size=416, TensorRT=False, Half=False):
416 |         super(Darknet, self).__init__()
417 |         self.module_defs = parse_model_config(config_path)
418 |         self.hyperparams, self.module_list = create_modules(self.module_defs, TensorRT)
419 |         # if Half is False:
420 |         #     self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
421 |         self.img_size = img_size
422 |         self.seen = 0
423 |         self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
424 | 
425 |     def forward(self, x, targets=None):
426 |         # print(x.type())
427 |         if x.type() == "torch.cuda.HalfTensor":
428 |             Half = True
429 |         else:
430 |             Half = False
431 |         img_dim = x.shape[2]
432 |         loss = 0
433 |         layer_outputs, yolo_outputs = [], []
434 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
435 |             if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
436 |                 # print(i, module_def["type"])
437 |                 x = module(x)
438 |             elif module_def["type"] == "route":
439 |                 # print(i, module_def["type"])
440 |                 x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
441 |             # 注释说明
442 |             # shortcut 为YOLOv3的结构
443 |             elif module_def["type"] == "shortcut":
444 |                 # print(i, module_def["type"])
445 |                 layer_i = int(module_def["from"])
446 |                 x = layer_outputs[-1] + layer_outputs[layer_i]
447 |             elif module_def["type"] == "yolo":
448 |                 # print(i, module_def["type"])
449 |                 x, layer_loss = module[0](x, targets, img_dim, Half)
450 |                 loss += layer_loss
451 |                 yolo_outputs.append(x)
452 |             layer_outputs.append(x)
453 |         yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
454 |         # print(yolo_outputs.size())
455 |         return yolo_outputs if targets is None else (loss, yolo_outputs)
456 | 
457 |     def load_darknet_weights(self, weights_path):
458 |         """Parses and loads the weights stored in 'weights_path'"""
459 | 
460 |         # Open the weights file
461 |         with open(weights_path, "rb") as f:
462 |             header = np.fromfile(f, dtype=np.int32, count=5)  # First five are header values
463 |             self.header_info = header  # Needed to write header when saving weights
464 |             self.seen = header[3]  # number of images seen during training
465 |             weights = np.fromfile(f, dtype=np.float32)  # The rest are weights
466 | 
467 |         # Establish cutoff for loading backbone weights
468 |         cutoff = None
469 |         if "darknet53.conv.74" in weights_path:
470 |             cutoff = 75
471 | 
472 |         ptr = 0
473 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
474 |             if i == cutoff:
475 |                 break
476 |             if module_def["type"] == "convolutional":
477 |                 conv_layer = module[0]
478 |                 if module_def["batch_normalize"]:
479 |                     # Load BN bias, weights, running mean and running variance
480 |                     bn_layer = module[1]
481 |                     num_b = bn_layer.bias.numel()  # Number of biases
482 |                     # Bias
483 |                     bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
484 |                     bn_layer.bias.data.copy_(bn_b)
485 |                     ptr += num_b
486 |                     # Weight
487 |                     bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
488 |                     bn_layer.weight.data.copy_(bn_w)
489 |                     ptr += num_b
490 |                     # Running Mean
491 |                     bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
492 |                     bn_layer.running_mean.data.copy_(bn_rm)
493 |                     ptr += num_b
494 |                     # Running Var
495 |                     bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
496 |                     bn_layer.running_var.data.copy_(bn_rv)
497 |                     ptr += num_b
498 |                 else:
499 |                     # Load conv. bias
500 |                     num_b = conv_layer.bias.numel()
501 |                     conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
502 |                     conv_layer.bias.data.copy_(conv_b)
503 |                     ptr += num_b
504 |                 # Load conv. weights
505 |                 num_w = conv_layer.weight.numel()
506 |                 conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
507 |                 conv_layer.weight.data.copy_(conv_w)
508 |                 ptr += num_w
509 | 
510 |     def save_darknet_weights(self, path, cutoff=-1):
511 |         """
512 |             @:param path    - path of the new weights file
513 |             @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
514 |         """
515 |         fp = open(path, "wb")
516 |         self.header_info[3] = self.seen
517 |         self.header_info.tofile(fp)
518 | 
519 |         # Iterate through layers
520 |         for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
521 |             if module_def["type"] == "convolutional":
522 |                 conv_layer = module[0]
523 |                 # If batch norm, load bn first
524 |                 if module_def["batch_normalize"]:
525 |                     bn_layer = module[1]
526 |                     bn_layer.bias.data.cpu().numpy().tofile(fp)
527 |                     bn_layer.weight.data.cpu().numpy().tofile(fp)
528 |                     bn_layer.running_mean.data.cpu().numpy().tofile(fp)
529 |                     bn_layer.running_var.data.cpu().numpy().tofile(fp)
530 |                 # Load conv bias
531 |                 else:
532 |                     conv_layer.bias.data.cpu().numpy().tofile(fp)
533 |                 # Load conv weights
534 |                 conv_layer.weight.data.cpu().numpy().tofile(fp)
535 | 
536 |         fp.close()
537 | 
538 | 


--------------------------------------------------------------------------------