├── nets
    ├── __init__.py
    ├── backbone.py
    ├── yolo.py
    └── yolo_training.py
├── utils
    ├── __init__.py
    ├── utils_fit.py
    ├── utils.py
    ├── callbacks.py
    └── dataloader.py
├── logs
    └── README.md
├── VOCdevkit
    └── VOC2007
    │   ├── Annotations
    │       └── README.md
    │   ├── JPEGImages
    │       └── README.md
    │   └── ImageSets
    │       └── Main
    │           └── README.md
├── img
    └── street.jpg
├── model_data
    ├── simhei.ttf
    ├── yolo_anchors.txt
    ├── voc_classes.txt
    └── coco_classes.txt
├── requirements.txt
├── summary.py
├── .gitignore
├── utils_coco
    ├── coco_annotation.py
    └── get_map_coco.py
├── kmeans_for_anchors.py
├── get_map.py
├── voc_annotation.py
├── README.md
├── predict.py
├── 常见问题汇总.md
└── train.py


/nets/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/logs/README.md:
--------------------------------------------------------------------------------
1 | 训练好的权重会保存在这里
2 | 


--------------------------------------------------------------------------------
/VOCdevkit/VOC2007/Annotations/README.md:
--------------------------------------------------------------------------------
1 | 存放标签文件


--------------------------------------------------------------------------------
/VOCdevkit/VOC2007/JPEGImages/README.md:
--------------------------------------------------------------------------------
1 | 存放图片文件


--------------------------------------------------------------------------------
/VOCdevkit/VOC2007/ImageSets/Main/README.md:
--------------------------------------------------------------------------------
1 | 存放训练索引文件


--------------------------------------------------------------------------------
/img/street.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bubbliiiing/yolov7-pytorch/HEAD/img/street.jpg


--------------------------------------------------------------------------------
/model_data/simhei.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bubbliiiing/yolov7-pytorch/HEAD/model_data/simhei.ttf


--------------------------------------------------------------------------------
/model_data/yolo_anchors.txt:
--------------------------------------------------------------------------------
1 | 12, 16,  19, 36,  40, 28,  36, 75,  76, 55,  72, 146,  142, 110,  192, 243,  459, 401


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | tensorboard
 4 | scipy==1.2.1
 5 | numpy==1.17.0
 6 | matplotlib==3.1.2
 7 | opencv_python==4.1.2.30
 8 | tqdm==4.60.0
 9 | Pillow==8.2.0
10 | h5py==2.10.0


--------------------------------------------------------------------------------
/model_data/voc_classes.txt:
--------------------------------------------------------------------------------
 1 | aeroplane
 2 | bicycle
 3 | bird
 4 | boat
 5 | bottle
 6 | bus
 7 | car
 8 | cat
 9 | chair
10 | cow
11 | diningtable
12 | dog
13 | horse
14 | motorbike
15 | person
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor


--------------------------------------------------------------------------------
/model_data/coco_classes.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/summary.py:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------#
 2 | #   该部分代码用于看网络结构
 3 | #--------------------------------------------#
 4 | import torch
 5 | from thop import clever_format, profile
 6 | 
 7 | from nets.yolo import YoloBody
 8 | 
 9 | if __name__ == "__main__":
10 |     input_shape     = [640, 640]
11 |     anchors_mask    = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
12 |     num_classes     = 80
13 |     phi             = 'l'
14 |     
15 |     device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16 |     m       = YoloBody(anchors_mask, num_classes, phi, False).to(device)
17 |     for i in m.children():
18 |         print(i)
19 |         print('==============================')
20 |     
21 |     dummy_input     = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device)
22 |     flops, params   = profile(m.to(device), (dummy_input, ), verbose=False)
23 |     #--------------------------------------------------------#
24 |     #   flops * 2是因为profile没有将卷积作为两个operations
25 |     #   有些论文将卷积算乘法、加法两个operations。此时乘2
26 |     #   有些论文只考虑乘法的运算次数，忽略加法。此时不乘2
27 |     #   本代码选择乘2，参考YOLOX。
28 |     #--------------------------------------------------------#
29 |     flops           = flops * 2
30 |     flops, params   = clever_format([flops, params], "%.3f")
31 |     print('Total GFLOPS: %s' % (flops))
32 |     print('Total params: %s' % (params))
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ignore map, miou, datasets
  2 | map_out/
  3 | miou_out/
  4 | VOCdevkit/
  5 | datasets/
  6 | Medical_Datasets/
  7 | lfw/
  8 | logs/
  9 | model_data/
 10 | .temp_map_out/
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 | 
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | 


--------------------------------------------------------------------------------
/utils_coco/coco_annotation.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------#
  2 | #   用于处理COCO数据集，根据json文件生成txt文件用于训练
  3 | #-------------------------------------------------------#
  4 | import json
  5 | import os
  6 | from collections import defaultdict
  7 | 
  8 | #-------------------------------------------------------#
  9 | #   指向了COCO训练集与验证集图片的路径
 10 | #-------------------------------------------------------#
 11 | train_datasets_path     = "coco_dataset/train2017"
 12 | val_datasets_path       = "coco_dataset/val2017"
 13 | 
 14 | #-------------------------------------------------------#
 15 | #   指向了COCO训练集与验证集标签的路径
 16 | #-------------------------------------------------------#
 17 | train_annotation_path   = "coco_dataset/annotations/instances_train2017.json"
 18 | val_annotation_path     = "coco_dataset/annotations/instances_val2017.json"
 19 | 
 20 | #-------------------------------------------------------#
 21 | #   生成的txt文件路径
 22 | #-------------------------------------------------------#
 23 | train_output_path       = "coco_train.txt"
 24 | val_output_path         = "coco_val.txt"
 25 | 
 26 | if __name__ == "__main__":
 27 |     name_box_id = defaultdict(list)
 28 |     id_name     = dict()
 29 |     f           = open(train_annotation_path, encoding='utf-8')
 30 |     data        = json.load(f)
 31 | 
 32 |     annotations = data['annotations']
 33 |     for ant in annotations:
 34 |         id = ant['image_id']
 35 |         name = os.path.join(train_datasets_path, '%012d.jpg' % id)
 36 |         cat = ant['category_id']
 37 |         if cat >= 1 and cat <= 11:
 38 |             cat = cat - 1
 39 |         elif cat >= 13 and cat <= 25:
 40 |             cat = cat - 2
 41 |         elif cat >= 27 and cat <= 28:
 42 |             cat = cat - 3
 43 |         elif cat >= 31 and cat <= 44:
 44 |             cat = cat - 5
 45 |         elif cat >= 46 and cat <= 65:
 46 |             cat = cat - 6
 47 |         elif cat == 67:
 48 |             cat = cat - 7
 49 |         elif cat == 70:
 50 |             cat = cat - 9
 51 |         elif cat >= 72 and cat <= 82:
 52 |             cat = cat - 10
 53 |         elif cat >= 84 and cat <= 90:
 54 |             cat = cat - 11
 55 |         name_box_id[name].append([ant['bbox'], cat])
 56 | 
 57 |     f = open(train_output_path, 'w')
 58 |     for key in name_box_id.keys():
 59 |         f.write(key)
 60 |         box_infos = name_box_id[key]
 61 |         for info in box_infos:
 62 |             x_min = int(info[0][0])
 63 |             y_min = int(info[0][1])
 64 |             x_max = x_min + int(info[0][2])
 65 |             y_max = y_min + int(info[0][3])
 66 | 
 67 |             box_info = " %d,%d,%d,%d,%d" % (
 68 |                 x_min, y_min, x_max, y_max, int(info[1]))
 69 |             f.write(box_info)
 70 |         f.write('\n')
 71 |     f.close()
 72 | 
 73 |     name_box_id = defaultdict(list)
 74 |     id_name     = dict()
 75 |     f           = open(val_annotation_path, encoding='utf-8')
 76 |     data        = json.load(f)
 77 | 
 78 |     annotations = data['annotations']
 79 |     for ant in annotations:
 80 |         id = ant['image_id']
 81 |         name = os.path.join(val_datasets_path, '%012d.jpg' % id)
 82 |         cat = ant['category_id']
 83 |         if cat >= 1 and cat <= 11:
 84 |             cat = cat - 1
 85 |         elif cat >= 13 and cat <= 25:
 86 |             cat = cat - 2
 87 |         elif cat >= 27 and cat <= 28:
 88 |             cat = cat - 3
 89 |         elif cat >= 31 and cat <= 44:
 90 |             cat = cat - 5
 91 |         elif cat >= 46 and cat <= 65:
 92 |             cat = cat - 6
 93 |         elif cat == 67:
 94 |             cat = cat - 7
 95 |         elif cat == 70:
 96 |             cat = cat - 9
 97 |         elif cat >= 72 and cat <= 82:
 98 |             cat = cat - 10
 99 |         elif cat >= 84 and cat <= 90:
100 |             cat = cat - 11
101 |         name_box_id[name].append([ant['bbox'], cat])
102 | 
103 |     f = open(val_output_path, 'w')
104 |     for key in name_box_id.keys():
105 |         f.write(key)
106 |         box_infos = name_box_id[key]
107 |         for info in box_infos:
108 |             x_min = int(info[0][0])
109 |             y_min = int(info[0][1])
110 |             x_max = x_min + int(info[0][2])
111 |             y_max = y_min + int(info[0][3])
112 | 
113 |             box_info = " %d,%d,%d,%d,%d" % (
114 |                 x_min, y_min, x_max, y_max, int(info[1]))
115 |             f.write(box_info)
116 |         f.write('\n')
117 |     f.close()
118 | 


--------------------------------------------------------------------------------
/utils/utils_fit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | 
  6 | from utils.utils import get_lr
  7 |         
  8 | def fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0):
  9 |     loss        = 0
 10 |     val_loss    = 0
 11 | 
 12 |     if local_rank == 0:
 13 |         print('Start Train')
 14 |         pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
 15 |     model_train.train()
 16 |     for iteration, batch in enumerate(gen):
 17 |         if iteration >= epoch_step:
 18 |             break
 19 | 
 20 |         images, targets = batch[0], batch[1]
 21 |         with torch.no_grad():
 22 |             if cuda:
 23 |                 images  = images.cuda(local_rank)
 24 |                 targets = targets.cuda(local_rank)
 25 |         #----------------------#
 26 |         #   清零梯度
 27 |         #----------------------#
 28 |         optimizer.zero_grad()
 29 |         if not fp16:
 30 |             #----------------------#
 31 |             #   前向传播
 32 |             #----------------------#
 33 |             outputs         = model_train(images)
 34 |             loss_value      = yolo_loss(outputs, targets, images)
 35 | 
 36 |             #----------------------#
 37 |             #   反向传播
 38 |             #----------------------#
 39 |             loss_value.backward()
 40 |             optimizer.step()
 41 |         else:
 42 |             from torch.cuda.amp import autocast
 43 |             with autocast():
 44 |                 #----------------------#
 45 |                 #   前向传播
 46 |                 #----------------------#
 47 |                 outputs         = model_train(images)
 48 |                 loss_value      = yolo_loss(outputs, targets, images)
 49 | 
 50 |             #----------------------#
 51 |             #   反向传播
 52 |             #----------------------#
 53 |             scaler.scale(loss_value).backward()
 54 |             scaler.step(optimizer)
 55 |             scaler.update()
 56 |         if ema:
 57 |             ema.update(model_train)
 58 | 
 59 |         loss += loss_value.item()
 60 |         
 61 |         if local_rank == 0:
 62 |             pbar.set_postfix(**{'loss'  : loss / (iteration + 1), 
 63 |                                 'lr'    : get_lr(optimizer)})
 64 |             pbar.update(1)
 65 | 
 66 |     if local_rank == 0:
 67 |         pbar.close()
 68 |         print('Finish Train')
 69 |         print('Start Validation')
 70 |         pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
 71 | 
 72 |     if ema:
 73 |         model_train_eval = ema.ema
 74 |     else:
 75 |         model_train_eval = model_train.eval()
 76 |         
 77 |     for iteration, batch in enumerate(gen_val):
 78 |         if iteration >= epoch_step_val:
 79 |             break
 80 |         images, targets = batch[0], batch[1]
 81 |         with torch.no_grad():
 82 |             if cuda:
 83 |                 images  = images.cuda(local_rank)
 84 |                 targets = targets.cuda(local_rank)
 85 |             #----------------------#
 86 |             #   清零梯度
 87 |             #----------------------#
 88 |             optimizer.zero_grad()
 89 |             #----------------------#
 90 |             #   前向传播
 91 |             #----------------------#
 92 |             outputs         = model_train_eval(images)
 93 |             loss_value      = yolo_loss(outputs, targets, images)
 94 | 
 95 |         val_loss += loss_value.item()
 96 |         if local_rank == 0:
 97 |             pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
 98 |             pbar.update(1)
 99 |             
100 |     if local_rank == 0:
101 |         pbar.close()
102 |         print('Finish Validation')
103 |         loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val)
104 |         eval_callback.on_epoch_end(epoch + 1, model_train_eval)
105 |         print('Epoch:'+ str(epoch + 1) + '/' + str(Epoch))
106 |         print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
107 |         
108 |         #-----------------------------------------------#
109 |         #   保存权值
110 |         #-----------------------------------------------#
111 |         if ema:
112 |             save_state_dict = ema.ema.state_dict()
113 |         else:
114 |             save_state_dict = model.state_dict()
115 | 
116 |         if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch:
117 |             torch.save(save_state_dict, os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val)))
118 |             
119 |         if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss):
120 |             print('Save best model to best_epoch_weights.pth')
121 |             torch.save(save_state_dict, os.path.join(save_dir, "best_epoch_weights.pth"))
122 |             
123 |         torch.save(save_state_dict, os.path.join(save_dir, "last_epoch_weights.pth"))


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from PIL import Image
  6 | 
  7 | 
  8 | #---------------------------------------------------------#
  9 | #   将图像转换成RGB图像，防止灰度图在预测时报错。
 10 | #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
 11 | #---------------------------------------------------------#
 12 | def cvtColor(image):
 13 |     if len(np.shape(image)) == 3 and np.shape(image)[2] == 3:
 14 |         return image 
 15 |     else:
 16 |         image = image.convert('RGB')
 17 |         return image 
 18 | 
 19 | #---------------------------------------------------#
 20 | #   对输入图像进行resize
 21 | #---------------------------------------------------#
 22 | def resize_image(image, size, letterbox_image, mode='PIL'):
 23 |     if mode == 'PIL':
 24 |         iw, ih  = image.size
 25 |         w, h    = size
 26 | 
 27 |         if letterbox_image:
 28 |             scale   = min(w/iw, h/ih)
 29 |             nw      = int(iw*scale)
 30 |             nh      = int(ih*scale)
 31 | 
 32 |             image   = image.resize((nw,nh), Image.BICUBIC)
 33 |             new_image = Image.new('RGB', size, (128,128,128))
 34 |             new_image.paste(image, ((w-nw)//2, (h-nh)//2))
 35 |         else:
 36 |             new_image = image.resize((w, h), Image.BICUBIC)
 37 |     else:
 38 |         image = np.array(image)
 39 |         if letterbox_image:
 40 |             # 获得现在的shape
 41 |             shape       = np.shape(image)[:2]
 42 |             # 获得输出的shape
 43 |             if isinstance(size, int):
 44 |                 size    = (size, size)
 45 | 
 46 |             # 计算缩放的比例
 47 |             r = min(size[0] / shape[0], size[1] / shape[1])
 48 | 
 49 |             # 计算缩放后图片的高宽
 50 |             new_unpad   = int(round(shape[1] * r)), int(round(shape[0] * r))
 51 |             dw, dh      = size[1] - new_unpad[0], size[0] - new_unpad[1]
 52 | 
 53 |             # 除以2以padding到两边
 54 |             dw          /= 2  
 55 |             dh          /= 2
 56 |     
 57 |             # 对图像进行resize
 58 |             if shape[::-1] != new_unpad:  # resize
 59 |                 image = cv2.resize(image, new_unpad, interpolation=cv2.INTER_LINEAR)
 60 |             top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
 61 |             left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
 62 |     
 63 |             new_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(128, 128, 128))  # add border
 64 |         else:
 65 |             new_image = cv2.resize(image, (w, h))
 66 | 
 67 |     return new_image
 68 | 
 69 | #---------------------------------------------------#
 70 | #   获得类
 71 | #---------------------------------------------------#
 72 | def get_classes(classes_path):
 73 |     with open(classes_path, encoding='utf-8') as f:
 74 |         class_names = f.readlines()
 75 |     class_names = [c.strip() for c in class_names]
 76 |     return class_names, len(class_names)
 77 | 
 78 | #---------------------------------------------------#
 79 | #   获得先验框
 80 | #---------------------------------------------------#
 81 | def get_anchors(anchors_path):
 82 |     '''loads the anchors from a file'''
 83 |     with open(anchors_path, encoding='utf-8') as f:
 84 |         anchors = f.readline()
 85 |     anchors = [float(x) for x in anchors.split(',')]
 86 |     anchors = np.array(anchors).reshape(-1, 2)
 87 |     return anchors, len(anchors)
 88 | 
 89 | #---------------------------------------------------#
 90 | #   获得学习率
 91 | #---------------------------------------------------#
 92 | def get_lr(optimizer):
 93 |     for param_group in optimizer.param_groups:
 94 |         return param_group['lr']
 95 | 
 96 | #---------------------------------------------------#
 97 | #   设置种子
 98 | #---------------------------------------------------#
 99 | def seed_everything(seed=11):
100 |     random.seed(seed)
101 |     np.random.seed(seed)
102 |     torch.manual_seed(seed)
103 |     torch.cuda.manual_seed(seed)
104 |     torch.cuda.manual_seed_all(seed)
105 |     torch.backends.cudnn.deterministic = True
106 |     torch.backends.cudnn.benchmark = False
107 | 
108 | #---------------------------------------------------#
109 | #   设置Dataloader的种子
110 | #---------------------------------------------------#
111 | def worker_init_fn(worker_id, rank, seed):
112 |     worker_seed = rank + seed
113 |     random.seed(worker_seed)
114 |     np.random.seed(worker_seed)
115 |     torch.manual_seed(worker_seed)
116 | 
117 | def preprocess_input(image):
118 |     image /= 255.0
119 |     return image
120 | 
121 | def show_config(**kwargs):
122 |     print('Configurations:')
123 |     print('-' * 70)
124 |     print('|%25s | %40s|' % ('keys', 'values'))
125 |     print('-' * 70)
126 |     for key, value in kwargs.items():
127 |         print('|%25s | %40s|' % (str(key), str(value)))
128 |     print('-' * 70)
129 |         
130 | def download_weights(phi, model_dir="./model_data"):
131 |     import os
132 |     from torch.hub import load_state_dict_from_url
133 |     
134 |     download_urls = {
135 |         "l" : 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_backbone_weights.pth',
136 |         "x" : 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_x_backbone_weights.pth',
137 |     }
138 |     url = download_urls[phi]
139 |     
140 |     if not os.path.exists(model_dir):
141 |         os.makedirs(model_dir)
142 |     load_state_dict_from_url(url, model_dir)


--------------------------------------------------------------------------------
/utils_coco/get_map_coco.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from PIL import Image
  7 | from pycocotools.coco import COCO
  8 | from pycocotools.cocoeval import COCOeval
  9 | from tqdm import tqdm
 10 | 
 11 | from utils.utils import cvtColor, preprocess_input, resize_image
 12 | from yolo import YOLO
 13 | 
 14 | #---------------------------------------------------------------------------#
 15 | #   map_mode用于指定该文件运行时计算的内容
 16 | #   map_mode为0代表整个map计算流程，包括获得预测结果、计算map。
 17 | #   map_mode为1代表仅仅获得预测结果。
 18 | #   map_mode为2代表仅仅获得计算map。
 19 | #---------------------------------------------------------------------------#
 20 | map_mode            = 0
 21 | #-------------------------------------------------------#
 22 | #   指向了验证集标签与图片路径
 23 | #-------------------------------------------------------#
 24 | cocoGt_path         = 'coco_dataset/annotations/instances_val2017.json'
 25 | dataset_img_path    = 'coco_dataset/val2017'
 26 | #-------------------------------------------------------#
 27 | #   结果输出的文件夹，默认为map_out
 28 | #-------------------------------------------------------#
 29 | temp_save_path      = 'map_out/coco_eval'
 30 | 
 31 | class mAP_YOLO(YOLO):
 32 |     #---------------------------------------------------#
 33 |     #   检测图片
 34 |     #---------------------------------------------------#
 35 |     def detect_image(self, image_id, image, results, clsid2catid):
 36 |         #---------------------------------------------------#
 37 |         #   计算输入图片的高和宽
 38 |         #---------------------------------------------------#
 39 |         image_shape = np.array(np.shape(image)[0:2])
 40 |         #---------------------------------------------------------#
 41 |         #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
 42 |         #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
 43 |         #---------------------------------------------------------#
 44 |         image       = cvtColor(image)
 45 |         #---------------------------------------------------------#
 46 |         #   给图像增加灰条，实现不失真的resize
 47 |         #   也可以直接resize进行识别
 48 |         #---------------------------------------------------------#
 49 |         image_data  = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image)
 50 |         #---------------------------------------------------------#
 51 |         #   添加上batch_size维度
 52 |         #---------------------------------------------------------#
 53 |         image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
 54 | 
 55 |         with torch.no_grad():
 56 |             images = torch.from_numpy(image_data)
 57 |             if self.cuda:
 58 |                 images = images.cuda()
 59 |             #---------------------------------------------------------#
 60 |             #   将图像输入网络当中进行预测！
 61 |             #---------------------------------------------------------#
 62 |             outputs = self.net(images)
 63 |             outputs = self.bbox_util.decode_box(outputs)
 64 |             #---------------------------------------------------------#
 65 |             #   将预测框进行堆叠，然后进行非极大抑制
 66 |             #---------------------------------------------------------#
 67 |             outputs = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, 
 68 |                         image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
 69 |                                                     
 70 |             if outputs[0] is None: 
 71 |                 return results
 72 | 
 73 |             top_label   = np.array(outputs[0][:, 6], dtype = 'int32')
 74 |             top_conf    = outputs[0][:, 4] * outputs[0][:, 5]
 75 |             top_boxes   = outputs[0][:, :4]
 76 | 
 77 |         for i, c in enumerate(top_label):
 78 |             result                      = {}
 79 |             top, left, bottom, right    = top_boxes[i]
 80 | 
 81 |             result["image_id"]      = int(image_id)
 82 |             result["category_id"]   = clsid2catid[c]
 83 |             result["bbox"]          = [float(left),float(top),float(right-left),float(bottom-top)]
 84 |             result["score"]         = float(top_conf[i])
 85 |             results.append(result)
 86 |         return results
 87 | 
 88 | if __name__ == "__main__":
 89 |     if not os.path.exists(temp_save_path):
 90 |         os.makedirs(temp_save_path)
 91 | 
 92 |     cocoGt      = COCO(cocoGt_path)
 93 |     ids         = list(cocoGt.imgToAnns.keys())
 94 |     clsid2catid = cocoGt.getCatIds()
 95 | 
 96 |     if map_mode == 0 or map_mode == 1:
 97 |         yolo = mAP_YOLO(confidence = 0.001, nms_iou = 0.65)
 98 | 
 99 |         with open(os.path.join(temp_save_path, 'eval_results.json'),"w") as f:
100 |             results = []
101 |             for image_id in tqdm(ids):
102 |                 image_path  = os.path.join(dataset_img_path, cocoGt.loadImgs(image_id)[0]['file_name'])
103 |                 image       = Image.open(image_path)
104 |                 results     = yolo.detect_image(image_id, image, results, clsid2catid)
105 |             json.dump(results, f)
106 | 
107 |     if map_mode == 0 or map_mode == 2:
108 |         cocoDt      = cocoGt.loadRes(os.path.join(temp_save_path, 'eval_results.json'))
109 |         cocoEval    = COCOeval(cocoGt, cocoDt, 'bbox') 
110 |         cocoEval.evaluate()
111 |         cocoEval.accumulate()
112 |         cocoEval.summarize()
113 |         print("Get map done.")
114 | 


--------------------------------------------------------------------------------
/nets/backbone.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | def autopad(k, p=None):
  6 |     if p is None:
  7 |         p = k // 2 if isinstance(k, int) else [x // 2 for x in k] 
  8 |     return p
  9 | 
 10 | class SiLU(nn.Module):  
 11 |     @staticmethod
 12 |     def forward(x):
 13 |         return x * torch.sigmoid(x)
 14 |     
 15 | class Conv(nn.Module):
 16 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=SiLU()):  # ch_in, ch_out, kernel, stride, padding, groups
 17 |         super(Conv, self).__init__()
 18 |         self.conv   = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
 19 |         self.bn     = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)
 20 |         self.act    = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
 21 | 
 22 |     def forward(self, x):
 23 |         return self.act(self.bn(self.conv(x)))
 24 | 
 25 |     def fuseforward(self, x):
 26 |         return self.act(self.conv(x))
 27 |     
 28 | class Multi_Concat_Block(nn.Module):
 29 |     def __init__(self, c1, c2, c3, n=4, e=1, ids=[0]):
 30 |         super(Multi_Concat_Block, self).__init__()
 31 |         c_ = int(c2 * e)
 32 |         
 33 |         self.ids = ids
 34 |         self.cv1 = Conv(c1, c_, 1, 1)
 35 |         self.cv2 = Conv(c1, c_, 1, 1)
 36 |         self.cv3 = nn.ModuleList(
 37 |             [Conv(c_ if i ==0 else c2, c2, 3, 1) for i in range(n)]
 38 |         )
 39 |         self.cv4 = Conv(c_ * 2 + c2 * (len(ids) - 2), c3, 1, 1)
 40 | 
 41 |     def forward(self, x):
 42 |         x_1 = self.cv1(x)
 43 |         x_2 = self.cv2(x)
 44 |         
 45 |         x_all = [x_1, x_2]
 46 |         # [-1, -3, -5, -6] => [5, 3, 1, 0]
 47 |         for i in range(len(self.cv3)):
 48 |             x_2 = self.cv3[i](x_2)
 49 |             x_all.append(x_2)
 50 |             
 51 |         out = self.cv4(torch.cat([x_all[id] for id in self.ids], 1))
 52 |         return out
 53 | 
 54 | class MP(nn.Module):
 55 |     def __init__(self, k=2):
 56 |         super(MP, self).__init__()
 57 |         self.m = nn.MaxPool2d(kernel_size=k, stride=k)
 58 | 
 59 |     def forward(self, x):
 60 |         return self.m(x)
 61 |     
 62 | class Transition_Block(nn.Module):
 63 |     def __init__(self, c1, c2):
 64 |         super(Transition_Block, self).__init__()
 65 |         self.cv1 = Conv(c1, c2, 1, 1)
 66 |         self.cv2 = Conv(c1, c2, 1, 1)
 67 |         self.cv3 = Conv(c2, c2, 3, 2)
 68 |         
 69 |         self.mp  = MP()
 70 | 
 71 |     def forward(self, x):
 72 |         # 160, 160, 256 => 80, 80, 256 => 80, 80, 128
 73 |         x_1 = self.mp(x)
 74 |         x_1 = self.cv1(x_1)
 75 |         
 76 |         # 160, 160, 256 => 160, 160, 128 => 80, 80, 128
 77 |         x_2 = self.cv2(x)
 78 |         x_2 = self.cv3(x_2)
 79 |         
 80 |         # 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
 81 |         return torch.cat([x_2, x_1], 1)
 82 |     
 83 | class Backbone(nn.Module):
 84 |     def __init__(self, transition_channels, block_channels, n, phi, pretrained=False):
 85 |         super().__init__()
 86 |         #-----------------------------------------------#
 87 |         #   输入图片是640, 640, 3
 88 |         #-----------------------------------------------#
 89 |         ids = {
 90 |             'l' : [-1, -3, -5, -6],
 91 |             'x' : [-1, -3, -5, -7, -8], 
 92 |         }[phi]
 93 |         # 640, 640, 3 => 640, 640, 32 => 320, 320, 64
 94 |         self.stem = nn.Sequential(
 95 |             Conv(3, transition_channels, 3, 1),
 96 |             Conv(transition_channels, transition_channels * 2, 3, 2),
 97 |             Conv(transition_channels * 2, transition_channels * 2, 3, 1),
 98 |         )
 99 |         # 320, 320, 64 => 160, 160, 128 => 160, 160, 256
100 |         self.dark2 = nn.Sequential(
101 |             Conv(transition_channels * 2, transition_channels * 4, 3, 2),
102 |             Multi_Concat_Block(transition_channels * 4, block_channels * 2, transition_channels * 8, n=n, ids=ids),
103 |         )
104 |         # 160, 160, 256 => 80, 80, 256 => 80, 80, 512
105 |         self.dark3 = nn.Sequential(
106 |             Transition_Block(transition_channels * 8, transition_channels * 4),
107 |             Multi_Concat_Block(transition_channels * 8, block_channels * 4, transition_channels * 16, n=n, ids=ids),
108 |         )
109 |         # 80, 80, 512 => 40, 40, 512 => 40, 40, 1024
110 |         self.dark4 = nn.Sequential(
111 |             Transition_Block(transition_channels * 16, transition_channels * 8),
112 |             Multi_Concat_Block(transition_channels * 16, block_channels * 8, transition_channels * 32, n=n, ids=ids),
113 |         )
114 |         # 40, 40, 1024 => 20, 20, 1024 => 20, 20, 1024
115 |         self.dark5 = nn.Sequential(
116 |             Transition_Block(transition_channels * 32, transition_channels * 16),
117 |             Multi_Concat_Block(transition_channels * 32, block_channels * 8, transition_channels * 32, n=n, ids=ids),
118 |         )
119 |         
120 |         if pretrained:
121 |             url = {
122 |                 "l" : 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_backbone_weights.pth',
123 |                 "x" : 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_x_backbone_weights.pth',
124 |             }[phi]
125 |             checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
126 |             self.load_state_dict(checkpoint, strict=False)
127 |             print("Load weights from " + url.split('/')[-1])
128 | 
129 |     def forward(self, x):
130 |         x = self.stem(x)
131 |         x = self.dark2(x)
132 |         #-----------------------------------------------#
133 |         #   dark3的输出为80, 80, 512，是一个有效特征层
134 |         #-----------------------------------------------#
135 |         x = self.dark3(x)
136 |         feat1 = x
137 |         #-----------------------------------------------#
138 |         #   dark4的输出为40, 40, 1024，是一个有效特征层
139 |         #-----------------------------------------------#
140 |         x = self.dark4(x)
141 |         feat2 = x
142 |         #-----------------------------------------------#
143 |         #   dark5的输出为20, 20, 1024，是一个有效特征层
144 |         #-----------------------------------------------#
145 |         x = self.dark5(x)
146 |         feat3 = x
147 |         return feat1, feat2, feat3
148 | 


--------------------------------------------------------------------------------
/kmeans_for_anchors.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------------------------------------#
  2 | #   kmeans虽然会对数据集中的框进行聚类，但是很多数据集由于框的大小相近，聚类出来的9个框相差不大，
  3 | #   这样的框反而不利于模型的训练。因为不同的特征层适合不同大小的先验框，shape越小的特征层适合越大的先验框
  4 | #   原始网络的先验框已经按大中小比例分配好了，不进行聚类也会有非常好的效果。
  5 | #-------------------------------------------------------------------------------------------------------#
  6 | import glob
  7 | import xml.etree.ElementTree as ET
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def cas_ratio(box,cluster):
 15 |     ratios_of_box_cluster = box / cluster
 16 |     ratios_of_cluster_box = cluster / box
 17 |     ratios = np.concatenate([ratios_of_box_cluster, ratios_of_cluster_box], axis = -1)
 18 | 
 19 |     return np.max(ratios, -1)
 20 | 
 21 | def avg_ratio(box,cluster):
 22 |     return np.mean([np.min(cas_ratio(box[i],cluster)) for i in range(box.shape[0])])
 23 | 
 24 | def kmeans(box,k):
 25 |     #-------------------------------------------------------------#
 26 |     #   取出一共有多少框
 27 |     #-------------------------------------------------------------#
 28 |     row = box.shape[0]
 29 |     
 30 |     #-------------------------------------------------------------#
 31 |     #   每个框各个点的位置
 32 |     #-------------------------------------------------------------#
 33 |     distance = np.empty((row,k))
 34 |     
 35 |     #-------------------------------------------------------------#
 36 |     #   最后的聚类位置
 37 |     #-------------------------------------------------------------#
 38 |     last_clu = np.zeros((row,))
 39 | 
 40 |     np.random.seed()
 41 | 
 42 |     #-------------------------------------------------------------#
 43 |     #   随机选5个当聚类中心
 44 |     #-------------------------------------------------------------#
 45 |     cluster = box[np.random.choice(row,k,replace = False)]
 46 | 
 47 |     iter = 0
 48 |     while True:
 49 |         #-------------------------------------------------------------#
 50 |         #   计算当前框和先验框的宽高比例
 51 |         #-------------------------------------------------------------#
 52 |         for i in range(row):
 53 |             distance[i] = cas_ratio(box[i],cluster)
 54 |         
 55 |         #-------------------------------------------------------------#
 56 |         #   取出最小点
 57 |         #-------------------------------------------------------------#
 58 |         near = np.argmin(distance,axis=1)
 59 | 
 60 |         if (last_clu == near).all():
 61 |             break
 62 |         
 63 |         #-------------------------------------------------------------#
 64 |         #   求每一个类的中位点
 65 |         #-------------------------------------------------------------#
 66 |         for j in range(k):
 67 |             cluster[j] = np.median(
 68 |                 box[near == j],axis=0)
 69 | 
 70 |         last_clu = near
 71 |         if iter % 5 == 0:
 72 |             print('iter: {:d}. avg_ratio:{:.2f}'.format(iter, avg_ratio(box,cluster)))
 73 |         iter += 1
 74 | 
 75 |     return cluster, near
 76 | 
 77 | def load_data(path):
 78 |     data = []
 79 |     #-------------------------------------------------------------#
 80 |     #   对于每一个xml都寻找box
 81 |     #-------------------------------------------------------------#
 82 |     for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
 83 |         tree = ET.parse(xml_file)
 84 |         height = int(tree.findtext('./size/height'))
 85 |         width = int(tree.findtext('./size/width'))
 86 |         if height<=0 or width<=0:
 87 |             continue
 88 |         
 89 |         #-------------------------------------------------------------#
 90 |         #   对于每一个目标都获得它的宽高
 91 |         #-------------------------------------------------------------#
 92 |         for obj in tree.iter('object'):
 93 |             xmin = int(float(obj.findtext('bndbox/xmin'))) / width
 94 |             ymin = int(float(obj.findtext('bndbox/ymin'))) / height
 95 |             xmax = int(float(obj.findtext('bndbox/xmax'))) / width
 96 |             ymax = int(float(obj.findtext('bndbox/ymax'))) / height
 97 | 
 98 |             xmin = np.float64(xmin)
 99 |             ymin = np.float64(ymin)
100 |             xmax = np.float64(xmax)
101 |             ymax = np.float64(ymax)
102 |             # 得到宽高
103 |             data.append([xmax-xmin,ymax-ymin])
104 |     return np.array(data)
105 | 
106 | if __name__ == '__main__':
107 |     np.random.seed(0)
108 |     #-------------------------------------------------------------#
109 |     #   运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
110 |     #   会生成yolo_anchors.txt
111 |     #-------------------------------------------------------------#
112 |     input_shape = [640, 640]
113 |     anchors_num = 9
114 |     #-------------------------------------------------------------#
115 |     #   载入数据集，可以使用VOC的xml
116 |     #-------------------------------------------------------------#
117 |     path        = 'VOCdevkit/VOC2007/Annotations'
118 |     
119 |     #-------------------------------------------------------------#
120 |     #   载入所有的xml
121 |     #   存储格式为转化为比例后的width,height
122 |     #-------------------------------------------------------------#
123 |     print('Load xmls.')
124 |     data = load_data(path)
125 |     print('Load xmls done.')
126 |     
127 |     #-------------------------------------------------------------#
128 |     #   使用k聚类算法
129 |     #-------------------------------------------------------------#
130 |     print('K-means boxes.')
131 |     cluster, near   = kmeans(data, anchors_num)
132 |     print('K-means boxes done.')
133 |     data            = data * np.array([input_shape[1], input_shape[0]])
134 |     cluster         = cluster * np.array([input_shape[1], input_shape[0]])
135 | 
136 |     #-------------------------------------------------------------#
137 |     #   绘图
138 |     #-------------------------------------------------------------#
139 |     for j in range(anchors_num):
140 |         plt.scatter(data[near == j][:,0], data[near == j][:,1])
141 |         plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
142 |     plt.savefig("kmeans_for_anchors.jpg")
143 |     plt.show()
144 |     print('Save kmeans_for_anchors.jpg in root dir.')
145 | 
146 |     cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
147 |     print('avg_ratio:{:.2f}'.format(avg_ratio(data, cluster)))
148 |     print(cluster)
149 | 
150 |     f = open("yolo_anchors.txt", 'w')
151 |     row = np.shape(cluster)[0]
152 |     for i in range(row):
153 |         if i == 0:
154 |             x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
155 |         else:
156 |             x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
157 |         f.write(x_y)
158 |     f.close()
159 | 


--------------------------------------------------------------------------------
/get_map.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | 
  4 | from PIL import Image
  5 | from tqdm import tqdm
  6 | 
  7 | from utils.utils import get_classes
  8 | from utils.utils_map import get_coco_map, get_map
  9 | from yolo import YOLO
 10 | 
 11 | if __name__ == "__main__":
 12 |     '''
 13 |     Recall和Precision不像AP是一个面积的概念，因此在门限值（Confidence）不同时，网络的Recall和Precision值是不同的。
 14 |     默认情况下，本代码计算的Recall和Precision代表的是当门限值（Confidence）为0.5时，所对应的Recall和Precision值。
 15 | 
 16 |     受到mAP计算原理的限制，网络在计算mAP时需要获得近乎所有的预测框，这样才可以计算不同门限条件下的Recall和Precision值
 17 |     因此，本代码获得的map_out/detection-results/里面的txt的框的数量一般会比直接predict多一些，目的是列出所有可能的预测框，
 18 |     '''
 19 |     #------------------------------------------------------------------------------------------------------------------#
 20 |     #   map_mode用于指定该文件运行时计算的内容
 21 |     #   map_mode为0代表整个map计算流程，包括获得预测结果、获得真实框、计算VOC_map。
 22 |     #   map_mode为1代表仅仅获得预测结果。
 23 |     #   map_mode为2代表仅仅获得真实框。
 24 |     #   map_mode为3代表仅仅计算VOC_map。
 25 |     #   map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行
 26 |     #-------------------------------------------------------------------------------------------------------------------#
 27 |     map_mode        = 0
 28 |     #--------------------------------------------------------------------------------------#
 29 |     #   此处的classes_path用于指定需要测量VOC_map的类别
 30 |     #   一般情况下与训练和预测所用的classes_path一致即可
 31 |     #--------------------------------------------------------------------------------------#
 32 |     classes_path    = 'model_data/voc_classes.txt'
 33 |     #--------------------------------------------------------------------------------------#
 34 |     #   MINOVERLAP用于指定想要获得的mAP0.x，mAP0.x的意义是什么请同学们百度一下。
 35 |     #   比如计算mAP0.75，可以设定MINOVERLAP = 0.75。
 36 |     #
 37 |     #   当某一预测框与真实框重合度大于MINOVERLAP时，该预测框被认为是正样本，否则为负样本。
 38 |     #   因此MINOVERLAP的值越大，预测框要预测的越准确才能被认为是正样本，此时算出来的mAP值越低，
 39 |     #--------------------------------------------------------------------------------------#
 40 |     MINOVERLAP      = 0.5
 41 |     #--------------------------------------------------------------------------------------#
 42 |     #   受到mAP计算原理的限制，网络在计算mAP时需要获得近乎所有的预测框，这样才可以计算mAP
 43 |     #   因此，confidence的值应当设置的尽量小进而获得全部可能的预测框。
 44 |     #   
 45 |     #   该值一般不调整。因为计算mAP需要获得近乎所有的预测框，此处的confidence不能随便更改。
 46 |     #   想要获得不同门限值下的Recall和Precision值，请修改下方的score_threhold。
 47 |     #--------------------------------------------------------------------------------------#
 48 |     confidence      = 0.001
 49 |     #--------------------------------------------------------------------------------------#
 50 |     #   预测时使用到的非极大抑制值的大小，越大表示非极大抑制越不严格。
 51 |     #   
 52 |     #   该值一般不调整。
 53 |     #--------------------------------------------------------------------------------------#
 54 |     nms_iou         = 0.5
 55 |     #---------------------------------------------------------------------------------------------------------------#
 56 |     #   Recall和Precision不像AP是一个面积的概念，因此在门限值不同时，网络的Recall和Precision值是不同的。
 57 |     #   
 58 |     #   默认情况下，本代码计算的Recall和Precision代表的是当门限值为0.5（此处定义为score_threhold）时所对应的Recall和Precision值。
 59 |     #   因为计算mAP需要获得近乎所有的预测框，上面定义的confidence不能随便更改。
 60 |     #   这里专门定义一个score_threhold用于代表门限值，进而在计算mAP时找到门限值对应的Recall和Precision值。
 61 |     #---------------------------------------------------------------------------------------------------------------#
 62 |     score_threhold  = 0.5
 63 |     #-------------------------------------------------------#
 64 |     #   map_vis用于指定是否开启VOC_map计算的可视化
 65 |     #-------------------------------------------------------#
 66 |     map_vis         = False
 67 |     #-------------------------------------------------------#
 68 |     #   指向VOC数据集所在的文件夹
 69 |     #   默认指向根目录下的VOC数据集
 70 |     #-------------------------------------------------------#
 71 |     VOCdevkit_path  = 'VOCdevkit'
 72 |     #-------------------------------------------------------#
 73 |     #   结果输出的文件夹，默认为map_out
 74 |     #-------------------------------------------------------#
 75 |     map_out_path    = 'map_out'
 76 | 
 77 |     image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split()
 78 | 
 79 |     if not os.path.exists(map_out_path):
 80 |         os.makedirs(map_out_path)
 81 |     if not os.path.exists(os.path.join(map_out_path, 'ground-truth')):
 82 |         os.makedirs(os.path.join(map_out_path, 'ground-truth'))
 83 |     if not os.path.exists(os.path.join(map_out_path, 'detection-results')):
 84 |         os.makedirs(os.path.join(map_out_path, 'detection-results'))
 85 |     if not os.path.exists(os.path.join(map_out_path, 'images-optional')):
 86 |         os.makedirs(os.path.join(map_out_path, 'images-optional'))
 87 | 
 88 |     class_names, _ = get_classes(classes_path)
 89 | 
 90 |     if map_mode == 0 or map_mode == 1:
 91 |         print("Load model.")
 92 |         yolo = YOLO(confidence = confidence, nms_iou = nms_iou)
 93 |         print("Load model done.")
 94 | 
 95 |         print("Get predict result.")
 96 |         for image_id in tqdm(image_ids):
 97 |             image_path  = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/"+image_id+".jpg")
 98 |             image       = Image.open(image_path)
 99 |             if map_vis:
100 |                 image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg"))
101 |             yolo.get_map_txt(image_id, image, class_names, map_out_path)
102 |         print("Get predict result done.")
103 |         
104 |     if map_mode == 0 or map_mode == 2:
105 |         print("Get ground truth result.")
106 |         for image_id in tqdm(image_ids):
107 |             with open(os.path.join(map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f:
108 |                 root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/"+image_id+".xml")).getroot()
109 |                 for obj in root.findall('object'):
110 |                     difficult_flag = False
111 |                     if obj.find('difficult')!=None:
112 |                         difficult = obj.find('difficult').text
113 |                         if int(difficult)==1:
114 |                             difficult_flag = True
115 |                     obj_name = obj.find('name').text
116 |                     if obj_name not in class_names:
117 |                         continue
118 |                     bndbox  = obj.find('bndbox')
119 |                     left    = bndbox.find('xmin').text
120 |                     top     = bndbox.find('ymin').text
121 |                     right   = bndbox.find('xmax').text
122 |                     bottom  = bndbox.find('ymax').text
123 | 
124 |                     if difficult_flag:
125 |                         new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom))
126 |                     else:
127 |                         new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
128 |         print("Get ground truth result done.")
129 | 
130 |     if map_mode == 0 or map_mode == 3:
131 |         print("Get map.")
132 |         get_map(MINOVERLAP, True, score_threhold = score_threhold, path = map_out_path)
133 |         print("Get map done.")
134 | 
135 |     if map_mode == 4:
136 |         print("Get map.")
137 |         get_coco_map(class_names = class_names, path = map_out_path)
138 |         print("Get map done.")
139 | 


--------------------------------------------------------------------------------
/voc_annotation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import xml.etree.ElementTree as ET
  4 | 
  5 | import numpy as np
  6 | 
  7 | from utils.utils import get_classes
  8 | 
  9 | #--------------------------------------------------------------------------------------------------------------------------------#
 10 | #   annotation_mode用于指定该文件运行时计算的内容
 11 | #   annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
 12 | #   annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
 13 | #   annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
 14 | #--------------------------------------------------------------------------------------------------------------------------------#
 15 | annotation_mode     = 0
 16 | #-------------------------------------------------------------------#
 17 | #   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
 18 | #   与训练和预测所用的classes_path一致即可
 19 | #   如果生成的2007_train.txt里面没有目标信息
 20 | #   那么就是因为classes没有设定正确
 21 | #   仅在annotation_mode为0和2的时候有效
 22 | #-------------------------------------------------------------------#
 23 | classes_path        = 'model_data/voc_classes.txt'
 24 | #--------------------------------------------------------------------------------------------------------------------------------#
 25 | #   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1
 26 | #   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1
 27 | #   仅在annotation_mode为0和1的时候有效
 28 | #--------------------------------------------------------------------------------------------------------------------------------#
 29 | trainval_percent    = 0.9
 30 | train_percent       = 0.9
 31 | #-------------------------------------------------------#
 32 | #   指向VOC数据集所在的文件夹
 33 | #   默认指向根目录下的VOC数据集
 34 | #-------------------------------------------------------#
 35 | VOCdevkit_path  = 'VOCdevkit'
 36 | 
 37 | VOCdevkit_sets  = [('2007', 'train'), ('2007', 'val')]
 38 | classes, _      = get_classes(classes_path)
 39 | 
 40 | #-------------------------------------------------------#
 41 | #   统计目标数量
 42 | #-------------------------------------------------------#
 43 | photo_nums  = np.zeros(len(VOCdevkit_sets))
 44 | nums        = np.zeros(len(classes))
 45 | def convert_annotation(year, image_id, list_file):
 46 |     in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml'%(year, image_id)), encoding='utf-8')
 47 |     tree=ET.parse(in_file)
 48 |     root = tree.getroot()
 49 | 
 50 |     for obj in root.iter('object'):
 51 |         difficult = 0 
 52 |         if obj.find('difficult')!=None:
 53 |             difficult = obj.find('difficult').text
 54 |         cls = obj.find('name').text
 55 |         if cls not in classes or int(difficult)==1:
 56 |             continue
 57 |         cls_id = classes.index(cls)
 58 |         xmlbox = obj.find('bndbox')
 59 |         b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
 60 |         list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
 61 |         
 62 |         nums[classes.index(cls)] = nums[classes.index(cls)] + 1
 63 |         
 64 | if __name__ == "__main__":
 65 |     random.seed(0)
 66 |     if " " in os.path.abspath(VOCdevkit_path):
 67 |         raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格，否则会影响正常的模型训练，请注意修改。")
 68 | 
 69 |     if annotation_mode == 0 or annotation_mode == 1:
 70 |         print("Generate txt in ImageSets.")
 71 |         xmlfilepath     = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
 72 |         saveBasePath    = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
 73 |         temp_xml        = os.listdir(xmlfilepath)
 74 |         total_xml       = []
 75 |         for xml in temp_xml:
 76 |             if xml.endswith(".xml"):
 77 |                 total_xml.append(xml)
 78 | 
 79 |         num     = len(total_xml)  
 80 |         list    = range(num)  
 81 |         tv      = int(num*trainval_percent)  
 82 |         tr      = int(tv*train_percent)  
 83 |         trainval= random.sample(list,tv)  
 84 |         train   = random.sample(trainval,tr)  
 85 |         
 86 |         print("train and val size",tv)
 87 |         print("train size",tr)
 88 |         ftrainval   = open(os.path.join(saveBasePath,'trainval.txt'), 'w')  
 89 |         ftest       = open(os.path.join(saveBasePath,'test.txt'), 'w')  
 90 |         ftrain      = open(os.path.join(saveBasePath,'train.txt'), 'w')  
 91 |         fval        = open(os.path.join(saveBasePath,'val.txt'), 'w')  
 92 |         
 93 |         for i in list:  
 94 |             name=total_xml[i][:-4]+'\n'  
 95 |             if i in trainval:  
 96 |                 ftrainval.write(name)  
 97 |                 if i in train:  
 98 |                     ftrain.write(name)  
 99 |                 else:  
100 |                     fval.write(name)  
101 |             else:  
102 |                 ftest.write(name)  
103 |         
104 |         ftrainval.close()  
105 |         ftrain.close()  
106 |         fval.close()  
107 |         ftest.close()
108 |         print("Generate txt in ImageSets done.")
109 | 
110 |     if annotation_mode == 0 or annotation_mode == 2:
111 |         print("Generate 2007_train.txt and 2007_val.txt for train.")
112 |         type_index = 0
113 |         for year, image_set in VOCdevkit_sets:
114 |             image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt'%(year, image_set)), encoding='utf-8').read().strip().split()
115 |             list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
116 |             for image_id in image_ids:
117 |                 list_file.write('%s/VOC%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), year, image_id))
118 | 
119 |                 convert_annotation(year, image_id, list_file)
120 |                 list_file.write('\n')
121 |             photo_nums[type_index] = len(image_ids)
122 |             type_index += 1
123 |             list_file.close()
124 |         print("Generate 2007_train.txt and 2007_val.txt for train done.")
125 |         
126 |         def printTable(List1, List2):
127 |             for i in range(len(List1[0])):
128 |                 print("|", end=' ')
129 |                 for j in range(len(List1)):
130 |                     print(List1[j][i].rjust(int(List2[j])), end=' ')
131 |                     print("|", end=' ')
132 |                 print()
133 | 
134 |         str_nums = [str(int(x)) for x in nums]
135 |         tableData = [
136 |             classes, str_nums
137 |         ]
138 |         colWidths = [0]*len(tableData)
139 |         len1 = 0
140 |         for i in range(len(tableData)):
141 |             for j in range(len(tableData[i])):
142 |                 if len(tableData[i][j]) > colWidths[i]:
143 |                     colWidths[i] = len(tableData[i][j])
144 |         printTable(tableData, colWidths)
145 | 
146 |         if photo_nums[0] <= 500:
147 |             print("训练集数量小于500，属于较小的数据量，请注意设置较大的训练世代（Epoch）以满足足够的梯度下降次数（Step）。")
148 | 
149 |         if np.sum(nums) == 0:
150 |             print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
151 |             print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
152 |             print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
153 |             print("（重要的事情说三遍）。")
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## YOLOV7：You Only Look Once目标检测模型在pytorch当中的实现
  2 | ---
  3 | 
  4 | ## 目录
  5 | 1. [仓库更新 Top News](#仓库更新)
  6 | 2. [相关仓库 Related code](#相关仓库)
  7 | 3. [性能情况 Performance](#性能情况)
  8 | 4. [所需环境 Environment](#所需环境)
  9 | 5. [文件下载 Download](#文件下载)
 10 | 6. [训练步骤 How2train](#训练步骤)
 11 | 7. [预测步骤 How2predict](#预测步骤)
 12 | 8. [评估步骤 How2eval](#评估步骤)
 13 | 9. [参考资料 Reference](#Reference)
 14 | 
 15 | ## Top News
 16 | **`2022-07`**:**仓库创建，支持step、cos学习率下降法、支持adam、sgd优化器选择、支持学习率根据batch_size自适应调整、新增图片裁剪、支持多GPU训练、支持各个种类目标数量计算、支持heatmap、支持EMA。**  
 17 | 
 18 | ## 相关仓库
 19 | | 模型 | 路径 |
 20 | | :----- | :----- |
 21 | YoloV3 | https://github.com/bubbliiiing/yolo3-pytorch  
 22 | Efficientnet-Yolo3 | https://github.com/bubbliiiing/efficientnet-yolo3-pytorch  
 23 | YoloV4 | https://github.com/bubbliiiing/yolov4-pytorch
 24 | YoloV4-tiny | https://github.com/bubbliiiing/yolov4-tiny-pytorch
 25 | Mobilenet-Yolov4 | https://github.com/bubbliiiing/mobilenet-yolov4-pytorch
 26 | YoloV5-V5.0 | https://github.com/bubbliiiing/yolov5-pytorch
 27 | YoloV5-V6.1 | https://github.com/bubbliiiing/yolov5-v6.1-pytorch
 28 | YoloX | https://github.com/bubbliiiing/yolox-pytorch
 29 | YoloV7 | https://github.com/bubbliiiing/yolov7-pytorch
 30 | YoloV7-tiny | https://github.com/bubbliiiing/yolov7-tiny-pytorch
 31 | 
 32 | ## 性能情况
 33 | | 训练数据集 | 权值文件名称 | 测试数据集 | 输入图片大小 | mAP 0.5:0.95 | mAP 0.5 |
 34 | | :-----: | :-----: | :------: | :------: | :------: | :-----: |
 35 | | COCO-Train2017 | [yolov7_weights.pth](https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_weights.pth) | COCO-Val2017 | 640x640 | 50.7 | 69.2
 36 | | COCO-Train2017 | [yolov7_x_weights.pth](https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_x_weights.pth) | COCO-Val2017 | 640x640 | 52.4 | 70.5
 37 | 
 38 | ## 所需环境
 39 | torch==1.2.0    
 40 | 为了使用amp混合精度，推荐使用torch1.7.1以上的版本。
 41 | 
 42 | ## 文件下载
 43 | 训练所需的权值可在百度网盘中下载。  
 44 | 链接: https://pan.baidu.com/s/1uYpjWC1uOo3Q-klpUEy9LQ     
 45 | 提取码: pmua    
 46 | 
 47 | VOC数据集下载地址如下，里面已经包括了训练集、测试集、验证集（与测试集一样），无需再次划分：  
 48 | 链接: https://pan.baidu.com/s/19Mw2u_df_nBzsC2lg20fQA    
 49 | 提取码: j5ge   
 50 | 
 51 | ## 训练步骤
 52 | ### a、训练VOC07+12数据集
 53 | 1. 数据集的准备   
 54 | **本文使用VOC格式进行训练，训练前需要下载好VOC07+12的数据集，解压后放在根目录**  
 55 | 
 56 | 2. 数据集的处理   
 57 | 修改voc_annotation.py里面的annotation_mode=2，运行voc_annotation.py生成根目录下的2007_train.txt和2007_val.txt。   
 58 | 
 59 | 3. 开始网络训练   
 60 | train.py的默认参数用于训练VOC数据集，直接运行train.py即可开始训练。   
 61 | 
 62 | 4. 训练结果预测   
 63 | 训练结果预测需要用到两个文件，分别是yolo.py和predict.py。我们首先需要去yolo.py里面修改model_path以及classes_path，这两个参数必须要修改。   
 64 | **model_path指向训练好的权值文件，在logs文件夹里。   
 65 | classes_path指向检测类别所对应的txt。**   
 66 | 完成修改后就可以运行predict.py进行检测了。运行后输入图片路径即可检测。   
 67 | 
 68 | ### b、训练自己的数据集
 69 | 1. 数据集的准备  
 70 | **本文使用VOC格式进行训练，训练前需要自己制作好数据集，**    
 71 | 训练前将标签文件放在VOCdevkit文件夹下的VOC2007文件夹下的Annotation中。   
 72 | 训练前将图片文件放在VOCdevkit文件夹下的VOC2007文件夹下的JPEGImages中。   
 73 | 
 74 | 2. 数据集的处理  
 75 | 在完成数据集的摆放之后，我们需要利用voc_annotation.py获得训练用的2007_train.txt和2007_val.txt。   
 76 | 修改voc_annotation.py里面的参数。第一次训练可以仅修改classes_path，classes_path用于指向检测类别所对应的txt。   
 77 | 训练自己的数据集时，可以自己建立一个cls_classes.txt，里面写自己所需要区分的类别。   
 78 | model_data/cls_classes.txt文件内容为：      
 79 | ```python
 80 | cat
 81 | dog
 82 | ...
 83 | ```
 84 | 修改voc_annotation.py中的classes_path，使其对应cls_classes.txt，并运行voc_annotation.py。  
 85 | 
 86 | 3. 开始网络训练  
 87 | **训练的参数较多，均在train.py中，大家可以在下载库后仔细看注释，其中最重要的部分依然是train.py里的classes_path。**  
 88 | **classes_path用于指向检测类别所对应的txt，这个txt和voc_annotation.py里面的txt一样！训练自己的数据集必须要修改！**  
 89 | 修改完classes_path后就可以运行train.py开始训练了，在训练多个epoch后，权值会生成在logs文件夹中。  
 90 | 
 91 | 4. 训练结果预测  
 92 | 训练结果预测需要用到两个文件，分别是yolo.py和predict.py。在yolo.py里面修改model_path以及classes_path。  
 93 | **model_path指向训练好的权值文件，在logs文件夹里。  
 94 | classes_path指向检测类别所对应的txt。**  
 95 | 完成修改后就可以运行predict.py进行检测了。运行后输入图片路径即可检测。  
 96 | 
 97 | ## 预测步骤
 98 | ### a、使用预训练权重
 99 | 1. 下载完库后解压，在百度网盘下载权值，放入model_data，运行predict.py，输入  
100 | ```python
101 | img/street.jpg
102 | ```
103 | 2. 在predict.py里面进行设置可以进行fps测试和video视频检测。  
104 | ### b、使用自己训练的权重
105 | 1. 按照训练步骤训练。  
106 | 2. 在yolo.py文件里面，在如下部分修改model_path和classes_path使其对应训练好的文件；**model_path对应logs文件夹下面的权值文件，classes_path是model_path对应分的类**。  
107 | ```python
108 | _defaults = {
109 |     #--------------------------------------------------------------------------#
110 |     #   使用自己训练好的模型进行预测一定要修改model_path和classes_path！
111 |     #   model_path指向logs文件夹下的权值文件，classes_path指向model_data下的txt
112 |     #
113 |     #   训练好后logs文件夹下存在多个权值文件，选择验证集损失较低的即可。
114 |     #   验证集损失较低不代表mAP较高，仅代表该权值在验证集上泛化性能较好。
115 |     #   如果出现shape不匹配，同时要注意训练时的model_path和classes_path参数的修改
116 |     #--------------------------------------------------------------------------#
117 |     "model_path"        : 'model_data/yolov7_weights.pth',
118 |     "classes_path"      : 'model_data/coco_classes.txt',
119 |     #---------------------------------------------------------------------#
120 |     #   anchors_path代表先验框对应的txt文件，一般不修改。
121 |     #   anchors_mask用于帮助代码找到对应的先验框，一般不修改。
122 |     #---------------------------------------------------------------------#
123 |     "anchors_path"      : 'model_data/yolo_anchors.txt',
124 |     "anchors_mask"      : [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
125 |     #---------------------------------------------------------------------#
126 |     #   输入图片的大小，必须为32的倍数。
127 |     #---------------------------------------------------------------------#
128 |     "input_shape"       : [640, 640],
129 |     #------------------------------------------------------#
130 |     #   所使用到的yolov7的版本，本仓库一共提供两个：
131 |     #   l : 对应yolov7
132 |     #   x : 对应yolov7_x
133 |     #------------------------------------------------------#
134 |     "phi"               : 'l',
135 |     #---------------------------------------------------------------------#
136 |     #   只有得分大于置信度的预测框会被保留下来
137 |     #---------------------------------------------------------------------#
138 |     "confidence"        : 0.5,
139 |     #---------------------------------------------------------------------#
140 |     #   非极大抑制所用到的nms_iou大小
141 |     #---------------------------------------------------------------------#
142 |     "nms_iou"           : 0.3,
143 |     #---------------------------------------------------------------------#
144 |     #   该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize，
145 |     #   在多次测试后，发现关闭letterbox_image直接resize的效果更好
146 |     #---------------------------------------------------------------------#
147 |     "letterbox_image"   : True,
148 |     #-------------------------------#
149 |     #   是否使用Cuda
150 |     #   没有GPU可以设置成False
151 |     #-------------------------------#
152 |     "cuda"              : True,
153 | }
154 | ```
155 | 3. 运行predict.py，输入  
156 | ```python
157 | img/street.jpg
158 | ```
159 | 4. 在predict.py里面进行设置可以进行fps测试和video视频检测。  
160 | 
161 | ## 评估步骤 
162 | ### a、评估VOC07+12的测试集
163 | 1. 本文使用VOC格式进行评估。VOC07+12已经划分好了测试集，无需利用voc_annotation.py生成ImageSets文件夹下的txt。
164 | 2. 在yolo.py里面修改model_path以及classes_path。**model_path指向训练好的权值文件，在logs文件夹里。classes_path指向检测类别所对应的txt。**  
165 | 3. 运行get_map.py即可获得评估结果，评估结果会保存在map_out文件夹中。
166 | 
167 | ### b、评估自己的数据集
168 | 1. 本文使用VOC格式进行评估。  
169 | 2. 如果在训练前已经运行过voc_annotation.py文件，代码会自动将数据集划分成训练集、验证集和测试集。如果想要修改测试集的比例，可以修改voc_annotation.py文件下的trainval_percent。trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1。train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1。
170 | 3. 利用voc_annotation.py划分测试集后，前往get_map.py文件修改classes_path，classes_path用于指向检测类别所对应的txt，这个txt和训练时的txt一样。评估自己的数据集必须要修改。
171 | 4. 在yolo.py里面修改model_path以及classes_path。**model_path指向训练好的权值文件，在logs文件夹里。classes_path指向检测类别所对应的txt。**  
172 | 5. 运行get_map.py即可获得评估结果，评估结果会保存在map_out文件夹中。
173 | 
174 | ## Reference
175 | https://github.com/WongKinYiu/yolov7
176 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------------#
  2 | #   predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能
  3 | #   整合到了一个py文件中，通过指定mode进行模式的修改。
  4 | #-----------------------------------------------------------------------#
  5 | import time
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | from PIL import Image
 10 | 
 11 | from yolo import YOLO, YOLO_ONNX
 12 | 
 13 | if __name__ == "__main__":
 14 |     #----------------------------------------------------------------------------------------------------------#
 15 |     #   mode用于指定测试的模式：
 16 |     #   'predict'           表示单张图片预测，如果想对预测过程进行修改，如保存图片，截取对象等，可以先看下方详细的注释
 17 |     #   'video'             表示视频检测，可调用摄像头或者视频进行检测，详情查看下方注释。
 18 |     #   'fps'               表示测试fps，使用的图片是img里面的street.jpg，详情查看下方注释。
 19 |     #   'dir_predict'       表示遍历文件夹进行检测并保存。默认遍历img文件夹，保存img_out文件夹，详情查看下方注释。
 20 |     #   'heatmap'           表示进行预测结果的热力图可视化，详情查看下方注释。
 21 |     #   'export_onnx'       表示将模型导出为onnx，需要pytorch1.7.1以上。
 22 |     #   'predict_onnx'      表示利用导出的onnx模型进行预测，相关参数的修改在yolo.py_423行左右处的YOLO_ONNX
 23 |     #----------------------------------------------------------------------------------------------------------#
 24 |     mode = "predict"
 25 |     #-------------------------------------------------------------------------#
 26 |     #   crop                指定了是否在单张图片预测后对目标进行截取
 27 |     #   count               指定了是否进行目标的计数
 28 |     #   crop、count仅在mode='predict'时有效
 29 |     #-------------------------------------------------------------------------#
 30 |     crop            = False
 31 |     count           = False
 32 |     #----------------------------------------------------------------------------------------------------------#
 33 |     #   video_path          用于指定视频的路径，当video_path=0时表示检测摄像头
 34 |     #                       想要检测视频，则设置如video_path = "xxx.mp4"即可，代表读取出根目录下的xxx.mp4文件。
 35 |     #   video_save_path     表示视频保存的路径，当video_save_path=""时表示不保存
 36 |     #                       想要保存视频，则设置如video_save_path = "yyy.mp4"即可，代表保存为根目录下的yyy.mp4文件。
 37 |     #   video_fps           用于保存的视频的fps
 38 |     #
 39 |     #   video_path、video_save_path和video_fps仅在mode='video'时有效
 40 |     #   保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
 41 |     #----------------------------------------------------------------------------------------------------------#
 42 |     video_path      = 0
 43 |     video_save_path = ""
 44 |     video_fps       = 25.0
 45 |     #----------------------------------------------------------------------------------------------------------#
 46 |     #   test_interval       用于指定测量fps的时候，图片检测的次数。理论上test_interval越大，fps越准确。
 47 |     #   fps_image_path      用于指定测试的fps图片
 48 |     #   
 49 |     #   test_interval和fps_image_path仅在mode='fps'有效
 50 |     #----------------------------------------------------------------------------------------------------------#
 51 |     test_interval   = 100
 52 |     fps_image_path  = "img/street.jpg"
 53 |     #-------------------------------------------------------------------------#
 54 |     #   dir_origin_path     指定了用于检测的图片的文件夹路径
 55 |     #   dir_save_path       指定了检测完图片的保存路径
 56 |     #   
 57 |     #   dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
 58 |     #-------------------------------------------------------------------------#
 59 |     dir_origin_path = "img/"
 60 |     dir_save_path   = "img_out/"
 61 |     #-------------------------------------------------------------------------#
 62 |     #   heatmap_save_path   热力图的保存路径，默认保存在model_data下
 63 |     #   
 64 |     #   heatmap_save_path仅在mode='heatmap'有效
 65 |     #-------------------------------------------------------------------------#
 66 |     heatmap_save_path = "model_data/heatmap_vision.png"
 67 |     #-------------------------------------------------------------------------#
 68 |     #   simplify            使用Simplify onnx
 69 |     #   onnx_save_path      指定了onnx的保存路径
 70 |     #-------------------------------------------------------------------------#
 71 |     simplify        = True
 72 |     onnx_save_path  = "model_data/models.onnx"
 73 | 
 74 |     if mode != "predict_onnx":
 75 |         yolo = YOLO()
 76 |     else:
 77 |         yolo = YOLO_ONNX()
 78 | 
 79 |     if mode == "predict":
 80 |         '''
 81 |         1、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
 82 |         2、如果想要获得预测框的坐标，可以进入yolo.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
 83 |         3、如果想要利用预测框截取下目标，可以进入yolo.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
 84 |         在原图上利用矩阵的方式进行截取。
 85 |         4、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入yolo.detect_image函数，在绘图部分对predicted_class进行判断，
 86 |         比如判断if predicted_class == 'car': 即可判断当前目标是否为车，然后记录数量即可。利用draw.text即可写字。
 87 |         '''
 88 |         while True:
 89 |             img = input('Input image filename:')
 90 |             try:
 91 |                 image = Image.open(img)
 92 |             except:
 93 |                 print('Open Error! Try again!')
 94 |                 continue
 95 |             else:
 96 |                 r_image = yolo.detect_image(image, crop = crop, count=count)
 97 |                 r_image.show()
 98 | 
 99 |     elif mode == "video":
100 |         capture = cv2.VideoCapture(video_path)
101 |         if video_save_path!="":
102 |             fourcc  = cv2.VideoWriter_fourcc(*'XVID')
103 |             size    = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
104 |             out     = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
105 | 
106 |         ref, frame = capture.read()
107 |         if not ref:
108 |             raise ValueError("未能正确读取摄像头（视频），请注意是否正确安装摄像头（是否正确填写视频路径）。")
109 | 
110 |         fps = 0.0
111 |         while(True):
112 |             t1 = time.time()
113 |             # 读取某一帧
114 |             ref, frame = capture.read()
115 |             if not ref:
116 |                 break
117 |             # 格式转变，BGRtoRGB
118 |             frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
119 |             # 转变成Image
120 |             frame = Image.fromarray(np.uint8(frame))
121 |             # 进行检测
122 |             frame = np.array(yolo.detect_image(frame))
123 |             # RGBtoBGR满足opencv显示格式
124 |             frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR)
125 |             
126 |             fps  = ( fps + (1./(time.time()-t1)) ) / 2
127 |             print("fps= %.2f"%(fps))
128 |             frame = cv2.putText(frame, "fps= %.2f"%(fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
129 |             
130 |             cv2.imshow("video",frame)
131 |             c= cv2.waitKey(1) & 0xff 
132 |             if video_save_path!="":
133 |                 out.write(frame)
134 | 
135 |             if c==27:
136 |                 capture.release()
137 |                 break
138 | 
139 |         print("Video Detection Done!")
140 |         capture.release()
141 |         if video_save_path!="":
142 |             print("Save processed video to the path :" + video_save_path)
143 |             out.release()
144 |         cv2.destroyAllWindows()
145 |         
146 |     elif mode == "fps":
147 |         img = Image.open(fps_image_path)
148 |         tact_time = yolo.get_FPS(img, test_interval)
149 |         print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1')
150 | 
151 |     elif mode == "dir_predict":
152 |         import os
153 | 
154 |         from tqdm import tqdm
155 | 
156 |         img_names = os.listdir(dir_origin_path)
157 |         for img_name in tqdm(img_names):
158 |             if img_name.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
159 |                 image_path  = os.path.join(dir_origin_path, img_name)
160 |                 image       = Image.open(image_path)
161 |                 r_image     = yolo.detect_image(image)
162 |                 if not os.path.exists(dir_save_path):
163 |                     os.makedirs(dir_save_path)
164 |                 r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)
165 | 
166 |     elif mode == "heatmap":
167 |         while True:
168 |             img = input('Input image filename:')
169 |             try:
170 |                 image = Image.open(img)
171 |             except:
172 |                 print('Open Error! Try again!')
173 |                 continue
174 |             else:
175 |                 yolo.detect_heatmap(image, heatmap_save_path)
176 |                 
177 |     elif mode == "export_onnx":
178 |         yolo.convert_to_onnx(simplify, onnx_save_path)
179 | 
180 |     elif mode == "predict_onnx":
181 |         while True:
182 |             img = input('Input image filename:')
183 |             try:
184 |                 image = Image.open(img)
185 |             except:
186 |                 print('Open Error! Try again!')
187 |                 continue
188 |             else:
189 |                 r_image = yolo.detect_image(image)
190 |                 r_image.show()
191 |     else:
192 |         raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps', 'heatmap', 'export_onnx', 'dir_predict'.")
193 | 


--------------------------------------------------------------------------------
/utils/callbacks.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | 
  4 | import torch
  5 | import matplotlib
  6 | matplotlib.use('Agg')
  7 | import scipy.signal
  8 | from matplotlib import pyplot as plt
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | import shutil
 12 | import numpy as np
 13 | 
 14 | from PIL import Image
 15 | from tqdm import tqdm
 16 | from .utils import cvtColor, preprocess_input, resize_image
 17 | from .utils_bbox import DecodeBox
 18 | from .utils_map import get_coco_map, get_map
 19 | 
 20 | 
 21 | class LossHistory():
 22 |     def __init__(self, log_dir, model, input_shape):
 23 |         self.log_dir    = log_dir
 24 |         self.losses     = []
 25 |         self.val_loss   = []
 26 |         
 27 |         os.makedirs(self.log_dir)
 28 |         self.writer     = SummaryWriter(self.log_dir)
 29 |         try:
 30 |             dummy_input     = torch.randn(2, 3, input_shape[0], input_shape[1])
 31 |             self.writer.add_graph(model, dummy_input)
 32 |         except:
 33 |             pass
 34 | 
 35 |     def append_loss(self, epoch, loss, val_loss):
 36 |         if not os.path.exists(self.log_dir):
 37 |             os.makedirs(self.log_dir)
 38 | 
 39 |         self.losses.append(loss)
 40 |         self.val_loss.append(val_loss)
 41 | 
 42 |         with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f:
 43 |             f.write(str(loss))
 44 |             f.write("\n")
 45 |         with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f:
 46 |             f.write(str(val_loss))
 47 |             f.write("\n")
 48 | 
 49 |         self.writer.add_scalar('loss', loss, epoch)
 50 |         self.writer.add_scalar('val_loss', val_loss, epoch)
 51 |         self.loss_plot()
 52 | 
 53 |     def loss_plot(self):
 54 |         iters = range(len(self.losses))
 55 | 
 56 |         plt.figure()
 57 |         plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss')
 58 |         plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss')
 59 |         try:
 60 |             if len(self.losses) < 25:
 61 |                 num = 5
 62 |             else:
 63 |                 num = 15
 64 |             
 65 |             plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss')
 66 |             plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss')
 67 |         except:
 68 |             pass
 69 | 
 70 |         plt.grid(True)
 71 |         plt.xlabel('Epoch')
 72 |         plt.ylabel('Loss')
 73 |         plt.legend(loc="upper right")
 74 | 
 75 |         plt.savefig(os.path.join(self.log_dir, "epoch_loss.png"))
 76 | 
 77 |         plt.cla()
 78 |         plt.close("all")
 79 | 
 80 | class EvalCallback():
 81 |     def __init__(self, net, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, cuda, \
 82 |             map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True, MINOVERLAP=0.5, eval_flag=True, period=1):
 83 |         super(EvalCallback, self).__init__()
 84 |         
 85 |         self.net                = net
 86 |         self.input_shape        = input_shape
 87 |         self.anchors            = anchors
 88 |         self.anchors_mask       = anchors_mask
 89 |         self.class_names        = class_names
 90 |         self.num_classes        = num_classes
 91 |         self.val_lines          = val_lines
 92 |         self.log_dir            = log_dir
 93 |         self.cuda               = cuda
 94 |         self.map_out_path       = map_out_path
 95 |         self.max_boxes          = max_boxes
 96 |         self.confidence         = confidence
 97 |         self.nms_iou            = nms_iou
 98 |         self.letterbox_image    = letterbox_image
 99 |         self.MINOVERLAP         = MINOVERLAP
100 |         self.eval_flag          = eval_flag
101 |         self.period             = period
102 |         
103 |         self.bbox_util          = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]), self.anchors_mask)
104 |         
105 |         self.maps       = [0]
106 |         self.epoches    = [0]
107 |         if self.eval_flag:
108 |             with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
109 |                 f.write(str(0))
110 |                 f.write("\n")
111 | 
112 |     def get_map_txt(self, image_id, image, class_names, map_out_path):
113 |         f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8') 
114 |         image_shape = np.array(np.shape(image)[0:2])
115 |         #---------------------------------------------------------#
116 |         #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
117 |         #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
118 |         #---------------------------------------------------------#
119 |         image       = cvtColor(image)
120 |         #---------------------------------------------------------#
121 |         #   给图像增加灰条，实现不失真的resize
122 |         #   也可以直接resize进行识别
123 |         #---------------------------------------------------------#
124 |         image_data  = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
125 |         #---------------------------------------------------------#
126 |         #   添加上batch_size维度
127 |         #---------------------------------------------------------#
128 |         image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
129 | 
130 |         with torch.no_grad():
131 |             images = torch.from_numpy(image_data)
132 |             if self.cuda:
133 |                 images = images.cuda()
134 |             #---------------------------------------------------------#
135 |             #   将图像输入网络当中进行预测！
136 |             #---------------------------------------------------------#
137 |             outputs = self.net(images)
138 |             outputs = self.bbox_util.decode_box(outputs)
139 |             #---------------------------------------------------------#
140 |             #   将预测框进行堆叠，然后进行非极大抑制
141 |             #---------------------------------------------------------#
142 |             results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, 
143 |                         image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
144 |                                                     
145 |             if results[0] is None: 
146 |                 return 
147 | 
148 |             top_label   = np.array(results[0][:, 6], dtype = 'int32')
149 |             top_conf    = results[0][:, 4] * results[0][:, 5]
150 |             top_boxes   = results[0][:, :4]
151 | 
152 |         top_100     = np.argsort(top_conf)[::-1][:self.max_boxes]
153 |         top_boxes   = top_boxes[top_100]
154 |         top_conf    = top_conf[top_100]
155 |         top_label   = top_label[top_100]
156 | 
157 |         for i, c in list(enumerate(top_label)):
158 |             predicted_class = self.class_names[int(c)]
159 |             box             = top_boxes[i]
160 |             score           = str(top_conf[i])
161 | 
162 |             top, left, bottom, right = box
163 |             if predicted_class not in class_names:
164 |                 continue
165 | 
166 |             f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom))))
167 | 
168 |         f.close()
169 |         return 
170 |     
171 |     def on_epoch_end(self, epoch, model_eval):
172 |         if epoch % self.period == 0 and self.eval_flag:
173 |             self.net = model_eval
174 |             if not os.path.exists(self.map_out_path):
175 |                 os.makedirs(self.map_out_path)
176 |             if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")):
177 |                 os.makedirs(os.path.join(self.map_out_path, "ground-truth"))
178 |             if not os.path.exists(os.path.join(self.map_out_path, "detection-results")):
179 |                 os.makedirs(os.path.join(self.map_out_path, "detection-results"))
180 |             print("Get map.")
181 |             for annotation_line in tqdm(self.val_lines):
182 |                 line        = annotation_line.split()
183 |                 image_id    = os.path.basename(line[0]).split('.')[0]
184 |                 #------------------------------#
185 |                 #   读取图像并转换成RGB图像
186 |                 #------------------------------#
187 |                 image       = Image.open(line[0])
188 |                 #------------------------------#
189 |                 #   获得预测框
190 |                 #------------------------------#
191 |                 gt_boxes    = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
192 |                 #------------------------------#
193 |                 #   获得预测txt
194 |                 #------------------------------#
195 |                 self.get_map_txt(image_id, image, self.class_names, self.map_out_path)
196 |                 
197 |                 #------------------------------#
198 |                 #   获得真实框txt
199 |                 #------------------------------#
200 |                 with open(os.path.join(self.map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f:
201 |                     for box in gt_boxes:
202 |                         left, top, right, bottom, obj = box
203 |                         obj_name = self.class_names[obj]
204 |                         new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
205 |                         
206 |             print("Calculate Map.")
207 |             try:
208 |                 temp_map = get_coco_map(class_names = self.class_names, path = self.map_out_path)[1]
209 |             except:
210 |                 temp_map = get_map(self.MINOVERLAP, False, path = self.map_out_path)
211 |             self.maps.append(temp_map)
212 |             self.epoches.append(epoch)
213 | 
214 |             with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
215 |                 f.write(str(temp_map))
216 |                 f.write("\n")
217 |             
218 |             plt.figure()
219 |             plt.plot(self.epoches, self.maps, 'red', linewidth = 2, label='train map')
220 | 
221 |             plt.grid(True)
222 |             plt.xlabel('Epoch')
223 |             plt.ylabel('Map %s'%str(self.MINOVERLAP))
224 |             plt.title('A Map Curve')
225 |             plt.legend(loc="upper right")
226 | 
227 |             plt.savefig(os.path.join(self.log_dir, "epoch_map.png"))
228 |             plt.cla()
229 |             plt.close("all")
230 | 
231 |             print("Get map done.")
232 |             shutil.rmtree(self.map_out_path)
233 | 


--------------------------------------------------------------------------------
/utils/dataloader.py:
--------------------------------------------------------------------------------
  1 | from random import sample, shuffle
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | import torch
  6 | from PIL import Image
  7 | from torch.utils.data.dataset import Dataset
  8 | 
  9 | from utils.utils import cvtColor, preprocess_input
 10 | 
 11 | 
 12 | class YoloDataset(Dataset):
 13 |     def __init__(self, annotation_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length, \
 14 |                         mosaic, mixup, mosaic_prob, mixup_prob, train, special_aug_ratio = 0.7):
 15 |         super(YoloDataset, self).__init__()
 16 |         self.annotation_lines   = annotation_lines
 17 |         self.input_shape        = input_shape
 18 |         self.num_classes        = num_classes
 19 |         self.anchors            = anchors
 20 |         self.anchors_mask       = anchors_mask
 21 |         self.epoch_length       = epoch_length
 22 |         self.mosaic             = mosaic
 23 |         self.mosaic_prob        = mosaic_prob
 24 |         self.mixup              = mixup
 25 |         self.mixup_prob         = mixup_prob
 26 |         self.train              = train
 27 |         self.special_aug_ratio  = special_aug_ratio
 28 | 
 29 |         self.epoch_now          = -1
 30 |         self.length             = len(self.annotation_lines)
 31 |         
 32 |         self.bbox_attrs         = 5 + num_classes
 33 | 
 34 |     def __len__(self):
 35 |         return self.length
 36 | 
 37 |     def __getitem__(self, index):
 38 |         index       = index % self.length
 39 | 
 40 |         #---------------------------------------------------#
 41 |         #   训练时进行数据的随机增强
 42 |         #   验证时不进行数据的随机增强
 43 |         #---------------------------------------------------#
 44 |         if self.mosaic and self.rand() < self.mosaic_prob and self.epoch_now < self.epoch_length * self.special_aug_ratio:
 45 |             lines = sample(self.annotation_lines, 3)
 46 |             lines.append(self.annotation_lines[index])
 47 |             shuffle(lines)
 48 |             image, box  = self.get_random_data_with_Mosaic(lines, self.input_shape)
 49 |             
 50 |             if self.mixup and self.rand() < self.mixup_prob:
 51 |                 lines           = sample(self.annotation_lines, 1)
 52 |                 image_2, box_2  = self.get_random_data(lines[0], self.input_shape, random = self.train)
 53 |                 image, box      = self.get_random_data_with_MixUp(image, box, image_2, box_2)
 54 |         else:
 55 |             image, box      = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train)
 56 | 
 57 |         image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
 58 |         box         = np.array(box, dtype=np.float32)
 59 |         
 60 |         #---------------------------------------------------#
 61 |         #   对真实框进行预处理
 62 |         #---------------------------------------------------#
 63 |         nL          = len(box)
 64 |         labels_out  = np.zeros((nL, 6))
 65 |         if nL:
 66 |             #---------------------------------------------------#
 67 |             #   对真实框进行归一化，调整到0-1之间
 68 |             #---------------------------------------------------#
 69 |             box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
 70 |             box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
 71 |             #---------------------------------------------------#
 72 |             #   序号为0、1的部分，为真实框的中心
 73 |             #   序号为2、3的部分，为真实框的宽高
 74 |             #   序号为4的部分，为真实框的种类
 75 |             #---------------------------------------------------#
 76 |             box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
 77 |             box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
 78 |             
 79 |             #---------------------------------------------------#
 80 |             #   调整顺序，符合训练的格式
 81 |             #   labels_out中序号为0的部分在collect时处理
 82 |             #---------------------------------------------------#
 83 |             labels_out[:, 1] = box[:, -1]
 84 |             labels_out[:, 2:] = box[:, :4]
 85 |             
 86 |         return image, labels_out
 87 | 
 88 |     def rand(self, a=0, b=1):
 89 |         return np.random.rand()*(b-a) + a
 90 | 
 91 |     def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
 92 |         line    = annotation_line.split()
 93 |         #------------------------------#
 94 |         #   读取图像并转换成RGB图像
 95 |         #------------------------------#
 96 |         image   = Image.open(line[0])
 97 |         image   = cvtColor(image)
 98 |         #------------------------------#
 99 |         #   获得图像的高宽与目标高宽
100 |         #------------------------------#
101 |         iw, ih  = image.size
102 |         h, w    = input_shape
103 |         #------------------------------#
104 |         #   获得预测框
105 |         #------------------------------#
106 |         box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
107 | 
108 |         if not random:
109 |             scale = min(w/iw, h/ih)
110 |             nw = int(iw*scale)
111 |             nh = int(ih*scale)
112 |             dx = (w-nw)//2
113 |             dy = (h-nh)//2
114 | 
115 |             #---------------------------------#
116 |             #   将图像多余的部分加上灰条
117 |             #---------------------------------#
118 |             image       = image.resize((nw,nh), Image.BICUBIC)
119 |             new_image   = Image.new('RGB', (w,h), (128,128,128))
120 |             new_image.paste(image, (dx, dy))
121 |             image_data  = np.array(new_image, np.float32)
122 | 
123 |             #---------------------------------#
124 |             #   对真实框进行调整
125 |             #---------------------------------#
126 |             if len(box)>0:
127 |                 np.random.shuffle(box)
128 |                 box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
129 |                 box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
130 |                 box[:, 0:2][box[:, 0:2]<0] = 0
131 |                 box[:, 2][box[:, 2]>w] = w
132 |                 box[:, 3][box[:, 3]>h] = h
133 |                 box_w = box[:, 2] - box[:, 0]
134 |                 box_h = box[:, 3] - box[:, 1]
135 |                 box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
136 | 
137 |             return image_data, box
138 |                 
139 |         #------------------------------------------#
140 |         #   对图像进行缩放并且进行长和宽的扭曲
141 |         #------------------------------------------#
142 |         new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
143 |         scale = self.rand(.25, 2)
144 |         if new_ar < 1:
145 |             nh = int(scale*h)
146 |             nw = int(nh*new_ar)
147 |         else:
148 |             nw = int(scale*w)
149 |             nh = int(nw/new_ar)
150 |         image = image.resize((nw,nh), Image.BICUBIC)
151 | 
152 |         #------------------------------------------#
153 |         #   将图像多余的部分加上灰条
154 |         #------------------------------------------#
155 |         dx = int(self.rand(0, w-nw))
156 |         dy = int(self.rand(0, h-nh))
157 |         new_image = Image.new('RGB', (w,h), (128,128,128))
158 |         new_image.paste(image, (dx, dy))
159 |         image = new_image
160 | 
161 |         #------------------------------------------#
162 |         #   翻转图像
163 |         #------------------------------------------#
164 |         flip = self.rand()<.5
165 |         if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
166 | 
167 |         image_data      = np.array(image, np.uint8)
168 |         #---------------------------------#
169 |         #   对图像进行色域变换
170 |         #   计算色域变换的参数
171 |         #---------------------------------#
172 |         r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
173 |         #---------------------------------#
174 |         #   将图像转到HSV上
175 |         #---------------------------------#
176 |         hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
177 |         dtype           = image_data.dtype
178 |         #---------------------------------#
179 |         #   应用变换
180 |         #---------------------------------#
181 |         x       = np.arange(0, 256, dtype=r.dtype)
182 |         lut_hue = ((x * r[0]) % 180).astype(dtype)
183 |         lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
184 |         lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
185 | 
186 |         image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
187 |         image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
188 | 
189 |         #---------------------------------#
190 |         #   对真实框进行调整
191 |         #---------------------------------#
192 |         if len(box)>0:
193 |             np.random.shuffle(box)
194 |             box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
195 |             box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
196 |             if flip: box[:, [0,2]] = w - box[:, [2,0]]
197 |             box[:, 0:2][box[:, 0:2]<0] = 0
198 |             box[:, 2][box[:, 2]>w] = w
199 |             box[:, 3][box[:, 3]>h] = h
200 |             box_w = box[:, 2] - box[:, 0]
201 |             box_h = box[:, 3] - box[:, 1]
202 |             box = box[np.logical_and(box_w>1, box_h>1)] 
203 |         
204 |         return image_data, box
205 |     
206 |     def merge_bboxes(self, bboxes, cutx, cuty):
207 |         merge_bbox = []
208 |         for i in range(len(bboxes)):
209 |             for box in bboxes[i]:
210 |                 tmp_box = []
211 |                 x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
212 | 
213 |                 if i == 0:
214 |                     if y1 > cuty or x1 > cutx:
215 |                         continue
216 |                     if y2 >= cuty and y1 <= cuty:
217 |                         y2 = cuty
218 |                     if x2 >= cutx and x1 <= cutx:
219 |                         x2 = cutx
220 | 
221 |                 if i == 1:
222 |                     if y2 < cuty or x1 > cutx:
223 |                         continue
224 |                     if y2 >= cuty and y1 <= cuty:
225 |                         y1 = cuty
226 |                     if x2 >= cutx and x1 <= cutx:
227 |                         x2 = cutx
228 | 
229 |                 if i == 2:
230 |                     if y2 < cuty or x2 < cutx:
231 |                         continue
232 |                     if y2 >= cuty and y1 <= cuty:
233 |                         y1 = cuty
234 |                     if x2 >= cutx and x1 <= cutx:
235 |                         x1 = cutx
236 | 
237 |                 if i == 3:
238 |                     if y1 > cuty or x2 < cutx:
239 |                         continue
240 |                     if y2 >= cuty and y1 <= cuty:
241 |                         y2 = cuty
242 |                     if x2 >= cutx and x1 <= cutx:
243 |                         x1 = cutx
244 |                 tmp_box.append(x1)
245 |                 tmp_box.append(y1)
246 |                 tmp_box.append(x2)
247 |                 tmp_box.append(y2)
248 |                 tmp_box.append(box[-1])
249 |                 merge_bbox.append(tmp_box)
250 |         return merge_bbox
251 | 
252 |     def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4):
253 |         h, w = input_shape
254 |         min_offset_x = self.rand(0.3, 0.7)
255 |         min_offset_y = self.rand(0.3, 0.7)
256 | 
257 |         image_datas = [] 
258 |         box_datas   = []
259 |         index       = 0
260 |         for line in annotation_line:
261 |             #---------------------------------#
262 |             #   每一行进行分割
263 |             #---------------------------------#
264 |             line_content = line.split()
265 |             #---------------------------------#
266 |             #   打开图片
267 |             #---------------------------------#
268 |             image = Image.open(line_content[0])
269 |             image = cvtColor(image)
270 |             
271 |             #---------------------------------#
272 |             #   图片的大小
273 |             #---------------------------------#
274 |             iw, ih = image.size
275 |             #---------------------------------#
276 |             #   保存框的位置
277 |             #---------------------------------#
278 |             box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
279 |             
280 |             #---------------------------------#
281 |             #   是否翻转图片
282 |             #---------------------------------#
283 |             flip = self.rand()<.5
284 |             if flip and len(box)>0:
285 |                 image = image.transpose(Image.FLIP_LEFT_RIGHT)
286 |                 box[:, [0,2]] = iw - box[:, [2,0]]
287 | 
288 |             #------------------------------------------#
289 |             #   对图像进行缩放并且进行长和宽的扭曲
290 |             #------------------------------------------#
291 |             new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
292 |             scale = self.rand(.4, 1)
293 |             if new_ar < 1:
294 |                 nh = int(scale*h)
295 |                 nw = int(nh*new_ar)
296 |             else:
297 |                 nw = int(scale*w)
298 |                 nh = int(nw/new_ar)
299 |             image = image.resize((nw, nh), Image.BICUBIC)
300 | 
301 |             #-----------------------------------------------#
302 |             #   将图片进行放置，分别对应四张分割图片的位置
303 |             #-----------------------------------------------#
304 |             if index == 0:
305 |                 dx = int(w*min_offset_x) - nw
306 |                 dy = int(h*min_offset_y) - nh
307 |             elif index == 1:
308 |                 dx = int(w*min_offset_x) - nw
309 |                 dy = int(h*min_offset_y)
310 |             elif index == 2:
311 |                 dx = int(w*min_offset_x)
312 |                 dy = int(h*min_offset_y)
313 |             elif index == 3:
314 |                 dx = int(w*min_offset_x)
315 |                 dy = int(h*min_offset_y) - nh
316 |             
317 |             new_image = Image.new('RGB', (w,h), (128,128,128))
318 |             new_image.paste(image, (dx, dy))
319 |             image_data = np.array(new_image)
320 | 
321 |             index = index + 1
322 |             box_data = []
323 |             #---------------------------------#
324 |             #   对box进行重新处理
325 |             #---------------------------------#
326 |             if len(box)>0:
327 |                 np.random.shuffle(box)
328 |                 box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
329 |                 box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
330 |                 box[:, 0:2][box[:, 0:2]<0] = 0
331 |                 box[:, 2][box[:, 2]>w] = w
332 |                 box[:, 3][box[:, 3]>h] = h
333 |                 box_w = box[:, 2] - box[:, 0]
334 |                 box_h = box[:, 3] - box[:, 1]
335 |                 box = box[np.logical_and(box_w>1, box_h>1)]
336 |                 box_data = np.zeros((len(box),5))
337 |                 box_data[:len(box)] = box
338 |             
339 |             image_datas.append(image_data)
340 |             box_datas.append(box_data)
341 | 
342 |         #---------------------------------#
343 |         #   将图片分割，放在一起
344 |         #---------------------------------#
345 |         cutx = int(w * min_offset_x)
346 |         cuty = int(h * min_offset_y)
347 | 
348 |         new_image = np.zeros([h, w, 3])
349 |         new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
350 |         new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :]
351 |         new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
352 |         new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]
353 | 
354 |         new_image       = np.array(new_image, np.uint8)
355 |         #---------------------------------#
356 |         #   对图像进行色域变换
357 |         #   计算色域变换的参数
358 |         #---------------------------------#
359 |         r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
360 |         #---------------------------------#
361 |         #   将图像转到HSV上
362 |         #---------------------------------#
363 |         hue, sat, val   = cv2.split(cv2.cvtColor(new_image, cv2.COLOR_RGB2HSV))
364 |         dtype           = new_image.dtype
365 |         #---------------------------------#
366 |         #   应用变换
367 |         #---------------------------------#
368 |         x       = np.arange(0, 256, dtype=r.dtype)
369 |         lut_hue = ((x * r[0]) % 180).astype(dtype)
370 |         lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
371 |         lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
372 | 
373 |         new_image = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
374 |         new_image = cv2.cvtColor(new_image, cv2.COLOR_HSV2RGB)
375 | 
376 |         #---------------------------------#
377 |         #   对框进行进一步的处理
378 |         #---------------------------------#
379 |         new_boxes = self.merge_bboxes(box_datas, cutx, cuty)
380 | 
381 |         return new_image, new_boxes
382 | 
383 |     def get_random_data_with_MixUp(self, image_1, box_1, image_2, box_2):
384 |         new_image = np.array(image_1, np.float32) * 0.5 + np.array(image_2, np.float32) * 0.5
385 |         if len(box_1) == 0:
386 |             new_boxes = box_2
387 |         elif len(box_2) == 0:
388 |             new_boxes = box_1
389 |         else:
390 |             new_boxes = np.concatenate([box_1, box_2], axis=0)
391 |         return new_image, new_boxes
392 |     
393 |     
394 | # DataLoader中collate_fn使用
395 | def yolo_dataset_collate(batch):
396 |     images  = []
397 |     bboxes  = []
398 |     for i, (img, box) in enumerate(batch):
399 |         images.append(img)
400 |         box[:, 0] = i
401 |         bboxes.append(box)
402 |             
403 |     images  = torch.from_numpy(np.array(images)).type(torch.FloatTensor)
404 |     bboxes  = torch.from_numpy(np.concatenate(bboxes, 0)).type(torch.FloatTensor)
405 |     return images, bboxes
406 | 


--------------------------------------------------------------------------------
/nets/yolo.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from nets.backbone import Backbone, Multi_Concat_Block, Conv, SiLU, Transition_Block, autopad
  6 | 
  7 | 
  8 | class SPPCSPC(nn.Module):
  9 |     # CSP https://github.com/WongKinYiu/CrossStagePartialNetworks
 10 |     def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(5, 9, 13)):
 11 |         super(SPPCSPC, self).__init__()
 12 |         c_ = int(2 * c2 * e)  # hidden channels
 13 |         self.cv1 = Conv(c1, c_, 1, 1)
 14 |         self.cv2 = Conv(c1, c_, 1, 1)
 15 |         self.cv3 = Conv(c_, c_, 3, 1)
 16 |         self.cv4 = Conv(c_, c_, 1, 1)
 17 |         self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
 18 |         self.cv5 = Conv(4 * c_, c_, 1, 1)
 19 |         self.cv6 = Conv(c_, c_, 3, 1)
 20 |         # 输出通道数为c2
 21 |         self.cv7 = Conv(2 * c_, c2, 1, 1)
 22 | 
 23 |     def forward(self, x):
 24 |         x1 = self.cv4(self.cv3(self.cv1(x)))
 25 |         y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))
 26 |         y2 = self.cv2(x)
 27 |         return self.cv7(torch.cat((y1, y2), dim=1))
 28 | 
 29 | class RepConv(nn.Module):
 30 |     # Represented convolution
 31 |     # https://arxiv.org/abs/2101.03697
 32 |     def __init__(self, c1, c2, k=3, s=1, p=None, g=1, act=SiLU(), deploy=False):
 33 |         super(RepConv, self).__init__()
 34 |         self.deploy         = deploy
 35 |         self.groups         = g
 36 |         self.in_channels    = c1
 37 |         self.out_channels   = c2
 38 |         
 39 |         assert k == 3
 40 |         assert autopad(k, p) == 1
 41 | 
 42 |         padding_11  = autopad(k, p) - k // 2
 43 |         self.act    = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
 44 | 
 45 |         if deploy:
 46 |             self.rbr_reparam    = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=True)
 47 |         else:
 48 |             self.rbr_identity   = (nn.BatchNorm2d(num_features=c1, eps=0.001, momentum=0.03) if c2 == c1 and s == 1 else None)
 49 |             self.rbr_dense      = nn.Sequential(
 50 |                 nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False),
 51 |                 nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),
 52 |             )
 53 |             self.rbr_1x1        = nn.Sequential(
 54 |                 nn.Conv2d( c1, c2, 1, s, padding_11, groups=g, bias=False),
 55 |                 nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),
 56 |             )
 57 | 
 58 |     def forward(self, inputs):
 59 |         if hasattr(self, "rbr_reparam"):
 60 |             return self.act(self.rbr_reparam(inputs))
 61 |         if self.rbr_identity is None:
 62 |             id_out = 0
 63 |         else:
 64 |             id_out = self.rbr_identity(inputs)
 65 |         return self.act(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)
 66 |     
 67 |     def get_equivalent_kernel_bias(self):
 68 |         kernel3x3, bias3x3  = self._fuse_bn_tensor(self.rbr_dense)
 69 |         kernel1x1, bias1x1  = self._fuse_bn_tensor(self.rbr_1x1)
 70 |         kernelid, biasid    = self._fuse_bn_tensor(self.rbr_identity)
 71 |         return (
 72 |             kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,
 73 |             bias3x3 + bias1x1 + biasid,
 74 |         )
 75 | 
 76 |     def _pad_1x1_to_3x3_tensor(self, kernel1x1):
 77 |         if kernel1x1 is None:
 78 |             return 0
 79 |         else:
 80 |             return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
 81 | 
 82 |     def _fuse_bn_tensor(self, branch):
 83 |         if branch is None:
 84 |             return 0, 0
 85 |         if isinstance(branch, nn.Sequential):
 86 |             kernel      = branch[0].weight
 87 |             running_mean = branch[1].running_mean
 88 |             running_var = branch[1].running_var
 89 |             gamma       = branch[1].weight
 90 |             beta        = branch[1].bias
 91 |             eps         = branch[1].eps
 92 |         else:
 93 |             assert isinstance(branch, nn.BatchNorm2d)
 94 |             if not hasattr(self, "id_tensor"):
 95 |                 input_dim = self.in_channels // self.groups
 96 |                 kernel_value = np.zeros(
 97 |                     (self.in_channels, input_dim, 3, 3), dtype=np.float32
 98 |                 )
 99 |                 for i in range(self.in_channels):
100 |                     kernel_value[i, i % input_dim, 1, 1] = 1
101 |                 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
102 |             kernel      = self.id_tensor
103 |             running_mean = branch.running_mean
104 |             running_var = branch.running_var
105 |             gamma       = branch.weight
106 |             beta        = branch.bias
107 |             eps         = branch.eps
108 |         std = (running_var + eps).sqrt()
109 |         t   = (gamma / std).reshape(-1, 1, 1, 1)
110 |         return kernel * t, beta - running_mean * gamma / std
111 | 
112 |     def repvgg_convert(self):
113 |         kernel, bias = self.get_equivalent_kernel_bias()
114 |         return (
115 |             kernel.detach().cpu().numpy(),
116 |             bias.detach().cpu().numpy(),
117 |         )
118 | 
119 |     def fuse_conv_bn(self, conv, bn):
120 |         std     = (bn.running_var + bn.eps).sqrt()
121 |         bias    = bn.bias - bn.running_mean * bn.weight / std
122 | 
123 |         t       = (bn.weight / std).reshape(-1, 1, 1, 1)
124 |         weights = conv.weight * t
125 | 
126 |         bn      = nn.Identity()
127 |         conv    = nn.Conv2d(in_channels = conv.in_channels,
128 |                               out_channels = conv.out_channels,
129 |                               kernel_size = conv.kernel_size,
130 |                               stride=conv.stride,
131 |                               padding = conv.padding,
132 |                               dilation = conv.dilation,
133 |                               groups = conv.groups,
134 |                               bias = True,
135 |                               padding_mode = conv.padding_mode)
136 | 
137 |         conv.weight = torch.nn.Parameter(weights)
138 |         conv.bias   = torch.nn.Parameter(bias)
139 |         return conv
140 | 
141 |     def fuse_repvgg_block(self):    
142 |         if self.deploy:
143 |             return
144 |         print(f"RepConv.fuse_repvgg_block")
145 |         self.rbr_dense  = self.fuse_conv_bn(self.rbr_dense[0], self.rbr_dense[1])
146 |         
147 |         self.rbr_1x1    = self.fuse_conv_bn(self.rbr_1x1[0], self.rbr_1x1[1])
148 |         rbr_1x1_bias    = self.rbr_1x1.bias
149 |         weight_1x1_expanded = torch.nn.functional.pad(self.rbr_1x1.weight, [1, 1, 1, 1])
150 |         
151 |         # Fuse self.rbr_identity
152 |         if (isinstance(self.rbr_identity, nn.BatchNorm2d) or isinstance(self.rbr_identity, nn.modules.batchnorm.SyncBatchNorm)):
153 |             identity_conv_1x1 = nn.Conv2d(
154 |                     in_channels=self.in_channels,
155 |                     out_channels=self.out_channels,
156 |                     kernel_size=1,
157 |                     stride=1,
158 |                     padding=0,
159 |                     groups=self.groups, 
160 |                     bias=False)
161 |             identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.to(self.rbr_1x1.weight.data.device)
162 |             identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.squeeze().squeeze()
163 |             identity_conv_1x1.weight.data.fill_(0.0)
164 |             identity_conv_1x1.weight.data.fill_diagonal_(1.0)
165 |             identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.unsqueeze(2).unsqueeze(3)
166 | 
167 |             identity_conv_1x1           = self.fuse_conv_bn(identity_conv_1x1, self.rbr_identity)
168 |             bias_identity_expanded      = identity_conv_1x1.bias
169 |             weight_identity_expanded    = torch.nn.functional.pad(identity_conv_1x1.weight, [1, 1, 1, 1])            
170 |         else:
171 |             bias_identity_expanded      = torch.nn.Parameter( torch.zeros_like(rbr_1x1_bias) )
172 |             weight_identity_expanded    = torch.nn.Parameter( torch.zeros_like(weight_1x1_expanded) )            
173 |         
174 |         self.rbr_dense.weight   = torch.nn.Parameter(self.rbr_dense.weight + weight_1x1_expanded + weight_identity_expanded)
175 |         self.rbr_dense.bias     = torch.nn.Parameter(self.rbr_dense.bias + rbr_1x1_bias + bias_identity_expanded)
176 |                 
177 |         self.rbr_reparam    = self.rbr_dense
178 |         self.deploy         = True
179 | 
180 |         if self.rbr_identity is not None:
181 |             del self.rbr_identity
182 |             self.rbr_identity = None
183 | 
184 |         if self.rbr_1x1 is not None:
185 |             del self.rbr_1x1
186 |             self.rbr_1x1 = None
187 | 
188 |         if self.rbr_dense is not None:
189 |             del self.rbr_dense
190 |             self.rbr_dense = None
191 |             
192 | def fuse_conv_and_bn(conv, bn):
193 |     fusedconv = nn.Conv2d(conv.in_channels,
194 |                           conv.out_channels,
195 |                           kernel_size=conv.kernel_size,
196 |                           stride=conv.stride,
197 |                           padding=conv.padding,
198 |                           groups=conv.groups,
199 |                           bias=True).requires_grad_(False).to(conv.weight.device)
200 | 
201 |     w_conv  = conv.weight.clone().view(conv.out_channels, -1)
202 |     w_bn    = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
203 |     # fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
204 |     fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape).detach())
205 | 
206 |     b_conv  = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
207 |     b_bn    = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
208 |     # fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
209 |     fusedconv.bias.copy_((torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn).detach())
210 |     return fusedconv
211 | 
212 | #---------------------------------------------------#
213 | #   yolo_body
214 | #---------------------------------------------------#
215 | class YoloBody(nn.Module):
216 |     def __init__(self, anchors_mask, num_classes, phi, pretrained=False):
217 |         super(YoloBody, self).__init__()
218 |         #-----------------------------------------------#
219 |         #   定义了不同yolov7版本的参数
220 |         #-----------------------------------------------#
221 |         transition_channels = {'l' : 32, 'x' : 40}[phi]
222 |         block_channels      = 32
223 |         panet_channels      = {'l' : 32, 'x' : 64}[phi]
224 |         e       = {'l' : 2, 'x' : 1}[phi]
225 |         n       = {'l' : 4, 'x' : 6}[phi]
226 |         ids     = {'l' : [-1, -2, -3, -4, -5, -6], 'x' : [-1, -3, -5, -7, -8]}[phi]
227 |         conv    = {'l' : RepConv, 'x' : Conv}[phi]
228 |         #-----------------------------------------------#
229 |         #   输入图片是640, 640, 3
230 |         #-----------------------------------------------#
231 | 
232 |         #---------------------------------------------------#   
233 |         #   生成主干模型
234 |         #   获得三个有效特征层，他们的shape分别是：
235 |         #   80, 80, 512
236 |         #   40, 40, 1024
237 |         #   20, 20, 1024
238 |         #---------------------------------------------------#
239 |         self.backbone   = Backbone(transition_channels, block_channels, n, phi, pretrained=pretrained)
240 | 
241 |         #------------------------加强特征提取网络------------------------# 
242 |         self.upsample   = nn.Upsample(scale_factor=2, mode="nearest")
243 | 
244 |         # 20, 20, 1024 => 20, 20, 512
245 |         self.sppcspc                = SPPCSPC(transition_channels * 32, transition_channels * 16)
246 |         # 20, 20, 512 => 20, 20, 256 => 40, 40, 256
247 |         self.conv_for_P5            = Conv(transition_channels * 16, transition_channels * 8)
248 |         # 40, 40, 1024 => 40, 40, 256
249 |         self.conv_for_feat2         = Conv(transition_channels * 32, transition_channels * 8)
250 |         # 40, 40, 512 => 40, 40, 256
251 |         self.conv3_for_upsample1    = Multi_Concat_Block(transition_channels * 16, panet_channels * 4, transition_channels * 8, e=e, n=n, ids=ids)
252 | 
253 |         # 40, 40, 256 => 40, 40, 128 => 80, 80, 128
254 |         self.conv_for_P4            = Conv(transition_channels * 8, transition_channels * 4)
255 |         # 80, 80, 512 => 80, 80, 128
256 |         self.conv_for_feat1         = Conv(transition_channels * 16, transition_channels * 4)
257 |         # 80, 80, 256 => 80, 80, 128
258 |         self.conv3_for_upsample2    = Multi_Concat_Block(transition_channels * 8, panet_channels * 2, transition_channels * 4, e=e, n=n, ids=ids)
259 | 
260 |         # 80, 80, 128 => 40, 40, 256
261 |         self.down_sample1           = Transition_Block(transition_channels * 4, transition_channels * 4)
262 |         # 40, 40, 512 => 40, 40, 256
263 |         self.conv3_for_downsample1  = Multi_Concat_Block(transition_channels * 16, panet_channels * 4, transition_channels * 8, e=e, n=n, ids=ids)
264 | 
265 |         # 40, 40, 256 => 20, 20, 512
266 |         self.down_sample2           = Transition_Block(transition_channels * 8, transition_channels * 8)
267 |         # 20, 20, 1024 => 20, 20, 512
268 |         self.conv3_for_downsample2  = Multi_Concat_Block(transition_channels * 32, panet_channels * 8, transition_channels * 16, e=e, n=n, ids=ids)
269 |         #------------------------加强特征提取网络------------------------# 
270 | 
271 |         # 80, 80, 128 => 80, 80, 256
272 |         self.rep_conv_1 = conv(transition_channels * 4, transition_channels * 8, 3, 1)
273 |         # 40, 40, 256 => 40, 40, 512
274 |         self.rep_conv_2 = conv(transition_channels * 8, transition_channels * 16, 3, 1)
275 |         # 20, 20, 512 => 20, 20, 1024
276 |         self.rep_conv_3 = conv(transition_channels * 16, transition_channels * 32, 3, 1)
277 | 
278 |         # 4 + 1 + num_classes
279 |         # 80, 80, 256 => 80, 80, 3 * 25 (4 + 1 + 20) & 85 (4 + 1 + 80)
280 |         self.yolo_head_P3 = nn.Conv2d(transition_channels * 8, len(anchors_mask[2]) * (5 + num_classes), 1)
281 |         # 40, 40, 512 => 40, 40, 3 * 25 & 85
282 |         self.yolo_head_P4 = nn.Conv2d(transition_channels * 16, len(anchors_mask[1]) * (5 + num_classes), 1)
283 |         # 20, 20, 512 => 20, 20, 3 * 25 & 85
284 |         self.yolo_head_P5 = nn.Conv2d(transition_channels * 32, len(anchors_mask[0]) * (5 + num_classes), 1)
285 | 
286 |     def fuse(self):
287 |         print('Fusing layers... ')
288 |         for m in self.modules():
289 |             if isinstance(m, RepConv):
290 |                 m.fuse_repvgg_block()
291 |             elif type(m) is Conv and hasattr(m, 'bn'):
292 |                 m.conv = fuse_conv_and_bn(m.conv, m.bn)
293 |                 delattr(m, 'bn')
294 |                 m.forward = m.fuseforward
295 |         return self
296 |     
297 |     def forward(self, x):
298 |         #  backbone
299 |         feat1, feat2, feat3 = self.backbone.forward(x)
300 |         
301 |         #------------------------加强特征提取网络------------------------# 
302 |         # 20, 20, 1024 => 20, 20, 512
303 |         P5          = self.sppcspc(feat3)
304 |         # 20, 20, 512 => 20, 20, 256
305 |         P5_conv     = self.conv_for_P5(P5)
306 |         # 20, 20, 256 => 40, 40, 256
307 |         P5_upsample = self.upsample(P5_conv)
308 |         # 40, 40, 256 cat 40, 40, 256 => 40, 40, 512
309 |         P4          = torch.cat([self.conv_for_feat2(feat2), P5_upsample], 1)
310 |         # 40, 40, 512 => 40, 40, 256
311 |         P4          = self.conv3_for_upsample1(P4)
312 | 
313 |         # 40, 40, 256 => 40, 40, 128
314 |         P4_conv     = self.conv_for_P4(P4)
315 |         # 40, 40, 128 => 80, 80, 128
316 |         P4_upsample = self.upsample(P4_conv)
317 |         # 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
318 |         P3          = torch.cat([self.conv_for_feat1(feat1), P4_upsample], 1)
319 |         # 80, 80, 256 => 80, 80, 128
320 |         P3          = self.conv3_for_upsample2(P3)
321 | 
322 |         # 80, 80, 128 => 40, 40, 256
323 |         P3_downsample = self.down_sample1(P3)
324 |         # 40, 40, 256 cat 40, 40, 256 => 40, 40, 512
325 |         P4 = torch.cat([P3_downsample, P4], 1)
326 |         # 40, 40, 512 => 40, 40, 256
327 |         P4 = self.conv3_for_downsample1(P4)
328 | 
329 |         # 40, 40, 256 => 20, 20, 512
330 |         P4_downsample = self.down_sample2(P4)
331 |         # 20, 20, 512 cat 20, 20, 512 => 20, 20, 1024
332 |         P5 = torch.cat([P4_downsample, P5], 1)
333 |         # 20, 20, 1024 => 20, 20, 512
334 |         P5 = self.conv3_for_downsample2(P5)
335 |         #------------------------加强特征提取网络------------------------# 
336 |         # P3 80, 80, 128 
337 |         # P4 40, 40, 256
338 |         # P5 20, 20, 512
339 |         
340 |         P3 = self.rep_conv_1(P3)
341 |         P4 = self.rep_conv_2(P4)
342 |         P5 = self.rep_conv_3(P5)
343 |         #---------------------------------------------------#
344 |         #   第三个特征层
345 |         #   y3=(batch_size, 75, 80, 80)
346 |         #---------------------------------------------------#
347 |         out2 = self.yolo_head_P3(P3)
348 |         #---------------------------------------------------#
349 |         #   第二个特征层
350 |         #   y2=(batch_size, 75, 40, 40)
351 |         #---------------------------------------------------#
352 |         out1 = self.yolo_head_P4(P4)
353 |         #---------------------------------------------------#
354 |         #   第一个特征层
355 |         #   y1=(batch_size, 75, 20, 20)
356 |         #---------------------------------------------------#
357 |         out0 = self.yolo_head_P5(P5)
358 | 
359 |         return [out0, out1, out2]
360 | 


--------------------------------------------------------------------------------
/常见问题汇总.md:
--------------------------------------------------------------------------------
  1 | 问题汇总的博客地址为[https://blog.csdn.net/weixin_44791964/article/details/107517428](https://blog.csdn.net/weixin_44791964/article/details/107517428)。
  2 | 
  3 | # 问题汇总
  4 | ## 1、下载问题
  5 | ### a、代码下载
  6 | **问：up主，可以给我发一份代码吗，代码在哪里下载啊？ 
  7 | 答：Github上的地址就在视频简介里。复制一下就能进去下载了。**
  8 | 
  9 | **问：up主，为什么我下载的代码提示压缩包损坏？
 10 | 答：重新去Github下载。**
 11 | 
 12 | **问：up主，为什么我下载的代码和你在视频以及博客上的代码不一样？
 13 | 答：我常常会对代码进行更新，最终以实际的代码为准。**
 14 | 
 15 | ### b、 权值下载
 16 | **问：up主，为什么我下载的代码里面，model_data下面没有.pth或者.h5文件？ 
 17 | 答：我一般会把权值上传到Github和百度网盘，在GITHUB的README里面就能找到。**
 18 | 
 19 | ### c、 数据集下载
 20 | **问：up主，XXXX数据集在哪里下载啊？
 21 | 答：一般数据集的下载地址我会放在README里面，基本上都有，没有的话请及时联系我添加，直接发github的issue即可**。
 22 | 
 23 | ## 2、环境配置问题
 24 | ### a、20系列及以下显卡环境配置
 25 | **pytorch代码对应的pytorch版本为1.2，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/106037141](https://blog.csdn.net/weixin_44791964/article/details/106037141)。
 26 | 
 27 | **keras代码对应的tensorflow版本为1.13.2，keras版本是2.1.5，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/104702142](https://blog.csdn.net/weixin_44791964/article/details/104702142)。
 28 | 
 29 | **tf2代码对应的tensorflow版本为2.2.0，无需安装keras，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/109161493](https://blog.csdn.net/weixin_44791964/article/details/109161493)。
 30 | 
 31 | **问：你的代码某某某版本的tensorflow和pytorch能用嘛？
 32 | 答：最好按照我推荐的配置，配置教程也有！其它版本的我没有试过！可能出现问题但是一般问题不大。仅需要改少量代码即可。**
 33 | 
 34 | ### b、30系列显卡环境配置
 35 | 30系显卡由于框架更新不可使用上述环境配置教程。
 36 | 当前我已经测试的可以用的30显卡配置如下：
 37 | **pytorch代码对应的pytorch版本为1.7.0，cuda为11.0，cudnn为8.0.5，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/120668551](https://blog.csdn.net/weixin_44791964/article/details/120668551)。
 38 | 
 39 | **keras代码无法在win10下配置cuda11，在ubuntu下可以百度查询一下，配置tensorflow版本为1.15.4，keras版本是2.1.5或者2.3.1（少量函数接口不同，代码可能还需要少量调整。）**
 40 | 
 41 | **tf2代码对应的tensorflow版本为2.4.0，cuda为11.0，cudnn为8.0.5，博客地址对应为**[https://blog.csdn.net/weixin_44791964/article/details/120657664](https://blog.csdn.net/weixin_44791964/article/details/120657664)。
 42 | 
 43 | ### c、CPU环境配置
 44 | **pytorch代码对应的pytorch-cpu版本为1.2，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/120655098](https://blog.csdn.net/weixin_44791964/article/details/120655098)
 45 | 
 46 | **keras代码对应的tensorflow-cpu版本为1.13.2，keras版本是2.1.5，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/120653717](https://blog.csdn.net/weixin_44791964/article/details/120653717)。
 47 | 
 48 | **tf2代码对应的tensorflow-cpu版本为2.2.0，无需安装keras，博客地址对应**[https://blog.csdn.net/weixin_44791964/article/details/120656291](https://blog.csdn.net/weixin_44791964/article/details/120656291)。
 49 | 
 50 | 
 51 | ### d、GPU利用问题与环境使用问题
 52 | **问：为什么我安装了tensorflow-gpu但是却没用利用GPU进行训练呢？
 53 | 答：确认tensorflow-gpu已经装好，利用pip list查看tensorflow版本，然后查看任务管理器或者利用nvidia命令看看是否使用了gpu进行训练，任务管理器的话要看显存使用情况。**
 54 | 
 55 | **问：up主，我好像没有在用gpu进行训练啊，怎么看是不是用了GPU进行训练？
 56 | 答：查看是否使用GPU进行训练一般使用NVIDIA在命令行的查看命令。在windows电脑中打开cmd然后利用nvidia-smi指令查看GPU利用情况**
 57 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/f88ef794c9a341918f000eb2b1c67af6.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBAQnViYmxpaWlpbmc=,size_20,color_FFFFFF,t_70,g_se,x_16)
 58 | **如果要一定看任务管理器的话，请看性能部分GPU的显存是否利用，或者查看任务管理器的Cuda，而非Copy。**
 59 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013234241524.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDc5MTk2NA==,size_16,color_FFFFFF,t_70#pic_center)
 60 | 
 61 | ### e、DLL load failed: 找不到指定的模块
 62 | **问：出现如下错误**
 63 | ```python
 64 | Traceback (most recent call last):
 65 |   File "C:\Users\focus\Anaconda3\ana\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 58, in <module>
 66 |  from tensorflow.python.pywrap_tensorflow_internal import *
 67 | File "C:\Users\focus\Anaconda3\ana\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\pywrap_tensorflow_internal.py", line 28, in <module>
 68 | pywrap_tensorflow_internal = swig_import_helper()
 69 |   File "C:\Users\focus\Anaconda3\ana\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\pywrap_tensorflow_internal.py", line 24, in swig_import_helper
 70 |     _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
 71 | File "C:\Users\focus\Anaconda3\ana\envs\tensorflow-gpu\lib\imp.py", line 243, in load_modulereturn load_dynamic(name, filename, file)
 72 | File "C:\Users\focus\Anaconda3\ana\envs\tensorflow-gpu\lib\imp.py", line 343, in load_dynamic
 73 |     return _load(spec)
 74 | ImportError: DLL load failed: 找不到指定的模块。
 75 | ```
 76 | **答：如果没重启过就重启一下，否则重新按照步骤安装，还无法解决则把你的GPU、CUDA、CUDNN、TF版本以及PYTORCH版本私聊告诉我。**
 77 | 
 78 | ### f、no module问题（no module name utils.utils、no module named 'matplotlib' ）
 79 | **问：为什么提示说no module name utils.utils（no module name nets.yolo、no module name nets.ssd等一系列问题）啊？
 80 | 答：utils并不需要用pip装，它就在我上传的仓库的根目录，出现这个问题的原因是根目录不对，查查相对目录和根目录的概念。查了基本上就明白了。**
 81 | 
 82 | **问：为什么提示说no module name matplotlib（no module name PIL，no module name cv2等等）？
 83 | 答：这个库没安装打开命令行安装就好。pip install matplotlib**
 84 | 
 85 | **问：为什么我已经用pip装了opencv（pillow、matplotlib等），还是提示no module name cv2？
 86 | 答：没有激活环境装，要激活对应的conda环境进行安装才可以正常使用**
 87 | 
 88 | **问：为什么提示说No module named 'torch' ？
 89 | 答：其实我也真的很想知道为什么会有这个问题……这个pytorch没装是什么情况？一般就俩情况，一个是真的没装，还有一个是装到其它环境了，当前激活的环境不是自己装的环境。**
 90 | 
 91 | **问：为什么提示说No module named 'tensorflow' ？
 92 | 答：同上。**
 93 | 
 94 | ### g、cuda安装失败问题
 95 | 一般cuda安装前需要安装Visual Studio，装个2017版本即可。
 96 | 
 97 | ### h、Ubuntu系统问题
 98 | **所有代码在Ubuntu下可以使用，我两个系统都试过。**
 99 | 
100 | ### i、VSCODE提示错误的问题
101 | **问：为什么在VSCODE里面提示一大堆的错误啊？
102 | 答：我也提示一大堆的错误，但是不影响，是VSCODE的问题，如果不想看错误的话就装Pycharm。
103 | 最好将设置里面的Python:Language Server，调整为Pylance。**
104 | 
105 | ### j、使用cpu进行训练与预测的问题
106 | **对于keras和tf2的代码而言，如果想用cpu进行训练和预测，直接装cpu版本的tensorflow就可以了。**
107 | 
108 | **对于pytorch的代码而言，如果想用cpu进行训练和预测，需要将cuda=True修改成cuda=False。**
109 | 
110 | ### k、tqdm没有pos参数问题
111 | **问：运行代码提示'tqdm' object has no attribute 'pos'。
112 | 答：重装tqdm，换个版本就可以了。**
113 | 
114 | ### l、提示decode(“utf-8”)的问题
115 | **由于h5py库的更新，安装过程中会自动安装h5py=3.0.0以上的版本，会导致decode("utf-8")的错误！
116 | 各位一定要在安装完tensorflow后利用命令装h5py=2.10.0！**
117 | ```
118 | pip install h5py==2.10.0
119 | ```
120 | 
121 | ### m、提示TypeError: __array__() takes 1 positional argument but 2 were given错误
122 | 可以修改pillow版本解决。
123 | ```
124 | pip install pillow==8.2.0
125 | ```
126 | ### n、如何查看当前cuda和cudnn
127 | **window下cuda版本查看方式如下：
128 | 1、打开cmd窗口。
129 | 2、输入nvcc -V。
130 | 3、Cuda compilation tools, release XXXXXXXX中的XXXXXXXX即cuda版本。**
131 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/0389ea35107a408a80ab5cb6590d5a74.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBAQnViYmxpaWlpbmc=,size_20,color_FFFFFF,t_70,g_se,x_16)
132 | window下cudnn版本查看方式如下：
133 | 1、进入cuda安装目录，进入incude文件夹。
134 | 2、找到cudnn.h文件。
135 | 3、右键文本打开，下拉，看到#define处可获得cudnn版本。
136 | ```python
137 | #define CUDNN_MAJOR 7
138 | #define CUDNN_MINOR 4
139 | #define CUDNN_PATCHLEVEL 1
140 | ```
141 | 代表cudnn为7.4.1。
142 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/7a86b68b17c84feaa6fa95780d4ae4b4.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBAQnViYmxpaWlpbmc=,size_20,color_FFFFFF,t_70,g_se,x_16)
143 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/81bb7c3e13cc492292530e4b69df86a9.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBAQnViYmxpaWlpbmc=,size_20,color_FFFFFF,t_70,g_se,x_16)
144 | 
145 | ### o、为什么按照你的环境配置后还是不能使用
146 | **问：up主，为什么我按照你的环境配置后还是不能使用？
147 | 答：请把你的GPU、CUDA、CUDNN、TF版本以及PYTORCH版本B站私聊告诉我。**
148 | 
149 | ### p、其它问题
150 | **问：为什么提示TypeError: cat() got an unexpected keyword argument 'axis'，Traceback (most recent call last)，AttributeError: 'Tensor' object has no attribute 'bool'？
151 | 答：这是版本问题，建议使用torch1.2以上版本**
152 | 
153 | **其它有很多稀奇古怪的问题，很多是版本问题，建议按照我的视频教程安装Keras和tensorflow。比如装的是tensorflow2，就不用问我说为什么我没法运行Keras-yolo啥的。那是必然不行的。**
154 | 
155 | ## 3、目标检测库问题汇总（人脸检测和分类库也可参考）
156 | ### a、shape不匹配问题。
157 | #### 1）、训练时shape不匹配问题。
158 | **问：up主，为什么运行train.py会提示shape不匹配啊？
159 | 答：在keras环境中，因为你训练的种类和原始的种类不同，网络结构会变化，所以最尾部的shape会有少量不匹配。**
160 | 
161 | #### 2）、预测时shape不匹配问题。
162 | **问：为什么我运行predict.py会提示我说shape不匹配呀。**
163 | ##### i、copying a param with shape torch.Size([75, 704, 1, 1]) from checkpoint
164 | 在Pytorch里面是这样的：
165 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200722171631901.png)
166 | ##### ii、Shapes are [1,1,1024,75] and [255,1024,1,1]. for 'Assign_360' (op: 'Assign') with input shapes: [1,1,1024,75], [255,1024,1,1].
167 | 在Keras里面是这样的：
168 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200722171523380.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDc5MTk2NA==,size_16,color_FFFFFF,t_70)
169 | **答：原因主要有仨：
170 | 1、训练的classes_path没改，就开始训练了。
171 | 2、训练的model_path没改。
172 | 3、训练的classes_path没改。
173 | 请检查清楚了！确定自己所用的model_path和classes_path是对应的！训练的时候用到的num_classes或者classes_path也需要检查！**
174 | 
175 | ### b、显存不足问题（OOM、RuntimeError: CUDA out of memory）。
176 | **问：为什么我运行train.py下面的命令行闪的贼快，还提示OOM啥的？ 
177 | 答：这是在keras中出现的，爆显存了，可以改小batch_size，SSD的显存占用率是最小的，建议用SSD；
178 | 2G显存：SSD、YOLOV4-TINY
179 | 4G显存：YOLOV3
180 | 6G显存：YOLOV4、Retinanet、M2det、Efficientdet、Faster RCNN等
181 | 8G+显存：随便选吧。**
182 | **需要注意的是，受到BatchNorm2d影响，batch_size不可为1，至少为2。**
183 | 
184 | **问：为什么提示 RuntimeError: CUDA out of memory. Tried to allocate 52.00 MiB (GPU 0; 15.90 GiB total capacity; 14.85 GiB already allocated; 51.88 MiB free; 15.07 GiB reserved in total by PyTorch)？ 
185 | 答：这是pytorch中出现的，爆显存了，同上。**
186 | 
187 | **问：为什么我显存都没利用，就直接爆显存了？ 
188 | 答：都爆显存了，自然就不利用了，模型没有开始训练。**
189 | ### c、为什么要进行冻结训练与解冻训练，不进行行吗？
190 | **问：为什么要冻结训练和解冻训练呀？
191 | 答：可以不进行，本质上是为了保证性能不足的同学的训练，如果电脑性能完全不够，可以将Freeze_Epoch和UnFreeze_Epoch设置成一样，只进行冻结训练。**
192 | 
193 | **同时这也是迁移学习的思想，因为神经网络主干特征提取部分所提取到的特征是通用的，我们冻结起来训练可以加快训练效率，也可以防止权值被破坏。**
194 | 在冻结阶段，模型的主干被冻结了，特征提取网络不发生改变。占用的显存较小，仅对网络进行微调。
195 | 在解冻阶段，模型的主干不被冻结了，特征提取网络会发生改变。占用的显存较大，网络所有的参数都会发生改变。
196 | 
197 | ### d、我的LOSS好大啊，有问题吗？（我的LOSS好小啊，有问题吗？）
198 | **问：为什么我的网络不收敛啊，LOSS是XXXX。
199 | 答：不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，我的yolo代码都没有归一化，所以LOSS值看起来比较高，LOSS的值不重要，重要的是是否在变小，预测是否有效果。**
200 | 
201 | ### e、为什么我训练出来的模型没有预测结果？
202 | **问：为什么我的训练效果不好？预测了没有框（框不准）。
203 | 答：**
204 | 考虑几个问题：
205 | 1、目标信息问题，查看2007_train.txt文件是否有目标信息，没有的话请修改voc_annotation.py。
206 | 2、数据集问题，小于500的自行考虑增加数据集，同时测试不同的模型，确认数据集是好的。
207 | 3、是否解冻训练，如果数据集分布与常规画面差距过大需要进一步解冻训练，调整主干，加强特征提取能力。
208 | 4、网络问题，比如SSD不适合小目标，因为先验框固定了。
209 | 5、训练时长问题，有些同学只训练了几代表示没有效果，按默认参数训练完。
210 | 6、确认自己是否按照步骤去做了，如果比如voc_annotation.py里面的classes是否修改了等。
211 | 7、不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，LOSS的值不重要，重要的是是否收敛。
212 | 8、是否修改了网络的主干，如果修改了没有预训练权重，网络不容易收敛，自然效果不好。
213 | 
214 | ### f、为什么我计算出来的map是0？
215 | **问：为什么我的训练效果不好？没有map？
216 | 答：**
217 | 首先尝试利用predict.py预测一下，如果有效果的话应该是get_map.py里面的classes_path设置错误。如果没有预测结果的话，解决方法同e问题，对下面几点进行检查：
218 | 1、目标信息问题，查看2007_train.txt文件是否有目标信息，没有的话请修改voc_annotation.py。
219 | 2、数据集问题，小于500的自行考虑增加数据集，同时测试不同的模型，确认数据集是好的。
220 | 3、是否解冻训练，如果数据集分布与常规画面差距过大需要进一步解冻训练，调整主干，加强特征提取能力。
221 | 4、网络问题，比如SSD不适合小目标，因为先验框固定了。
222 | 5、训练时长问题，有些同学只训练了几代表示没有效果，按默认参数训练完。
223 | 6、确认自己是否按照步骤去做了，如果比如voc_annotation.py里面的classes是否修改了等。
224 | 7、不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，LOSS的值不重要，重要的是是否收敛。
225 | 8、是否修改了网络的主干，如果修改了没有预训练权重，网络不容易收敛，自然效果不好。
226 | 
227 | ### g、gbk编码错误（'gbk' codec can't decode byte）。
228 | **问：我怎么出现了gbk什么的编码错误啊：**
229 | ```python
230 | UnicodeDecodeError: 'gbk' codec can't decode byte 0xa6 in position 446: illegal multibyte sequence
231 | ```
232 | **答：标签和路径不要使用中文，如果一定要使用中文，请注意处理的时候编码的问题，改成打开文件的encoding方式改为utf-8。**
233 | 
234 | ### h、我的图片是xxx*xxx的分辨率的，可以用吗？
235 | **问：我的图片是xxx*xxx的分辨率的，可以用吗！**
236 | **答：可以用，代码里面会自动进行resize与数据增强。**
237 | 
238 | ### i、我想进行数据增强！怎么增强？
239 | **问：我想要进行数据增强！怎么做呢？**
240 | **答：可以用，代码里面会自动进行resize与数据增强。**
241 | 
242 | ### j、多GPU训练。
243 | **问：怎么进行多GPU训练？
244 | 答：pytorch的大多数代码可以直接使用gpu训练，keras的话直接百度就好了，实现并不复杂，我没有多卡没法详细测试，还需要各位同学自己努力了。**
245 | 
246 | ### k、能不能训练灰度图？
247 | **问：能不能训练灰度图（预测灰度图）啊？
248 | 答：我的大多数库会将灰度图转化成RGB进行训练和预测，如果遇到代码不能训练或者预测灰度图的情况，可以尝试一下在get_random_data里面将Image.open后的结果转换成RGB，预测的时候也这样试试。（仅供参考）**
249 | 
250 | ### l、断点续练问题。
251 | **问：我已经训练过几个世代了，能不能从这个基础上继续开始训练
252 | 答：可以，你在训练前，和载入预训练权重一样载入训练过的权重就行了。一般训练好的权重会保存在logs文件夹里面，将model_path修改成你要开始的权值的路径即可。**
253 | 
254 | ### m、我要训练其它的数据集，预训练权重能不能用？
255 | **问：如果我要训练其它的数据集，预训练权重要怎么办啊？**
256 | **答：数据的预训练权重对不同数据集是通用的，因为特征是通用的，预训练权重对于99%的情况都必须要用，不用的话权值太过随机，特征提取效果不明显，网络训练的结果也不会好。**
257 | 
258 | ### n、网络如何从0开始训练？
259 | **问：我要怎么不使用预训练权重啊？
260 | 答：看一看注释、大多数代码是model_path = ''，Freeze_Train = Fasle**，如果设置model_path无用，**那么把载入预训练权重的代码注释了就行。**
261 | 
262 | ### o、为什么从0开始训练效果这么差（修改了网络主干，效果不好怎么办）？
263 | **问：为什么我不使用预训练权重效果这么差啊？
264 | 答：因为随机初始化的权值不好，提取的特征不好，也就导致了模型训练的效果不好，voc07+12、coco+voc07+12效果都不一样，预训练权重还是非常重要的。**
265 | 
266 | **问：up，我修改了网络，预训练权重还能用吗？
267 | 答：修改了主干的话，如果不是用的现有的网络，基本上预训练权重是不能用的，要么就自己判断权值里卷积核的shape然后自己匹配，要么只能自己预训练去了；修改了后半部分的话，前半部分的主干部分的预训练权重还是可以用的，如果是pytorch代码的话，需要自己修改一下载入权值的方式，判断shape后载入，如果是keras代码，直接by_name=True,skip_mismatch=True即可。**
268 | 权值匹配的方式可以参考如下：
269 | ```python
270 | # 加快模型训练的效率
271 | print('Loading weights into state dict...')
272 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
273 | model_dict = model.state_dict()
274 | pretrained_dict = torch.load(model_path, map_location=device)
275 | a = {}
276 | for k, v in pretrained_dict.items():
277 |     try:    
278 |         if np.shape(model_dict[k]) ==  np.shape(v):
279 |             a[k]=v
280 |     except:
281 |         pass
282 | model_dict.update(a)
283 | model.load_state_dict(model_dict)
284 | print('Finished!')
285 | ```
286 | 
287 | **问：为什么从0开始训练效果这么差（我修改了网络主干，效果不好怎么办）？
288 | 答：一般来讲，网络从0开始的训练效果会很差，因为权值太过随机，特征提取效果不明显，因此非常、非常、非常不建议大家从0开始训练！如果一定要从0开始，可以了解imagenet数据集，首先训练分类模型，获得网络的主干部分权值，分类模型的 主干部分 和该模型通用，基于此进行训练。
289 | 网络修改了主干之后也是同样的问题，随机的权值效果很差。**
290 | 
291 | **问：怎么在模型上从0开始训练？
292 | 答：在算力不足与调参能力不足的情况下从0开始训练毫无意义。模型特征提取能力在随机初始化参数的情况下非常差。没有好的参数调节能力和算力，无法使得网络正常收敛。**
293 | 如果一定要从0开始，那么训练的时候请注意几点：
294 |  - 不载入预训练权重。 
295 |  - 不要进行冻结训练，注释冻结模型的代码。
296 | 
297 | **问：为什么我不使用预训练权重效果这么差啊？
298 | 答：因为随机初始化的权值不好，提取的特征不好，也就导致了模型训练的效果不好，voc07+12、coco+voc07+12效果都不一样，预训练权重还是非常重要的。**
299 | 
300 | ### p、你的权值都是哪里来的？
301 | **问：如果网络不能从0开始训练的话你的权值哪里来的？
302 | 答：有些权值是官方转换过来的，有些权值是自己训练出来的，我用到的主干的imagenet的权值都是官方的。**
303 | 
304 | ### q、视频检测与摄像头检测
305 | **问：怎么用摄像头检测呀？
306 | 答：predict.py修改参数可以进行摄像头检测，也有视频详细解释了摄像头检测的思路。**
307 | 
308 | **问：怎么用视频检测呀？
309 | 答：同上**
310 | 
311 | ### r、如何保存检测出的图片
312 | **问：检测完的图片怎么保存？
313 | 答：一般目标检测用的是Image，所以查询一下PIL库的Image如何进行保存。详细看看predict.py文件的注释。**
314 | 
315 | **问：怎么用视频保存呀？
316 | 答：详细看看predict.py文件的注释。**
317 | 
318 | ### s、遍历问题
319 | **问：如何对一个文件夹的图片进行遍历？
320 | 答：一般使用os.listdir先找出文件夹里面的所有图片，然后根据predict.py文件里面的执行思路检测图片就行了，详细看看predict.py文件的注释。**
321 | 
322 | **问：如何对一个文件夹的图片进行遍历？并且保存。
323 | 答：遍历的话一般使用os.listdir先找出文件夹里面的所有图片，然后根据predict.py文件里面的执行思路检测图片就行了。保存的话一般目标检测用的是Image，所以查询一下PIL库的Image如何进行保存。如果有些库用的是cv2，那就是查一下cv2怎么保存图片。详细看看predict.py文件的注释。**
324 | 
325 | ### t、路径问题（No such file or directory、StopIteration: [Errno 13] Permission denied: 'XXXXXX'）
326 | **问：我怎么出现了这样的错误呀：**
327 | ```python
328 | FileNotFoundError: 【Errno 2】 No such file or directory
329 | StopIteration: [Errno 13] Permission denied: 'D:\\Study\\Collection\\Dataset\\VOC07+12+test\\VOCdevkit/VOC2007'
330 | ……………………………………
331 | ……………………………………
332 | ```
333 | **答：去检查一下文件夹路径，查看是否有对应文件；并且检查一下2007_train.txt，其中文件路径是否有错。**
334 | 关于路径有几个重要的点：
335 | **文件夹名称中一定不要有空格。
336 | 注意相对路径和绝对路径。
337 | 多百度路径相关的知识。**
338 | 
339 | **所有的路径问题基本上都是根目录问题，好好查一下相对目录的概念！**
340 | ### u、和原版比较问题，你怎么和原版不一样啊？
341 | **问：原版的代码是XXX，为什么你的代码是XXX？
342 | 答：是啊……这要不怎么说我不是原版呢……**
343 | 
344 | **问：你这个代码和原版比怎么样，可以达到原版的效果么？
345 | 答：基本上可以达到，我都用voc数据测过，我没有好显卡，没有能力在coco上测试与训练。**
346 | 
347 | **问：你有没有实现yolov4所有的tricks，和原版差距多少？
348 | 答：并没有实现全部的改进部分，由于YOLOV4使用的改进实在太多了，很难完全实现与列出来，这里只列出来了一些我比较感兴趣，而且非常有效的改进。论文中提到的SAM（注意力机制模块），作者自己的源码也没有使用。还有其它很多的tricks，不是所有的tricks都有提升，我也没法实现全部的tricks。至于和原版的比较，我没有能力训练coco数据集，根据使用过的同学反应差距不大。**
349 | 
350 | ### v、我的检测速度是xxx正常吗？我的检测速度还能增快吗？
351 | **问：你这个FPS可以到达多少，可以到 XX FPS么？
352 | 答：FPS和机子的配置有关，配置高就快，配置低就慢。**
353 | 
354 | **问：我的检测速度是xxx正常吗？我的检测速度还能增快吗？
355 | 答：看配置，配置好速度就快，如果想要配置不变的情况下加快速度，就要修改网络了。**
356 | 
357 | **问：为什么我用服务器去测试yolov4（or others）的FPS只有十几？
358 | 答：检查是否正确安装了tensorflow-gpu或者pytorch的gpu版本，如果已经正确安装，可以去利用time.time()的方法查看detect_image里面，哪一段代码耗时更长（不仅只有网络耗时长，其它处理部分也会耗时，如绘图等）。**
359 | 
360 | **问：为什么论文中说速度可以达到XX，但是这里却没有？
361 | 答：检查是否正确安装了tensorflow-gpu或者pytorch的gpu版本，如果已经正确安装，可以去利用time.time()的方法查看detect_image里面，哪一段代码耗时更长（不仅只有网络耗时长，其它处理部分也会耗时，如绘图等）。有些论文还会使用多batch进行预测，我并没有去实现这个部分。**
362 | 
363 | ### w、预测图片不显示问题
364 | **问：为什么你的代码在预测完成后不显示图片？只是在命令行告诉我有什么目标。
365 | 答：给系统安装一个图片查看器就行了。**
366 | 
367 | ### x、算法评价问题（目标检测的map、PR曲线、Recall、Precision等）
368 | **问：怎么计算map？
369 | 答：看map视频，都一个流程。**
370 | 
371 | **问：计算map的时候，get_map.py里面有一个MINOVERLAP是什么用的，是iou吗？
372 | 答：是iou，它的作用是判断预测框和真实框的重合成度，如果重合程度大于MINOVERLAP，则预测正确。**
373 | 
374 | **问：为什么get_map.py里面的self.confidence（self.score）要设置的那么小？
375 | 答：看一下map的视频的原理部分，要知道所有的结果然后再进行pr曲线的绘制。**
376 | 
377 | **问：能不能说说怎么绘制PR曲线啥的呀。
378 | 答：可以看mAP视频，结果里面有PR曲线。**
379 | 
380 | **问：怎么计算Recall、Precision指标。
381 | 答：这俩指标应该是相对于特定的置信度的，计算map的时候也会获得。**
382 | 
383 | ### y、coco数据集训练问题
384 | **问：目标检测怎么训练COCO数据集啊？。
385 | 答：coco数据训练所需要的txt文件可以参考qqwweee的yolo3的库，格式都是一样的。**
386 | 
387 | ### z、UP，怎么优化模型啊？我想提升效果
388 | **问：up，怎么修改模型啊，我想发个小论文！
389 | 答：建议看看yolov3和yolov4的区别，然后看看yolov4的论文，作为一个大型调参现场非常有参考意义，使用了很多tricks。我能给的建议就是多看一些经典模型，然后拆解里面的亮点结构并使用。**
390 | 
391 | ### aa、UP，有Focal LOSS的代码吗？怎么改啊？
392 | **问：up，YOLO系列使用Focal LOSS的代码你有吗，有提升吗？
393 | 答：很多人试过，提升效果也不大（甚至变的更Low），它自己有自己的正负样本的平衡方式**。改代码的事情，还是自己好好看看代码吧。
394 | 
395 | ### ab、部署问题（ONNX、TensorRT等）
396 | 我没有具体部署到手机等设备上过，所以很多部署问题我并不了解……
397 | 
398 | ## 4、语义分割库问题汇总
399 | ### a、shape不匹配问题
400 | #### 1）、训练时shape不匹配问题
401 | **问：up主，为什么运行train.py会提示shape不匹配啊？
402 | 答：在keras环境中，因为你训练的种类和原始的种类不同，网络结构会变化，所以最尾部的shape会有少量不匹配。**
403 | 
404 | #### 2）、预测时shape不匹配问题
405 | **问：为什么我运行predict.py会提示我说shape不匹配呀。**
406 | ##### i、copying a param with shape torch.Size([75, 704, 1, 1]) from checkpoint
407 | 在Pytorch里面是这样的：
408 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200722171631901.png)
409 | ##### ii、Shapes are [1,1,1024,75] and [255,1024,1,1]. for 'Assign_360' (op: 'Assign') with input shapes: [1,1,1024,75], [255,1024,1,1].
410 | 在Keras里面是这样的：
411 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200722171523380.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDc5MTk2NA==,size_16,color_FFFFFF,t_70)
412 | **答：原因主要有二：
413 | 1、train.py里面的num_classes没改。
414 | 2、预测时num_classes没改。
415 | 3、预测时model_path没改。
416 | 请检查清楚！训练和预测的时候用到的num_classes都需要检查！**
417 | 
418 | ### b、显存不足问题（OOM、RuntimeError: CUDA out of memory）。
419 | **问：为什么我运行train.py下面的命令行闪的贼快，还提示OOM啥的？ 
420 | 答：这是在keras中出现的，爆显存了，可以改小batch_size。**
421 | 
422 | **需要注意的是，受到BatchNorm2d影响，batch_size不可为1，至少为2。**
423 | 
424 | **问：为什么提示 RuntimeError: CUDA out of memory. Tried to allocate 52.00 MiB (GPU 0; 15.90 GiB total capacity; 14.85 GiB already allocated; 51.88 MiB free; 15.07 GiB reserved in total by PyTorch)？ 
425 | 答：这是pytorch中出现的，爆显存了，同上。**
426 | 
427 | **问：为什么我显存都没利用，就直接爆显存了？ 
428 | 答：都爆显存了，自然就不利用了，模型没有开始训练。**
429 | 
430 | ### c、为什么要进行冻结训练与解冻训练，不进行行吗？
431 | **问：为什么要冻结训练和解冻训练呀？
432 | 答：可以不进行，本质上是为了保证性能不足的同学的训练，如果电脑性能完全不够，可以将Freeze_Epoch和UnFreeze_Epoch设置成一样，只进行冻结训练。**
433 | 
434 | **同时这也是迁移学习的思想，因为神经网络主干特征提取部分所提取到的特征是通用的，我们冻结起来训练可以加快训练效率，也可以防止权值被破坏。**
435 | 在冻结阶段，模型的主干被冻结了，特征提取网络不发生改变。占用的显存较小，仅对网络进行微调。
436 | 在解冻阶段，模型的主干不被冻结了，特征提取网络会发生改变。占用的显存较大，网络所有的参数都会发生改变。
437 | 
438 | ### d、我的LOSS好大啊，有问题吗？（我的LOSS好小啊，有问题吗？）
439 | **问：为什么我的网络不收敛啊，LOSS是XXXX。
440 | 答：不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，我的yolo代码都没有归一化，所以LOSS值看起来比较高，LOSS的值不重要，重要的是是否在变小，预测是否有效果。**
441 | 
442 | ### e、为什么我训练出来的模型没有预测结果？
443 | **问：为什么我的训练效果不好？预测了没有框（框不准）。
444 | 答：**
445 | **考虑几个问题：
446 | 1、数据集问题，这是最重要的问题。小于500的自行考虑增加数据集；一定要检查数据集的标签，视频中详细解析了VOC数据集的格式，但并不是有输入图片有输出标签即可，还需要确认标签的每一个像素值是否为它对应的种类。很多同学的标签格式不对，最常见的错误格式就是标签的背景为黑，目标为白，此时目标的像素点值为255，无法正常训练，目标需要为1才行。
447 | 2、是否解冻训练，如果数据集分布与常规画面差距过大需要进一步解冻训练，调整主干，加强特征提取能力。
448 | 3、网络问题，可以尝试不同的网络。
449 | 4、训练时长问题，有些同学只训练了几代表示没有效果，按默认参数训练完。
450 | 5、确认自己是否按照步骤去做了。
451 | 6、不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，LOSS的值不重要，重要的是是否收敛。**
452 | 
453 | **问：为什么我的训练效果不好？对小目标预测不准确。
454 | 答：对于deeplab和pspnet而言，可以修改一下downsample_factor，当downsample_factor为16的时候下采样倍数过多，效果不太好，可以修改为8。**
455 | 
456 | ### f、为什么我计算出来的miou是0？
457 | **问：为什么我的训练效果不好？计算出来的miou是0？。**
458 | 答：
459 | 与e类似，**考虑几个问题：
460 | 1、数据集问题，这是最重要的问题。小于500的自行考虑增加数据集；一定要检查数据集的标签，视频中详细解析了VOC数据集的格式，但并不是有输入图片有输出标签即可，还需要确认标签的每一个像素值是否为它对应的种类。很多同学的标签格式不对，最常见的错误格式就是标签的背景为黑，目标为白，此时目标的像素点值为255，无法正常训练，目标需要为1才行。
461 | 2、是否解冻训练，如果数据集分布与常规画面差距过大需要进一步解冻训练，调整主干，加强特征提取能力。
462 | 3、网络问题，可以尝试不同的网络。
463 | 4、训练时长问题，有些同学只训练了几代表示没有效果，按默认参数训练完。
464 | 5、确认自己是否按照步骤去做了。
465 | 6、不同网络的LOSS不同，LOSS只是一个参考指标，用于查看网络是否收敛，而非评价网络好坏，LOSS的值不重要，重要的是是否收敛。**
466 | 
467 | ### g、gbk编码错误（'gbk' codec can't decode byte）。
468 | **问：我怎么出现了gbk什么的编码错误啊：**
469 | ```python
470 | UnicodeDecodeError: 'gbk' codec can't decode byte 0xa6 in position 446: illegal multibyte sequence
471 | ```
472 | **答：标签和路径不要使用中文，如果一定要使用中文，请注意处理的时候编码的问题，改成打开文件的encoding方式改为utf-8。**
473 | 
474 | ### h、我的图片是xxx*xxx的分辨率的，可以用吗？
475 | **问：我的图片是xxx*xxx的分辨率的，可以用吗！**
476 | **答：可以用，代码里面会自动进行resize与数据增强。**
477 | 
478 | ### i、我想进行数据增强！怎么增强？
479 | **问：我想要进行数据增强！怎么做呢？**
480 | **答：可以用，代码里面会自动进行resize与数据增强。**
481 | 
482 | ### j、多GPU训练。
483 | **问：怎么进行多GPU训练？
484 | 答：pytorch的大多数代码可以直接使用gpu训练，keras的话直接百度就好了，实现并不复杂，我没有多卡没法详细测试，还需要各位同学自己努力了。**
485 | 
486 | ### k、能不能训练灰度图？
487 | **问：能不能训练灰度图（预测灰度图）啊？
488 | 答：我的大多数库会将灰度图转化成RGB进行训练和预测，如果遇到代码不能训练或者预测灰度图的情况，可以尝试一下在get_random_data里面将Image.open后的结果转换成RGB，预测的时候也这样试试。（仅供参考）**
489 | 
490 | ### l、断点续练问题。
491 | **问：我已经训练过几个世代了，能不能从这个基础上继续开始训练
492 | 答：可以，你在训练前，和载入预训练权重一样载入训练过的权重就行了。一般训练好的权重会保存在logs文件夹里面，将model_path修改成你要开始的权值的路径即可。**
493 | 
494 | ### m、我要训练其它的数据集，预训练权重能不能用？
495 | **问：如果我要训练其它的数据集，预训练权重要怎么办啊？**
496 | **答：数据的预训练权重对不同数据集是通用的，因为特征是通用的，预训练权重对于99%的情况都必须要用，不用的话权值太过随机，特征提取效果不明显，网络训练的结果也不会好。**
497 | 
498 | ### n、网络如何从0开始训练？
499 | **问：我要怎么不使用预训练权重啊？
500 | 答：看一看注释、大多数代码是model_path = ''，Freeze_Train = Fasle**，如果设置model_path无用，**那么把载入预训练权重的代码注释了就行。**
501 | 
502 | ### o、为什么从0开始训练效果这么差（修改了网络主干，效果不好怎么办）？
503 | **问：为什么我不使用预训练权重效果这么差啊？
504 | 答：因为随机初始化的权值不好，提取的特征不好，也就导致了模型训练的效果不好，预训练权重还是非常重要的。**
505 | 
506 | **问：up，我修改了网络，预训练权重还能用吗？
507 | 答：修改了主干的话，如果不是用的现有的网络，基本上预训练权重是不能用的，要么就自己判断权值里卷积核的shape然后自己匹配，要么只能自己预训练去了；修改了后半部分的话，前半部分的主干部分的预训练权重还是可以用的，如果是pytorch代码的话，需要自己修改一下载入权值的方式，判断shape后载入，如果是keras代码，直接by_name=True,skip_mismatch=True即可。**
508 | 权值匹配的方式可以参考如下：
509 | ```python
510 | # 加快模型训练的效率
511 | print('Loading weights into state dict...')
512 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
513 | model_dict = model.state_dict()
514 | pretrained_dict = torch.load(model_path, map_location=device)
515 | a = {}
516 | for k, v in pretrained_dict.items():
517 |     try:    
518 |         if np.shape(model_dict[k]) ==  np.shape(v):
519 |             a[k]=v
520 |     except:
521 |         pass
522 | model_dict.update(a)
523 | model.load_state_dict(model_dict)
524 | print('Finished!')
525 | ```
526 | 
527 | **问：为什么从0开始训练效果这么差（我修改了网络主干，效果不好怎么办）？
528 | 答：一般来讲，网络从0开始的训练效果会很差，因为权值太过随机，特征提取效果不明显，因此非常、非常、非常不建议大家从0开始训练！如果一定要从0开始，可以了解imagenet数据集，首先训练分类模型，获得网络的主干部分权值，分类模型的 主干部分 和该模型通用，基于此进行训练。
529 | 网络修改了主干之后也是同样的问题，随机的权值效果很差。**
530 | 
531 | **问：怎么在模型上从0开始训练？
532 | 答：在算力不足与调参能力不足的情况下从0开始训练毫无意义。模型特征提取能力在随机初始化参数的情况下非常差。没有好的参数调节能力和算力，无法使得网络正常收敛。**
533 | 如果一定要从0开始，那么训练的时候请注意几点：
534 |  - 不载入预训练权重。 
535 |  - 不要进行冻结训练，注释冻结模型的代码。
536 | 
537 | **问：为什么我不使用预训练权重效果这么差啊？
538 | 答：因为随机初始化的权值不好，提取的特征不好，也就导致了模型训练的效果不好，voc07+12、coco+voc07+12效果都不一样，预训练权重还是非常重要的。**
539 | 
540 | ### p、你的权值都是哪里来的？
541 | **问：如果网络不能从0开始训练的话你的权值哪里来的？
542 | 答：有些权值是官方转换过来的，有些权值是自己训练出来的，我用到的主干的imagenet的权值都是官方的。**
543 | 
544 | 
545 | ### q、视频检测与摄像头检测
546 | **问：怎么用摄像头检测呀？
547 | 答：predict.py修改参数可以进行摄像头检测，也有视频详细解释了摄像头检测的思路。**
548 | 
549 | **问：怎么用视频检测呀？
550 | 答：同上**
551 | 
552 | ### r、如何保存检测出的图片
553 | **问：检测完的图片怎么保存？
554 | 答：一般目标检测用的是Image，所以查询一下PIL库的Image如何进行保存。详细看看predict.py文件的注释。**
555 | 
556 | **问：怎么用视频保存呀？
557 | 答：详细看看predict.py文件的注释。**
558 | 
559 | ### s、遍历问题
560 | **问：如何对一个文件夹的图片进行遍历？
561 | 答：一般使用os.listdir先找出文件夹里面的所有图片，然后根据predict.py文件里面的执行思路检测图片就行了，详细看看predict.py文件的注释。**
562 | 
563 | **问：如何对一个文件夹的图片进行遍历？并且保存。
564 | 答：遍历的话一般使用os.listdir先找出文件夹里面的所有图片，然后根据predict.py文件里面的执行思路检测图片就行了。保存的话一般目标检测用的是Image，所以查询一下PIL库的Image如何进行保存。如果有些库用的是cv2，那就是查一下cv2怎么保存图片。详细看看predict.py文件的注释。**
565 | 
566 | ### t、路径问题（No such file or directory、StopIteration: [Errno 13] Permission denied: 'XXXXXX'）
567 | **问：我怎么出现了这样的错误呀：**
568 | ```python
569 | FileNotFoundError: 【Errno 2】 No such file or directory
570 | StopIteration: [Errno 13] Permission denied: 'D:\\Study\\Collection\\Dataset\\VOC07+12+test\\VOCdevkit/VOC2007'
571 | ……………………………………
572 | ……………………………………
573 | ```
574 | **答：去检查一下文件夹路径，查看是否有对应文件；并且检查一下2007_train.txt，其中文件路径是否有错。**
575 | 关于路径有几个重要的点：
576 | **文件夹名称中一定不要有空格。
577 | 注意相对路径和绝对路径。
578 | 多百度路径相关的知识。**
579 | 
580 | **所有的路径问题基本上都是根目录问题，好好查一下相对目录的概念！**
581 | ### u、和原版比较问题，你怎么和原版不一样啊？
582 | **问：原版的代码是XXX，为什么你的代码是XXX？
583 | 答：是啊……这要不怎么说我不是原版呢……**
584 | 
585 | **问：你这个代码和原版比怎么样，可以达到原版的效果么？
586 | 答：基本上可以达到，我都用voc数据测过，我没有好显卡，没有能力在coco上测试与训练。**
587 | 
588 | ### v、我的检测速度是xxx正常吗？我的检测速度还能增快吗？
589 | **问：你这个FPS可以到达多少，可以到 XX FPS么？
590 | 答：FPS和机子的配置有关，配置高就快，配置低就慢。**
591 | 
592 | **问：我的检测速度是xxx正常吗？我的检测速度还能增快吗？
593 | 答：看配置，配置好速度就快，如果想要配置不变的情况下加快速度，就要修改网络了。**
594 | 
595 | **问：为什么论文中说速度可以达到XX，但是这里却没有？
596 | 答：检查是否正确安装了tensorflow-gpu或者pytorch的gpu版本，如果已经正确安装，可以去利用time.time()的方法查看detect_image里面，哪一段代码耗时更长（不仅只有网络耗时长，其它处理部分也会耗时，如绘图等）。有些论文还会使用多batch进行预测，我并没有去实现这个部分。**
597 | 
598 | ### w、预测图片不显示问题
599 | **问：为什么你的代码在预测完成后不显示图片？只是在命令行告诉我有什么目标。
600 | 答：给系统安装一个图片查看器就行了。**
601 | 
602 | ### x、算法评价问题（miou）
603 | **问：怎么计算miou？
604 | 答：参考视频里的miou测量部分。**
605 | 
606 | **问：怎么计算Recall、Precision指标。
607 | 答：现有的代码还无法获得，需要各位同学理解一下混淆矩阵的概念，然后自行计算一下。**
608 | 
609 | ### y、UP，怎么优化模型啊？我想提升效果
610 | **问：up，怎么修改模型啊，我想发个小论文！
611 | 答：建议目标检测中的yolov4论文，作为一个大型调参现场非常有参考意义，使用了很多tricks。我能给的建议就是多看一些经典模型，然后拆解里面的亮点结构并使用。**
612 | 
613 | ### z、部署问题（ONNX、TensorRT等）
614 | 我没有具体部署到手机等设备上过，所以很多部署问题我并不了解……
615 | 
616 | ## 5、交流群问题
617 | **问：up，有没有QQ群啥的呢？
618 | 答：没有没有，我没有时间管理QQ群……**
619 | 
620 | ## 6、怎么学习的问题
621 | **问：up，你的学习路线怎么样的？我是个小白我要怎么学？
622 | 答：这里有几点需要注意哈
623 | 1、我不是高手，很多东西我也不会，我的学习路线也不一定适用所有人。
624 | 2、我实验室不做深度学习，所以我很多东西都是自学，自己摸索，正确与否我也不知道。
625 | 3、我个人觉得学习更靠自学**
626 | 学习路线的话，我是先学习了莫烦的python教程，从tensorflow、keras、pytorch入门，入门完之后学的SSD，YOLO，然后了解了很多经典的卷积网，后面就开始学很多不同的代码了，我的学习方法就是一行一行的看，了解整个代码的执行流程，特征层的shape变化等，花了很多时间也没有什么捷径，就是要花时间吧。
627 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------#
  2 | #       对数据集进行训练
  3 | #-------------------------------------#
  4 | import datetime
  5 | import os
  6 | from functools import partial
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.backends.cudnn as cudnn
 11 | import torch.distributed as dist
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | from nets.yolo import YoloBody
 17 | from nets.yolo_training import (ModelEMA, YOLOLoss, get_lr_scheduler,
 18 |                                 set_optimizer_lr, weights_init)
 19 | from utils.callbacks import EvalCallback, LossHistory
 20 | from utils.dataloader import YoloDataset, yolo_dataset_collate
 21 | from utils.utils import (download_weights, get_anchors, get_classes,
 22 |                          seed_everything, show_config, worker_init_fn)
 23 | from utils.utils_fit import fit_one_epoch
 24 | 
 25 | '''
 26 | 训练自己的目标检测模型一定需要注意以下几点：
 27 | 1、训练前仔细检查自己的格式是否满足要求，该库要求数据集格式为VOC格式，需要准备好的内容有输入图片和标签
 28 |    输入图片为.jpg图片，无需固定大小，传入训练前会自动进行resize。
 29 |    灰度图会自动转成RGB图片进行训练，无需自己修改。
 30 |    输入图片如果后缀非jpg，需要自己批量转成jpg后再开始训练。
 31 | 
 32 |    标签为.xml格式，文件中会有需要检测的目标信息，标签文件和输入图片文件相对应。
 33 | 
 34 | 2、损失值的大小用于判断是否收敛，比较重要的是有收敛的趋势，即验证集损失不断下降，如果验证集损失基本上不改变的话，模型基本上就收敛了。
 35 |    损失值的具体大小并没有什么意义，大和小只在于损失的计算方式，并不是接近于0才好。如果想要让损失好看点，可以直接到对应的损失函数里面除上10000。
 36 |    训练过程中的损失值会保存在logs文件夹下的loss_%Y_%m_%d_%H_%M_%S文件夹中
 37 |    
 38 | 3、训练好的权值文件保存在logs文件夹中，每个训练世代（Epoch）包含若干训练步长（Step），每个训练步长（Step）进行一次梯度下降。
 39 |    如果只是训练了几个Step是不会保存的，Epoch和Step的概念要捋清楚一下。
 40 | '''
 41 | if __name__ == "__main__":
 42 |     #---------------------------------#
 43 |     #   Cuda    是否使用Cuda
 44 |     #           没有GPU可以设置成False
 45 |     #---------------------------------#
 46 |     Cuda            = True
 47 |     #----------------------------------------------#
 48 |     #   Seed    用于固定随机种子
 49 |     #           使得每次独立训练都可以获得一样的结果
 50 |     #----------------------------------------------#
 51 |     seed            = 11
 52 |     #---------------------------------------------------------------------#
 53 |     #   distributed     用于指定是否使用单机多卡分布式运行
 54 |     #                   终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
 55 |     #                   Windows系统下默认使用DP模式调用所有显卡，不支持DDP。
 56 |     #   DP模式：
 57 |     #       设置            distributed = False
 58 |     #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python train.py
 59 |     #   DDP模式：
 60 |     #       设置            distributed = True
 61 |     #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
 62 |     #---------------------------------------------------------------------#
 63 |     distributed     = False
 64 |     #---------------------------------------------------------------------#
 65 |     #   sync_bn     是否使用sync_bn，DDP模式多卡可用
 66 |     #---------------------------------------------------------------------#
 67 |     sync_bn         = False
 68 |     #---------------------------------------------------------------------#
 69 |     #   fp16        是否使用混合精度训练
 70 |     #               可减少约一半的显存、需要pytorch1.7.1以上
 71 |     #---------------------------------------------------------------------#
 72 |     fp16            = False
 73 |     #---------------------------------------------------------------------#
 74 |     #   classes_path    指向model_data下的txt，与自己训练的数据集相关 
 75 |     #                   训练前一定要修改classes_path，使其对应自己的数据集
 76 |     #---------------------------------------------------------------------#
 77 |     classes_path    = 'model_data/voc_classes.txt'
 78 |     #---------------------------------------------------------------------#
 79 |     #   anchors_path    代表先验框对应的txt文件，一般不修改。
 80 |     #   anchors_mask    用于帮助代码找到对应的先验框，一般不修改。
 81 |     #---------------------------------------------------------------------#
 82 |     anchors_path    = 'model_data/yolo_anchors.txt'
 83 |     anchors_mask    = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
 84 |     #----------------------------------------------------------------------------------------------------------------------------#
 85 |     #   权值文件的下载请看README，可以通过网盘下载。模型的 预训练权重 对不同数据集是通用的，因为特征是通用的。
 86 |     #   模型的 预训练权重 比较重要的部分是 主干特征提取网络的权值部分，用于进行特征提取。
 87 |     #   预训练权重对于99%的情况都必须要用，不用的话主干部分的权值太过随机，特征提取效果不明显，网络训练的结果也不会好
 88 |     #
 89 |     #   如果训练过程中存在中断训练的操作，可以将model_path设置成logs文件夹下的权值文件，将已经训练了一部分的权值再次载入。
 90 |     #   同时修改下方的 冻结阶段 或者 解冻阶段 的参数，来保证模型epoch的连续性。
 91 |     #   
 92 |     #   当model_path = ''的时候不加载整个模型的权值。
 93 |     #
 94 |     #   此处使用的是整个模型的权重，因此是在train.py进行加载的。
 95 |     #   如果想要让模型从0开始训练，则设置model_path = ''，下面的Freeze_Train = Fasle，此时从0开始训练，且没有冻结主干的过程。
 96 |     #   
 97 |     #   一般来讲，网络从0开始的训练效果会很差，因为权值太过随机，特征提取效果不明显，因此非常、非常、非常不建议大家从0开始训练！
 98 |     #   从0开始训练有两个方案：
 99 |     #   1、得益于Mosaic数据增强方法强大的数据增强能力，将UnFreeze_Epoch设置的较大（300及以上）、batch较大（16及以上）、数据较多（万以上）的情况下，
100 |     #      可以设置mosaic=True，直接随机初始化参数开始训练，但得到的效果仍然不如有预训练的情况。（像COCO这样的大数据集可以这样做）
101 |     #   2、了解imagenet数据集，首先训练分类模型，获得网络的主干部分权值，分类模型的 主干部分 和该模型通用，基于此进行训练。
102 |     #----------------------------------------------------------------------------------------------------------------------------#
103 |     model_path      = 'model_data/yolov7_weights.pth'
104 |     #------------------------------------------------------#
105 |     #   input_shape     输入的shape大小，一定要是32的倍数
106 |     #------------------------------------------------------#
107 |     input_shape     = [640, 640]
108 |     #------------------------------------------------------#
109 |     #   phi             所使用到的yolov7的版本，本仓库一共提供两个：
110 |     #                   l : 对应yolov7
111 |     #                   x : 对应yolov7_x
112 |     #------------------------------------------------------#
113 |     phi             = 'l'
114 |     #----------------------------------------------------------------------------------------------------------------------------#
115 |     #   pretrained      是否使用主干网络的预训练权重，此处使用的是主干的权重，因此是在模型构建的时候进行加载的。
116 |     #                   如果设置了model_path，则主干的权值无需加载，pretrained的值无意义。
117 |     #                   如果不设置model_path，pretrained = True，此时仅加载主干开始训练。
118 |     #                   如果不设置model_path，pretrained = False，Freeze_Train = Fasle，此时从0开始训练，且没有冻结主干的过程。
119 |     #----------------------------------------------------------------------------------------------------------------------------#
120 |     pretrained      = False
121 |     #------------------------------------------------------------------#
122 |     #   mosaic              马赛克数据增强。
123 |     #   mosaic_prob         每个step有多少概率使用mosaic数据增强，默认50%。
124 |     #
125 |     #   mixup               是否使用mixup数据增强，仅在mosaic=True时有效。
126 |     #                       只会对mosaic增强后的图片进行mixup的处理。
127 |     #   mixup_prob          有多少概率在mosaic后使用mixup数据增强，默认50%。
128 |     #                       总的mixup概率为mosaic_prob * mixup_prob。
129 |     #
130 |     #   special_aug_ratio   参考YoloX，由于Mosaic生成的训练图片，远远脱离自然图片的真实分布。
131 |     #                       当mosaic=True时，本代码会在special_aug_ratio范围内开启mosaic。
132 |     #                       默认为前70%个epoch，100个世代会开启70个世代。
133 |     #------------------------------------------------------------------#
134 |     mosaic              = True
135 |     mosaic_prob         = 0.5
136 |     mixup               = True
137 |     mixup_prob          = 0.5
138 |     special_aug_ratio   = 0.7
139 |     #------------------------------------------------------------------#
140 |     #   label_smoothing     标签平滑。一般0.01以下。如0.01、0.005。
141 |     #------------------------------------------------------------------#
142 |     label_smoothing     = 0
143 | 
144 |     #----------------------------------------------------------------------------------------------------------------------------#
145 |     #   训练分为两个阶段，分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。
146 |     #   冻结训练需要的显存较小，显卡非常差的情况下，可设置Freeze_Epoch等于UnFreeze_Epoch，Freeze_Train = True，此时仅仅进行冻结训练。
147 |     #      
148 |     #   在此提供若干参数设置建议，各位训练者根据自己的需求进行灵活调整：
149 |     #   （一）从整个模型的预训练权重开始训练： 
150 |     #       Adam：
151 |     #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 100，Freeze_Train = True，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（冻结）
152 |     #           Init_Epoch = 0，UnFreeze_Epoch = 100，Freeze_Train = False，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（不冻结）
153 |     #       SGD：
154 |     #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 300，Freeze_Train = True，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（冻结）
155 |     #           Init_Epoch = 0，UnFreeze_Epoch = 300，Freeze_Train = False，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（不冻结）
156 |     #       其中：UnFreeze_Epoch可以在100-300之间调整。
157 |     #   （二）从0开始训练：
158 |     #       Init_Epoch = 0，UnFreeze_Epoch >= 300，Unfreeze_batch_size >= 16，Freeze_Train = False（不冻结训练）
159 |     #       其中：UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd'，Init_lr = 1e-2，mosaic = True。
160 |     #   （三）batch_size的设置：
161 |     #       在显卡能够接受的范围内，以大为好。显存不足与数据集大小无关，提示显存不足（OOM或者CUDA out of memory）请调小batch_size。
162 |     #       受到BatchNorm层影响，batch_size最小为2，不能为1。
163 |     #       正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大，因为关系到学习率的自动调整。
164 |     #----------------------------------------------------------------------------------------------------------------------------#
165 |     #------------------------------------------------------------------#
166 |     #   冻结阶段训练参数
167 |     #   此时模型的主干被冻结了，特征提取网络不发生改变
168 |     #   占用的显存较小，仅对网络进行微调
169 |     #   Init_Epoch          模型当前开始的训练世代，其值可以大于Freeze_Epoch，如设置：
170 |     #                       Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100
171 |     #                       会跳过冻结阶段，直接从60代开始，并调整对应的学习率。
172 |     #                       （断点续练时使用）
173 |     #   Freeze_Epoch        模型冻结训练的Freeze_Epoch
174 |     #                       (当Freeze_Train=False时失效)
175 |     #   Freeze_batch_size   模型冻结训练的batch_size
176 |     #                       (当Freeze_Train=False时失效)
177 |     #------------------------------------------------------------------#
178 |     Init_Epoch          = 0
179 |     Freeze_Epoch        = 50
180 |     Freeze_batch_size   = 8
181 |     #------------------------------------------------------------------#
182 |     #   解冻阶段训练参数
183 |     #   此时模型的主干不被冻结了，特征提取网络会发生改变
184 |     #   占用的显存较大，网络所有的参数都会发生改变
185 |     #   UnFreeze_Epoch          模型总共训练的epoch
186 |     #                           SGD需要更长的时间收敛，因此设置较大的UnFreeze_Epoch
187 |     #                           Adam可以使用相对较小的UnFreeze_Epoch
188 |     #   Unfreeze_batch_size     模型在解冻后的batch_size
189 |     #------------------------------------------------------------------#
190 |     UnFreeze_Epoch      = 300
191 |     Unfreeze_batch_size = 4
192 |     #------------------------------------------------------------------#
193 |     #   Freeze_Train    是否进行冻结训练
194 |     #                   默认先冻结主干训练后解冻训练。
195 |     #------------------------------------------------------------------#
196 |     Freeze_Train        = True
197 | 
198 |     #------------------------------------------------------------------#
199 |     #   其它训练参数：学习率、优化器、学习率下降有关
200 |     #------------------------------------------------------------------#
201 |     #------------------------------------------------------------------#
202 |     #   Init_lr         模型的最大学习率
203 |     #   Min_lr          模型的最小学习率，默认为最大学习率的0.01
204 |     #------------------------------------------------------------------#
205 |     Init_lr             = 1e-2
206 |     Min_lr              = Init_lr * 0.01
207 |     #------------------------------------------------------------------#
208 |     #   optimizer_type  使用到的优化器种类，可选的有adam、sgd
209 |     #                   当使用Adam优化器时建议设置  Init_lr=1e-3
210 |     #                   当使用SGD优化器时建议设置   Init_lr=1e-2
211 |     #   momentum        优化器内部使用到的momentum参数
212 |     #   weight_decay    权值衰减，可防止过拟合
213 |     #                   adam会导致weight_decay错误，使用adam时建议设置为0。
214 |     #------------------------------------------------------------------#
215 |     optimizer_type      = "sgd"
216 |     momentum            = 0.937
217 |     weight_decay        = 5e-4
218 |     #------------------------------------------------------------------#
219 |     #   lr_decay_type   使用到的学习率下降方式，可选的有step、cos
220 |     #------------------------------------------------------------------#
221 |     lr_decay_type       = "cos"
222 |     #------------------------------------------------------------------#
223 |     #   save_period     多少个epoch保存一次权值
224 |     #------------------------------------------------------------------#
225 |     save_period         = 10
226 |     #------------------------------------------------------------------#
227 |     #   save_dir        权值与日志文件保存的文件夹
228 |     #------------------------------------------------------------------#
229 |     save_dir            = 'logs'
230 |     #------------------------------------------------------------------#
231 |     #   eval_flag       是否在训练时进行评估，评估对象为验证集
232 |     #                   安装pycocotools库后，评估体验更佳。
233 |     #   eval_period     代表多少个epoch评估一次，不建议频繁的评估
234 |     #                   评估需要消耗较多的时间，频繁评估会导致训练非常慢
235 |     #   此处获得的mAP会与get_map.py获得的会有所不同，原因有二：
236 |     #   （一）此处获得的mAP为验证集的mAP。
237 |     #   （二）此处设置评估参数较为保守，目的是加快评估速度。
238 |     #------------------------------------------------------------------#
239 |     eval_flag           = True
240 |     eval_period         = 10
241 |     #------------------------------------------------------------------#
242 |     #   num_workers     用于设置是否使用多线程读取数据
243 |     #                   开启后会加快数据读取速度，但是会占用更多内存
244 |     #                   内存较小的电脑可以设置为2或者0  
245 |     #------------------------------------------------------------------#
246 |     num_workers         = 4
247 | 
248 |     #------------------------------------------------------#
249 |     #   train_annotation_path   训练图片路径和标签
250 |     #   val_annotation_path     验证图片路径和标签
251 |     #------------------------------------------------------#
252 |     train_annotation_path   = '2007_train.txt'
253 |     val_annotation_path     = '2007_val.txt'
254 | 
255 |     seed_everything(seed)
256 |     #------------------------------------------------------#
257 |     #   设置用到的显卡
258 |     #------------------------------------------------------#
259 |     ngpus_per_node  = torch.cuda.device_count()
260 |     if distributed:
261 |         dist.init_process_group(backend="nccl")
262 |         local_rank  = int(os.environ["LOCAL_RANK"])
263 |         rank        = int(os.environ["RANK"])
264 |         device      = torch.device("cuda", local_rank)
265 |         if local_rank == 0:
266 |             print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
267 |             print("Gpu Device Count : ", ngpus_per_node)
268 |     else:
269 |         device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
270 |         local_rank      = 0
271 |         rank            = 0
272 | 
273 |     #------------------------------------------------------#
274 |     #   获取classes和anchor
275 |     #------------------------------------------------------#
276 |     class_names, num_classes = get_classes(classes_path)
277 |     anchors, num_anchors     = get_anchors(anchors_path)
278 | 
279 |     #----------------------------------------------------#
280 |     #   下载预训练权重
281 |     #----------------------------------------------------#
282 |     if pretrained:
283 |         if distributed:
284 |             if local_rank == 0:
285 |                 download_weights(phi)  
286 |             dist.barrier()
287 |         else:
288 |             download_weights(phi)
289 |             
290 |     #------------------------------------------------------#
291 |     #   创建yolo模型
292 |     #------------------------------------------------------#
293 |     model = YoloBody(anchors_mask, num_classes, phi, pretrained=pretrained)
294 |     if not pretrained:
295 |         weights_init(model)
296 |     if model_path != '':
297 |         #------------------------------------------------------#
298 |         #   权值文件请看README，百度网盘下载
299 |         #------------------------------------------------------#
300 |         if local_rank == 0:
301 |             print('Load weights {}.'.format(model_path))
302 |         
303 |         #------------------------------------------------------#
304 |         #   根据预训练权重的Key和模型的Key进行加载
305 |         #------------------------------------------------------#
306 |         model_dict      = model.state_dict()
307 |         pretrained_dict = torch.load(model_path, map_location = device)
308 |         load_key, no_load_key, temp_dict = [], [], {}
309 |         for k, v in pretrained_dict.items():
310 |             if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
311 |                 temp_dict[k] = v
312 |                 load_key.append(k)
313 |             else:
314 |                 no_load_key.append(k)
315 |         model_dict.update(temp_dict)
316 |         model.load_state_dict(model_dict)
317 |         #------------------------------------------------------#
318 |         #   显示没有匹配上的Key
319 |         #------------------------------------------------------#
320 |         if local_rank == 0:
321 |             print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key))
322 |             print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key))
323 |             print("\n\033[1;33;44m温馨提示，head部分没有载入是正常现象，Backbone部分没有载入是错误的。\033[0m")
324 | 
325 |     #----------------------#
326 |     #   获得损失函数
327 |     #----------------------#
328 |     yolo_loss    = YOLOLoss(anchors, num_classes, input_shape, anchors_mask, label_smoothing)
329 |     #----------------------#
330 |     #   记录Loss
331 |     #----------------------#
332 |     if local_rank == 0:
333 |         time_str        = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S')
334 |         log_dir         = os.path.join(save_dir, "loss_" + str(time_str))
335 |         loss_history    = LossHistory(log_dir, model, input_shape=input_shape)
336 |     else:
337 |         loss_history    = None
338 |         
339 |     #------------------------------------------------------------------#
340 |     #   torch 1.2不支持amp，建议使用torch 1.7.1及以上正确使用fp16
341 |     #   因此torch1.2这里显示"could not be resolve"
342 |     #------------------------------------------------------------------#
343 |     if fp16:
344 |         from torch.cuda.amp import GradScaler as GradScaler
345 |         scaler = GradScaler()
346 |     else:
347 |         scaler = None
348 | 
349 |     model_train     = model.train()
350 |     #----------------------------#
351 |     #   多卡同步Bn
352 |     #----------------------------#
353 |     if sync_bn and ngpus_per_node > 1 and distributed:
354 |         model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train)
355 |     elif sync_bn:
356 |         print("Sync_bn is not support in one gpu or not distributed.")
357 | 
358 |     if Cuda:
359 |         if distributed:
360 |             #----------------------------#
361 |             #   多卡平行运行
362 |             #----------------------------#
363 |             model_train = model_train.cuda(local_rank)
364 |             model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True)
365 |         else:
366 |             model_train = torch.nn.DataParallel(model)
367 |             cudnn.benchmark = True
368 |             model_train = model_train.cuda()
369 |             
370 |     #----------------------------#
371 |     #   权值平滑
372 |     #----------------------------#
373 |     ema = ModelEMA(model_train)
374 |     
375 |     #---------------------------#
376 |     #   读取数据集对应的txt
377 |     #---------------------------#
378 |     with open(train_annotation_path, encoding='utf-8') as f:
379 |         train_lines = f.readlines()
380 |     with open(val_annotation_path, encoding='utf-8') as f:
381 |         val_lines   = f.readlines()
382 |     num_train   = len(train_lines)
383 |     num_val     = len(val_lines)
384 | 
385 |     if local_rank == 0:
386 |         show_config(
387 |             classes_path = classes_path, anchors_path = anchors_path, anchors_mask = anchors_mask, model_path = model_path, input_shape = input_shape, \
388 |             Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \
389 |             Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
390 |             save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val
391 |         )
392 |         #---------------------------------------------------------#
393 |         #   总训练世代指的是遍历全部数据的总次数
394 |         #   总训练步长指的是梯度下降的总次数 
395 |         #   每个训练世代包含若干训练步长，每个训练步长进行一次梯度下降。
396 |         #   此处仅建议最低训练世代，上不封顶，计算时只考虑了解冻部分
397 |         #----------------------------------------------------------#
398 |         wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
399 |         total_step  = num_train // Unfreeze_batch_size * UnFreeze_Epoch
400 |         if total_step <= wanted_step:
401 |             if num_train // Unfreeze_batch_size == 0:
402 |                 raise ValueError('数据集过小，无法进行训练，请扩充数据集。')
403 |             wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
404 |             print("\n\033[1;33;44m[Warning] 使用%s优化器时，建议将训练总步长设置到%d以上。\033[0m"%(optimizer_type, wanted_step))
405 |             print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%d，Unfreeze_batch_size为%d，共训练%d个Epoch，计算出总训练步长为%d。\033[0m"%(num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step))
406 |             print("\033[1;33;44m[Warning] 由于总训练步长为%d，小于建议总步长%d，建议设置总世代为%d。\033[0m"%(total_step, wanted_step, wanted_epoch))
407 | 
408 |     #------------------------------------------------------#
409 |     #   主干特征提取网络特征通用，冻结训练可以加快训练速度
410 |     #   也可以在训练初期防止权值被破坏。
411 |     #   Init_Epoch为起始世代
412 |     #   Freeze_Epoch为冻结训练的世代
413 |     #   UnFreeze_Epoch总训练世代
414 |     #   提示OOM或者显存不足请调小Batch_size
415 |     #------------------------------------------------------#
416 |     if True:
417 |         UnFreeze_flag = False
418 |         #------------------------------------#
419 |         #   冻结一定部分训练
420 |         #------------------------------------#
421 |         if Freeze_Train:
422 |             for param in model.backbone.parameters():
423 |                 param.requires_grad = False
424 | 
425 |         #-------------------------------------------------------------------#
426 |         #   如果不冻结训练的话，直接设置batch_size为Unfreeze_batch_size
427 |         #-------------------------------------------------------------------#
428 |         batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size
429 | 
430 |         #-------------------------------------------------------------------#
431 |         #   判断当前batch_size，自适应调整学习率
432 |         #-------------------------------------------------------------------#
433 |         nbs             = 64
434 |         lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
435 |         lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
436 |         Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
437 |         Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
438 | 
439 |         #---------------------------------------#
440 |         #   根据optimizer_type选择优化器
441 |         #---------------------------------------#
442 |         pg0, pg1, pg2 = [], [], []  
443 |         for k, v in model.named_modules():
444 |             if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
445 |                 pg2.append(v.bias)    
446 |             if isinstance(v, nn.BatchNorm2d) or "bn" in k:
447 |                 pg0.append(v.weight)    
448 |             elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
449 |                 pg1.append(v.weight)   
450 |         optimizer = {
451 |             'adam'  : optim.Adam(pg0, Init_lr_fit, betas = (momentum, 0.999)),
452 |             'sgd'   : optim.SGD(pg0, Init_lr_fit, momentum = momentum, nesterov=True)
453 |         }[optimizer_type]
454 |         optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay})
455 |         optimizer.add_param_group({"params": pg2})
456 | 
457 |         #---------------------------------------#
458 |         #   获得学习率下降的公式
459 |         #---------------------------------------#
460 |         lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
461 |         
462 |         #---------------------------------------#
463 |         #   判断每一个世代的长度
464 |         #---------------------------------------#
465 |         epoch_step      = num_train // batch_size
466 |         epoch_step_val  = num_val // batch_size
467 |         
468 |         if epoch_step == 0 or epoch_step_val == 0:
469 |             raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
470 | 
471 |         if ema:
472 |             ema.updates     = epoch_step * Init_Epoch
473 |         
474 |         #---------------------------------------#
475 |         #   构建数据集加载器。
476 |         #---------------------------------------#
477 |         train_dataset   = YoloDataset(train_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
478 |                                         mosaic=mosaic, mixup=mixup, mosaic_prob=mosaic_prob, mixup_prob=mixup_prob, train=True, special_aug_ratio=special_aug_ratio)
479 |         val_dataset     = YoloDataset(val_lines, input_shape, num_classes, anchors, anchors_mask, epoch_length=UnFreeze_Epoch, \
480 |                                         mosaic=False, mixup=False, mosaic_prob=0, mixup_prob=0, train=False, special_aug_ratio=0)
481 |         
482 |         if distributed:
483 |             train_sampler   = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,)
484 |             val_sampler     = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False,)
485 |             batch_size      = batch_size // ngpus_per_node
486 |             shuffle         = False
487 |         else:
488 |             train_sampler   = None
489 |             val_sampler     = None
490 |             shuffle         = True
491 | 
492 |         gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
493 |                                     drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
494 |                                     worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
495 |         gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
496 |                                     drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
497 |                                     worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
498 | 
499 |         #----------------------#
500 |         #   记录eval的map曲线
501 |         #----------------------#
502 |         if local_rank == 0:
503 |             eval_callback   = EvalCallback(model, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, Cuda, \
504 |                                             eval_flag=eval_flag, period=eval_period)
505 |         else:
506 |             eval_callback   = None
507 |         
508 |         #---------------------------------------#
509 |         #   开始模型训练
510 |         #---------------------------------------#
511 |         for epoch in range(Init_Epoch, UnFreeze_Epoch):
512 |             #---------------------------------------#
513 |             #   如果模型有冻结学习部分
514 |             #   则解冻，并设置参数
515 |             #---------------------------------------#
516 |             if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
517 |                 batch_size = Unfreeze_batch_size
518 | 
519 |                 #-------------------------------------------------------------------#
520 |                 #   判断当前batch_size，自适应调整学习率
521 |                 #-------------------------------------------------------------------#
522 |                 nbs             = 64
523 |                 lr_limit_max    = 1e-3 if optimizer_type == 'adam' else 5e-2
524 |                 lr_limit_min    = 3e-4 if optimizer_type == 'adam' else 5e-4
525 |                 Init_lr_fit     = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
526 |                 Min_lr_fit      = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
527 |                 #---------------------------------------#
528 |                 #   获得学习率下降的公式
529 |                 #---------------------------------------#
530 |                 lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
531 | 
532 |                 for param in model.backbone.parameters():
533 |                     param.requires_grad = True
534 | 
535 |                 epoch_step      = num_train // batch_size
536 |                 epoch_step_val  = num_val // batch_size
537 | 
538 |                 if epoch_step == 0 or epoch_step_val == 0:
539 |                     raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
540 |                     
541 |                 if ema:
542 |                     ema.updates     = epoch_step * epoch
543 | 
544 |                 if distributed:
545 |                     batch_size  = batch_size // ngpus_per_node
546 |                     
547 |                 gen             = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
548 |                                             drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, 
549 |                                             worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
550 |                 gen_val         = DataLoader(val_dataset  , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, 
551 |                                             drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, 
552 |                                             worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
553 | 
554 |                 UnFreeze_flag   = True
555 | 
556 |             gen.dataset.epoch_now       = epoch
557 |             gen_val.dataset.epoch_now   = epoch
558 | 
559 |             if distributed:
560 |                 train_sampler.set_epoch(epoch)
561 | 
562 |             set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
563 | 
564 |             fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank)
565 |             
566 |             if distributed:
567 |                 dist.barrier()
568 | 
569 |         if local_rank == 0:
570 |             loss_history.writer.close()
571 | 


--------------------------------------------------------------------------------
/nets/yolo_training.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from copy import deepcopy
  3 | from functools import partial
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
 12 |     # return positive, negative label smoothing BCE targets
 13 |     return 1.0 - 0.5 * eps, 0.5 * eps
 14 | 
 15 | class YOLOLoss(nn.Module):
 16 |     def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0):
 17 |         super(YOLOLoss, self).__init__()
 18 |         #-----------------------------------------------------------#
 19 |         #   13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]
 20 |         #   26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]
 21 |         #   52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]
 22 |         #-----------------------------------------------------------#
 23 |         self.anchors        = [anchors[mask] for mask in anchors_mask]
 24 |         self.num_classes    = num_classes
 25 |         self.input_shape    = input_shape
 26 |         self.anchors_mask   = anchors_mask
 27 | 
 28 |         self.balance        = [0.4, 1.0, 4]
 29 |         self.stride         = [32, 16, 8]
 30 |         
 31 |         self.box_ratio      = 0.05
 32 |         self.obj_ratio      = 1 * (input_shape[0] * input_shape[1]) / (640 ** 2)
 33 |         self.cls_ratio      = 0.5 * (num_classes / 80)
 34 |         self.threshold      = 4
 35 | 
 36 |         self.cp, self.cn                    = smooth_BCE(eps=label_smoothing)  
 37 |         self.BCEcls, self.BCEobj, self.gr   = nn.BCEWithLogitsLoss(), nn.BCEWithLogitsLoss(), 1
 38 | 
 39 |     def bbox_iou(self, box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
 40 |         box2 = box2.T
 41 | 
 42 |         if x1y1x2y2:
 43 |             b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
 44 |             b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
 45 |         else:
 46 |             b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
 47 |             b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
 48 |             b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
 49 |             b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
 50 | 
 51 |         inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
 52 |                 (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
 53 | 
 54 |         w1, h1  = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
 55 |         w2, h2  = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
 56 |         union   = w1 * h1 + w2 * h2 - inter + eps
 57 | 
 58 |         iou = inter / union
 59 | 
 60 |         if GIoU or DIoU or CIoU:
 61 |             cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex (smallest enclosing box) width
 62 |             ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
 63 |             if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
 64 |                 c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
 65 |                 rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
 66 |                         (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center distance squared
 67 |                 if DIoU:
 68 |                     return iou - rho2 / c2  # DIoU
 69 |                 elif CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
 70 |                     v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
 71 |                     with torch.no_grad():
 72 |                         alpha = v / (v - iou + (1 + eps))
 73 |                     return iou - (rho2 / c2 + v * alpha)  # CIoU
 74 |             else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
 75 |                 c_area = cw * ch + eps  # convex area
 76 |                 return iou - (c_area - union) / c_area  # GIoU
 77 |         else:
 78 |             return iou  # IoU
 79 |     
 80 |     def __call__(self, predictions, targets, imgs): 
 81 |         #-------------------------------------------#
 82 |         #   对输入进来的预测结果进行reshape
 83 |         #   bs, 255, 20, 20 => bs, 3, 20, 20, 85
 84 |         #   bs, 255, 40, 40 => bs, 3, 40, 40, 85
 85 |         #   bs, 255, 80, 80 => bs, 3, 80, 80, 85
 86 |         #-------------------------------------------#
 87 |         for i in range(len(predictions)):
 88 |             bs, _, h, w = predictions[i].size()
 89 |             predictions[i] = predictions[i].view(bs, len(self.anchors_mask[i]), -1, h, w).permute(0, 1, 3, 4, 2).contiguous()
 90 |             
 91 |         #-------------------------------------------#
 92 |         #   获得工作的设备
 93 |         #-------------------------------------------#
 94 |         device              = targets.device
 95 |         #-------------------------------------------#
 96 |         #   初始化三个部分的损失
 97 |         #-------------------------------------------#
 98 |         cls_loss, box_loss, obj_loss    = torch.zeros(1, device = device), torch.zeros(1, device = device), torch.zeros(1, device = device)
 99 |         
100 |         #-------------------------------------------#
101 |         #   进行正样本的匹配
102 |         #-------------------------------------------#
103 |         bs, as_, gjs, gis, targets, anchors = self.build_targets(predictions, targets, imgs)
104 |         #-------------------------------------------#
105 |         #   计算获得对应特征层的高宽
106 |         #-------------------------------------------#
107 |         feature_map_sizes = [torch.tensor(prediction.shape, device=device)[[3, 2, 3, 2]].type_as(prediction) for prediction in predictions] 
108 |     
109 |         #-------------------------------------------#
110 |         #   计算损失，对三个特征层各自进行处理
111 |         #-------------------------------------------#
112 |         for i, prediction in enumerate(predictions): 
113 |             #-------------------------------------------#
114 |             #   image, anchor, gridy, gridx
115 |             #-------------------------------------------#
116 |             b, a, gj, gi    = bs[i], as_[i], gjs[i], gis[i]
117 |             tobj            = torch.zeros_like(prediction[..., 0], device=device)  # target obj
118 | 
119 |             #-------------------------------------------#
120 |             #   获得目标数量，如果目标大于0
121 |             #   则开始计算种类损失和回归损失
122 |             #-------------------------------------------#
123 |             n = b.shape[0]
124 |             if n:
125 |                 prediction_pos = prediction[b, a, gj, gi]  # prediction subset corresponding to targets
126 | 
127 |                 #-------------------------------------------#
128 |                 #   计算匹配上的正样本的回归损失
129 |                 #-------------------------------------------#
130 |                 #-------------------------------------------#
131 |                 #   grid 获得正样本的x、y轴坐标
132 |                 #-------------------------------------------#
133 |                 grid    = torch.stack([gi, gj], dim=1)
134 |                 #-------------------------------------------#
135 |                 #   进行解码，获得预测结果
136 |                 #-------------------------------------------#
137 |                 xy      = prediction_pos[:, :2].sigmoid() * 2. - 0.5
138 |                 wh      = (prediction_pos[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]
139 |                 box     = torch.cat((xy, wh), 1)
140 |                 #-------------------------------------------#
141 |                 #   对真实框进行处理，映射到特征层上
142 |                 #-------------------------------------------#
143 |                 selected_tbox           = targets[i][:, 2:6] * feature_map_sizes[i]
144 |                 selected_tbox[:, :2]    -= grid.type_as(prediction)
145 |                 #-------------------------------------------#
146 |                 #   计算预测框和真实框的回归损失
147 |                 #-------------------------------------------#
148 |                 iou                     = self.bbox_iou(box.T, selected_tbox, x1y1x2y2=False, CIoU=True)
149 |                 box_loss                += (1.0 - iou).mean()
150 |                 #-------------------------------------------#
151 |                 #   根据预测结果的iou获得置信度损失的gt
152 |                 #-------------------------------------------#
153 |                 tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * iou.detach().clamp(0).type(tobj.dtype)  # iou ratio
154 | 
155 |                 #-------------------------------------------#
156 |                 #   计算匹配上的正样本的分类损失
157 |                 #-------------------------------------------#
158 |                 selected_tcls               = targets[i][:, 1].long()
159 |                 t                           = torch.full_like(prediction_pos[:, 5:], self.cn, device=device)  # targets
160 |                 t[range(n), selected_tcls]  = self.cp
161 |                 cls_loss                    += self.BCEcls(prediction_pos[:, 5:], t)  # BCE
162 | 
163 |             #-------------------------------------------#
164 |             #   计算目标是否存在的置信度损失
165 |             #   并且乘上每个特征层的比例
166 |             #-------------------------------------------#
167 |             obj_loss += self.BCEobj(prediction[..., 4], tobj) * self.balance[i]  # obj loss
168 |             
169 |         #-------------------------------------------#
170 |         #   将各个部分的损失乘上比例
171 |         #   全加起来后，乘上batch_size
172 |         #-------------------------------------------#
173 |         box_loss    *= self.box_ratio
174 |         obj_loss    *= self.obj_ratio
175 |         cls_loss    *= self.cls_ratio
176 |         bs          = tobj.shape[0]
177 |         
178 |         loss    = box_loss + obj_loss + cls_loss
179 |         return loss
180 |         
181 |     def xywh2xyxy(self, x):
182 |         # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2]
183 |         y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
184 |         y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
185 |         y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
186 |         y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
187 |         y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
188 |         return y
189 |     
190 |     def box_iou(self, box1, box2):
191 |         # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
192 |         """
193 |         Return intersection-over-union (Jaccard index) of boxes.
194 |         Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
195 |         Arguments:
196 |             box1 (Tensor[N, 4])
197 |             box2 (Tensor[M, 4])
198 |         Returns:
199 |             iou (Tensor[N, M]): the NxM matrix containing the pairwise
200 |                 IoU values for every element in boxes1 and boxes2
201 |         """
202 |         def box_area(box):
203 |             # box = 4xn
204 |             return (box[2] - box[0]) * (box[3] - box[1])
205 | 
206 |         area1 = box_area(box1.T)
207 |         area2 = box_area(box2.T)
208 | 
209 |         # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
210 |         inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
211 |         return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
212 | 
213 |     def build_targets(self, predictions, targets, imgs):
214 |         #-------------------------------------------#
215 |         #   匹配正样本
216 |         #-------------------------------------------#
217 |         indices, anch       = self.find_3_positive(predictions, targets)
218 | 
219 |         matching_bs         = [[] for _ in predictions]
220 |         matching_as         = [[] for _ in predictions]
221 |         matching_gjs        = [[] for _ in predictions]
222 |         matching_gis        = [[] for _ in predictions]
223 |         matching_targets    = [[] for _ in predictions]
224 |         matching_anchs      = [[] for _ in predictions]
225 |         
226 |         #-------------------------------------------#
227 |         #   一共三层
228 |         #-------------------------------------------#
229 |         num_layer = len(predictions)
230 |         #-------------------------------------------#
231 |         #   对batch_size进行循环，进行OTA匹配
232 |         #   在batch_size循环中对layer进行循环
233 |         #-------------------------------------------#
234 |         for batch_idx in range(predictions[0].shape[0]):
235 |             #-------------------------------------------#
236 |             #   先判断匹配上的真实框哪些属于该图片
237 |             #-------------------------------------------#
238 |             b_idx       = targets[:, 0]==batch_idx
239 |             this_target = targets[b_idx]
240 |             #-------------------------------------------#
241 |             #   如果没有真实框属于该图片则continue
242 |             #-------------------------------------------#
243 |             if this_target.shape[0] == 0:
244 |                 continue
245 |             
246 |             #-------------------------------------------#
247 |             #   真实框的坐标进行缩放
248 |             #-------------------------------------------#
249 |             txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1]
250 |             #-------------------------------------------#
251 |             #   从中心宽高到左上角右下角
252 |             #-------------------------------------------#
253 |             txyxy = self.xywh2xyxy(txywh)
254 | 
255 |             pxyxys      = []
256 |             p_cls       = []
257 |             p_obj       = []
258 |             from_which_layer = []
259 |             all_b       = []
260 |             all_a       = []
261 |             all_gj      = []
262 |             all_gi      = []
263 |             all_anch    = []
264 |             
265 |             #-------------------------------------------#
266 |             #   对三个layer进行循环
267 |             #-------------------------------------------#
268 |             for i, prediction in enumerate(predictions):
269 |                 #-------------------------------------------#
270 |                 #   b代表第几张图片 a代表第几个先验框
271 |                 #   gj代表y轴，gi代表x轴
272 |                 #-------------------------------------------#
273 |                 b, a, gj, gi    = indices[i]
274 |                 idx             = (b == batch_idx)
275 |                 b, a, gj, gi    = b[idx], a[idx], gj[idx], gi[idx]       
276 |                        
277 |                 all_b.append(b)
278 |                 all_a.append(a)
279 |                 all_gj.append(gj)
280 |                 all_gi.append(gi)
281 |                 all_anch.append(anch[i][idx])
282 |                 from_which_layer.append(torch.ones(size=(len(b),)) * i)
283 |                 
284 |                 #-------------------------------------------#
285 |                 #   取出这个真实框对应的预测结果
286 |                 #-------------------------------------------#
287 |                 fg_pred = prediction[b, a, gj, gi]                
288 |                 p_obj.append(fg_pred[:, 4:5])
289 |                 p_cls.append(fg_pred[:, 5:])
290 |                 
291 |                 #-------------------------------------------#
292 |                 #   获得网格后，进行解码
293 |                 #-------------------------------------------#
294 |                 grid    = torch.stack([gi, gj], dim=1).type_as(fg_pred)
295 |                 pxy     = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i]
296 |                 pwh     = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i]
297 |                 pxywh   = torch.cat([pxy, pwh], dim=-1)
298 |                 pxyxy   = self.xywh2xyxy(pxywh)
299 |                 pxyxys.append(pxyxy)
300 |             
301 |             #-------------------------------------------#
302 |             #   判断是否存在对应的预测框，不存在则跳过
303 |             #-------------------------------------------#
304 |             pxyxys = torch.cat(pxyxys, dim=0)
305 |             if pxyxys.shape[0] == 0:
306 |                 continue
307 |             
308 |             #-------------------------------------------#
309 |             #   进行堆叠
310 |             #-------------------------------------------#
311 |             p_obj       = torch.cat(p_obj, dim=0)
312 |             p_cls       = torch.cat(p_cls, dim=0)
313 |             from_which_layer = torch.cat(from_which_layer, dim=0)
314 |             all_b       = torch.cat(all_b, dim=0)
315 |             all_a       = torch.cat(all_a, dim=0)
316 |             all_gj      = torch.cat(all_gj, dim=0)
317 |             all_gi      = torch.cat(all_gi, dim=0)
318 |             all_anch    = torch.cat(all_anch, dim=0)
319 |         
320 |             #-------------------------------------------------------------#
321 |             #   计算当前图片中，真实框与预测框的重合程度
322 |             #   iou的范围为0-1，取-log后为0~inf
323 |             #   重合程度越大，取-log后越小
324 |             #   因此，真实框与预测框重合度越大，pair_wise_iou_loss越小
325 |             #-------------------------------------------------------------#
326 |             pair_wise_iou       = self.box_iou(txyxy, pxyxys)
327 |             pair_wise_iou_loss  = -torch.log(pair_wise_iou + 1e-8)
328 | 
329 |             #-------------------------------------------#
330 |             #   最多二十个预测框与真实框的重合程度
331 |             #   然后求和，找到每个真实框对应几个预测框
332 |             #-------------------------------------------#
333 |             top_k, _    = torch.topk(pair_wise_iou, min(20, pair_wise_iou.shape[1]), dim=1)
334 |             dynamic_ks  = torch.clamp(top_k.sum(1).int(), min=1)
335 | 
336 |             #-------------------------------------------#
337 |             #   gt_cls_per_image    种类的真实信息
338 |             #-------------------------------------------#
339 |             gt_cls_per_image = F.one_hot(this_target[:, 1].to(torch.int64), self.num_classes).float().unsqueeze(1).repeat(1, pxyxys.shape[0], 1)
340 |             
341 |             #-------------------------------------------#
342 |             #   cls_preds_  种类置信度的预测信息
343 |             #               cls_preds_越接近于1，y越接近于1
344 |             #               y / (1 - y)越接近于无穷大
345 |             #               也就是种类置信度预测的越准
346 |             #               pair_wise_cls_loss越小
347 |             #-------------------------------------------#
348 |             num_gt              = this_target.shape[0]
349 |             cls_preds_          = p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
350 |             y                   = cls_preds_.sqrt_()
351 |             pair_wise_cls_loss  = F.binary_cross_entropy_with_logits(torch.log(y / (1 - y)), gt_cls_per_image, reduction="none").sum(-1)
352 |             del cls_preds_
353 |         
354 |             #-------------------------------------------#
355 |             #   求cost的总和
356 |             #-------------------------------------------#
357 |             cost = (
358 |                 pair_wise_cls_loss
359 |                 + 3.0 * pair_wise_iou_loss
360 |             )
361 | 
362 |             #-------------------------------------------#
363 |             #   求cost最小的k个预测框
364 |             #-------------------------------------------#
365 |             matching_matrix = torch.zeros_like(cost)
366 |             for gt_idx in range(num_gt):
367 |                 _, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
368 |                 matching_matrix[gt_idx][pos_idx] = 1.0
369 | 
370 |             del top_k, dynamic_ks
371 | 
372 |             #-------------------------------------------#
373 |             #   如果一个预测框对应多个真实框
374 |             #   只使用这个预测框最对应的真实框
375 |             #-------------------------------------------#
376 |             anchor_matching_gt = matching_matrix.sum(0)
377 |             if (anchor_matching_gt > 1).sum() > 0:
378 |                 _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
379 |                 matching_matrix[:, anchor_matching_gt > 1]          *= 0.0
380 |                 matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
381 |             fg_mask_inboxes = matching_matrix.sum(0) > 0.0
382 |             matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
383 | 
384 |             #-------------------------------------------#
385 |             #   取出符合条件的框
386 |             #-------------------------------------------#
387 |             from_which_layer    = from_which_layer.to(fg_mask_inboxes.device)[fg_mask_inboxes]
388 |             all_b               = all_b[fg_mask_inboxes]
389 |             all_a               = all_a[fg_mask_inboxes]
390 |             all_gj              = all_gj[fg_mask_inboxes]
391 |             all_gi              = all_gi[fg_mask_inboxes]
392 |             all_anch            = all_anch[fg_mask_inboxes]
393 |             this_target         = this_target[matched_gt_inds]
394 |         
395 |             for i in range(num_layer):
396 |                 layer_idx = from_which_layer == i
397 |                 matching_bs[i].append(all_b[layer_idx])
398 |                 matching_as[i].append(all_a[layer_idx])
399 |                 matching_gjs[i].append(all_gj[layer_idx])
400 |                 matching_gis[i].append(all_gi[layer_idx])
401 |                 matching_targets[i].append(this_target[layer_idx])
402 |                 matching_anchs[i].append(all_anch[layer_idx])
403 | 
404 |         for i in range(num_layer):
405 |             matching_bs[i]      = torch.cat(matching_bs[i], dim=0) if len(matching_bs[i]) != 0 else torch.Tensor(matching_bs[i])
406 |             matching_as[i]      = torch.cat(matching_as[i], dim=0) if len(matching_as[i]) != 0 else torch.Tensor(matching_as[i])
407 |             matching_gjs[i]     = torch.cat(matching_gjs[i], dim=0) if len(matching_gjs[i]) != 0 else torch.Tensor(matching_gjs[i])
408 |             matching_gis[i]     = torch.cat(matching_gis[i], dim=0) if len(matching_gis[i]) != 0 else torch.Tensor(matching_gis[i])
409 |             matching_targets[i] = torch.cat(matching_targets[i], dim=0) if len(matching_targets[i]) != 0 else torch.Tensor(matching_targets[i])
410 |             matching_anchs[i]   = torch.cat(matching_anchs[i], dim=0) if len(matching_anchs[i]) != 0 else torch.Tensor(matching_anchs[i])
411 | 
412 |         return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs
413 | 
414 |     def find_3_positive(self, predictions, targets):
415 |         #------------------------------------#
416 |         #   获得每个特征层先验框的数量
417 |         #   与真实框的数量
418 |         #------------------------------------#
419 |         num_anchor, num_gt  = len(self.anchors_mask[0]), targets.shape[0] 
420 |         #------------------------------------#
421 |         #   创建空列表存放indices和anchors
422 |         #------------------------------------#
423 |         indices, anchors    = [], []
424 |         #------------------------------------#
425 |         #   创建7个1
426 |         #   序号0,1为1
427 |         #   序号2:6为特征层的高宽
428 |         #   序号6为1
429 |         #------------------------------------#
430 |         gain    = torch.ones(7, device=targets.device)
431 |         #------------------------------------#
432 |         #   ai      [num_anchor, num_gt]
433 |         #   targets [num_gt, 6] => [num_anchor, num_gt, 7]
434 |         #------------------------------------#
435 |         ai      = torch.arange(num_anchor, device=targets.device).float().view(num_anchor, 1).repeat(1, num_gt)
436 |         targets = torch.cat((targets.repeat(num_anchor, 1, 1), ai[:, :, None]), 2)  # append anchor indices
437 | 
438 |         g   = 0.5 # offsets
439 |         off = torch.tensor([
440 |             [0, 0],
441 |             [1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m
442 |             # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
443 |         ], device=targets.device).float() * g 
444 | 
445 |         for i in range(len(predictions)):
446 |             #----------------------------------------------------#
447 |             #   将先验框除以stride，获得相对于特征层的先验框。
448 |             #   anchors_i [num_anchor, 2]
449 |             #----------------------------------------------------#
450 |             anchors_i = torch.from_numpy(self.anchors[i] / self.stride[i]).type_as(predictions[i])
451 |             anchors_i, shape = torch.from_numpy(self.anchors[i] / self.stride[i]).type_as(predictions[i]), predictions[i].shape
452 |             #-------------------------------------------#
453 |             #   计算获得对应特征层的高宽
454 |             #-------------------------------------------#
455 |             gain[2:6] = torch.tensor(predictions[i].shape)[[3, 2, 3, 2]]
456 |             
457 |             #-------------------------------------------#
458 |             #   将真实框乘上gain，
459 |             #   其实就是将真实框映射到特征层上
460 |             #-------------------------------------------#
461 |             t = targets * gain
462 |             if num_gt:
463 |                 #-------------------------------------------#
464 |                 #   计算真实框与先验框高宽的比值
465 |                 #   然后根据比值大小进行判断，
466 |                 #   判断结果用于取出，获得所有先验框对应的真实框
467 |                 #   r   [num_anchor, num_gt, 2]
468 |                 #   t   [num_anchor, num_gt, 7] => [num_matched_anchor, 7]
469 |                 #-------------------------------------------#
470 |                 r = t[:, :, 4:6] / anchors_i[:, None]
471 |                 j = torch.max(r, 1. / r).max(2)[0] < self.threshold
472 |                 t = t[j]  # filter
473 |                 
474 |                 #-------------------------------------------#
475 |                 #   gxy 获得所有先验框对应的真实框的x轴y轴坐标
476 |                 #   gxi 取相对于该特征层的右小角的坐标
477 |                 #-------------------------------------------#
478 |                 gxy     = t[:, 2:4] # grid xy
479 |                 gxi     = gain[[2, 3]] - gxy # inverse
480 |                 j, k    = ((gxy % 1. < g) & (gxy > 1.)).T
481 |                 l, m    = ((gxi % 1. < g) & (gxi > 1.)).T
482 |                 j       = torch.stack((torch.ones_like(j), j, k, l, m))
483 |                 
484 |                 #-------------------------------------------#
485 |                 #   t   重复5次，使用满足条件的j进行框的提取
486 |                 #   j   一共五行，代表当前特征点在五个
487 |                 #       [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]
488 |                 #       方向是否存在
489 |                 #-------------------------------------------#
490 |                 t       = t.repeat((5, 1, 1))[j]
491 |                 offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
492 |             else:
493 |                 t = targets[0]
494 |                 offsets = 0
495 | 
496 |             #-------------------------------------------#
497 |             #   b   代表属于第几个图片
498 |             #   gxy 代表该真实框所处的x、y中心坐标
499 |             #   gwh 代表该真实框的wh坐标
500 |             #   gij 代表真实框所属的特征点坐标
501 |             #-------------------------------------------#
502 |             b, c    = t[:, :2].long().T  # image, class
503 |             gxy     = t[:, 2:4]  # grid xy
504 |             gwh     = t[:, 4:6]  # grid wh
505 |             gij     = (gxy - offsets).long()
506 |             gi, gj  = gij.T  # grid xy indices
507 | 
508 |             #-------------------------------------------#
509 |             #   gj、gi不能超出特征层范围
510 |             #   a代表属于该特征点的第几个先验框
511 |             #-------------------------------------------#
512 |             a = t[:, 6].long()  # anchor indices
513 |             indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, anchor, grid indices
514 |             anchors.append(anchors_i[a])  # anchors
515 | 
516 |         return indices, anchors
517 | 
518 | def is_parallel(model):
519 |     # Returns True if model is of type DP or DDP
520 |     return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
521 | 
522 | def de_parallel(model):
523 |     # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
524 |     return model.module if is_parallel(model) else model
525 |     
526 | def copy_attr(a, b, include=(), exclude=()):
527 |     # Copy attributes from b to a, options to only include [...] and to exclude [...]
528 |     for k, v in b.__dict__.items():
529 |         if (len(include) and k not in include) or k.startswith('_') or k in exclude:
530 |             continue
531 |         else:
532 |             setattr(a, k, v)
533 | 
534 | class ModelEMA:
535 |     """ Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models
536 |     Keeps a moving average of everything in the model state_dict (parameters and buffers)
537 |     For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
538 |     """
539 | 
540 |     def __init__(self, model, decay=0.9999, tau=2000, updates=0):
541 |         # Create EMA
542 |         self.ema = deepcopy(de_parallel(model)).eval()  # FP32 EMA
543 |         # if next(model.parameters()).device.type != 'cpu':
544 |         #     self.ema.half()  # FP16 EMA
545 |         self.updates = updates  # number of EMA updates
546 |         self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
547 |         for p in self.ema.parameters():
548 |             p.requires_grad_(False)
549 | 
550 |     def update(self, model):
551 |         # Update EMA parameters
552 |         with torch.no_grad():
553 |             self.updates += 1
554 |             d = self.decay(self.updates)
555 | 
556 |             msd = de_parallel(model).state_dict()  # model state_dict
557 |             for k, v in self.ema.state_dict().items():
558 |                 if v.dtype.is_floating_point:
559 |                     v *= d
560 |                     v += (1 - d) * msd[k].detach()
561 | 
562 |     def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
563 |         # Update EMA attributes
564 |         copy_attr(self.ema, model, include, exclude)
565 | 
566 | def weights_init(net, init_type='normal', init_gain = 0.02):
567 |     def init_func(m):
568 |         classname = m.__class__.__name__
569 |         if hasattr(m, 'weight') and classname.find('Conv') != -1:
570 |             if init_type == 'normal':
571 |                 torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
572 |             elif init_type == 'xavier':
573 |                 torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
574 |             elif init_type == 'kaiming':
575 |                 torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
576 |             elif init_type == 'orthogonal':
577 |                 torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
578 |             else:
579 |                 raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
580 |         elif classname.find('BatchNorm2d') != -1:
581 |             torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
582 |             torch.nn.init.constant_(m.bias.data, 0.0)
583 |     print('initialize network with %s type' % init_type)
584 |     net.apply(init_func)
585 | 
586 | def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.05, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.05, step_num = 10):
587 |     def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters):
588 |         if iters <= warmup_total_iters:
589 |             # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
590 |             lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2
591 |             ) + warmup_lr_start
592 |         elif iters >= total_iters - no_aug_iter:
593 |             lr = min_lr
594 |         else:
595 |             lr = min_lr + 0.5 * (lr - min_lr) * (
596 |                 1.0
597 |                 + math.cos(
598 |                     math.pi
599 |                     * (iters - warmup_total_iters)
600 |                     / (total_iters - warmup_total_iters - no_aug_iter)
601 |                 )
602 |             )
603 |         return lr
604 | 
605 |     def step_lr(lr, decay_rate, step_size, iters):
606 |         if step_size < 1:
607 |             raise ValueError("step_size must above 1.")
608 |         n       = iters // step_size
609 |         out_lr  = lr * decay_rate ** n
610 |         return out_lr
611 | 
612 |     if lr_decay_type == "cos":
613 |         warmup_total_iters  = min(max(warmup_iters_ratio * total_iters, 1), 3)
614 |         warmup_lr_start     = max(warmup_lr_ratio * lr, 1e-6)
615 |         no_aug_iter         = min(max(no_aug_iter_ratio * total_iters, 1), 15)
616 |         func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter)
617 |     else:
618 |         decay_rate  = (min_lr / lr) ** (1 / (step_num - 1))
619 |         step_size   = total_iters / step_num
620 |         func = partial(step_lr, lr, decay_rate, step_size)
621 | 
622 |     return func
623 | 
624 | def set_optimizer_lr(optimizer, lr_scheduler_func, epoch):
625 |     lr = lr_scheduler_func(epoch)
626 |     for param_group in optimizer.param_groups:
627 |         param_group['lr'] = lr
628 | 


--------------------------------------------------------------------------------