├── README.md ├── faster_rcnn ├── README.md ├── __pycache__ │ ├── my_dataset.cpython-37.pyc │ └── transforms.cpython-37.pyc ├── backbone │ ├── __pycache__ │ │ └── resnet50_fpn_model.cpython-37.pyc │ ├── mobilenetv2_model.py │ ├── resnet50_fpn_model.py │ └── vgg_model.py ├── draw_box_utils.py ├── fasterRCNN.png ├── my_dataset.py ├── network_files │ ├── __pycache__ │ │ ├── boxes.cpython-37.pyc │ │ ├── det_utils.cpython-37.pyc │ │ ├── faster_rcnn_framework.cpython-37.pyc │ │ ├── image_list.cpython-37.pyc │ │ ├── roi_head.cpython-37.pyc │ │ ├── rpn_function.cpython-37.pyc │ │ └── transform.cpython-37.pyc │ ├── boxes.py │ ├── det_utils.py │ ├── faster_rcnn_framework.py │ ├── image_list.py │ ├── roi_head.py │ ├── rpn_function.py │ └── transform.py ├── pascal_voc_classes.json ├── predict.py ├── spilt_data.py ├── train_mobilenet.py ├── train_multi_GPU.py ├── train_res50_fpn.py ├── train_utils │ ├── __pycache__ │ │ ├── coco_eval.cpython-37.pyc │ │ ├── coco_utils.cpython-37.pyc │ │ └── train_eval_utils.cpython-37.pyc │ ├── coco_eval.py │ ├── coco_utils.py │ ├── group_by_aspect_ratio.py │ └── train_eval_utils.py └── transforms.py └── ssd ├── README.md ├── __pycache__ ├── draw_box_utils.cpython-37.pyc ├── my_dataset.cpython-37.pyc └── transform.cpython-37.pyc ├── draw_box_utils.py ├── my_dataset.py ├── pascal_voc_classes.json ├── plot_curve.py ├── predict_test.py ├── res50_ssd.png ├── src ├── __pycache__ │ ├── res50_backbone.cpython-37.pyc │ ├── ssd_model.cpython-37.pyc │ └── utils.cpython-37.pyc ├── res50_backbone.py ├── ssd_model.py └── utils.py ├── train_multi_GPU.py ├── train_ssd300.py ├── train_utils ├── __pycache__ │ ├── coco_eval.cpython-37.pyc │ ├── coco_utils.cpython-37.pyc │ └── train_eval_utils.cpython-37.pyc ├── coco_eval.py ├── coco_utils.py ├── group_by_aspect_ratio.py └── train_eval_utils.py └── transform.py /README.md: -------------------------------------------------------------------------------- 1 | # object_detection 2 | -------------------------------------------------------------------------------- /faster_rcnn/README.md: -------------------------------------------------------------------------------- 1 | # Faster R-CNN 2 | ## 环境配置: 3 | * Python3.6或者3.7 4 | * Pytorch1.5(注意:是1.5) 5 | * pycocotools(Linux: pip install pycocotools; 6 | Windows:pip install pycocotools-windows(不需要额外安装vs)) 7 | * Ubuntu或Centos(不建议Windows) 8 | * 最好使用GPU训练 9 | 10 | ## 文件结构: 11 | ``` 12 | * ├── backbone: 特征提取网络,可以根据自己的要求选择 13 | * ├── network_files: Faster R-CNN网络(包括Fast R-CNN以及RPN等模块) 14 | * ├── train_utils: 训练验证相关模块(包括cocotools) 15 | * ├── my_dataset.py: 自定义dataset用于读取VOC数据集 16 | * ├── train_mobilenet.py: 以MobileNetV2做为backbone进行训练 17 | * ├── train_resnet50_fpn.py: 以resnet50+FPN做为backbone进行训练 18 | * ├── train_multi_GPU.py: 针对使用多GPU的用户使用 19 | * ├── predict.py: 简易的预测脚本,使用训练好的权重进行预测测试 20 | * ├── pascal_voc_classes.json: pascal_voc标签文件 21 | ``` 22 | 23 | ## 预训练权重下载地址(下载后放入backbone文件夹中): 24 | * MobileNetV2 backbone: https://download.pytorch.org/models/mobilenet_v2-b0353104.pth 25 | * ResNet50+FPN backbone: https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth 26 | 27 | 28 | ## 数据集,本例程使用的是PASCAL VOC2012数据集(下载后放入项目当前文件夹中) 29 | * Pascal VOC2012 train/val数据集下载地址:http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 30 | * 如果需要使用Pascal VOC2012 test数据集请参考:https://pjreddie.com/projects/pascal-voc-dataset-mirror/ 31 | 32 | 33 | ## 训练方法 34 | * 确保提前准备好数据集 35 | * 确保提前下载好对应预训练模型权重 36 | * 若要训练mobilenetv2+fasterrcnn,直接使用train_mobilenet.py训练脚本 37 | * 若要训练resnet50+fpn+fasterrcnn,直接使用train_resnet50_fpn.py训练脚本 38 | * 若要使用多GPU训练,使用 "python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py" 指令,nproc_per_node参数为使用GPU数量 39 | 40 | ## Faster RCNN框架图 41 | ![Faster R-CNN](https://github.com/WZMIAOMIAO/deep-learning-for-image-processing/raw/master/pytorch_object_detection/faster_rcnn/fasterRCNN.png) 42 | -------------------------------------------------------------------------------- /faster_rcnn/__pycache__/my_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/__pycache__/my_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/__pycache__/transforms.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/__pycache__/transforms.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/backbone/__pycache__/resnet50_fpn_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/backbone/__pycache__/resnet50_fpn_model.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/backbone/mobilenetv2_model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torchvision.ops import misc 4 | 5 | 6 | def _make_divisible(ch, divisor=8, min_ch=None): 7 | """ 8 | This function is taken from the original tf repo. 9 | It ensures that all layers have a channel number that is divisible by 8 10 | It can be seen here: 11 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 12 | """ 13 | if min_ch is None: 14 | min_ch = divisor 15 | new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor) 16 | # Make sure that round down does not go down by more than 10%. 17 | if new_ch < 0.9 * ch: 18 | new_ch += divisor 19 | return new_ch 20 | 21 | 22 | class ConvBNReLU(nn.Sequential): 23 | def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1, norm_layer=None): 24 | padding = (kernel_size - 1) // 2 25 | if norm_layer is None: 26 | norm_layer = nn.BatchNorm2d 27 | super(ConvBNReLU, self).__init__( 28 | nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False), 29 | norm_layer(out_channel), 30 | nn.ReLU6(inplace=True) 31 | ) 32 | 33 | 34 | class InvertedResidual(nn.Module): 35 | def __init__(self, in_channel, out_channel, stride, expand_ratio, norm_layer=None): 36 | super(InvertedResidual, self).__init__() 37 | hidden_channel = in_channel * expand_ratio 38 | self.use_shortcut = stride == 1 and in_channel == out_channel 39 | if norm_layer is None: 40 | norm_layer = nn.BatchNorm2d 41 | 42 | layers = [] 43 | if expand_ratio != 1: 44 | # 1x1 pointwise conv 45 | layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1, norm_layer=norm_layer)) 46 | layers.extend([ 47 | # 3x3 depthwise conv 48 | ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel, norm_layer=norm_layer), 49 | # 1x1 pointwise conv(linear) 50 | nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False), 51 | norm_layer(out_channel), 52 | ]) 53 | 54 | self.conv = nn.Sequential(*layers) 55 | 56 | def forward(self, x): 57 | if self.use_shortcut: 58 | return x + self.conv(x) 59 | else: 60 | return self.conv(x) 61 | 62 | 63 | class MobileNetV2(nn.Module): 64 | def __init__(self, num_classes=1000, alpha=1.0, round_nearest=8, weights_path=None, norm_layer=None): 65 | super(MobileNetV2, self).__init__() 66 | block = InvertedResidual 67 | input_channel = _make_divisible(32 * alpha, round_nearest) 68 | last_channel = _make_divisible(1280 * alpha, round_nearest) 69 | 70 | if norm_layer is None: 71 | norm_layer = nn.BatchNorm2d 72 | 73 | inverted_residual_setting = [ 74 | # t, c, n, s 75 | [1, 16, 1, 1], 76 | [6, 24, 2, 2], 77 | [6, 32, 3, 2], 78 | [6, 64, 4, 2], 79 | [6, 96, 3, 1], 80 | [6, 160, 3, 2], 81 | [6, 320, 1, 1], 82 | ] 83 | 84 | features = [] 85 | # conv1 layer 86 | features.append(ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)) 87 | # building inverted residual residual blockes 88 | for t, c, n, s in inverted_residual_setting: 89 | output_channel = _make_divisible(c * alpha, round_nearest) 90 | for i in range(n): 91 | stride = s if i == 0 else 1 92 | features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer)) 93 | input_channel = output_channel 94 | # building last several layers 95 | features.append(ConvBNReLU(input_channel, last_channel, 1, norm_layer=norm_layer)) 96 | # combine feature layers 97 | self.features = nn.Sequential(*features) 98 | 99 | # building classifier 100 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 101 | self.classifier = nn.Sequential( 102 | nn.Dropout(0.2), 103 | nn.Linear(last_channel, num_classes) 104 | ) 105 | 106 | if weights_path is None: 107 | # weight initialization 108 | for m in self.modules(): 109 | if isinstance(m, nn.Conv2d): 110 | nn.init.kaiming_normal_(m.weight, mode='fan_out') 111 | if m.bias is not None: 112 | nn.init.zeros_(m.bias) 113 | elif isinstance(m, nn.BatchNorm2d): 114 | nn.init.ones_(m.weight) 115 | nn.init.zeros_(m.bias) 116 | elif isinstance(m, nn.Linear): 117 | nn.init.normal_(m.weight, 0, 0.01) 118 | nn.init.zeros_(m.bias) 119 | else: 120 | self.load_state_dict(torch.load(weights_path)) 121 | 122 | def forward(self, x): 123 | x = self.features(x) 124 | x = self.avgpool(x) 125 | x = torch.flatten(x, 1) 126 | x = self.classifier(x) 127 | return x 128 | -------------------------------------------------------------------------------- /faster_rcnn/backbone/resnet50_fpn_model.py: -------------------------------------------------------------------------------- 1 | from torchvision.ops import misc 2 | import torch.nn as nn 3 | import torch 4 | from torch import Tensor 5 | from collections import OrderedDict 6 | import torch.nn.functional as F 7 | 8 | from torch.jit.annotations import Tuple, List, Dict 9 | 10 | 11 | class Bottleneck(nn.Module): 12 | expansion = 4 13 | 14 | def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None): 15 | super(Bottleneck, self).__init__() 16 | if norm_layer is None: 17 | norm_layer = nn.BatchNorm2d 18 | 19 | self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel, 20 | kernel_size=1, stride=1, bias=False) # squeeze channels 21 | self.bn1 = norm_layer(out_channel) 22 | # ----------------------------------------- 23 | self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel, 24 | kernel_size=3, stride=stride, bias=False, padding=1) 25 | self.bn2 = norm_layer(out_channel) 26 | # ----------------------------------------- 27 | self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion, 28 | kernel_size=1, stride=1, bias=False) # unsqueeze channels 29 | self.bn3 = norm_layer(out_channel * self.expansion) 30 | self.relu = nn.ReLU(inplace=True) 31 | self.downsample = downsample 32 | 33 | def forward(self, x): 34 | identity = x 35 | if self.downsample is not None: 36 | identity = self.downsample(x) 37 | 38 | out = self.conv1(x) 39 | out = self.bn1(out) 40 | out = self.relu(out) 41 | 42 | out = self.conv2(out) 43 | out = self.bn2(out) 44 | out = self.relu(out) 45 | 46 | out = self.conv3(out) 47 | out = self.bn3(out) 48 | 49 | out += identity 50 | out = self.relu(out) 51 | 52 | return out 53 | 54 | 55 | class ResNet(nn.Module): 56 | 57 | def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None): 58 | super(ResNet, self).__init__() 59 | if norm_layer is None: 60 | norm_layer = nn.BatchNorm2d 61 | self._norm_layer = norm_layer 62 | 63 | self.include_top = include_top 64 | self.in_channel = 64 65 | 66 | self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2, 67 | padding=3, bias=False) 68 | self.bn1 = norm_layer(self.in_channel) 69 | self.relu = nn.ReLU(inplace=True) 70 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 71 | self.layer1 = self._make_layer(block, 64, blocks_num[0]) 72 | self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2) 73 | self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2) 74 | self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2) 75 | if self.include_top: 76 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1) 77 | self.fc = nn.Linear(512 * block.expansion, num_classes) 78 | 79 | for m in self.modules(): 80 | if isinstance(m, nn.Conv2d): 81 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 82 | 83 | def _make_layer(self, block, channel, block_num, stride=1): 84 | norm_layer = self._norm_layer 85 | downsample = None 86 | if stride != 1 or self.in_channel != channel * block.expansion: 87 | downsample = nn.Sequential( 88 | nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False), 89 | norm_layer(channel * block.expansion)) 90 | 91 | layers = [] 92 | layers.append(block(self.in_channel, channel, downsample=downsample, 93 | stride=stride, norm_layer=norm_layer)) 94 | self.in_channel = channel * block.expansion 95 | 96 | for _ in range(1, block_num): 97 | layers.append(block(self.in_channel, channel, norm_layer=norm_layer)) 98 | 99 | return nn.Sequential(*layers) 100 | 101 | def forward(self, x): 102 | x = self.conv1(x) 103 | x = self.bn1(x) 104 | x = self.relu(x) 105 | x = self.maxpool(x) 106 | 107 | x = self.layer1(x) 108 | x = self.layer2(x) 109 | x = self.layer3(x) 110 | x = self.layer4(x) 111 | 112 | if self.include_top: 113 | x = self.avgpool(x) 114 | x = torch.flatten(x, 1) 115 | x = self.fc(x) 116 | 117 | return x 118 | 119 | 120 | class IntermediateLayerGetter(nn.ModuleDict): 121 | """ 122 | Module wrapper that returns intermediate layers from a model 123 | It has a strong assumption that the modules have been registered 124 | into the model in the same order as they are used. 125 | This means that one should **not** reuse the same nn.Module 126 | twice in the forward if you want this to work. 127 | Additionally, it is only able to query submodules that are directly 128 | assigned to the model. So if `model` is passed, `model.feature1` can 129 | be returned, but not `model.feature1.layer2`. 130 | Arguments: 131 | model (nn.Module): model on which we will extract the features 132 | return_layers (Dict[name, new_name]): a dict containing the names 133 | of the modules for which the activations will be returned as 134 | the key of the dict, and the value of the dict is the name 135 | of the returned activation (which the user can specify). 136 | """ 137 | __annotations__ = { 138 | "return_layers": Dict[str, str], 139 | } 140 | 141 | def __init__(self, model, return_layers): 142 | if not set(return_layers).issubset([name for name, _ in model.named_children()]): 143 | raise ValueError("return_layers are not present in model") 144 | 145 | orig_return_layers = return_layers 146 | return_layers = {k: v for k, v in return_layers.items()} 147 | layers = OrderedDict() 148 | 149 | # 遍历模型子模块按顺序存入有序字典 150 | # 只保存layer4及其之前的结构,舍去之后不用的结构 151 | for name, module in model.named_children(): 152 | layers[name] = module 153 | if name in return_layers: 154 | del return_layers[name] 155 | # 如果return_layers是空的 156 | if not return_layers: 157 | break 158 | 159 | super(IntermediateLayerGetter, self).__init__(layers) 160 | self.return_layers = orig_return_layers 161 | 162 | def forward(self, x): 163 | out = OrderedDict() 164 | # 依次遍历模型的所有子模块,并进行正向传播, 165 | # 收集layer1, layer2, layer3, layer4的输出 166 | for name, module in self.named_children(): 167 | x = module(x) 168 | if name in self.return_layers: 169 | out_name = self.return_layers[name] 170 | out[out_name] = x 171 | return out 172 | 173 | 174 | class FeaturePyramidNetwork(nn.Module): 175 | """ 176 | Module that adds a FPN from on top of a set of feature maps. This is based on 177 | `"Feature Pyramid Network for Object Detection" `_. 178 | The feature maps are currently supposed to be in increasing depth 179 | order. 180 | The input to the model is expected to be an OrderedDict[Tensor], containing 181 | the feature maps on top of which the FPN will be added. 182 | Arguments: 183 | in_channels_list (list[int]): number of channels for each feature map that 184 | is passed to the module 185 | out_channels (int): number of channels of the FPN representation 186 | extra_blocks (ExtraFPNBlock or None): if provided, extra operations will 187 | be performed. It is expected to take the fpn features, the original 188 | features and the names of the original features as input, and returns 189 | a new list of feature maps and their corresponding names 190 | """ 191 | 192 | def __init__(self, in_channels_list, out_channels, extra_blocks=None): 193 | super(FeaturePyramidNetwork, self).__init__() 194 | # 用来调整resnet特征矩阵(layer1,2,3,4)的channel(kernel_size=1) 195 | self.inner_blocks = nn.ModuleList() 196 | # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵 197 | self.layer_blocks = nn.ModuleList() 198 | for in_channels in in_channels_list: 199 | if in_channels == 0: 200 | continue 201 | inner_block_module = nn.Conv2d(in_channels, out_channels, 1) 202 | layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1) 203 | self.inner_blocks.append(inner_block_module) 204 | self.layer_blocks.append(layer_block_module) 205 | 206 | # initialize parameters now to avoid modifying the initialization of top_blocks 207 | for m in self.children(): 208 | if isinstance(m, nn.Conv2d): 209 | nn.init.kaiming_uniform_(m.weight, a=1) 210 | nn.init.constant_(m.bias, 0) 211 | 212 | self.extra_blocks = extra_blocks 213 | 214 | def get_result_from_inner_blocks(self, x, idx): 215 | # type: (Tensor, int) 216 | """ 217 | This is equivalent to self.inner_blocks[idx](x), 218 | but torchscript doesn't support this yet 219 | """ 220 | num_blocks = 0 221 | for m in self.inner_blocks: 222 | num_blocks += 1 223 | if idx < 0: 224 | idx += num_blocks 225 | i = 0 226 | out = x 227 | for module in self.inner_blocks: 228 | if i == idx: 229 | out = module(x) 230 | i += 1 231 | return out 232 | 233 | def get_result_from_layer_blocks(self, x, idx): 234 | # type: (Tensor, int) 235 | """ 236 | This is equivalent to self.layer_blocks[idx](x), 237 | but torchscript doesn't support this yet 238 | """ 239 | num_blocks = 0 240 | for m in self.layer_blocks: 241 | num_blocks += 1 242 | if idx < 0: 243 | idx += num_blocks 244 | i = 0 245 | out = x 246 | for module in self.layer_blocks: 247 | if i == idx: 248 | out = module(x) 249 | i += 1 250 | return out 251 | 252 | def forward(self, x): 253 | # type: (Dict[str, Tensor]) 254 | """ 255 | Computes the FPN for a set of feature maps. 256 | Arguments: 257 | x (OrderedDict[Tensor]): feature maps for each feature level. 258 | Returns: 259 | results (OrderedDict[Tensor]): feature maps after FPN layers. 260 | They are ordered from highest resolution first. 261 | """ 262 | # unpack OrderedDict into two lists for easier handling 263 | names = list(x.keys()) 264 | x = list(x.values()) 265 | 266 | # 将resnet layer4的channel调整到指定的out_channels 267 | last_inner = self.inner_blocks[-1](x[-1]) 268 | 269 | # result中保存着每个预测特征层 270 | results = [] 271 | # 将layer4调整channel后的特征矩阵,通过3x3卷积后得到对应的预测特征矩阵 272 | results.append(self.layer_blocks[-1](last_inner)) 273 | 274 | # 倒序遍历resenet输出特征层,以及对应inner_block和layer_block 275 | # layer3 -> layer2 -> layer1 (layer4已经处理过了) 276 | # for feature, inner_block, layer_block in zip( 277 | # x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] 278 | # ): 279 | # if not inner_block: 280 | # continue 281 | # inner_lateral = inner_block(feature) 282 | # feat_shape = inner_lateral.shape[-2:] 283 | # inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest") 284 | # last_inner = inner_lateral + inner_top_down 285 | # results.insert(0, layer_block(last_inner)) 286 | 287 | for idx in range(len(x) - 2, -1, -1): 288 | inner_lateral = self.get_result_from_inner_blocks(x[idx], idx) 289 | feat_shape = inner_lateral.shape[-2:] 290 | inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest") 291 | last_inner = inner_lateral + inner_top_down 292 | results.insert(0, self.get_result_from_layer_blocks(last_inner, idx)) 293 | 294 | # 在layer4对应的预测特征层基础上生成预测特征矩阵5 295 | if self.extra_blocks is not None: 296 | results, names = self.extra_blocks(results, names) 297 | 298 | # make it back an OrderedDict 299 | out = OrderedDict([(k, v) for k, v in zip(names, results)]) 300 | 301 | return out 302 | 303 | 304 | class LastLevelMaxPool(torch.nn.Module): 305 | """ 306 | Applies a max_pool2d on top of the last feature map 307 | """ 308 | 309 | def forward(self, x, names): 310 | # type: (List[Tensor], List[str]) 311 | names.append("pool") 312 | x.append(F.max_pool2d(x[-1], 1, 2, 0)) 313 | return x, names 314 | 315 | 316 | class BackboneWithFPN(nn.Module): 317 | """ 318 | Adds a FPN on top of a model. 319 | Internally, it uses torchvision.models._utils.IntermediateLayerGetter to 320 | extract a submodel that returns the feature maps specified in return_layers. 321 | The same limitations of IntermediatLayerGetter apply here. 322 | Arguments: 323 | backbone (nn.Module) 324 | return_layers (Dict[name, new_name]): a dict containing the names 325 | of the modules for which the activations will be returned as 326 | the key of the dict, and the value of the dict is the name 327 | of the returned activation (which the user can specify). 328 | in_channels_list (List[int]): number of channels for each feature map 329 | that is returned, in the order they are present in the OrderedDict 330 | out_channels (int): number of channels in the FPN. 331 | Attributes: 332 | out_channels (int): the number of channels in the FPN 333 | """ 334 | 335 | def __init__(self, backbone, return_layers, in_channels_list, out_channels): 336 | super(BackboneWithFPN, self).__init__() 337 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 338 | self.fpn = FeaturePyramidNetwork( 339 | in_channels_list=in_channels_list, 340 | out_channels=out_channels, 341 | extra_blocks=LastLevelMaxPool(), 342 | ) 343 | # super(BackboneWithFPN, self).__init__(OrderedDict( 344 | # [("body", body), ("fpn", fpn)])) 345 | self.out_channels = out_channels 346 | 347 | def forward(self, x): 348 | x = self.body(x) 349 | x = self.fpn(x) 350 | return x 351 | 352 | 353 | def resnet50_fpn_backbone(): 354 | # FrozenBatchNorm2d的功能与BatchNorm2d类似,但参数无法更新 355 | # norm_layer=misc.FrozenBatchNorm2d 356 | resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3], 357 | include_top=False) 358 | 359 | # freeze layers 360 | # 冻结layer1及其之前的所有底层权重(基础通用特征) 361 | for name, parameter in resnet_backbone.named_parameters(): 362 | if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 363 | parameter.requires_grad_(False) 364 | 365 | return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'} 366 | 367 | # in_channel 为layer4的输出特征矩阵channel = 2048 368 | in_channels_stage2 = resnet_backbone.in_channel // 8 369 | in_channels_list = [ 370 | in_channels_stage2, # layer1 out_channel=256 371 | in_channels_stage2 * 2, # layer2 out_channel=512 372 | in_channels_stage2 * 4, # layer3 out_channel=1024 373 | in_channels_stage2 * 8, # layer4 out_channel=2048 374 | ] 375 | out_channels = 256 376 | return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels) 377 | -------------------------------------------------------------------------------- /faster_rcnn/backbone/vgg_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | class VGG(nn.Module): 6 | def __init__(self, features, class_num=1000, init_weights=False, weights_path=None): 7 | super(VGG, self).__init__() 8 | self.features = features 9 | self.classifier = nn.Sequential( 10 | nn.Dropout(p=0.5), 11 | nn.Linear(512*7*7, 2048), 12 | nn.ReLU(True), 13 | nn.Dropout(p=0.5), 14 | nn.Linear(2048, 2048), 15 | nn.ReLU(True), 16 | nn.Linear(2048, class_num) 17 | ) 18 | if init_weights and weights_path is None: 19 | self._initialize_weights() 20 | 21 | if weights_path is not None: 22 | self.load_state_dict(torch.load(weights_path), strict=False) 23 | 24 | def forward(self, x): 25 | # N x 3 x 224 x 224 26 | x = self.features(x) 27 | # N x 512 x 7 x 7 28 | x = torch.flatten(x, start_dim=1) 29 | # N x 512*7*7 30 | x = self.classifier(x) 31 | return x 32 | 33 | def _initialize_weights(self): 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 37 | nn.init.xavier_uniform_(m.weight) 38 | if m.bias is not None: 39 | nn.init.constant_(m.bias, 0) 40 | elif isinstance(m, nn.Linear): 41 | nn.init.xavier_uniform_(m.weight) 42 | # nn.init.normal_(m.weight, 0, 0.01) 43 | nn.init.constant_(m.bias, 0) 44 | 45 | 46 | def make_features(cfg: list): 47 | layers = [] 48 | in_channels = 3 49 | for v in cfg: 50 | if v == "M": 51 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 52 | else: 53 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 54 | layers += [conv2d, nn.ReLU(True)] 55 | in_channels = v 56 | return nn.Sequential(*layers) 57 | 58 | 59 | cfgs = { 60 | 'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 61 | 'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 62 | 'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 63 | 'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], 64 | } 65 | 66 | 67 | def vgg(model_name="vgg16", weights_path=None): 68 | try: 69 | cfg = cfgs[model_name] 70 | except: 71 | print("Warning: model number {} not in cfgs dict!".format(model_name)) 72 | exit(-1) 73 | model = VGG(make_features(cfg), weights_path=weights_path) 74 | return model 75 | -------------------------------------------------------------------------------- /faster_rcnn/draw_box_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import PIL.ImageDraw as ImageDraw 3 | import PIL.ImageFont as ImageFont 4 | import numpy as np 5 | 6 | STANDARD_COLORS = [ 7 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 8 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 9 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 10 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 11 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 12 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 13 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 14 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 15 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 16 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 17 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 18 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 19 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 20 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 21 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 22 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 23 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 24 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 25 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 26 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 27 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 28 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 29 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 30 | ] 31 | 32 | 33 | def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map): 34 | for i in range(boxes.shape[0]): 35 | if scores[i] > thresh: 36 | box = tuple(boxes[i].tolist()) # numpy -> list -> tuple 37 | if classes[i] in category_index.keys(): 38 | class_name = category_index[classes[i]] 39 | else: 40 | class_name = 'N/A' 41 | display_str = str(class_name) 42 | display_str = '{}: {}%'.format(display_str, int(100 * scores[i])) 43 | box_to_display_str_map[box].append(display_str) 44 | box_to_color_map[box] = STANDARD_COLORS[ 45 | classes[i] % len(STANDARD_COLORS)] 46 | else: 47 | break # 网络输出概率已经排序过,当遇到一个不满足后面的肯定不满足 48 | 49 | 50 | def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color): 51 | try: 52 | font = ImageFont.truetype('arial.ttf', 24) 53 | except IOError: 54 | font = ImageFont.load_default() 55 | 56 | # If the total height of the display strings added to the top of the bounding 57 | # box exceeds the top of the image, stack the strings below the bounding box 58 | # instead of above. 59 | display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]] 60 | # Each display_str has a top and bottom margin of 0.05x. 61 | total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights) 62 | 63 | if top > total_display_str_height: 64 | text_bottom = top 65 | else: 66 | text_bottom = bottom + total_display_str_height 67 | # Reverse list and print from bottom to top. 68 | for display_str in box_to_display_str_map[box][::-1]: 69 | text_width, text_height = font.getsize(display_str) 70 | margin = np.ceil(0.05 * text_height) 71 | draw.rectangle([(left, text_bottom - text_height - 2 * margin), 72 | (left + text_width, text_bottom)], fill=color) 73 | draw.text((left + margin, text_bottom - text_height - margin), 74 | display_str, 75 | fill='black', 76 | font=font) 77 | text_bottom -= text_height - 2 * margin 78 | 79 | 80 | def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8): 81 | box_to_display_str_map = collections.defaultdict(list) 82 | box_to_color_map = collections.defaultdict(str) 83 | 84 | filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map) 85 | 86 | # Draw all boxes onto image. 87 | draw = ImageDraw.Draw(image) 88 | im_width, im_height = image.size 89 | for box, color in box_to_color_map.items(): 90 | xmin, ymin, xmax, ymax = box 91 | (left, right, top, bottom) = (xmin * 1, xmax * 1, 92 | ymin * 1, ymax * 1) 93 | draw.line([(left, top), (left, bottom), (right, bottom), 94 | (right, top), (left, top)], width=line_thickness, fill=color) 95 | draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color) 96 | -------------------------------------------------------------------------------- /faster_rcnn/fasterRCNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/fasterRCNN.png -------------------------------------------------------------------------------- /faster_rcnn/my_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import os 3 | import torch 4 | import json 5 | from PIL import Image 6 | from lxml import etree 7 | 8 | 9 | class VOC2012DataSet(Dataset): 10 | """读取解析PASCAL VOC2012数据集""" 11 | 12 | def __init__(self, voc_root, transforms, train_set=True): 13 | self.root = os.path.join(voc_root, "VOCdevkit", "VOC2012") 14 | self.img_root = os.path.join(self.root, "JPEGImages") 15 | self.annotations_root = os.path.join(self.root, "Annotations") 16 | 17 | # read train.txt or val.txt file 18 | if train_set: 19 | txt_list = os.path.join(self.root, "ImageSets", "Main", "train.txt") 20 | else: 21 | txt_list = os.path.join(self.root, "ImageSets", "Main", "val.txt") 22 | with open(txt_list) as read: 23 | self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml") 24 | for line in read.readlines()] 25 | 26 | # read class_indict 27 | try: 28 | json_file = open('./pascal_voc_classes.json', 'r') 29 | self.class_dict = json.load(json_file) 30 | except Exception as e: 31 | print(e) 32 | exit(-1) 33 | 34 | self.transforms = transforms 35 | 36 | def __len__(self): 37 | return len(self.xml_list) 38 | 39 | def __getitem__(self, idx): 40 | # read xml 41 | xml_path = self.xml_list[idx] 42 | with open(xml_path) as fid: 43 | xml_str = fid.read() 44 | xml = etree.fromstring(xml_str) 45 | data = self.parse_xml_to_dict(xml)["annotation"] 46 | img_path = os.path.join(self.img_root, data["filename"]) 47 | image = Image.open(img_path) 48 | if image.format != "JPEG": 49 | raise ValueError("Image format not JPEG") 50 | boxes = [] 51 | labels = [] 52 | iscrowd = [] 53 | for obj in data["object"]: 54 | xmin = float(obj["bndbox"]["xmin"]) 55 | xmax = float(obj["bndbox"]["xmax"]) 56 | ymin = float(obj["bndbox"]["ymin"]) 57 | ymax = float(obj["bndbox"]["ymax"]) 58 | boxes.append([xmin, ymin, xmax, ymax]) 59 | labels.append(self.class_dict[obj["name"]]) 60 | iscrowd.append(int(obj["difficult"])) 61 | 62 | # convert everything into a torch.Tensor 63 | boxes = torch.as_tensor(boxes, dtype=torch.float32) 64 | labels = torch.as_tensor(labels, dtype=torch.int64) 65 | iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64) 66 | image_id = torch.tensor([idx]) 67 | area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) 68 | 69 | target = {} 70 | target["boxes"] = boxes 71 | target["labels"] = labels 72 | target["image_id"] = image_id 73 | target["area"] = area 74 | target["iscrowd"] = iscrowd 75 | 76 | if self.transforms is not None: 77 | image, target = self.transforms(image, target) 78 | 79 | return image, target 80 | 81 | def get_height_and_width(self, idx): 82 | # read xml 83 | xml_path = self.xml_list[idx] 84 | with open(xml_path) as fid: 85 | xml_str = fid.read() 86 | xml = etree.fromstring(xml_str) 87 | data = self.parse_xml_to_dict(xml)["annotation"] 88 | data_height = int(data["size"]["height"]) 89 | data_width = int(data["size"]["width"]) 90 | return data_height, data_width 91 | 92 | def parse_xml_to_dict(self, xml): 93 | """ 94 | 将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict 95 | Args: 96 | xml: xml tree obtained by parsing XML file contents using lxml.etree 97 | 98 | Returns: 99 | Python dictionary holding XML contents. 100 | """ 101 | 102 | if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息 103 | return {xml.tag: xml.text} 104 | 105 | result = {} 106 | for child in xml: 107 | child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息 108 | if child.tag != 'object': 109 | result[child.tag] = child_result[child.tag] 110 | else: 111 | if child.tag not in result: # 因为object可能有多个,所以需要放入列表里 112 | result[child.tag] = [] 113 | result[child.tag].append(child_result[child.tag]) 114 | return {xml.tag: result} 115 | 116 | 117 | # import transforms 118 | # from draw_box_utils import draw_box 119 | # from PIL import Image 120 | # import json 121 | # import matplotlib.pyplot as plt 122 | # import torchvision.transforms as ts 123 | # import random 124 | # 125 | # # read class_indict 126 | # category_index = {} 127 | # try: 128 | # json_file = open('./pascal_voc_classes.json', 'r') 129 | # class_dict = json.load(json_file) 130 | # category_index = {v: k for k, v in class_dict.items()} 131 | # except Exception as e: 132 | # print(e) 133 | # exit(-1) 134 | # 135 | # data_transform = { 136 | # "train": transforms.Compose([transforms.ToTensor(), 137 | # transforms.RandomHorizontalFlip(0.5)]), 138 | # "val": transforms.Compose([transforms.ToTensor()]) 139 | # } 140 | # 141 | # # load train data set 142 | # train_data_set = VOC2012DataSet(os.getcwd(), data_transform["train"], True) 143 | # print(len(train_data_set)) 144 | # for index in random.sample(range(0, len(train_data_set)), k=5): 145 | # img, target = train_data_set[index] 146 | # img = ts.ToPILImage()(img) 147 | # draw_box(img, 148 | # target["boxes"].numpy(), 149 | # target["labels"].numpy(), 150 | # [1 for i in range(len(target["labels"].numpy()))], 151 | # category_index, 152 | # thresh=0.5, 153 | # line_thickness=5) 154 | # plt.imshow(img) 155 | # plt.show() 156 | -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/boxes.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/boxes.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/det_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/det_utils.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/faster_rcnn_framework.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/faster_rcnn_framework.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/image_list.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/image_list.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/roi_head.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/roi_head.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/rpn_function.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/rpn_function.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/__pycache__/transform.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/network_files/__pycache__/transform.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/network_files/boxes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.jit.annotations import Tuple 3 | from torch import Tensor 4 | import torchvision 5 | 6 | 7 | def nms(boxes, scores, iou_threshold): 8 | # type: (Tensor, Tensor, float) 9 | """ 10 | Performs non-maximum suppression (NMS) on the boxes according 11 | to their intersection-over-union (IoU). 12 | 13 | NMS iteratively removes lower scoring boxes which have an 14 | IoU greater than iou_threshold with another (higher scoring) 15 | box. 16 | 17 | Parameters 18 | ---------- 19 | boxes : Tensor[N, 4]) 20 | boxes to perform NMS on. They 21 | are expected to be in (x1, y1, x2, y2) format 22 | scores : Tensor[N] 23 | scores for each one of the boxes 24 | iou_threshold : float 25 | discards all overlapping 26 | boxes with IoU < iou_threshold 27 | 28 | Returns 29 | ------- 30 | keep : Tensor 31 | int64 tensor with the indices 32 | of the elements that have been kept 33 | by NMS, sorted in decreasing order of scores 34 | """ 35 | return torch.ops.torchvision.nms(boxes, scores, iou_threshold) 36 | 37 | 38 | def batched_nms(boxes, scores, idxs, iou_threshold): 39 | # type: (Tensor, Tensor, Tensor, float) 40 | """ 41 | Performs non-maximum suppression in a batched fashion. 42 | 43 | Each index value correspond to a category, and NMS 44 | will not be applied between elements of different categories. 45 | 46 | Parameters 47 | ---------- 48 | boxes : Tensor[N, 4] 49 | boxes where NMS will be performed. They 50 | are expected to be in (x1, y1, x2, y2) format 51 | scores : Tensor[N] 52 | scores for each one of the boxes 53 | idxs : Tensor[N] 54 | indices of the categories for each one of the boxes. 55 | iou_threshold : float 56 | discards all overlapping boxes 57 | with IoU < iou_threshold 58 | 59 | Returns 60 | ------- 61 | keep : Tensor 62 | int64 tensor with the indices of 63 | the elements that have been kept by NMS, sorted 64 | in decreasing order of scores 65 | """ 66 | if boxes.numel() == 0: 67 | return torch.empty((0,), dtype=torch.int64, device=boxes.device) 68 | 69 | # strategy: in order to perform NMS independently per class. 70 | # we add an offset to all the boxes. The offset is dependent 71 | # only on the class idx, and is large enough so that boxes 72 | # from different classes do not overlap 73 | # 获取所有boxes中最大的坐标值(xmin, ymin, xmax, ymax) 74 | max_coordinate = boxes.max() 75 | 76 | # to(): Performs Tensor dtype and/or device conversion 77 | # 为每一个类别生成一个很大的偏移量 78 | # 这里的to只是让生成tensor的dytpe和device与boxes保持一致 79 | offsets = idxs.to(boxes) * (max_coordinate + 1) 80 | # boxes加上对应层的偏移量后,保证不同类别之间boxes不会有重合的现象 81 | boxes_for_nms = boxes + offsets[:, None] 82 | keep = nms(boxes_for_nms, scores, iou_threshold) 83 | return keep 84 | 85 | 86 | def remove_small_boxes(boxes, min_size): 87 | # type: (Tensor, float) 88 | """ 89 | Remove boxes which contains at least one side smaller than min_size. 90 | 移除宽高小于指定阈值的索引 91 | Arguments: 92 | boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format 93 | min_size (float): minimum size 94 | 95 | Returns: 96 | keep (Tensor[K]): indices of the boxes that have both sides 97 | larger than min_size 98 | """ 99 | ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1] # 预测boxes的宽和高 100 | keep = (ws >= min_size) & (hs >= min_size) # 当满足宽,高都大于给定阈值时为True 101 | # nonzero(): Returns a tensor containing the indices of all non-zero elements of input 102 | keep = keep.nonzero().squeeze(1) 103 | return keep 104 | 105 | 106 | def clip_boxes_to_image(boxes, size): 107 | # type: (Tensor, Tuple[int, int]) 108 | """ 109 | Clip boxes so that they lie inside an image of size `size`. 110 | 裁剪预测的boxes信息,将越界的坐标调整到图片边界上 111 | 112 | Arguments: 113 | boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format 114 | size (Tuple[height, width]): size of the image 115 | 116 | Returns: 117 | clipped_boxes (Tensor[N, 4]) 118 | """ 119 | dim = boxes.dim() 120 | boxes_x = boxes[..., 0::2] # x1, x2 121 | boxes_y = boxes[..., 1::2] # y1, y2 122 | height, width = size 123 | 124 | if torchvision._is_tracing(): 125 | boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) 126 | boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device)) 127 | boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) 128 | boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device)) 129 | else: 130 | boxes_x = boxes_x.clamp(min=0, max=width) # 限制x坐标范围在[0,width]之间 131 | boxes_y = boxes_y.clamp(min=0, max=height) # 限制y坐标范围在[0,height]之间 132 | 133 | clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) 134 | return clipped_boxes.reshape(boxes.shape) 135 | 136 | 137 | def box_area(boxes): 138 | """ 139 | Computes the area of a set of bounding boxes, which are specified by its 140 | (x1, y1, x2, y2) coordinates. 141 | 142 | Arguments: 143 | boxes (Tensor[N, 4]): boxes for which the area will be computed. They 144 | are expected to be in (x1, y1, x2, y2) format 145 | 146 | Returns: 147 | area (Tensor[N]): area for each box 148 | """ 149 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 150 | 151 | 152 | def box_iou(boxes1, boxes2): 153 | """ 154 | Return intersection-over-union (Jaccard index) of boxes. 155 | 156 | Both sets of boxes are expected to be in (x1, y1, x2, y2) format. 157 | 158 | Arguments: 159 | boxes1 (Tensor[N, 4]) 160 | boxes2 (Tensor[M, 4]) 161 | 162 | Returns: 163 | iou (Tensor[N, M]): the NxM matrix containing the pairwise 164 | IoU values for every element in boxes1 and boxes2 165 | """ 166 | area1 = box_area(boxes1) 167 | area2 = box_area(boxes2) 168 | 169 | # When the shapes do not match, 170 | # the shape of the returned output tensor follows the broadcasting rules 171 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # left-top [N,M,2] 172 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # right-bottom [N,M,2] 173 | 174 | wh = (rb - lt).clamp(min=0) # [N,M,2] 175 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 176 | 177 | iou = inter / (area1[:, None] + area2 - inter) 178 | return iou 179 | 180 | -------------------------------------------------------------------------------- /faster_rcnn/network_files/image_list.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.jit.annotations import List, Tuple 3 | from torch import Tensor 4 | 5 | 6 | @torch.jit.script 7 | class ImageList(object): 8 | """ 9 | Structure that holds a list of images (of possibly 10 | varying sizes) as a single tensor. 11 | This works by padding the images to the same size, 12 | and storing in a field the original sizes of each image 13 | """ 14 | 15 | def __init__(self, tensors, image_sizes): 16 | # type: (Tensor, List[Tuple[int, int]]) 17 | """ 18 | Arguments: 19 | tensors (tensor) padding后的图像数据 20 | image_sizes (list[tuple[int, int]]) padding前的图像尺寸 21 | """ 22 | self.tensors = tensors 23 | self.image_sizes = image_sizes 24 | 25 | def to(self, device): 26 | # type: (Device) # noqa 27 | cast_tensor = self.tensors.to(device) 28 | return ImageList(cast_tensor, self.image_sizes) 29 | 30 | -------------------------------------------------------------------------------- /faster_rcnn/network_files/roi_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | from network_files import boxes as box_ops, det_utils 5 | from torch import nn, Tensor 6 | from torch.jit.annotations import Optional, List, Dict, Tuple 7 | 8 | 9 | def fastrcnn_loss(class_logits, box_regression, labels, regression_targets): 10 | # type: (Tensor, Tensor, List[Tensor], List[Tensor]) 11 | """ 12 | Computes the loss for Faster R-CNN. 13 | 14 | Arguments: 15 | class_logits (Tensor): 预测类别概率信息,shape=[num_anchors, num_classes] 16 | box_regression (Tensor): 预测边目标界框回归信息 17 | labels (list[BoxList]): 真实类别信息 18 | regression_targets (Tensor): 真实目标边界框信息 19 | 20 | Returns: 21 | classification_loss (Tensor) 22 | box_loss (Tensor) 23 | """ 24 | 25 | labels = torch.cat(labels, dim=0) 26 | regression_targets = torch.cat(regression_targets, dim=0) 27 | 28 | # 计算类别损失信息 29 | classification_loss = F.cross_entropy(class_logits, labels) 30 | 31 | # get indices that correspond to the regression targets for 32 | # the corresponding ground truth labels, to be used with 33 | # advanced indexing 34 | # 返回标签类别大于0的索引 35 | sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) 36 | 37 | # 返回标签类别大于0位置的类别信息 38 | labels_pos = labels[sampled_pos_inds_subset] 39 | 40 | # shape=[num_proposal, num_classes] 41 | N, num_classes = class_logits.shape 42 | box_regression = box_regression.reshape(N, -1, 4) 43 | 44 | # 计算边界框损失信息 45 | box_loss = det_utils.smooth_l1_loss( 46 | # 获取指定索引proposal的指定类别box信息 47 | box_regression[sampled_pos_inds_subset, labels_pos], 48 | regression_targets[sampled_pos_inds_subset], 49 | beta=1 / 9, 50 | size_average=False, 51 | ) / labels.numel() 52 | 53 | return classification_loss, box_loss 54 | 55 | 56 | class RoIHeads(torch.nn.Module): 57 | __annotations__ = { 58 | 'box_coder': det_utils.BoxCoder, 59 | 'proposal_matcher': det_utils.Matcher, 60 | 'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler, 61 | } 62 | 63 | def __init__(self, 64 | box_roi_pool, 65 | box_head, 66 | box_predictor, 67 | # Faster R-CNN training 68 | fg_iou_thresh, bg_iou_thresh, 69 | batch_size_per_image, positive_fraction, 70 | bbox_reg_weights, 71 | # Faster R-CNN inference 72 | score_thresh, 73 | nms_thresh, 74 | detection_per_img): 75 | super(RoIHeads, self).__init__() 76 | 77 | self.box_similarity = box_ops.box_iou 78 | # assign ground-truth boxes for each proposal 79 | self.proposal_matcher = det_utils.Matcher( 80 | fg_iou_thresh, # 0.5 81 | bg_iou_thresh, # 0.5 82 | allow_low_quality_matches=False) 83 | 84 | self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( 85 | batch_size_per_image, # 512 86 | positive_fraction) # 0.25 87 | 88 | if bbox_reg_weights is None: 89 | bbox_reg_weights = (10., 10., 5., 5.) 90 | self.box_coder = det_utils.BoxCoder(bbox_reg_weights) 91 | 92 | self.box_roi_pool = box_roi_pool 93 | self.box_head = box_head 94 | self.box_predictor = box_predictor 95 | 96 | self.score_thresh = score_thresh 97 | self.nms_thresh = nms_thresh 98 | self.detection_per_img = detection_per_img 99 | 100 | def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels): 101 | # type: (List[Tensor], List[Tensor], List[Tensor]) 102 | """ 103 | 为每个proposal匹配对应的gt_box,并划分到正负样本中 104 | Args: 105 | proposals: 106 | gt_boxes: 107 | gt_labels: 108 | 109 | Returns: 110 | 111 | """ 112 | matched_idxs = [] 113 | labels = [] 114 | # 遍历每张图像的proposals, gt_boxes, gt_labels信息 115 | for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels): 116 | if gt_boxes_in_image.numel() == 0: # 该张图像中没有gt框,为背景 117 | # background image 118 | device = proposals_in_image.device 119 | clamped_matched_idxs_in_image = torch.zeros( 120 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device 121 | ) 122 | labels_in_image = torch.zeros( 123 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device 124 | ) 125 | else: 126 | # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands 127 | # 计算proposal与每个gt_box的iou重合度 128 | match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image) 129 | 130 | # 计算proposal与每个gt_box匹配的iou最大值,并记录索引, 131 | # iou < low_threshold索引值为 -1, low_threshold <= iou < high_threshold索引值为 -2 132 | matched_idxs_in_image = self.proposal_matcher(match_quality_matrix) 133 | 134 | # 限制最小值,防止匹配标签时出现越界的情况 135 | # 注意-1, -2对应的gt索引会调整到0,获取的标签类别为第0个gt的类别(实际上并不是),后续会进一步处理 136 | clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0) 137 | # 获取proposal匹配到的gt对应标签 138 | labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image] 139 | labels_in_image = labels_in_image.to(dtype=torch.int64) 140 | 141 | # label background (below the low threshold) 142 | # 将gt索引为-1的类别设置为0,即背景,负样本 143 | bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD # -1 144 | labels_in_image[bg_inds] = torch.tensor(0) 145 | 146 | # label ignore proposals (between low and high threshold) 147 | # 将gt索引为-2的类别设置为-1, 即废弃样本 148 | ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS # -2 149 | labels_in_image[ignore_inds] = torch.tensor(-1) # -1 is ignored by sampler 150 | 151 | matched_idxs.append(clamped_matched_idxs_in_image) 152 | labels.append(labels_in_image) 153 | return matched_idxs, labels 154 | 155 | def subsample(self, labels): 156 | # type: (List[Tensor]) 157 | # BalancedPositiveNegativeSampler 158 | sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) 159 | sampled_inds = [] 160 | # 遍历每张图片的正负样本索引 161 | for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)): 162 | # 记录所有采集样本索引(包括正样本和负样本) 163 | img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) 164 | sampled_inds.append(img_sampled_inds) 165 | return sampled_inds 166 | 167 | def add_gt_proposals(self, proposals, gt_boxes): 168 | # type: (List[Tensor], List[Tensor]) 169 | """ 170 | 将gt_boxes拼接到proposal后面 171 | Args: 172 | proposals: 一个batch中每张图像rpn预测的boxes 173 | gt_boxes: 一个batch中每张图像对应的真实目标边界框 174 | 175 | Returns: 176 | 177 | """ 178 | proposals = [ 179 | torch.cat((proposal, gt_box)) 180 | for proposal, gt_box in zip(proposals, gt_boxes) 181 | ] 182 | return proposals 183 | 184 | def DELTEME_all(self, the_list): 185 | # type: (List[bool]) 186 | for i in the_list: 187 | if not i: 188 | return False 189 | return True 190 | 191 | def check_targets(self, targets): 192 | # type: (Optional[List[Dict[str, Tensor]]]) 193 | assert targets is not None 194 | assert self.DELTEME_all(["boxes" in t for t in targets]) 195 | assert self.DELTEME_all(["labels" in t for t in targets]) 196 | 197 | def select_training_samples(self, proposals, targets): 198 | # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) 199 | """ 200 | 划分正负样本,统计对应gt的标签以及边界框回归信息 201 | list元素个数为batch_size 202 | Args: 203 | proposals: rpn预测的boxes list[[2000,4], [2000,4]], [2000,4],[2000,4]] 204 | list列表中每个元素代表一张图片经过rpn生成2000个建议框 205 | targets: list 每个元素是个dict 代表每张图片的标注信息 206 | 207 | Returns: 208 | 209 | """ 210 | 211 | # 检查target数据是否为空 212 | self.check_targets(targets) 213 | assert targets is not None 214 | dtype = proposals[0].dtype 215 | device = proposals[0].device 216 | 217 | gt_boxes = [t["boxes"].to(dtype) for t in targets] 218 | gt_labels = [t["labels"] for t in targets] 219 | 220 | # append ground-truth bboxes to proposal 221 | # 将gt_boxes拼接到proposal后面 把每张图片的gt_boxes拼接到proposals对应元素的后面 222 | proposals = self.add_gt_proposals(proposals, gt_boxes) 223 | 224 | # get matching gt indices for each proposal 225 | # 为每个proposal匹配对应的gt_box,并划分到正负样本中 226 | matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels) 227 | # sample a fixed proportion of positive-negative proposals 228 | # 按给定数量和比例采样正负样本 229 | sampled_inds = self.subsample(labels) 230 | matched_gt_boxes = [] 231 | num_images = len(proposals) 232 | 233 | # 遍历每张图像 234 | for img_id in range(num_images): 235 | # 获取每张图像的正负样本索引 236 | img_sampled_inds = sampled_inds[img_id] 237 | # 获取对应正负样本的proposals信息 238 | proposals[img_id] = proposals[img_id][img_sampled_inds] 239 | # 获取对应正负样本的预测类别信息 240 | labels[img_id] = labels[img_id][img_sampled_inds] 241 | # 获取对应正负样本的真实类别信息 242 | matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds] 243 | 244 | gt_boxes_in_image = gt_boxes[img_id] 245 | if gt_boxes_in_image.numel() == 0: 246 | gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device) 247 | # 获取对应正负样本的gt box信息 248 | matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]]) 249 | 250 | # 根据gt和proposal计算边框回归参数(针对gt的) 251 | regression_targets = self.box_coder.encode(matched_gt_boxes, proposals) 252 | return proposals, matched_idxs, labels, regression_targets 253 | 254 | def postprocess_detections(self, class_logits, box_regression, proposals, image_shapes): 255 | # type: (Tensor, Tensor, List[Tensor], List[Tuple[int, int]]) 256 | """ 257 | 对网络的预测数据进行后处理,包括 258 | (1)根据proposal以及预测的回归参数计算出最终bbox坐标 259 | (2)对预测类别结果进行softmax处理 260 | (3)裁剪预测的boxes信息,将越界的坐标调整到图片边界上 261 | (4)移除所有背景信息 262 | (5)移除低概率目标 263 | (6)移除小尺寸目标 264 | (7)执行nms处理,并按scores进行排序 265 | (8)根据scores排序返回前topk个目标 266 | Args: 267 | class_logits: 网络预测类别概率信息 268 | box_regression: 网络预测的边界框回归参数 269 | proposals: rpn输出的proposal 270 | image_shapes: 打包成batch前每张图像的宽高 271 | 272 | Returns: 273 | 274 | """ 275 | device = class_logits.device 276 | # 预测目标类别数 277 | num_classes = class_logits.shape[-1] 278 | 279 | # 获取每张图像的预测bbox数量 280 | boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals] 281 | # 根据proposal以及预测的回归参数计算出最终bbox坐标 282 | pred_boxes = self.box_coder.decode(box_regression, proposals) 283 | 284 | # 对预测类别结果进行softmax处理 285 | pred_scores = F.softmax(class_logits, -1) 286 | 287 | # split boxes and scores per image 288 | # 根据每张图像的预测bbox数量分割结果 289 | pred_boxes_list = pred_boxes.split(boxes_per_image, 0) 290 | pred_scores_list = pred_scores.split(boxes_per_image, 0) 291 | 292 | all_boxes = [] 293 | all_scores = [] 294 | all_labels = [] 295 | # 遍历每张图像预测信息 296 | for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes): 297 | # 裁剪预测的boxes信息,将越界的坐标调整到图片边界上 298 | boxes = box_ops.clip_boxes_to_image(boxes, image_shape) 299 | 300 | # create labels for each prediction 301 | labels = torch.arange(num_classes, device=device) 302 | labels = labels.view(1, -1).expand_as(scores) 303 | 304 | # remove prediction with the background label 305 | # 移除索引为0的所有信息(0代表背景) 306 | boxes = boxes[:, 1:] 307 | scores = scores[:, 1:] 308 | labels = labels[:, 1:] 309 | 310 | # batch everything, by making every class prediction be a separate instance 311 | boxes = boxes.reshape(-1, 4) 312 | scores = scores.reshape(-1) 313 | labels = labels.reshape(-1) 314 | 315 | # remove low scoring boxes 316 | # 移除低概率目标,self.scores_thresh=0.05 317 | inds = torch.nonzero(scores > self.score_thresh).squeeze(1) 318 | boxes, scores, labels = boxes[inds], scores[inds], labels[inds] 319 | 320 | # remove empty boxes 321 | # 移除小目标 322 | keep = box_ops.remove_small_boxes(boxes, min_size=1e-2) 323 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep] 324 | 325 | # non-maximun suppression, independently done per class 326 | # 执行nms处理,执行后的结果会按照scores从大到小进行排序返回 327 | keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh) 328 | 329 | # keep only topk scoring predictions 330 | # 获取scores排在前topk个预测目标 331 | keep = keep[:self.detection_per_img] 332 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep] 333 | 334 | all_boxes.append(boxes) 335 | all_scores.append(scores) 336 | all_labels.append(labels) 337 | 338 | return all_boxes, all_scores, all_labels 339 | 340 | def forward(self, features, proposals, image_shapes, targets=None): 341 | # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]], Optional[List[Dict[str, Tensor]]]) 342 | """ 343 | Arguments: 344 | features (List[Tensor]) 345 | proposals (List[Tensor[N, 4]]) 346 | image_shapes (List[Tuple[H, W]]) 347 | targets (List[Dict]) 348 | """ 349 | 350 | # 检查targets的数据类型是否正确 351 | if targets is not None: 352 | for t in targets: 353 | floating_point_types = (torch.float, torch.double, torch.half) 354 | assert t["boxes"].dtype in floating_point_types, "target boxes must of float type" 355 | assert t["labels"].dtype == torch.int64, "target labels must of int64 type" 356 | 357 | if self.training: 358 | # 划分正负样本,统计对应gt的标签以及边界框回归信息 359 | proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets) 360 | else: 361 | labels = None 362 | regression_targets = None 363 | matched_idxs = None 364 | 365 | # 将采集样本通过roi_pooling层 366 | box_features = self.box_roi_pool(features, proposals, image_shapes) 367 | # 通过roi_pooling后的两层全连接层 368 | box_features = self.box_head(box_features) 369 | # 接着分别预测目标类别和边界框回归参数 class_logits(2048, 21) box_regression(2048,84) 370 | class_logits, box_regression = self.box_predictor(box_features) 371 | 372 | result = torch.jit.annotate(List[Dict[str, torch.Tensor]], []) 373 | losses = {} 374 | if self.training: 375 | assert labels is not None and regression_targets is not None 376 | loss_classifier, loss_box_reg = fastrcnn_loss( 377 | # labels:List[(512,), (512,), (512,), (512,)] regression_targets:Tuple((512,4),(512,4),(512,4),(512,4)) 378 | class_logits, box_regression, labels, regression_targets) 379 | losses = { 380 | "loss_classifier": loss_classifier, 381 | "loss_box_reg": loss_box_reg 382 | } 383 | else: 384 | boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes) 385 | num_images = len(boxes) 386 | for i in range(num_images): 387 | result.append( 388 | { 389 | "boxes": boxes[i], 390 | "labels": labels[i], 391 | "scores": scores[i], 392 | } 393 | ) 394 | 395 | return result, losses 396 | -------------------------------------------------------------------------------- /faster_rcnn/network_files/transform.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | import random 4 | import math 5 | from network_files.image_list import ImageList 6 | from torch.jit.annotations import List, Tuple, Dict, Optional 7 | import torchvision 8 | 9 | 10 | class GeneralizedRCNNTransform(nn.Module): 11 | """ 12 | Performs input / target transformation before feeding the data to a GeneralizedRCNN 13 | model. 14 | 15 | The transformations it perform are: 16 | - input normalization (mean subtraction and std division) 17 | - input / target resizing to match min_size / max_size 18 | 19 | It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets 20 | """ 21 | 22 | def __init__(self, min_size, max_size, image_mean, image_std): 23 | super(GeneralizedRCNNTransform, self).__init__() 24 | if not isinstance(min_size, (list, tuple)): 25 | min_size = (min_size,) 26 | self.min_size = min_size # 指定图像的最小边长范围 27 | self.max_size = max_size # 指定图像的最大边长范围 28 | self.image_mean = image_mean # 指定图像在标准化处理中的均值 29 | self.image_std = image_std # 指定图像在标准化处理中的方差 30 | 31 | def normalize(self, image): 32 | """标准化处理""" 33 | dtype, device = image.dtype, image.device 34 | mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) 35 | std = torch.as_tensor(self.image_std, dtype=dtype, device=device) 36 | # [:, None, None]: shape [3] -> [3, 1, 1] 37 | return (image - mean[:, None, None]) / std[:, None, None] 38 | 39 | def torch_choice(self, l): 40 | # type: (List[int]) 41 | """ 42 | Implements `random.choice` via torch ops so it can be compiled with 43 | TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803 44 | is fixed. 45 | """ 46 | index = int(torch.empty(1).uniform_(0., float(len(l))).item()) 47 | return l[index] 48 | 49 | def resize(self, image, target): 50 | # type: (Tensor, Optional[Dict[str, Tensor]]) 51 | """ 52 | 将图片缩放到指定的大小范围内,并对应缩放bboxes信息 53 | Args: 54 | image: 输入的图片 55 | target: 输入图片的相关信息(包括bboxes信息) 56 | 57 | Returns: 58 | image: 缩放后的图片 59 | target: 缩放bboxes后的图片相关信息 60 | """ 61 | # image shape is [channel, height, width] 62 | # 得到image的高度和宽度 63 | h, w = image.shape[-2:] 64 | im_shape = torch.tensor(image.shape[-2:]) 65 | min_size = float(torch.min(im_shape)) # 获取高宽中的最小值 66 | max_size = float(torch.max(im_shape)) # 获取高宽中的最大值 67 | if self.training: 68 | size = float(self.torch_choice(self.min_size)) # 指定输入图片的最小边长,注意是self.min_size不是min_size 69 | else: 70 | # FIXME assume for now that testing uses the largest scale 71 | size = float(self.min_size[-1]) # 指定输入图片的最小边长,注意是self.min_size不是min_size 72 | scale_factor = size / min_size # 根据指定最小边长和图片最小边长计算缩放比例 73 | 74 | # 如果使用该缩放比例计算的图片最大边长大于指定的最大边长 75 | if max_size * scale_factor > self.max_size: 76 | scale_factor = self.max_size / max_size # 将缩放比例设为指定最大边长和图片最大边长之比 77 | 78 | # interpolate利用插值的方法缩放图片 79 | # image[None]操作是在最前面添加batch维度[C, H, W] -> [1, C, H, W] 80 | # bilinear只支持4D Tensor 81 | image = torch.nn.functional.interpolate( 82 | image[None], scale_factor=scale_factor, mode='bilinear', align_corners=False)[0] 83 | 84 | if target is None: 85 | return image, target 86 | 87 | bbox = target["boxes"] 88 | # 根据图像的缩放比例来缩放bbox 89 | bbox = resize_boxes(bbox, (h, w), image.shape[-2:]) 90 | target["boxes"] = bbox 91 | 92 | return image, target 93 | 94 | # _onnx_batch_images() is an implementation of 95 | # batch_images() that is supported by ONNX tracing. 96 | @torch.jit.unused 97 | def _onnx_batch_images(self, images, size_divisible=32): 98 | # type: (List[Tensor], int) -> Tensor 99 | max_size = [] 100 | for i in range(images[0].dim()): 101 | max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64) 102 | max_size.append(max_size_i) 103 | stride = size_divisible 104 | max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64) 105 | max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64) 106 | max_size = tuple(max_size) 107 | 108 | # work around for 109 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 110 | # which is not yet supported in onnx 111 | padded_imgs = [] 112 | for img in images: 113 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 114 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 115 | padded_imgs.append(padded_img) 116 | 117 | return torch.stack(padded_imgs) 118 | 119 | def max_by_axis(self, the_list): 120 | # type: (List[List[int]]) -> List[int] 121 | maxes = the_list[0] 122 | for sublist in the_list[1:]: 123 | for index, item in enumerate(sublist): 124 | maxes[index] = max(maxes[index], item) 125 | return maxes 126 | 127 | def batch_images(self, images, size_divisible=32): 128 | # type: (List[Tensor], int) 129 | """ 130 | 将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的) 131 | Args: 132 | images: 输入的一批图片 133 | size_divisible: 将图像高和宽调整到该数的整数倍 134 | 135 | Returns: 136 | batched_imgs: 打包成一个batch后的tensor数据 137 | """ 138 | 139 | if torchvision._is_tracing(): 140 | # batch_images() does not export well to ONNX 141 | # call _onnx_batch_images() instead 142 | return self._onnx_batch_images(images, size_divisible) 143 | 144 | # 分别计算一个batch中所有图片中的最大height, width 145 | max_size = self.max_by_axis([list(img.shape) for img in images]) 146 | 147 | stride = float(size_divisible) 148 | # max_size = list(max_size) 149 | # 将height向上调整到stride的整数倍 150 | max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) 151 | # 将width向上调整到stride的整数倍 152 | max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) 153 | 154 | # [batch, channel, height, width] 155 | batch_shape = [len(images)] + max_size 156 | 157 | # 创建shape为batch_shape且值全部为0的tensor 158 | # images[0]就是一个tensor 为了调用tensor的new_full方法 返回全0的shape为batch_shape的tensor 159 | batched_imgs = images[0].new_full(batch_shape, 0) 160 | for img, pad_img in zip(images, batched_imgs): 161 | # 将输入images中的每张图片复制到新的batched_imgs的每张图片中,对齐左上角,保证bboxes的坐标不变 162 | # 这样保证输入到网络中一个batch的每张图片的shape相同 163 | # copy_: Copies the elements from src into self tensor and returns self 164 | # 把img的像素值复制到pad_img的相同位置处 165 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 166 | 167 | return batched_imgs 168 | 169 | def postprocess(self, result, image_shapes, original_image_sizes): 170 | # type: (List[Dict[str, Tensor]], List[Tuple[int, int]], List[Tuple[int, int]]) 171 | """ 172 | 对网络的预测结果进行后处理(主要将bboxes还原到原图像尺度上) 173 | Args: 174 | result: list(dict), 网络的预测结果, len(result) == batch_size 175 | image_shapes: list(torch.Size), 图像预处理缩放后的尺寸, len(image_shapes) == batch_size 176 | original_image_sizes: list(torch.Size), 图像的原始尺寸, len(original_image_sizes) == batch_size 177 | 178 | Returns: 179 | 180 | """ 181 | if self.training: 182 | return result 183 | for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)): 184 | boxes = pred["boxes"] 185 | boxes = resize_boxes(boxes, im_s, o_im_s) # 将bboxes缩放回原图像尺度上 186 | result[i]["boxes"] = boxes 187 | return result 188 | 189 | def __repr__(self): 190 | """自定义输出实例化对象的信息,可通过print打印实例信息""" 191 | format_string = self.__class__.__name__ + '(' 192 | _indent = '\n ' 193 | format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std) 194 | format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size, 195 | self.max_size) 196 | format_string += '\n)' 197 | return format_string 198 | 199 | def forward(self, images, targets=None): 200 | # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) 201 | images = [img for img in images] 202 | for i in range(len(images)): 203 | image = images[i] 204 | target_index = targets[i] if targets is not None else None 205 | 206 | if image.dim() != 3: 207 | raise ValueError("images is expected to be a list of 3d tensors " 208 | "of shape [C, H, W], got {}".format(image.shape)) 209 | image = self.normalize(image) # 对图像进行标准化处理 210 | image, target_index = self.resize(image, target_index) # 对图像和对应的bboxes缩放到指定范围 211 | images[i] = image 212 | if targets is not None and target_index is not None: 213 | targets[i] = target_index 214 | 215 | # 记录resize后的图像尺寸 216 | image_sizes = [img.shape[-2:] for img in images] 217 | images = self.batch_images(images) # 将images打包成一个batch 218 | image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], []) 219 | 220 | for image_size in image_sizes: 221 | assert len(image_size) == 2 222 | image_sizes_list.append((image_size[0], image_size[1])) 223 | 224 | image_list = ImageList(images, image_sizes_list) 225 | return image_list, targets 226 | 227 | 228 | def resize_boxes(boxes, original_size, new_size): 229 | # type: (Tensor, List[int], List[int]) -> Tensor 230 | """ 231 | 将boxes参数根据图像的缩放情况进行相应缩放 232 | 233 | Arguments: 234 | original_size: 图像缩放前的尺寸 235 | new_size: 图像缩放后的尺寸 236 | """ 237 | ratios = [ 238 | torch.tensor(s, dtype=torch.float32, device=boxes.device) / 239 | torch.tensor(s_orig, dtype=torch.float32, device=boxes.device) 240 | for s, s_orig in zip(new_size, original_size) 241 | ] 242 | ratios_height, ratios_width = ratios 243 | # Removes a tensor dimension, boxes [minibatch, 4] 244 | # Returns a tuple of all slices along a given dimension, already without it. 245 | xmin, ymin, xmax, ymax = boxes.unbind(1) 246 | xmin = xmin * ratios_width 247 | xmax = xmax * ratios_width 248 | ymin = ymin * ratios_height 249 | ymax = ymax * ratios_height 250 | return torch.stack((xmin, ymin, xmax, ymax), dim=1) 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /faster_rcnn/pascal_voc_classes.json: -------------------------------------------------------------------------------- 1 | { 2 | "aeroplane": 1, 3 | "bicycle": 2, 4 | "bird": 3, 5 | "boat": 4, 6 | "bottle": 5, 7 | "bus": 6, 8 | "car": 7, 9 | "cat": 8, 10 | "chair": 9, 11 | "cow": 10, 12 | "diningtable": 11, 13 | "dog": 12, 14 | "horse": 13, 15 | "motorbike": 14, 16 | "person": 15, 17 | "pottedplant": 16, 18 | "sheep": 17, 19 | "sofa": 18, 20 | "train": 19, 21 | "tvmonitor": 20 22 | } -------------------------------------------------------------------------------- /faster_rcnn/predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from torchvision import transforms 4 | from network_files.faster_rcnn_framework import FasterRCNN, FastRCNNPredictor 5 | from backbone.resnet50_fpn_model import resnet50_fpn_backbone 6 | from network_files.rpn_function import AnchorsGenerator 7 | from backbone.mobilenetv2_model import MobileNetV2 8 | from draw_box_utils import draw_box 9 | from PIL import Image 10 | import json 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def create_model(num_classes): 15 | # mobileNetv2+faster_RCNN 16 | # backbone = MobileNetV2().features 17 | # backbone.out_channels = 1280 18 | # 19 | # anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),), 20 | # aspect_ratios=((0.5, 1.0, 2.0),)) 21 | # 22 | # roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], 23 | # output_size=[7, 7], 24 | # sampling_ratio=2) 25 | # 26 | # model = FasterRCNN(backbone=backbone, 27 | # num_classes=num_classes, 28 | # rpn_anchor_generator=anchor_generator, 29 | # box_roi_pool=roi_pooler) 30 | 31 | # resNet50+fpn+faster_RCNN 32 | backbone = resnet50_fpn_backbone() 33 | model = FasterRCNN(backbone=backbone, num_classes=num_classes) 34 | 35 | return model 36 | 37 | 38 | # get devices 39 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 40 | print(device) 41 | 42 | # create model 43 | model = create_model(num_classes=21) 44 | 45 | # load train weights 46 | train_weights = "./save_weights/model.pth" 47 | model.load_state_dict(torch.load(train_weights)["model"]) 48 | model.to(device) 49 | 50 | # read class_indict 51 | category_index = {} 52 | try: 53 | json_file = open('./pascal_voc_classes.json', 'r') 54 | class_dict = json.load(json_file) 55 | category_index = {v: k for k, v in class_dict.items()} 56 | except Exception as e: 57 | print(e) 58 | exit(-1) 59 | 60 | # load image 61 | original_img = Image.open("./test.jpg") 62 | 63 | # from pil image to tensor, do not normalize image 64 | data_transform = transforms.Compose([transforms.ToTensor()]) 65 | img = data_transform(original_img) 66 | # expand batch dimension 67 | img = torch.unsqueeze(img, dim=0) 68 | 69 | model.eval() 70 | with torch.no_grad(): 71 | predictions = model(img.to(device))[0] 72 | predict_boxes = predictions["boxes"].to("cpu").numpy() 73 | predict_classes = predictions["labels"].to("cpu").numpy() 74 | predict_scores = predictions["scores"].to("cpu").numpy() 75 | 76 | if len(predict_boxes) == 0: 77 | print("没有检测到任何目标!") 78 | 79 | draw_box(original_img, 80 | predict_boxes, 81 | predict_classes, 82 | predict_scores, 83 | category_index, 84 | thresh=0.5, 85 | line_thickness=5) 86 | plt.imshow(original_img) 87 | plt.show() 88 | -------------------------------------------------------------------------------- /faster_rcnn/spilt_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | 5 | files_path = "./VOCdevkit/VOC2012/Annotations" 6 | if not os.path.exists(files_path): 7 | print("文件夹不存在") 8 | exit(1) 9 | val_rate = 0.5 10 | 11 | files_name = sorted([file.split(".")[0] for file in os.listdir(files_path)]) 12 | files_num = len(files_name) 13 | val_index = random.sample(range(0, files_num), k=int(files_num*val_rate)) 14 | train_files = [] 15 | val_files = [] 16 | for index, file_name in enumerate(files_name): 17 | if index in val_index: 18 | val_files.append(file_name) 19 | else: 20 | train_files.append(file_name) 21 | 22 | try: 23 | train_f = open("train.txt", "x") 24 | eval_f = open("val.txt", "x") 25 | train_f.write("\n".join(train_files)) 26 | eval_f.write("\n".join(val_files)) 27 | except FileExistsError as e: 28 | print(e) 29 | exit(1) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /faster_rcnn/train_mobilenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import transforms 4 | from network_files.faster_rcnn_framework import FasterRCNN 5 | from network_files.rpn_function import AnchorsGenerator 6 | from backbone.mobilenetv2_model import MobileNetV2 7 | from torchvision.ops import misc 8 | from my_dataset import VOC2012DataSet 9 | from train_utils import train_eval_utils as utils 10 | import os 11 | 12 | 13 | def create_model(num_classes): 14 | backbone = MobileNetV2(weights_path="./backbone/mobilenet_v2.pth").features 15 | backbone.out_channels = 1280 16 | 17 | anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),), 18 | aspect_ratios=((0.5, 1.0, 2.0),)) 19 | 20 | roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], # 在哪些特征层上进行roi pooling 21 | output_size=[7, 7], # roi_pooling输出特征矩阵尺寸 22 | sampling_ratio=2) # 采样率 23 | 24 | model = FasterRCNN(backbone=backbone, 25 | num_classes=num_classes, 26 | rpn_anchor_generator=anchor_generator, 27 | box_roi_pool=roi_pooler) 28 | 29 | return model 30 | 31 | 32 | def main(): 33 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 34 | print(device) 35 | 36 | # 检查保存权重文件夹是否存在,不存在则创建 37 | if not os.path.exists("save_weights"): 38 | os.makedirs("save_weights") 39 | 40 | data_transform = { 41 | "train": transforms.Compose([transforms.ToTensor(), 42 | transforms.RandomHorizontalFlip(0.5)]), 43 | "val": transforms.Compose([transforms.ToTensor()]) 44 | } 45 | 46 | VOC_root = os.getcwd() 47 | # load train data set 48 | train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) 49 | # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch 50 | train_data_loader = torch.utils.data.DataLoader(train_data_set, 51 | batch_size=8, 52 | shuffle=True, 53 | num_workers=0, 54 | collate_fn=utils.collate_fn) 55 | 56 | # load validation data set 57 | val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) 58 | val_data_set_loader = torch.utils.data.DataLoader(val_data_set, 59 | batch_size=1, 60 | shuffle=False, 61 | num_workers=0, 62 | collate_fn=utils.collate_fn) 63 | 64 | # create model num_classes equal background + 20 classes 65 | model = create_model(num_classes=21) 66 | print(model) 67 | 68 | model.to(device) 69 | 70 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 71 | # first frozen backbone and train 5 epochs # 72 | # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # 73 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 74 | for param in model.backbone.parameters(): 75 | param.requires_grad = False 76 | 77 | # define optimizer 78 | params = [p for p in model.parameters() if p.requires_grad] 79 | optimizer = torch.optim.SGD(params, lr=0.005, 80 | momentum=0.9, weight_decay=0.0005) 81 | 82 | num_epochs = 5 83 | for epoch in range(num_epochs): 84 | # train for one epoch, printing every 10 iterations 85 | utils.train_one_epoch(model, optimizer, train_data_loader, 86 | device, epoch, print_freq=50) 87 | 88 | # evaluate on the test dataset 89 | utils.evaluate(model, val_data_set_loader, device=device) 90 | 91 | torch.save(model.state_dict(), "./save_weights/pretrain.pth") 92 | 93 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # 94 | # second unfrozen backbone and train all network # 95 | # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # 96 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # 97 | 98 | # 冻结backbone部分底层权重 99 | for name, parameter in model.backbone.named_parameters(): 100 | split_name = name.split(".")[0] 101 | if split_name in ["0", "1", "2", "3"]: 102 | parameter.requires_grad = False 103 | else: 104 | parameter.requires_grad = True 105 | 106 | # define optimizer 107 | params = [p for p in model.parameters() if p.requires_grad] 108 | optimizer = torch.optim.SGD(params, lr=0.005, 109 | momentum=0.9, weight_decay=0.0005) 110 | # learning rate scheduler 111 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 112 | step_size=5, 113 | gamma=0.33) 114 | num_epochs = 20 115 | for epoch in range(num_epochs): 116 | # train for one epoch, printing every 50 iterations 117 | utils.train_one_epoch(model, optimizer, train_data_loader, 118 | device, epoch, print_freq=50, warmup=True) 119 | # update the learning rate 120 | lr_scheduler.step() 121 | 122 | # evaluate on the test dataset 123 | utils.evaluate(model, val_data_set_loader, device=device) 124 | 125 | # save weights 126 | if epoch > 10: 127 | save_files = { 128 | 'model': model.state_dict(), 129 | 'optimizer': optimizer.state_dict(), 130 | 'lr_scheduler': lr_scheduler.state_dict(), 131 | 'epoch': epoch} 132 | torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) 133 | 134 | # model.eval() 135 | # x = [torch.rand(3, 300, 400), torch.rand(3, 400, 400)] 136 | # predictions = model(x) 137 | # print(predictions) 138 | 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /faster_rcnn/train_multi_GPU.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import train_utils.train_eval_utils as utils 3 | import time 4 | import os 5 | import datetime 6 | from my_dataset import VOC2012DataSet 7 | import transforms 8 | from train_utils.group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups 9 | from backbone.resnet50_fpn_model import resnet50_fpn_backbone 10 | from network_files.faster_rcnn_framework import FasterRCNN, FastRCNNPredictor 11 | import torch.multiprocessing as mp 12 | 13 | 14 | def create_model(num_classes): 15 | backbone = resnet50_fpn_backbone() 16 | model = FasterRCNN(backbone=backbone, num_classes=91) 17 | # 载入预训练模型权重 18 | weights_dict = torch.load("./backbone/fasterrcnn_resnet50_fpn_coco.pth") 19 | missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False) 20 | if len(missing_keys) != 0 or len(unexpected_keys) != 0: 21 | print("missing_keys: ", missing_keys) 22 | print("unexpected_keys: ", unexpected_keys) 23 | 24 | # get number of input features for the classifier 25 | in_features = model.roi_heads.box_predictor.cls_score.in_features 26 | # replace the pre-trained head with a new one 27 | model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) 28 | 29 | return model 30 | 31 | 32 | # def main_worker(args): 33 | def main(args): 34 | print(args) 35 | # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) 36 | utils.init_distributed_mode(args) 37 | 38 | device = torch.device(args.device) 39 | 40 | # Data loading code 41 | print("Loading data") 42 | 43 | data_transform = { 44 | "train": transforms.Compose([transforms.ToTensor(), 45 | transforms.RandomHorizontalFlip(0.5)]), 46 | "val": transforms.Compose([transforms.ToTensor()]) 47 | } 48 | 49 | VOC_root = args.data_path 50 | # load train data set 51 | train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) 52 | 53 | # load validation data set 54 | val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) 55 | 56 | print("Creating data loaders") 57 | if args.distributed: 58 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_data_set) 59 | test_sampler = torch.utils.data.distributed.DistributedSampler(val_data_set) 60 | else: 61 | train_sampler = torch.utils.data.RandomSampler(train_data_set) 62 | test_sampler = torch.utils.data.SequentialSampler(val_data_set) 63 | 64 | if args.aspect_ratio_group_factor >= 0: 65 | # 统计所有图像比例在bins区间中的位置索引 66 | group_ids = create_aspect_ratio_groups(train_data_set, k=args.aspect_ratio_group_factor) 67 | train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) 68 | else: 69 | train_batch_sampler = torch.utils.data.BatchSampler( 70 | train_sampler, args.batch_size, drop_last=True) 71 | 72 | data_loader = torch.utils.data.DataLoader( 73 | train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, 74 | collate_fn=utils.collate_fn) 75 | 76 | data_loader_test = torch.utils.data.DataLoader( 77 | val_data_set, batch_size=1, 78 | sampler=test_sampler, num_workers=args.workers, 79 | collate_fn=utils.collate_fn) 80 | 81 | print("Creating model") 82 | model = create_model(num_classes=21) 83 | model.to(device) 84 | 85 | model_without_ddp = model 86 | if args.distributed: 87 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 88 | model_without_ddp = model.module 89 | 90 | params = [p for p in model.parameters() if p.requires_grad] 91 | optimizer = torch.optim.SGD( 92 | params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 93 | 94 | # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) 95 | lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) 96 | 97 | # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 98 | if args.resume: 99 | # If map_location is missing, torch.load will first load the module to CPU 100 | # and then copy each parameter to where it was saved, 101 | # which would result in all processes on the same machine using the same set of devices. 102 | checkpoint = torch.load(args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) 103 | model_without_ddp.load_state_dict(checkpoint['model']) 104 | optimizer.load_state_dict(checkpoint['optimizer']) 105 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 106 | args.start_epoch = checkpoint['epoch'] + 1 107 | 108 | if args.test_only: 109 | utils.evaluate(model, data_loader_test, device=device) 110 | return 111 | 112 | print("Start training") 113 | start_time = time.time() 114 | for epoch in range(args.start_epoch, args.epochs): 115 | if args.distributed: 116 | train_sampler.set_epoch(epoch) 117 | utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) 118 | lr_scheduler.step() 119 | if args.output_dir: 120 | # 只在主节点上执行保存权重操作 121 | utils.save_on_master({ 122 | 'model': model_without_ddp.state_dict(), 123 | 'optimizer': optimizer.state_dict(), 124 | 'lr_scheduler': lr_scheduler.state_dict(), 125 | 'args': args, 126 | 'epoch': epoch}, 127 | os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) 128 | 129 | # evaluate after every epoch 130 | utils.evaluate(model, data_loader_test, device=device) 131 | 132 | total_time = time.time() - start_time 133 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 134 | print('Training time {}'.format(total_time_str)) 135 | 136 | 137 | if __name__ == "__main__": 138 | import argparse 139 | parser = argparse.ArgumentParser( 140 | description=__doc__) 141 | 142 | # 训练文件的根目录 143 | parser.add_argument('--data-path', default='./', help='dataset') 144 | # 训练设备类型 145 | parser.add_argument('--device', default='cuda', help='device') 146 | # 每块GPU上的batch_size 147 | parser.add_argument('-b', '--batch-size', default=2, type=int, 148 | help='images per gpu, the total batch size is $NGPU x batch_size') 149 | # 指定接着从哪个epoch数开始训练 150 | parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') 151 | # 训练的总epoch数 152 | parser.add_argument('--epochs', default=20, type=int, metavar='N', 153 | help='number of total epochs to run') 154 | # 数据加载以及预处理的线程数 155 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 156 | help='number of data loading workers (default: 4)') 157 | # 学习率,这个需要根据gpu的数量以及batch_size进行设置0.02 / 8 * num_GPU 158 | parser.add_argument('--lr', default=0.02, type=float, 159 | help='initial learning rate, 0.02 is the default value for training ' 160 | 'on 8 gpus and 2 images_per_gpu') 161 | # SGD的momentum参数 162 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 163 | help='momentum') 164 | # SGD的weight_decay参数 165 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 166 | metavar='W', help='weight decay (default: 1e-4)', 167 | dest='weight_decay') 168 | # 针对torch.optim.lr_scheduler.StepLR的参数 169 | parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs') 170 | # 针对torch.optim.lr_scheduler.MultiStepLR的参数 171 | parser.add_argument('--lr-steps', default=[7, 12], nargs='+', type=int, help='decrease lr every step-size epochs') 172 | # 针对torch.optim.lr_scheduler.MultiStepLR的参数 173 | parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') 174 | # 训练过程打印信息的频率 175 | parser.add_argument('--print-freq', default=20, type=int, help='print frequency') 176 | # 文件保存地址 177 | parser.add_argument('--output-dir', default='./multi_train', help='path where to save') 178 | # 基于上次的训练结果接着训练 179 | parser.add_argument('--resume', default='', help='resume from checkpoint') 180 | parser.add_argument('--aspect-ratio-group-factor', default=3, type=int) 181 | # 不训练,仅测试 182 | parser.add_argument( 183 | "--test-only", 184 | dest="test_only", 185 | help="Only test the model", 186 | action="store_true", 187 | ) 188 | 189 | # 开启的进程数(注意不是线程) 190 | parser.add_argument('--world-size', default=4, type=int, 191 | help='number of distributed processes') 192 | parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') 193 | 194 | args = parser.parse_args() 195 | 196 | # 如果指定了保存文件地址,检查文件夹是否存在,若不存在,则创建 197 | if args.output_dir: 198 | utils.mkdir(args.output_dir) 199 | 200 | main(args) 201 | -------------------------------------------------------------------------------- /faster_rcnn/train_res50_fpn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transforms 3 | from network_files.faster_rcnn_framework import FasterRCNN, FastRCNNPredictor 4 | from backbone.resnet50_fpn_model import resnet50_fpn_backbone 5 | from my_dataset import VOC2012DataSet 6 | from train_utils import train_eval_utils as utils 7 | import os 8 | 9 | 10 | def create_model(num_classes): 11 | backbone = resnet50_fpn_backbone() 12 | model = FasterRCNN(backbone=backbone, num_classes=91) 13 | # 载入预训练模型权重 14 | weights_dict = torch.load("./backbone/fasterrcnn_resnet50_fpn_coco.pth") 15 | missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False) 16 | if len(missing_keys) != 0 or len(unexpected_keys) != 0: 17 | print("missing_keys: ", missing_keys) 18 | print("unexpected_keys: ", unexpected_keys) 19 | 20 | # get number of input features for the classifier 21 | in_features = model.roi_heads.box_predictor.cls_score.in_features 22 | # replace the pre-trained head with a new one 23 | model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) 24 | 25 | return model 26 | 27 | 28 | def main(parser_data): 29 | device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") 30 | print(device) 31 | 32 | data_transform = { 33 | "train": transforms.Compose([transforms.ToTensor(), 34 | transforms.RandomHorizontalFlip(0.5)]), 35 | "val": transforms.Compose([transforms.ToTensor()]) 36 | } 37 | 38 | VOC_root = parser_data.data_path 39 | # load train data set 40 | train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) 41 | # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch 42 | train_data_loader = torch.utils.data.DataLoader(train_data_set, 43 | batch_size=4, 44 | shuffle=True, 45 | num_workers=0, 46 | collate_fn=utils.collate_fn) 47 | 48 | # load validation data set 49 | val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) 50 | val_data_set_loader = torch.utils.data.DataLoader(val_data_set, 51 | batch_size=2, 52 | shuffle=False, 53 | num_workers=0, 54 | collate_fn=utils.collate_fn) 55 | 56 | # create model num_classes equal background + 20 classes 57 | model = create_model(num_classes=21) 58 | print(model) 59 | 60 | model.to(device) 61 | 62 | # define optimizer 63 | params = [p for p in model.parameters() if p.requires_grad] 64 | optimizer = torch.optim.SGD(params, lr=0.005, 65 | momentum=0.9, weight_decay=0.0005) 66 | # learning rate scheduler 67 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 68 | step_size=5, 69 | gamma=0.33) 70 | 71 | # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 72 | if parser_data.resume != "": 73 | checkpoint = torch.load(parser_data.resume) 74 | model.load_state_dict(checkpoint['model']) 75 | optimizer.load_state_dict(checkpoint['optimizer']) 76 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 77 | parser_data.start_epoch = checkpoint['epoch'] + 1 78 | print("the training process from epoch{}...".format(parser_data.start_epoch)) 79 | 80 | for epoch in range(parser_data.start_epoch, parser_data.epochs): 81 | # train for one epoch, printing every 10 iterations 82 | utils.train_one_epoch(model, optimizer, train_data_loader, 83 | device, epoch, print_freq=50, warmup=True) 84 | # update the learning rate 85 | lr_scheduler.step() 86 | 87 | # evaluate on the test dataset 88 | utils.evaluate(model, val_data_set_loader, device=device) 89 | 90 | # save weights 91 | save_files = { 92 | 'model': model.state_dict(), 93 | 'optimizer': optimizer.state_dict(), 94 | 'lr_scheduler': lr_scheduler.state_dict(), 95 | 'epoch': epoch} 96 | torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) 97 | 98 | # model.eval() 99 | # x = [torch.rand(3, 300, 400), torch.rand(3, 400, 400)] 100 | # predictions = model(x) 101 | # print(predictions) 102 | 103 | 104 | if __name__ == "__main__": 105 | import argparse 106 | 107 | parser = argparse.ArgumentParser( 108 | description=__doc__) 109 | 110 | # 训练设备类型 111 | parser.add_argument('--device', default='cuda:0', help='device') 112 | # 训练数据集的根目录 113 | parser.add_argument('--data-path', default='../ssd', help='dataset') 114 | # 文件保存地址 115 | parser.add_argument('--output-dir', default='./save_weights', help='path where to save') 116 | # 若需要接着上次训练,则指定上次训练保存权重文件地址 117 | parser.add_argument('--resume', default='', type=str, help='resume from checkpoint') 118 | # 指定接着从哪个epoch数开始训练 119 | parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') 120 | # 训练的总epoch数 121 | parser.add_argument('--epochs', default=15, type=int, metavar='N', 122 | help='number of total epochs to run') 123 | 124 | args = parser.parse_args() 125 | print(args) 126 | 127 | # 检查保存权重文件夹是否存在,不存在则创建 128 | if not os.path.exists(args.output_dir): 129 | os.makedirs(args.output_dir) 130 | 131 | main(args) 132 | -------------------------------------------------------------------------------- /faster_rcnn/train_utils/__pycache__/coco_eval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/train_utils/__pycache__/coco_eval.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/train_utils/__pycache__/coco_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/train_utils/__pycache__/coco_utils.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/train_utils/__pycache__/train_eval_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/faster_rcnn/train_utils/__pycache__/train_eval_utils.cpython-37.pyc -------------------------------------------------------------------------------- /faster_rcnn/train_utils/coco_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | import numpy as np 5 | import copy 6 | import time 7 | import torch 8 | import torch._six 9 | 10 | from pycocotools.cocoeval import COCOeval 11 | from pycocotools.coco import COCO 12 | import pycocotools.mask as mask_util 13 | 14 | from collections import defaultdict 15 | 16 | from train_utils import train_eval_utils as utils 17 | 18 | 19 | class CocoEvaluator(object): 20 | def __init__(self, coco_gt, iou_types): 21 | assert isinstance(iou_types, (list, tuple)) 22 | coco_gt = copy.deepcopy(coco_gt) 23 | self.coco_gt = coco_gt 24 | 25 | self.iou_types = iou_types 26 | self.coco_eval = {} 27 | for iou_type in iou_types: 28 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 29 | 30 | self.img_ids = [] 31 | self.eval_imgs = {k: [] for k in iou_types} 32 | 33 | def update(self, predictions): 34 | img_ids = list(np.unique(list(predictions.keys()))) 35 | self.img_ids.extend(img_ids) 36 | 37 | for iou_type in self.iou_types: 38 | results = self.prepare(predictions, iou_type) 39 | coco_dt = loadRes(self.coco_gt, results) if results else COCO() 40 | coco_eval = self.coco_eval[iou_type] 41 | 42 | coco_eval.cocoDt = coco_dt 43 | coco_eval.params.imgIds = list(img_ids) 44 | img_ids, eval_imgs = evaluate(coco_eval) 45 | 46 | self.eval_imgs[iou_type].append(eval_imgs) 47 | 48 | def synchronize_between_processes(self): 49 | for iou_type in self.iou_types: 50 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 51 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) 52 | 53 | def accumulate(self): 54 | for coco_eval in self.coco_eval.values(): 55 | coco_eval.accumulate() 56 | 57 | def summarize(self): 58 | for iou_type, coco_eval in self.coco_eval.items(): 59 | print("IoU metric: {}".format(iou_type)) 60 | coco_eval.summarize() 61 | 62 | def prepare(self, predictions, iou_type): 63 | if iou_type == "bbox": 64 | return self.prepare_for_coco_detection(predictions) 65 | elif iou_type == "segm": 66 | return self.prepare_for_coco_segmentation(predictions) 67 | elif iou_type == "keypoints": 68 | return self.prepare_for_coco_keypoint(predictions) 69 | else: 70 | raise ValueError("Unknown iou type {}".format(iou_type)) 71 | 72 | def prepare_for_coco_detection(self, predictions): 73 | coco_results = [] 74 | for original_id, prediction in predictions.items(): 75 | if len(prediction) == 0: 76 | continue 77 | 78 | boxes = prediction["boxes"] 79 | boxes = convert_to_xywh(boxes).tolist() 80 | scores = prediction["scores"].tolist() 81 | labels = prediction["labels"].tolist() 82 | 83 | coco_results.extend( 84 | [ 85 | { 86 | "image_id": original_id, 87 | "category_id": labels[k], 88 | "bbox": box, 89 | "score": scores[k], 90 | } 91 | for k, box in enumerate(boxes) 92 | ] 93 | ) 94 | return coco_results 95 | 96 | def prepare_for_coco_segmentation(self, predictions): 97 | coco_results = [] 98 | for original_id, prediction in predictions.items(): 99 | if len(prediction) == 0: 100 | continue 101 | 102 | scores = prediction["scores"] 103 | labels = prediction["labels"] 104 | masks = prediction["masks"] 105 | 106 | masks = masks > 0.5 107 | 108 | scores = prediction["scores"].tolist() 109 | labels = prediction["labels"].tolist() 110 | 111 | rles = [ 112 | mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] 113 | for mask in masks 114 | ] 115 | for rle in rles: 116 | rle["counts"] = rle["counts"].decode("utf-8") 117 | 118 | coco_results.extend( 119 | [ 120 | { 121 | "image_id": original_id, 122 | "category_id": labels[k], 123 | "segmentation": rle, 124 | "score": scores[k], 125 | } 126 | for k, rle in enumerate(rles) 127 | ] 128 | ) 129 | return coco_results 130 | 131 | def prepare_for_coco_keypoint(self, predictions): 132 | coco_results = [] 133 | for original_id, prediction in predictions.items(): 134 | if len(prediction) == 0: 135 | continue 136 | 137 | boxes = prediction["boxes"] 138 | boxes = convert_to_xywh(boxes).tolist() 139 | scores = prediction["scores"].tolist() 140 | labels = prediction["labels"].tolist() 141 | keypoints = prediction["keypoints"] 142 | keypoints = keypoints.flatten(start_dim=1).tolist() 143 | 144 | coco_results.extend( 145 | [ 146 | { 147 | "image_id": original_id, 148 | "category_id": labels[k], 149 | 'keypoints': keypoint, 150 | "score": scores[k], 151 | } 152 | for k, keypoint in enumerate(keypoints) 153 | ] 154 | ) 155 | return coco_results 156 | 157 | 158 | def convert_to_xywh(boxes): 159 | xmin, ymin, xmax, ymax = boxes.unbind(1) 160 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 161 | 162 | 163 | def merge(img_ids, eval_imgs): 164 | all_img_ids = utils.all_gather(img_ids) 165 | all_eval_imgs = utils.all_gather(eval_imgs) 166 | 167 | merged_img_ids = [] 168 | for p in all_img_ids: 169 | merged_img_ids.extend(p) 170 | 171 | merged_eval_imgs = [] 172 | for p in all_eval_imgs: 173 | merged_eval_imgs.append(p) 174 | 175 | merged_img_ids = np.array(merged_img_ids) 176 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 177 | 178 | # keep only unique (and in sorted order) images 179 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 180 | merged_eval_imgs = merged_eval_imgs[..., idx] 181 | 182 | return merged_img_ids, merged_eval_imgs 183 | 184 | 185 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 186 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 187 | img_ids = list(img_ids) 188 | eval_imgs = list(eval_imgs.flatten()) 189 | 190 | coco_eval.evalImgs = eval_imgs 191 | coco_eval.params.imgIds = img_ids 192 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 193 | 194 | 195 | ################################################################# 196 | # From pycocotools, just removed the prints and fixed 197 | # a Python3 bug about unicode not defined 198 | ################################################################# 199 | 200 | # Ideally, pycocotools wouldn't have hard-coded prints 201 | # so that we could avoid copy-pasting those two functions 202 | 203 | def createIndex(self): 204 | # create index 205 | # print('creating index...') 206 | anns, cats, imgs = {}, {}, {} 207 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list) 208 | if 'annotations' in self.dataset: 209 | for ann in self.dataset['annotations']: 210 | imgToAnns[ann['image_id']].append(ann) 211 | anns[ann['id']] = ann 212 | 213 | if 'images' in self.dataset: 214 | for img in self.dataset['images']: 215 | imgs[img['id']] = img 216 | 217 | if 'categories' in self.dataset: 218 | for cat in self.dataset['categories']: 219 | cats[cat['id']] = cat 220 | 221 | if 'annotations' in self.dataset and 'categories' in self.dataset: 222 | for ann in self.dataset['annotations']: 223 | catToImgs[ann['category_id']].append(ann['image_id']) 224 | 225 | # print('index created!') 226 | 227 | # create class members 228 | self.anns = anns 229 | self.imgToAnns = imgToAnns 230 | self.catToImgs = catToImgs 231 | self.imgs = imgs 232 | self.cats = cats 233 | 234 | 235 | maskUtils = mask_util 236 | 237 | 238 | def loadRes(self, resFile): 239 | """ 240 | Load result file and return a result api object. 241 | :param resFile (str) : file name of result file 242 | :return: res (obj) : result api object 243 | """ 244 | res = COCO() 245 | res.dataset['images'] = [img for img in self.dataset['images']] 246 | 247 | # print('Loading and preparing results...') 248 | # tic = time.time() 249 | if isinstance(resFile, torch._six.string_classes): 250 | anns = json.load(open(resFile)) 251 | elif type(resFile) == np.ndarray: 252 | anns = self.loadNumpyAnnotations(resFile) 253 | else: 254 | anns = resFile 255 | assert type(anns) == list, 'results in not an array of objects' 256 | annsImgIds = [ann['image_id'] for ann in anns] 257 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 258 | 'Results do not correspond to current coco set' 259 | if 'caption' in anns[0]: 260 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 261 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 262 | for id, ann in enumerate(anns): 263 | ann['id'] = id + 1 264 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 265 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 266 | for id, ann in enumerate(anns): 267 | bb = ann['bbox'] 268 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] 269 | if 'segmentation' not in ann: 270 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 271 | ann['area'] = bb[2] * bb[3] 272 | ann['id'] = id + 1 273 | ann['iscrowd'] = 0 274 | elif 'segmentation' in anns[0]: 275 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 276 | for id, ann in enumerate(anns): 277 | # now only support compressed RLE format as segmentation results 278 | ann['area'] = maskUtils.area(ann['segmentation']) 279 | if 'bbox' not in ann: 280 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 281 | ann['id'] = id + 1 282 | ann['iscrowd'] = 0 283 | elif 'keypoints' in anns[0]: 284 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 285 | for id, ann in enumerate(anns): 286 | s = ann['keypoints'] 287 | x = s[0::3] 288 | y = s[1::3] 289 | x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y) 290 | ann['area'] = (x2 - x1) * (y2 - y1) 291 | ann['id'] = id + 1 292 | ann['bbox'] = [x1, y1, x2 - x1, y2 - y1] 293 | # print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 294 | 295 | res.dataset['annotations'] = anns 296 | createIndex(res) 297 | return res 298 | 299 | 300 | def evaluate(self): 301 | ''' 302 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 303 | :return: None 304 | ''' 305 | # tic = time.time() 306 | # print('Running per image evaluation...') 307 | p = self.params 308 | # add backward compatibility if useSegm is specified in params 309 | if p.useSegm is not None: 310 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 311 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 312 | # print('Evaluate annotation type *{}*'.format(p.iouType)) 313 | p.imgIds = list(np.unique(p.imgIds)) 314 | if p.useCats: 315 | p.catIds = list(np.unique(p.catIds)) 316 | p.maxDets = sorted(p.maxDets) 317 | self.params = p 318 | 319 | self._prepare() 320 | # loop through images, area range, max detection number 321 | catIds = p.catIds if p.useCats else [-1] 322 | 323 | if p.iouType == 'segm' or p.iouType == 'bbox': 324 | computeIoU = self.computeIoU 325 | elif p.iouType == 'keypoints': 326 | computeIoU = self.computeOks 327 | self.ious = { 328 | (imgId, catId): computeIoU(imgId, catId) 329 | for imgId in p.imgIds 330 | for catId in catIds} 331 | 332 | evaluateImg = self.evaluateImg 333 | maxDet = p.maxDets[-1] 334 | evalImgs = [ 335 | evaluateImg(imgId, catId, areaRng, maxDet) 336 | for catId in catIds 337 | for areaRng in p.areaRng 338 | for imgId in p.imgIds 339 | ] 340 | # this is NOT in the pycocotools code, but could be done outside 341 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 342 | self._paramsEval = copy.deepcopy(self.params) 343 | # toc = time.time() 344 | # print('DONE (t={:0.2f}s).'.format(toc-tic)) 345 | return p.imgIds, evalImgs 346 | 347 | ################################################################# 348 | # end of straight copy from pycocotools, just removing the prints 349 | ################################################################# 350 | -------------------------------------------------------------------------------- /faster_rcnn/train_utils/coco_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data 4 | from pycocotools.coco import COCO 5 | 6 | 7 | def convert_to_coco_api(ds): 8 | coco_ds = COCO() 9 | # annotation IDs need to start at 1, not 0 10 | ann_id = 1 11 | dataset = {'images': [], 'categories': [], 'annotations': []} 12 | categories = set() 13 | for img_idx in range(len(ds)): 14 | # find better way to get target 15 | img, targets = ds[img_idx] 16 | image_id = targets["image_id"].item() 17 | img_dict = {} 18 | img_dict['id'] = image_id 19 | img_dict['height'] = img.shape[-2] 20 | img_dict['width'] = img.shape[-1] 21 | dataset['images'].append(img_dict) 22 | bboxes = targets["boxes"] 23 | bboxes[:, 2:] -= bboxes[:, :2] 24 | bboxes = bboxes.tolist() 25 | labels = targets['labels'].tolist() 26 | areas = targets['area'].tolist() 27 | iscrowd = targets['iscrowd'].tolist() 28 | num_objs = len(bboxes) 29 | for i in range(num_objs): 30 | ann = {} 31 | ann['image_id'] = image_id 32 | ann['bbox'] = bboxes[i] 33 | ann['category_id'] = labels[i] 34 | categories.add(labels[i]) 35 | ann['area'] = areas[i] 36 | ann['iscrowd'] = iscrowd[i] 37 | ann['id'] = ann_id 38 | dataset['annotations'].append(ann) 39 | ann_id += 1 40 | dataset['categories'] = [{'id': i} for i in sorted(categories)] 41 | coco_ds.dataset = dataset 42 | coco_ds.createIndex() 43 | return coco_ds 44 | 45 | 46 | def get_coco_api_from_dataset(dataset): 47 | for _ in range(10): 48 | if isinstance(dataset, torchvision.datasets.CocoDetection): 49 | break 50 | if isinstance(dataset, torch.utils.data.Subset): 51 | dataset = dataset.dataset 52 | if isinstance(dataset, torchvision.datasets.CocoDetection): 53 | return dataset.coco 54 | return convert_to_coco_api(dataset) 55 | -------------------------------------------------------------------------------- /faster_rcnn/train_utils/group_by_aspect_ratio.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from collections import defaultdict 3 | import copy 4 | from itertools import repeat, chain 5 | import math 6 | import numpy as np 7 | 8 | import torch 9 | import torch.utils.data 10 | from torch.utils.data.sampler import BatchSampler, Sampler 11 | from torch.utils.model_zoo import tqdm 12 | import torchvision 13 | 14 | from PIL import Image 15 | 16 | 17 | def _repeat_to_at_least(iterable, n): 18 | repeat_times = math.ceil(n / len(iterable)) 19 | repeated = chain.from_iterable(repeat(iterable, repeat_times)) 20 | return list(repeated) 21 | 22 | 23 | class GroupedBatchSampler(BatchSampler): 24 | """ 25 | Wraps another sampler to yield a mini-batch of indices. 26 | It enforces that the batch only contain elements from the same group. 27 | It also tries to provide mini-batches which follows an ordering which is 28 | as close as possible to the ordering from the original sampler. 29 | Arguments: 30 | sampler (Sampler): Base sampler. 31 | group_ids (list[int]): If the sampler produces indices in range [0, N), 32 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 33 | The group ids must be a continuous set of integers starting from 34 | 0, i.e. they must be in the range [0, num_groups). 35 | batch_size (int): Size of mini-batch. 36 | """ 37 | def __init__(self, sampler, group_ids, batch_size): 38 | if not isinstance(sampler, Sampler): 39 | raise ValueError( 40 | "sampler should be an instance of " 41 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 42 | ) 43 | self.sampler = sampler 44 | self.group_ids = group_ids 45 | self.batch_size = batch_size 46 | 47 | def __iter__(self): 48 | buffer_per_group = defaultdict(list) 49 | samples_per_group = defaultdict(list) 50 | 51 | num_batches = 0 52 | for idx in self.sampler: 53 | group_id = self.group_ids[idx] 54 | buffer_per_group[group_id].append(idx) 55 | samples_per_group[group_id].append(idx) 56 | if len(buffer_per_group[group_id]) == self.batch_size: 57 | yield buffer_per_group[group_id] 58 | num_batches += 1 59 | del buffer_per_group[group_id] 60 | assert len(buffer_per_group[group_id]) < self.batch_size 61 | 62 | # now we have run out of elements that satisfy 63 | # the group criteria, let's return the remaining 64 | # elements so that the size of the sampler is 65 | # deterministic 66 | expected_num_batches = len(self) 67 | num_remaining = expected_num_batches - num_batches 68 | if num_remaining > 0: 69 | # for the remaining batches, take first the buffers with largest number 70 | # of elements 71 | for group_id, _ in sorted(buffer_per_group.items(), 72 | key=lambda x: len(x[1]), reverse=True): 73 | remaining = self.batch_size - len(buffer_per_group[group_id]) 74 | samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining) 75 | buffer_per_group[group_id].extend(samples_from_group_id[:remaining]) 76 | assert len(buffer_per_group[group_id]) == self.batch_size 77 | yield buffer_per_group[group_id] 78 | num_remaining -= 1 79 | if num_remaining == 0: 80 | break 81 | assert num_remaining == 0 82 | 83 | def __len__(self): 84 | return len(self.sampler) // self.batch_size 85 | 86 | 87 | def _compute_aspect_ratios_slow(dataset, indices=None): 88 | print("Your dataset doesn't support the fast path for " 89 | "computing the aspect ratios, so will iterate over " 90 | "the full dataset and load every image instead. " 91 | "This might take some time...") 92 | if indices is None: 93 | indices = range(len(dataset)) 94 | 95 | class SubsetSampler(Sampler): 96 | def __init__(self, indices): 97 | self.indices = indices 98 | 99 | def __iter__(self): 100 | return iter(self.indices) 101 | 102 | def __len__(self): 103 | return len(self.indices) 104 | 105 | sampler = SubsetSampler(indices) 106 | data_loader = torch.utils.data.DataLoader( 107 | dataset, batch_size=1, sampler=sampler, 108 | num_workers=14, # you might want to increase it for faster processing 109 | collate_fn=lambda x: x[0]) 110 | aspect_ratios = [] 111 | with tqdm(total=len(dataset)) as pbar: 112 | for _i, (img, _) in enumerate(data_loader): 113 | pbar.update(1) 114 | height, width = img.shape[-2:] 115 | aspect_ratio = float(width) / float(height) 116 | aspect_ratios.append(aspect_ratio) 117 | return aspect_ratios 118 | 119 | 120 | def _compute_aspect_ratios_custom_dataset(dataset, indices=None): 121 | if indices is None: 122 | indices = range(len(dataset)) 123 | aspect_ratios = [] 124 | for i in indices: 125 | height, width = dataset.get_height_and_width(i) 126 | aspect_ratio = float(width) / float(height) 127 | aspect_ratios.append(aspect_ratio) 128 | return aspect_ratios 129 | 130 | 131 | def _compute_aspect_ratios_coco_dataset(dataset, indices=None): 132 | if indices is None: 133 | indices = range(len(dataset)) 134 | aspect_ratios = [] 135 | for i in indices: 136 | img_info = dataset.coco.imgs[dataset.ids[i]] 137 | aspect_ratio = float(img_info["width"]) / float(img_info["height"]) 138 | aspect_ratios.append(aspect_ratio) 139 | return aspect_ratios 140 | 141 | 142 | def _compute_aspect_ratios_voc_dataset(dataset, indices=None): 143 | if indices is None: 144 | indices = range(len(dataset)) 145 | aspect_ratios = [] 146 | for i in indices: 147 | # this doesn't load the data into memory, because PIL loads it lazily 148 | width, height = Image.open(dataset.images[i]).size 149 | aspect_ratio = float(width) / float(height) 150 | aspect_ratios.append(aspect_ratio) 151 | return aspect_ratios 152 | 153 | 154 | def _compute_aspect_ratios_subset_dataset(dataset, indices=None): 155 | if indices is None: 156 | indices = range(len(dataset)) 157 | 158 | ds_indices = [dataset.indices[i] for i in indices] 159 | return compute_aspect_ratios(dataset.dataset, ds_indices) 160 | 161 | 162 | def compute_aspect_ratios(dataset, indices=None): 163 | if hasattr(dataset, "get_height_and_width"): 164 | return _compute_aspect_ratios_custom_dataset(dataset, indices) 165 | 166 | if isinstance(dataset, torchvision.datasets.CocoDetection): 167 | return _compute_aspect_ratios_coco_dataset(dataset, indices) 168 | 169 | if isinstance(dataset, torchvision.datasets.VOCDetection): 170 | return _compute_aspect_ratios_voc_dataset(dataset, indices) 171 | 172 | if isinstance(dataset, torch.utils.data.Subset): 173 | return _compute_aspect_ratios_subset_dataset(dataset, indices) 174 | 175 | # slow path 176 | return _compute_aspect_ratios_slow(dataset, indices) 177 | 178 | 179 | def _quantize(x, bins): 180 | bins = copy.deepcopy(bins) 181 | bins = sorted(bins) 182 | # bisect_right:寻找y元素按顺序应该排在bins中哪个元素的右边,返回的是索引 183 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) 184 | return quantized 185 | 186 | 187 | def create_aspect_ratio_groups(dataset, k=0): 188 | # 计算所有数据集中的图片width/height比例 189 | aspect_ratios = compute_aspect_ratios(dataset) 190 | # 将[0.5, 2]区间划分成2*k+1等份 191 | bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0] 192 | 193 | # 统计所有图像比例在bins区间中的位置索引 194 | groups = _quantize(aspect_ratios, bins) 195 | # count number of elements per group 196 | # 统计每个区间的频次 197 | counts = np.unique(groups, return_counts=True)[1] 198 | fbins = [0] + bins + [np.inf] 199 | print("Using {} as bins for aspect ratio quantization".format(fbins)) 200 | print("Count of instances per bin: {}".format(counts)) 201 | return groups 202 | -------------------------------------------------------------------------------- /faster_rcnn/train_utils/train_eval_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | import time 4 | import datetime 5 | import pickle 6 | import os 7 | import torch 8 | import errno 9 | from collections import defaultdict, deque 10 | import torch.distributed as dist 11 | from train_utils.coco_utils import get_coco_api_from_dataset 12 | from train_utils.coco_eval import CocoEvaluator 13 | 14 | 15 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, warmup=False): 16 | model.train() 17 | metric_logger = MetricLogger(delimiter=" ") 18 | metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) 19 | header = 'Epoch: [{}]'.format(epoch) 20 | 21 | lr_scheduler = None 22 | if epoch == 0 and warmup is True: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 23 | warmup_factor = 1.0 / 1000 24 | warmup_iters = min(1000, len(data_loader) - 1) 25 | 26 | lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) 27 | 28 | for images, targets in metric_logger.log_every(data_loader, print_freq, header): 29 | images = list(image.to(device) for image in images) 30 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 31 | 32 | loss_dict = model(images, targets) 33 | 34 | losses = sum(loss for loss in loss_dict.values()) 35 | 36 | # reduce losses over all GPUs for logging purpose 37 | loss_dict_reduced = reduce_dict(loss_dict) 38 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 39 | 40 | loss_value = losses_reduced.item() 41 | 42 | if not math.isfinite(loss_value): # 当计算的损失为无穷大时停止训练 43 | print("Loss is {}, stopping training".format(loss_value)) 44 | print(loss_dict_reduced) 45 | sys.exit(1) 46 | 47 | optimizer.zero_grad() 48 | losses.backward() 49 | optimizer.step() 50 | 51 | if lr_scheduler is not None: # 第一轮使用warmup训练方式 52 | lr_scheduler.step() 53 | 54 | metric_logger.update(loss=losses_reduced, **loss_dict_reduced) 55 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 56 | 57 | 58 | @torch.no_grad() 59 | def evaluate(model, data_loader, device): 60 | n_threads = torch.get_num_threads() 61 | # FIXME remove this and make paste_masks_in_image run on the GPU 62 | torch.set_num_threads(1) 63 | cpu_device = torch.device("cpu") 64 | model.eval() 65 | metric_logger = MetricLogger(delimiter=" ") 66 | header = "Test: " 67 | 68 | coco = get_coco_api_from_dataset(data_loader.dataset) 69 | iou_types = _get_iou_types(model) 70 | coco_evaluator = CocoEvaluator(coco, iou_types) 71 | 72 | for image, targets in metric_logger.log_every(data_loader, 100, header): 73 | image = list(img.to(device) for img in image) 74 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 75 | 76 | # 当使用CPU时,跳过GPU相关指令 77 | if device != torch.device("cpu"): 78 | torch.cuda.synchronize(device) 79 | 80 | model_time = time.time() 81 | outputs = model(image) 82 | 83 | outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] 84 | model_time = time.time() - model_time 85 | 86 | res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} 87 | 88 | evaluator_time = time.time() 89 | coco_evaluator.update(res) 90 | evaluator_time = time.time() - evaluator_time 91 | metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) 92 | 93 | # gather the stats from all processes 94 | metric_logger.synchronize_between_processes() 95 | print("Averaged stats:", metric_logger) 96 | coco_evaluator.synchronize_between_processes() 97 | 98 | # accumulate predictions from all images 99 | coco_evaluator.accumulate() 100 | coco_evaluator.summarize() 101 | torch.set_num_threads(n_threads) 102 | return coco_evaluator 103 | 104 | 105 | def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor): 106 | 107 | def f(x): 108 | """根据step数返回一个学习率倍率因子""" 109 | if x >= warmup_iters: # 当迭代数大于给定的warmup_iters时,倍率因子为1 110 | return 1 111 | alpha = float(x) / warmup_iters 112 | # 迭代过程中倍率因子从warmup_factor -> 1 113 | return warmup_factor * (1 - alpha) + alpha 114 | 115 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f) 116 | 117 | 118 | class SmoothedValue(object): 119 | """Track a series of values and provide access to smoothed values over a 120 | window or the global series average. 121 | """ 122 | def __init__(self, window_size=20, fmt=None): 123 | if fmt is None: 124 | fmt = "{median:.4f} ({global_avg:.4f})" 125 | self.deque = deque(maxlen=window_size) # deque简单理解成加强版list 126 | self.total = 0.0 127 | self.count = 0 128 | self.fmt = fmt 129 | 130 | def update(self, value, n=1): 131 | self.deque.append(value) 132 | self.count += n 133 | self.total += value * n 134 | 135 | def synchronize_between_processes(self): 136 | """ 137 | Warning: does not synchronize the deque! 138 | """ 139 | if not is_dist_avail_and_initialized(): 140 | return 141 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda") 142 | dist.barrier() 143 | dist.all_reduce(t) 144 | t = t.tolist() 145 | self.count = int(t[0]) 146 | self.total = t[1] 147 | 148 | @property 149 | def median(self): # @property 是装饰器,这里可简单理解为增加median属性(只读) 150 | d = torch.tensor(list(self.deque)) 151 | return d.median().item() 152 | 153 | @property 154 | def avg(self): 155 | d = torch.tensor(list(self.deque), dtype=torch.float32) 156 | return d.mean().item() 157 | 158 | @property 159 | def global_avg(self): 160 | return self.total / self.count 161 | 162 | @property 163 | def max(self): 164 | return max(self.deque) 165 | 166 | @property 167 | def value(self): 168 | return self.deque[-1] 169 | 170 | def __str__(self): 171 | return self.fmt.format( 172 | median=self.median, 173 | avg=self.avg, 174 | global_avg=self.global_avg, 175 | max=self.max, 176 | value=self.value) 177 | 178 | 179 | def is_dist_avail_and_initialized(): 180 | """检查是否支持分布式环境""" 181 | if not dist.is_available(): 182 | return False 183 | if not dist.is_initialized(): 184 | return False 185 | return True 186 | 187 | 188 | class MetricLogger(object): 189 | def __init__(self, delimiter="\t"): 190 | self.meters = defaultdict(SmoothedValue) 191 | self.delimiter = delimiter 192 | 193 | def update(self, **kwargs): 194 | for k, v in kwargs.items(): 195 | if isinstance(v, torch.Tensor): 196 | v = v.item() 197 | assert isinstance(v, (float, int)) 198 | self.meters[k].update(v) 199 | 200 | def __getattr__(self, attr): 201 | if attr in self.meters: 202 | return self.meters[attr] 203 | if attr in self.__dict__: 204 | return self.__dict__[attr] 205 | raise AttributeError("'{}' object has no attribute '{}'".format( 206 | type(self).__name__, attr)) 207 | 208 | def __str__(self): 209 | loss_str = [] 210 | for name, meter in self.meters.items(): 211 | loss_str.append( 212 | "{}: {}".format(name, str(meter)) 213 | ) 214 | return self.delimiter.join(loss_str) 215 | 216 | def synchronize_between_processes(self): 217 | for meter in self.meters.values(): 218 | meter.synchronize_between_processes() 219 | 220 | def add_meter(self, name, meter): 221 | self.meters[name] = meter 222 | 223 | def log_every(self, iterable, print_freq, header=None): 224 | i = 0 225 | if not header: 226 | header = "" 227 | start_time = time.time() 228 | end = time.time() 229 | iter_time = SmoothedValue(fmt='{avg:.4f}') 230 | data_time = SmoothedValue(fmt='{avg:.4f}') 231 | space_fmt = ":" + str(len(str(len(iterable)))) + "d" 232 | if torch.cuda.is_available(): 233 | log_msg = self.delimiter.join([header, 234 | '[{0' + space_fmt + '}/{1}]', 235 | 'eta: {eta}', 236 | '{meters}', 237 | 'time: {time}', 238 | 'data: {data}', 239 | 'max mem: {memory:.0f}']) 240 | else: 241 | log_msg = self.delimiter.join([header, 242 | '[{0' + space_fmt + '}/{1}]', 243 | 'eta: {eta}', 244 | '{meters}', 245 | 'time: {time}', 246 | 'data: {data}']) 247 | MB = 1024.0 * 1024.0 248 | for obj in iterable: 249 | data_time.update(time.time() - end) 250 | yield obj 251 | iter_time.update(time.time() - end) 252 | if i % print_freq == 0 or i == len(iterable) - 1: 253 | eta_second = iter_time.global_avg * (len(iterable) - i) 254 | eta_string = str(datetime.timedelta(seconds=eta_second)) 255 | if torch.cuda.is_available(): 256 | print(log_msg.format(i, len(iterable), 257 | eta=eta_string, 258 | meters=str(self), 259 | time=str(iter_time), 260 | data=str(data_time), 261 | memory=torch.cuda.max_memory_allocated() / MB)) 262 | else: 263 | print(log_msg.format(i, len(iterable), 264 | eta=eta_string, 265 | meters=str(self), 266 | time=str(iter_time), 267 | data=str(data_time))) 268 | i += 1 269 | end = time.time() 270 | total_time = time.time() - start_time 271 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 272 | print('{} Total time: {} ({:.4f} s / it)'.format(header, 273 | total_time_str, 274 | 275 | total_time / len(iterable))) 276 | 277 | 278 | def collate_fn(batch): 279 | return tuple(zip(*batch)) 280 | 281 | 282 | def mkdir(path): 283 | try: 284 | os.makedirs(path) 285 | except OSError as e: 286 | if e.errno != errno.EEXIST: 287 | raise 288 | 289 | 290 | def get_world_size(): 291 | if not is_dist_avail_and_initialized(): 292 | return 1 293 | return dist.get_world_size() 294 | 295 | 296 | def reduce_dict(input_dict, average=True): 297 | """ 298 | Args: 299 | input_dict (dict): all the values will be reduced 300 | average (bool): whether to do average or sum 301 | Reduce the values in the dictionary from all processes so that all processes 302 | have the averaged results. Returns a dict with the same fields as 303 | input_dict, after reduction. 304 | """ 305 | world_size = get_world_size() 306 | if world_size < 2: # 单GPU的情况 307 | return input_dict 308 | with torch.no_grad(): # 多GPU的情况 309 | names = [] 310 | values = [] 311 | # sort the keys so that they are consistent across processes 312 | for k in sorted(input_dict.keys()): 313 | names.append(k) 314 | values.append(input_dict[k]) 315 | values = torch.stack(values, dim=0) 316 | dist.all_reduce(values) 317 | if average: 318 | values /= world_size 319 | 320 | reduced_dict = {k: v for k, v in zip(names, values)} 321 | return reduced_dict 322 | 323 | 324 | def _get_iou_types(model): 325 | model_without_ddp = model 326 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 327 | model_without_ddp = model.module 328 | iou_types = ["bbox"] 329 | return iou_types 330 | 331 | 332 | def all_gather(data): 333 | """ 334 | Run all_gather on arbitrary picklable data (not necessarily tensors) 335 | Args: 336 | data: any picklable object 337 | Returns: 338 | list[data]: list of data gathered from each rank 339 | """ 340 | world_size = get_world_size() 341 | if world_size == 1: 342 | return [data] 343 | 344 | # serialized to a Tensor 345 | buffer = pickle.dumps(data) 346 | storage = torch.ByteStorage.from_buffer(buffer) 347 | tensor = torch.ByteTensor(storage).to("cuda") 348 | 349 | # obtain Tensor size of each rank 350 | local_size = torch.tensor([tensor.numel()], device="cuda") 351 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 352 | dist.all_gather(size_list, local_size) 353 | size_list = [int(size.item()) for size in size_list] 354 | max_size = max(size_list) 355 | 356 | # receiving Tensor from all ranks 357 | # we pad the tensor because torch all_gather does not support 358 | # gathering tensors of different shapes 359 | tensor_list = [] 360 | for _ in size_list: 361 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 362 | if local_size != max_size: 363 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 364 | tensor = torch.cat((tensor, padding), dim=0) 365 | dist.all_gather(tensor_list, tensor) 366 | 367 | data_list = [] 368 | for size, tensor in zip(size_list, tensor_list): 369 | buffer = tensor.cpu().numpy().tobytes()[:size] 370 | data_list.append(pickle.loads(buffer)) 371 | 372 | return data_list 373 | 374 | 375 | def setup_for_distributed(is_master): 376 | """ 377 | This function disables when not in master process 378 | """ 379 | import builtins as __builtin__ 380 | builtin_print = __builtin__.print 381 | 382 | def print(*args, **kwargs): 383 | force = kwargs.pop('force', False) 384 | if is_master or force: 385 | builtin_print(*args, **kwargs) 386 | 387 | __builtin__.print = print 388 | 389 | 390 | def get_rank(): 391 | if not is_dist_avail_and_initialized(): 392 | return 0 393 | return dist.get_rank() 394 | 395 | 396 | def is_main_process(): 397 | return get_rank() == 0 398 | 399 | 400 | def save_on_master(*args, **kwargs): 401 | if is_main_process(): 402 | torch.save(*args, **kwargs) 403 | 404 | 405 | def init_distributed_mode(args): 406 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 407 | args.rank = int(os.environ["RANK"]) 408 | args.world_size = int(os.environ['WORLD_SIZE']) 409 | args.gpu = int(os.environ['LOCAL_RANK']) 410 | elif 'SLURM_PROCID' in os.environ: 411 | args.rank = int(os.environ['SLURM_PROCID']) 412 | args.gpu = args.rank % torch.cuda.device_count() 413 | else: 414 | print('Not using distributed mode') 415 | args.distributed = False 416 | return 417 | 418 | args.distributed = True 419 | 420 | torch.cuda.set_device(args.gpu) 421 | args.dist_backend = 'nccl' 422 | print('| distributed init (rank {}): {}'.format( 423 | args.rank, args.dist_url), flush=True) 424 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 425 | world_size=args.world_size, rank=args.rank) 426 | torch.distributed.barrier() 427 | setup_for_distributed(args.rank == 0) 428 | -------------------------------------------------------------------------------- /faster_rcnn/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | from torchvision.transforms import functional as F 5 | 6 | 7 | class Compose(object): 8 | """组合多个transform函数""" 9 | def __init__(self, transforms): 10 | self.transforms = transforms 11 | 12 | def __call__(self, image, target): 13 | for t in self.transforms: 14 | image, target = t(image, target) 15 | return image, target 16 | 17 | 18 | class ToTensor(object): 19 | """将PIL图像转为Tensor""" 20 | def __call__(self, image, target): 21 | image = F.to_tensor(image) 22 | return image, target 23 | 24 | 25 | class RandomHorizontalFlip(object): 26 | """随机水平翻转图像以及bboxes""" 27 | def __init__(self, prob=0.5): 28 | self.prob = prob 29 | 30 | def __call__(self, image, target): 31 | if random.random() < self.prob: 32 | height, width = image.shape[-2:] 33 | image = image.flip(-1) # 水平翻转图片 34 | bbox = target["boxes"] 35 | # bbox: xmin, ymin, xmax, ymax 36 | bbox[:, [0, 2]] = width - bbox[:, [2, 0]] # 翻转对应bbox坐标信息 37 | target["boxes"] = bbox 38 | return image, target 39 | -------------------------------------------------------------------------------- /ssd/README.md: -------------------------------------------------------------------------------- 1 | # SSD: Single Shot MultiBox Detector 2 | 3 | ## 环境配置: 4 | * Python 3.6或者3.7 5 | * Pytorch 1.5(注意:是1.5) 6 | * pycocotools(Linux: pip install pycocotools; 7 | Windows:pip install pycocotools-windows(不需要额外安装vs)) 8 | * Ubuntu或Centos(不建议Windows) 9 | * 最好使用GPU训练 10 | 11 | ## 文件结构: 12 | ``` 13 | ├── src: 实现SSD模型的相关模块 14 | │ ├── resnet50_backbone.py 使用resnet50网络作为SSD的backbone 15 | │ ├── ssd_model.py SSD网络结构文件 16 | │ └── utils.py 训练过程中使用到的一些功能实现 17 | ├── train_utils: 训练验证相关模块(包括cocotools) 18 | ├── my_dataset.py: 自定义dataset用于读取VOC数据集 19 | ├── train_ssd300.py: 以resnet50做为backbone的SSD网络进行训练 20 | ├── train_multi_GPU.py: 针对使用多GPU的用户使用 21 | ├── predict_test.py: 简易的预测脚本,使用训练好的权重进行预测测试 22 | ├── pascal_voc_classes.json: pascal_voc标签文件 23 | ├── plot_curve.py: 用于绘制训练过程的损失以及验证集的mAP 24 | ``` 25 | 26 | ## 预训练权重下载地址(下载后放入src文件夹中): 27 | * ResNet50+SSD: https://ngc.nvidia.com/catalog/models 28 | `搜索ssd -> 找到SSD for PyTorch(FP32) -> download FP32 -> 解压文件` 29 | 30 | ## 数据集,本例程使用的是PASCAL VOC2012数据集(下载后放入项目当前文件夹中) 31 | * Pascal VOC2012 train/val数据集下载地址:http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 32 | * Pascal VOC2007 test数据集请参考:http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 33 | 34 | ## 训练方法 35 | * 确保提前准备好数据集 36 | * 确保提前下载好对应预训练模型权重 37 | * 单GPU训练或CPU,直接使用train_ssd300.py训练脚本 38 | * 若要使用多GPU训练,使用 "python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py" 指令,nproc_per_node参数为使用GPU数量 39 | 40 | ## Resnet50 + SSD算法框架图 41 | ![Resnet50 SSD](https://github.com/WZMIAOMIAO/deep-learning-for-image-processing/raw/master/pytorch_object_detection/ssd/res50_ssd.png) 42 | -------------------------------------------------------------------------------- /ssd/__pycache__/draw_box_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/__pycache__/draw_box_utils.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/__pycache__/my_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/__pycache__/my_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/__pycache__/transform.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/__pycache__/transform.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/draw_box_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import PIL.ImageDraw as ImageDraw 3 | import PIL.ImageFont as ImageFont 4 | import numpy as np 5 | 6 | STANDARD_COLORS = [ 7 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 8 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 9 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 10 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 11 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 12 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 13 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 14 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 15 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 16 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 17 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 18 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 19 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 20 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 21 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 22 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 23 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 24 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 25 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 26 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 27 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 28 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 29 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 30 | ] 31 | 32 | 33 | def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map): 34 | for i in range(boxes.shape[0]): 35 | if scores[i] > thresh: 36 | box = tuple(boxes[i].tolist()) # numpy -> list -> tuple 37 | if classes[i] in category_index.keys(): 38 | class_name = category_index[classes[i]] 39 | else: 40 | class_name = 'N/A' 41 | display_str = str(class_name) 42 | display_str = '{}: {}%'.format(display_str, int(100 * scores[i])) 43 | box_to_display_str_map[box].append(display_str) 44 | box_to_color_map[box] = STANDARD_COLORS[ 45 | classes[i] % len(STANDARD_COLORS)] 46 | else: 47 | break # 网络输出概率已经排序过,当遇到一个不满足后面的肯定不满足 48 | 49 | 50 | def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color): 51 | try: 52 | font = ImageFont.truetype('arial.ttf', 24) 53 | except IOError: 54 | font = ImageFont.load_default() 55 | 56 | # If the total height of the display strings added to the top of the bounding 57 | # box exceeds the top of the image, stack the strings below the bounding box 58 | # instead of above. 59 | display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]] 60 | # Each display_str has a top and bottom margin of 0.05x. 61 | total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights) 62 | 63 | if top > total_display_str_height: 64 | text_bottom = top 65 | else: 66 | text_bottom = bottom + total_display_str_height 67 | # Reverse list and print from bottom to top. 68 | for display_str in box_to_display_str_map[box][::-1]: 69 | text_width, text_height = font.getsize(display_str) 70 | margin = np.ceil(0.05 * text_height) 71 | draw.rectangle([(left, text_bottom - text_height - 2 * margin), 72 | (left + text_width, text_bottom)], fill=color) 73 | draw.text((left + margin, text_bottom - text_height - margin), 74 | display_str, 75 | fill='black', 76 | font=font) 77 | text_bottom -= text_height - 2 * margin 78 | 79 | 80 | def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8): 81 | box_to_display_str_map = collections.defaultdict(list) 82 | box_to_color_map = collections.defaultdict(str) 83 | 84 | filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map) 85 | 86 | # Draw all boxes onto image. 87 | draw = ImageDraw.Draw(image) 88 | im_width, im_height = image.size 89 | for box, color in box_to_color_map.items(): 90 | xmin, ymin, xmax, ymax = box 91 | (left, right, top, bottom) = (xmin * 1, xmax * 1, 92 | ymin * 1, ymax * 1) 93 | draw.line([(left, top), (left, bottom), (right, bottom), 94 | (right, top), (left, top)], width=line_thickness, fill=color) 95 | draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color) 96 | -------------------------------------------------------------------------------- /ssd/my_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import os 3 | import torch 4 | import json 5 | from PIL import Image 6 | from lxml import etree 7 | 8 | 9 | class VOC2012DataSet(Dataset): 10 | """读取解析PASCAL VOC2012数据集""" 11 | 12 | def __init__(self, voc_root, transforms, year='VOC2012', train_set='train.txt'): 13 | self.root = os.path.join(voc_root, "VOCdevkit", year) 14 | self.img_root = os.path.join(self.root, "JPEGImages") 15 | self.annotations_root = os.path.join(self.root, "Annotations") 16 | 17 | txt_list = os.path.join(self.root, "ImageSets", "Main", train_set) 18 | 19 | with open(txt_list) as read: 20 | self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml") 21 | for line in read.readlines()] 22 | 23 | # read class_indict 24 | try: 25 | json_file = open('./pascal_voc_classes.json', 'r') 26 | self.class_dict = json.load(json_file) 27 | except Exception as e: 28 | print(e) 29 | exit(-1) 30 | 31 | self.transforms = transforms 32 | 33 | def __len__(self): 34 | return len(self.xml_list) 35 | 36 | def __getitem__(self, idx): 37 | # read xml 38 | xml_path = self.xml_list[idx] 39 | with open(xml_path) as fid: 40 | xml_str = fid.read() 41 | xml = etree.fromstring(xml_str) 42 | data = self.parse_xml_to_dict(xml)["annotation"] 43 | data_height = int(data["size"]["height"]) 44 | data_width = int(data["size"]["width"]) 45 | height_width = [data_height, data_width] 46 | img_path = os.path.join(self.img_root, data["filename"]) 47 | image = Image.open(img_path) 48 | if image.format != "JPEG": 49 | raise ValueError("Image format not JPEG") 50 | boxes = [] 51 | labels = [] 52 | iscrowd = [] 53 | for obj in data["object"]: 54 | # 将所有的gt box信息转换成相对值0-1之间 55 | xmin = float(obj["bndbox"]["xmin"]) / data_width 56 | xmax = float(obj["bndbox"]["xmax"]) / data_width 57 | ymin = float(obj["bndbox"]["ymin"]) / data_height 58 | ymax = float(obj["bndbox"]["ymax"]) / data_height 59 | boxes.append([xmin, ymin, xmax, ymax]) 60 | labels.append(self.class_dict[obj["name"]]) 61 | iscrowd.append(int(obj["difficult"])) 62 | 63 | # convert everything into a torch.Tensor 64 | boxes = torch.as_tensor(boxes, dtype=torch.float32) 65 | labels = torch.as_tensor(labels, dtype=torch.int64) 66 | iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64) 67 | height_width = torch.as_tensor(height_width, dtype=torch.int64) 68 | image_id = torch.tensor([idx]) 69 | area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) 70 | 71 | target = {} 72 | target["boxes"] = boxes 73 | target["labels"] = labels 74 | target["image_id"] = image_id 75 | target["area"] = area 76 | target["iscrowd"] = iscrowd 77 | target["height_width"] = height_width 78 | 79 | if self.transforms is not None: 80 | image, target = self.transforms(image, target) 81 | 82 | return image, target 83 | 84 | def get_height_and_width(self, idx): 85 | # read xml 86 | xml_path = self.xml_list[idx] 87 | with open(xml_path) as fid: 88 | xml_str = fid.read() 89 | xml = etree.fromstring(xml_str) 90 | data = self.parse_xml_to_dict(xml)["annotation"] 91 | data_height = int(data["size"]["height"]) 92 | data_width = int(data["size"]["width"]) 93 | return data_height, data_width 94 | 95 | def parse_xml_to_dict(self, xml): 96 | """ 97 | 将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict 98 | Args: 99 | xml: xml tree obtained by parsing XML file contents using lxml.etree 100 | 101 | Returns: 102 | Python dictionary holding XML contents. 103 | """ 104 | 105 | if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息 106 | return {xml.tag: xml.text} 107 | 108 | result = {} 109 | for child in xml: 110 | child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息 111 | if child.tag != 'object': 112 | result[child.tag] = child_result[child.tag] 113 | else: 114 | if child.tag not in result: # 因为object可能有多个,所以需要放入列表里 115 | result[child.tag] = [] 116 | result[child.tag].append(child_result[child.tag]) 117 | return {xml.tag: result} 118 | 119 | 120 | # import transforms 121 | # from draw_box_utils import draw_box 122 | # from PIL import Image 123 | # import json 124 | # import matplotlib.pyplot as plt 125 | # import torchvision.transforms as ts 126 | # import random 127 | # 128 | # # read class_indict 129 | # category_index = {} 130 | # try: 131 | # json_file = open('./pascal_voc_classes.json', 'r') 132 | # class_dict = json.load(json_file) 133 | # category_index = {v: k for k, v in class_dict.items()} 134 | # except Exception as e: 135 | # print(e) 136 | # exit(-1) 137 | # 138 | # data_transform = { 139 | # "train": transforms.Compose([transforms.ToTensor(), 140 | # transforms.RandomHorizontalFlip(0.5)]), 141 | # "val": transforms.Compose([transforms.ToTensor()]) 142 | # } 143 | # 144 | # # load train data set 145 | # train_data_set = VOC2012DataSet(os.getcwd(), data_transform["train"], True) 146 | # print(len(train_data_set)) 147 | # for index in random.sample(range(0, len(train_data_set)), k=5): 148 | # img, target = train_data_set[index] 149 | # img = ts.ToPILImage()(img) 150 | # draw_box(img, 151 | # target["boxes"].numpy(), 152 | # target["labels"].numpy(), 153 | # [1 for i in range(len(target["labels"].numpy()))], 154 | # category_index, 155 | # thresh=0.5, 156 | # line_thickness=5) 157 | # plt.imshow(img) 158 | # plt.show() 159 | -------------------------------------------------------------------------------- /ssd/pascal_voc_classes.json: -------------------------------------------------------------------------------- 1 | { 2 | "aeroplane": 1, 3 | "bicycle": 2, 4 | "bird": 3, 5 | "boat": 4, 6 | "bottle": 5, 7 | "bus": 6, 8 | "car": 7, 9 | "cat": 8, 10 | "chair": 9, 11 | "cow": 10, 12 | "diningtable": 11, 13 | "dog": 12, 14 | "horse": 13, 15 | "motorbike": 14, 16 | "person": 15, 17 | "pottedplant": 16, 18 | "sheep": 17, 19 | "sofa": 18, 20 | "train": 19, 21 | "tvmonitor": 20 22 | } -------------------------------------------------------------------------------- /ssd/plot_curve.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def plot_loss_and_lr(train_loss, learning_rate): 5 | try: 6 | x = list(range(len(train_loss))) 7 | fig, ax1 = plt.subplots(1, 1) 8 | ax1.plot(x, train_loss, 'r', label='loss') 9 | ax1.set_xlabel("step") 10 | ax1.set_ylabel("loss") 11 | ax1.set_title("Train Loss and lr") 12 | plt.legend(loc='best') 13 | 14 | ax2 = ax1.twinx() 15 | ax2.plot(x, learning_rate, label='lr') 16 | ax2.set_ylabel("learning rate") 17 | ax2.set_xlim(0, len(train_loss)) # 设置横坐标整数间隔 18 | plt.legend(loc='best') 19 | 20 | handles1, labels1 = ax1.get_legend_handles_labels() 21 | handles2, labels2 = ax2.get_legend_handles_labels() 22 | plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right') 23 | 24 | fig.subplots_adjust(right=0.8) # 防止出现保存图片显示不全的情况 25 | fig.savefig('./loss_and_lr.png') 26 | plt.close() 27 | print("successful save loss curve! ") 28 | except Exception as e: 29 | print(e) 30 | 31 | 32 | def plot_map(mAP): 33 | try: 34 | x = list(range(len(mAP))) 35 | plt.plot(x, mAP, label='mAp') 36 | plt.xlabel('epoch') 37 | plt.ylabel('mAP') 38 | plt.title('Eval mAP') 39 | plt.xlim(0, len(mAP)) 40 | plt.legend(loc='best') 41 | plt.savefig('./mAP.png') 42 | plt.close() 43 | print("successful save mAP curve!") 44 | except Exception as e: 45 | print(e) 46 | -------------------------------------------------------------------------------- /ssd/predict_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from draw_box_utils import draw_box 3 | from PIL import Image 4 | import json 5 | import matplotlib.pyplot as plt 6 | from src.ssd_model import SSD300, Backbone 7 | import transform 8 | 9 | 10 | def create_model(num_classes): 11 | backbone = Backbone() 12 | model = SSD300(backbone=backbone, num_classes=num_classes) 13 | 14 | return model 15 | 16 | 17 | # get devices 18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 19 | print(device) 20 | 21 | # create model 22 | model = create_model(num_classes=21) 23 | 24 | # load train weights 25 | train_weights = "./save_weights/ssd300-14.pth" 26 | train_weights_dict = torch.load(train_weights, map_location=device)['model'] 27 | 28 | model.load_state_dict(train_weights_dict, strict=False) 29 | model.to(device) 30 | 31 | # read class_indict 32 | category_index = {} 33 | try: 34 | json_file = open('./pascal_voc_classes.json', 'r') 35 | class_dict = json.load(json_file) 36 | category_index = {v: k for k, v in class_dict.items()} 37 | except Exception as e: 38 | print(e) 39 | exit(-1) 40 | 41 | # load image 42 | original_img = Image.open("./test/test21.jpeg") 43 | 44 | # from pil image to tensor, do not normalize image 45 | data_transform = transform.Compose([transform.Resize(), 46 | transform.ToTensor(), 47 | transform.Normalization()]) 48 | img, _ = data_transform(original_img) 49 | # expand batch dimension 50 | img = torch.unsqueeze(img, dim=0) 51 | 52 | model.eval() 53 | with torch.no_grad(): 54 | predictions = model(img.to(device))[0] # bboxes_out, labels_out, scores_out 55 | predict_boxes = predictions[0].to("cpu").numpy() 56 | predict_boxes[:, [0, 2]] = predict_boxes[:, [0, 2]] * original_img.size[0] 57 | predict_boxes[:, [1, 3]] = predict_boxes[:, [1, 3]] * original_img.size[1] 58 | predict_classes = predictions[1].to("cpu").numpy() 59 | predict_scores = predictions[2].to("cpu").numpy() 60 | 61 | if len(predict_boxes) == 0: 62 | print("没有检测到任何目标!") 63 | 64 | draw_box(original_img, 65 | predict_boxes, 66 | predict_classes, 67 | predict_scores, 68 | category_index, 69 | thresh=0.5, 70 | line_thickness=5) 71 | plt.figure(dpi=300) 72 | plt.imshow(original_img) 73 | plt.show() 74 | -------------------------------------------------------------------------------- /ssd/res50_ssd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/res50_ssd.png -------------------------------------------------------------------------------- /ssd/src/__pycache__/res50_backbone.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/src/__pycache__/res50_backbone.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/src/__pycache__/ssd_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/src/__pycache__/ssd_model.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/src/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/src/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/src/res50_backbone.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | class Bottleneck(nn.Module): 6 | expansion = 4 7 | 8 | def __init__(self, in_channel, out_channel, stride=1, downsample=None): 9 | super(Bottleneck, self).__init__() 10 | self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel, 11 | kernel_size=1, stride=1, bias=False) # squeeze channels 12 | self.bn1 = nn.BatchNorm2d(out_channel) 13 | # ----------------------------------------- 14 | self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel, 15 | kernel_size=3, stride=stride, bias=False, padding=1) 16 | self.bn2 = nn.BatchNorm2d(out_channel) 17 | # ----------------------------------------- 18 | self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel*self.expansion, 19 | kernel_size=1, stride=1, bias=False) # unsqueeze channels 20 | self.bn3 = nn.BatchNorm2d(out_channel*self.expansion) 21 | self.relu = nn.ReLU(inplace=True) 22 | self.downsample = downsample 23 | 24 | def forward(self, x): 25 | identity = x 26 | if self.downsample is not None: 27 | identity = self.downsample(x) 28 | 29 | out = self.conv1(x) 30 | out = self.bn1(out) 31 | out = self.relu(out) 32 | 33 | out = self.conv2(out) 34 | out = self.bn2(out) 35 | out = self.relu(out) 36 | 37 | out = self.conv3(out) 38 | out = self.bn3(out) 39 | 40 | out += identity 41 | out = self.relu(out) 42 | 43 | return out 44 | 45 | 46 | class ResNet(nn.Module): 47 | 48 | def __init__(self, block, blocks_num, num_classes=1000, include_top=True): 49 | super(ResNet, self).__init__() 50 | self.include_top = include_top 51 | self.in_channel = 64 52 | 53 | self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2, 54 | padding=3, bias=False) 55 | self.bn1 = nn.BatchNorm2d(self.in_channel) 56 | self.relu = nn.ReLU(inplace=True) 57 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 58 | self.layer1 = self._make_layer(block, 64, blocks_num[0]) 59 | self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2) 60 | self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2) 61 | self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2) 62 | if self.include_top: 63 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1) 64 | self.fc = nn.Linear(512 * block.expansion, num_classes) 65 | 66 | for m in self.modules(): 67 | if isinstance(m, nn.Conv2d): 68 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 69 | 70 | def _make_layer(self, block, channel, block_num, stride=1): 71 | downsample = None 72 | if stride != 1 or self.in_channel != channel * block.expansion: 73 | downsample = nn.Sequential( 74 | nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False), 75 | nn.BatchNorm2d(channel * block.expansion)) 76 | 77 | layers = [] 78 | layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride)) 79 | self.in_channel = channel * block.expansion 80 | 81 | for _ in range(1, block_num): 82 | layers.append(block(self.in_channel, channel)) 83 | 84 | return nn.Sequential(*layers) 85 | 86 | def forward(self, x): 87 | x = self.conv1(x) 88 | x = self.bn1(x) 89 | x = self.relu(x) 90 | x = self.maxpool(x) 91 | 92 | x = self.layer1(x) 93 | x = self.layer2(x) 94 | x = self.layer3(x) 95 | x = self.layer4(x) 96 | 97 | if self.include_top: 98 | x = self.avgpool(x) 99 | x = torch.flatten(x, 1) 100 | x = self.fc(x) 101 | 102 | return x 103 | 104 | 105 | def resnet50(num_classes=1000, include_top=True): 106 | return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top) 107 | -------------------------------------------------------------------------------- /ssd/src/ssd_model.py: -------------------------------------------------------------------------------- 1 | from src.res50_backbone import resnet50 2 | from torch import nn, Tensor 3 | import torch 4 | from torch.jit.annotations import Optional, List, Dict, Tuple, Module 5 | from src.utils import dboxes300_coco, Encoder, PostProcess 6 | 7 | 8 | class Backbone(nn.Module): 9 | def __init__(self, pretrain_path=None): 10 | super(Backbone, self).__init__() 11 | net = resnet50() 12 | self.out_channels = [1024, 512, 512, 256, 256, 256] 13 | 14 | if pretrain_path is not None: 15 | net.load_state_dict(torch.load(pretrain_path)) 16 | 17 | self.feature_extractor = nn.Sequential(*list(net.children())[:7]) 18 | 19 | conv4_block1 = self.feature_extractor[-1][0] 20 | 21 | # 修改conv4_block1的步距,从2->1 22 | conv4_block1.conv1.stride = (1, 1) 23 | conv4_block1.conv2.stride = (1, 1) 24 | conv4_block1.downsample[0].stride = (1, 1) 25 | 26 | def forward(self, x): 27 | x = self.feature_extractor(x) 28 | return x 29 | 30 | 31 | class SSD300(nn.Module): 32 | def __init__(self, backbone=None, num_classes=21): 33 | super(SSD300, self).__init__() 34 | if backbone is None: 35 | raise Exception("backbone is None") 36 | if not hasattr(backbone, "out_channels"): 37 | raise Exception("the backbone not has attribute: out_channel") 38 | self.feature_extractor = backbone # 把传入的backbone定义给ssd的feature_extractor 39 | 40 | self.num_classes = num_classes 41 | # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50,添加一系列卷积层做为特征提取层 42 | self._build_additional_features(self.feature_extractor.out_channels) 43 | # 第一个特征层层和最后两个特征层默认每个点生成四个大小的框, 其他层每个点默认生成6个 44 | self.num_defaults = [4, 6, 6, 6, 4, 4] 45 | location_extractors = [] # 定位预测器,使用3*3卷积来预测定位 46 | confidence_extractors = [] # 置信度预测器, 同样使用3*3卷积来预测 47 | 48 | # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 49 | for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels): 50 | # nd is number_default_boxes, oc is output_channel 51 | location_extractors.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) 52 | confidence_extractors.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1)) 53 | 54 | self.loc = nn.ModuleList(location_extractors) 55 | self.conf = nn.ModuleList(confidence_extractors) 56 | self._init_weights() 57 | 58 | # default_box.dboxes (8732*4) 59 | default_box = dboxes300_coco() # 通过该函数生成默认的8732个default box 60 | self.compute_loss = Loss(default_box) 61 | self.encoder = Encoder(default_box) 62 | self.postprocess = PostProcess(default_box) 63 | 64 | def _build_additional_features(self, input_size): 65 | """ 66 | 为backbone(resnet50)添加额外的一系列卷积层,得到相应的一系列特征提取器 67 | :param input_size: 68 | :return: 69 | """ 70 | additional_blocks = [] 71 | # input_size = [1024, 512, 512, 256, 256, 256] for resnet50 72 | middle_channels = [256, 256, 128, 128, 128] 73 | # input_size[:-1]=[1024, 512, 512, 256, 256], input_size[1:]=[512, 512, 256, 256, 256] 74 | for i, (input_ch, output_ch, middle_ch) in enumerate(zip(input_size[:-1], input_size[1:], middle_channels)): 75 | padding, stride = (1, 2) if i < 3 else (0, 1) 76 | layer = nn.Sequential( 77 | nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False), 78 | nn.BatchNorm2d(middle_ch), 79 | nn.ReLU(inplace=True), 80 | nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False), 81 | nn.BatchNorm2d(output_ch), 82 | nn.ReLU(inplace=True), 83 | ) 84 | additional_blocks.append(layer) 85 | self.additional_blocks = nn.ModuleList(additional_blocks) 86 | 87 | def _init_weights(self): 88 | layers = [*self.additional_blocks, *self.loc, *self.conf] 89 | for layer in layers: 90 | for param in layer.parameters(): 91 | if param.dim() > 1: 92 | nn.init.xavier_uniform_(param) 93 | 94 | # Shape the classifier to the view of bboxes 95 | def bbox_view(self, features, loc_extractor, conf_extractor): 96 | locs = [] 97 | confs = [] 98 | for f, l, c in zip(features, loc_extractor, conf_extractor): 99 | # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1] 100 | locs.append(l(f).view(f.size(0), 4, -1)) # size([batch_size, 4, -1]) 101 | # [batch, n*classes, feat_size, feat_size] -> [batch, classes, -1] 102 | confs.append(c(f).view(f.size(0), self.num_classes, -1)) # size([batch_size, num_classes, -1]) 103 | 104 | # locs:(batch_size, 4, 8732), confs:(batch_size, 21, 8732) 105 | locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous() 106 | return locs, confs 107 | 108 | def forward(self, image, targets=None): 109 | x = self.feature_extractor(image) 110 | 111 | # Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256 112 | detection_features = torch.jit.annotate(List[Tensor], []) # [x] 113 | detection_features.append(x) 114 | for layer in self.additional_blocks: 115 | x = layer(x) 116 | detection_features.append(x) 117 | 118 | # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 119 | # locs:(batch_size, 4, 8732), confs:(batch_size, 21, 8732) 120 | locs, confs = self.bbox_view(detection_features, self.loc, self.conf) 121 | 122 | # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results 123 | # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732 124 | 125 | if self.training: 126 | if targets is None: 127 | raise ValueError("In training mode, targets should be passed") 128 | # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732) 129 | bboxes_out = targets['boxes'] 130 | bboxes_out = bboxes_out.transpose(1, 2).contiguous() # batch_size*4*8732 131 | # print(bboxes_out.is_contiguous()) 132 | labels_out = targets['labels'] # batch_size*8732 133 | # print(labels_out.is_contiguous()) 134 | 135 | # ploc, plabel, gloc, glabel 136 | loss = self.compute_loss(locs, confs, bboxes_out, labels_out) 137 | return {"total_losses": loss} 138 | 139 | # 将预测回归参数叠加到default box上得到最终预测box,并执行非极大值抑制虑除重叠框 140 | # results = self.encoder.decode_batch(locs, confs) 141 | results = self.postprocess(locs, confs) 142 | return results 143 | 144 | 145 | class Loss(nn.Module): 146 | """ 147 | Implements the loss as the sum of the followings: 148 | 1. Confidence Loss: All labels, with hard negative mining 149 | 2. Localization Loss: Only on positive labels 150 | Suppose input dboxes has the shape 8732x4 151 | """ 152 | def __init__(self, dboxes): 153 | super(Loss, self).__init__() 154 | # Two factor are from following links 155 | # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html 156 | self.scale_xy = 1.0 / dboxes.scale_xy # 10 157 | self.scale_wh = 1.0 / dboxes.scale_wh # 5 158 | 159 | self.location_loss = nn.SmoothL1Loss(reduction='none') 160 | # [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors] 161 | self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), 162 | requires_grad=False) 163 | 164 | self.confidence_loss = nn.CrossEntropyLoss(reduction='none') 165 | 166 | def _location_vec(self, loc): 167 | # type: (Tensor) 168 | """ 169 | Generate Location Vectors 170 | 计算ground truth相对anchors的回归参数 171 | :param loc: anchor匹配到的对应GTBOX Nx4x8732 172 | :return: 173 | """ 174 | gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :] # Nx2x8732 175 | gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log() # Nx2x8732 176 | return torch.cat((gxy, gwh), dim=1).contiguous() 177 | 178 | def forward(self, ploc, plabel, gloc, glabel): 179 | # type: (Tensor, Tensor, Tensor, Tensor) 180 | """ 181 | ploc, plabel: Nx4x8732, Nxlabel_numx8732 182 | predicted location and labels 183 | 184 | gloc, glabel: Nx4x8732, Nx8732 185 | ground truth location and labels 186 | """ 187 | # 获取正样本的mask Tensor: [N, 8732] 188 | mask = glabel > 0 189 | # mask1 = torch.nonzero(glabel) 190 | # 计算一个batch中的每张图片的正样本个数 Tensor: [N] 191 | pos_num = mask.sum(dim=1) 192 | 193 | # 计算gt的location回归参数 Tensor: [N, 4, 8732] 194 | vec_gd = self._location_vec(gloc) 195 | 196 | # sum on four coordinates, and mask 197 | # 计算定位损失(只有正样本) 198 | loc_loss = self.location_loss(ploc, vec_gd).sum(dim=1) # Tensor: [N, 8732] 199 | loc_loss = (mask.float() * loc_loss).sum(dim=1) # Tenosr: [N] 200 | 201 | # hard negative mining Tenosr: [N, 8732] 202 | con = self.confidence_loss(plabel, glabel) 203 | 204 | # positive mask will never selected 205 | # 获取负样本 206 | con_neg = con.clone() 207 | con_neg[mask] = torch.tensor(0.0) 208 | # 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732]) 209 | _, con_idx = con_neg.sort(dim=1, descending=True) 210 | _, con_rank = con_idx.sort(dim=1) # 这个步骤比较巧妙 211 | 212 | # number of negative three times positive 213 | # 用于损失计算的负样本数是正样本的3倍(在原论文Hard negative mining部分), 214 | # 但不能超过总样本数8732 215 | neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1) 216 | neg_mask = con_rank < neg_num # Tensor [N, 8732] 217 | 218 | # confidence最终loss使用选取的正样本loss+选取的负样本loss 219 | con_loss = (con * (mask.float() + neg_mask.float())).sum(dim=1) # Tensor [N] 220 | 221 | # avoid no object detected 222 | # 避免出现图像中没有GTBOX的情况 223 | total_loss = loc_loss + con_loss 224 | # eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0] 225 | num_mask = (pos_num > 0).float() # 统计一个batch中的每张图像中是否存在正样本 226 | pos_num = pos_num.float().clamp(min=1e-6) # 防止出现分母为零的情况 227 | ret = (total_loss * num_mask / pos_num).mean(dim=0) # 只计算存在正样本的图像损失 228 | return ret 229 | 230 | -------------------------------------------------------------------------------- /ssd/train_multi_GPU.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import train_utils.train_eval_utils as utils 3 | import time 4 | import os 5 | import datetime 6 | from my_dataset import VOC2012DataSet 7 | from train_utils.group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups 8 | from src.ssd_model import SSD300, Backbone 9 | import transform 10 | import torch.multiprocessing as mp 11 | 12 | 13 | def create_model(num_classes, device=torch.device('cpu')): 14 | # https://download.pytorch.org/models/resnet50-19c8e357.pth 15 | pre_train_path = "./src/resnet50.pth" 16 | backbone = Backbone(pretrain_path=pre_train_path) 17 | model = SSD300(backbone=backbone, num_classes=num_classes) 18 | 19 | pre_ssd_path = "./src/nvidia_ssdpyt_fp32.pt" 20 | pre_model_dict = torch.load(pre_ssd_path, map_location=device) 21 | pre_weights_dict = pre_model_dict["model"] 22 | 23 | # 删除类别预测器权重,注意,回归预测器的权重可以重用,因为不涉及num_classes 24 | del_conf_loc_dict = {} 25 | for k, v in pre_weights_dict.items(): 26 | split_key = k.split(".") 27 | if "conf" in split_key: 28 | continue 29 | del_conf_loc_dict.update({k: v}) 30 | 31 | missing_keys, unexpected_keys = model.load_state_dict(del_conf_loc_dict, strict=False) 32 | if len(missing_keys) != 0 or len(unexpected_keys) != 0: 33 | print("missing_keys: ", missing_keys) 34 | print("unexpected_keys: ", unexpected_keys) 35 | 36 | return model 37 | 38 | 39 | # def main_worker(args): 40 | def main(args): 41 | print(args) 42 | # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) 43 | utils.init_distributed_mode(args) 44 | 45 | device = torch.device(args.device) 46 | 47 | # Data loading code 48 | print("Loading data") 49 | 50 | data_transform = { 51 | "train": transform.Compose([transform.SSDCropping(), 52 | transform.Resize(), 53 | transform.ColorJitter(), 54 | transform.ToTensor(), 55 | transform.RandomHorizontalFlip(), 56 | transform.Normalization(), 57 | transform.AssignGTtoDefaultBox()]), 58 | "val": transform.Compose([transform.Resize(), 59 | transform.ToTensor(), 60 | transform.Normalization()]) 61 | } 62 | 63 | VOC_root = args.data_path 64 | # load train data set 65 | train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') 66 | 67 | # load validation data set 68 | val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') 69 | 70 | print("Creating data loaders") 71 | if args.distributed: 72 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_data_set) 73 | test_sampler = torch.utils.data.distributed.DistributedSampler(val_data_set) 74 | else: 75 | train_sampler = torch.utils.data.RandomSampler(train_data_set) 76 | test_sampler = torch.utils.data.SequentialSampler(val_data_set) 77 | 78 | if args.aspect_ratio_group_factor >= 0: 79 | # 统计所有图像比例在bins区间中的位置索引 80 | group_ids = create_aspect_ratio_groups(train_data_set, k=args.aspect_ratio_group_factor) 81 | train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) 82 | else: 83 | train_batch_sampler = torch.utils.data.BatchSampler( 84 | train_sampler, args.batch_size, drop_last=True) 85 | 86 | data_loader = torch.utils.data.DataLoader( 87 | train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, 88 | collate_fn=utils.collate_fn) 89 | 90 | data_loader_test = torch.utils.data.DataLoader( 91 | val_data_set, batch_size=4, 92 | sampler=test_sampler, num_workers=args.workers, 93 | collate_fn=utils.collate_fn) 94 | 95 | print("Creating model") 96 | model = create_model(num_classes=21) 97 | model.to(device) 98 | 99 | model_without_ddp = model 100 | if args.distributed: 101 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 102 | model_without_ddp = model.module 103 | 104 | params = [p for p in model.parameters() if p.requires_grad] 105 | optimizer = torch.optim.SGD( 106 | params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 107 | 108 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) 109 | # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) 110 | 111 | # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 112 | if args.resume: 113 | # If map_location is missing, torch.load will first load the module to CPU 114 | # and then copy each parameter to where it was saved, 115 | # which would result in all processes on the same machine using the same set of devices. 116 | checkpoint = torch.load(args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) 117 | model_without_ddp.load_state_dict(checkpoint['model']) 118 | optimizer.load_state_dict(checkpoint['optimizer']) 119 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 120 | args.start_epoch = checkpoint['epoch'] + 1 121 | 122 | if args.test_only: 123 | utils.evaluate(model, data_loader_test, device=device) 124 | return 125 | 126 | print("Start training") 127 | start_time = time.time() 128 | for epoch in range(args.start_epoch, args.epochs): 129 | if args.distributed: 130 | train_sampler.set_epoch(epoch) 131 | utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) 132 | lr_scheduler.step() 133 | if args.output_dir: 134 | # 只在主节点上执行保存权重操作 135 | utils.save_on_master({ 136 | 'model': model_without_ddp.state_dict(), 137 | 'optimizer': optimizer.state_dict(), 138 | 'lr_scheduler': lr_scheduler.state_dict(), 139 | 'args': args, 140 | 'epoch': epoch}, 141 | os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) 142 | 143 | # evaluate after every epoch 144 | utils.evaluate(model, data_loader_test, device=device) 145 | 146 | total_time = time.time() - start_time 147 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 148 | print('Training time {}'.format(total_time_str)) 149 | 150 | 151 | if __name__ == "__main__": 152 | import argparse 153 | parser = argparse.ArgumentParser( 154 | description=__doc__) 155 | 156 | # 训练文件的根目录 157 | parser.add_argument('--data-path', default='./', help='dataset') 158 | # 训练设备类型 159 | parser.add_argument('--device', default='cuda', help='device') 160 | # 每块GPU上的batch_size 161 | parser.add_argument('-b', '--batch-size', default=8, type=int, 162 | help='images per gpu, the total batch size is $NGPU x batch_size') 163 | # 指定接着从哪个epoch数开始训练 164 | parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') 165 | # 训练的总epoch数 166 | parser.add_argument('--epochs', default=20, type=int, metavar='N', 167 | help='number of total epochs to run') 168 | # 数据加载以及预处理的线程数 169 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 170 | help='number of data loading workers (default: 4)') 171 | # 学习率,这个需要根据gpu的数量以及batch_size进行设置0.005 / 8 * num_GPU 172 | parser.add_argument('--lr', default=0.005, type=float, 173 | help='initial learning rate, 0.005 is the default value for training ' 174 | 'on 8 gpus and 2 images_per_gpu') 175 | # SGD的momentum参数 176 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 177 | help='momentum') 178 | # SGD的weight_decay参数 179 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 180 | metavar='W', help='weight decay (default: 1e-4)', 181 | dest='weight_decay') 182 | # 针对torch.optim.lr_scheduler.StepLR的参数 183 | parser.add_argument('--lr-step-size', default=5, type=int, help='decrease lr every step-size epochs') 184 | # 针对torch.optim.lr_scheduler.MultiStepLR的参数 185 | parser.add_argument('--lr-steps', default=[7, 12], nargs='+', type=int, help='decrease lr every step-size epochs') 186 | # 针对torch.optim.lr_scheduler.MultiStepLR的参数 187 | parser.add_argument('--lr-gamma', default=0.3, type=float, help='decrease lr by a factor of lr-gamma') 188 | # 训练过程打印信息的频率 189 | parser.add_argument('--print-freq', default=20, type=int, help='print frequency') 190 | # 文件保存地址 191 | parser.add_argument('--output-dir', default='./multi_train', help='path where to save') 192 | # 基于上次的训练结果接着训练 193 | parser.add_argument('--resume', default='', help='resume from checkpoint') 194 | parser.add_argument('--aspect-ratio-group-factor', default=3, type=int) 195 | # 不训练,仅测试 196 | parser.add_argument( 197 | "--test-only", 198 | dest="test_only", 199 | help="Only test the model", 200 | action="store_true", 201 | ) 202 | 203 | # 开启的进程数(注意不是线程) 204 | parser.add_argument('--world-size', default=4, type=int, 205 | help='number of distributed processes') 206 | parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') 207 | 208 | args = parser.parse_args() 209 | 210 | # 如果指定了保存文件地址,检查文件夹是否存在,若不存在,则创建 211 | if args.output_dir: 212 | utils.mkdir(args.output_dir) 213 | 214 | main(args) 215 | -------------------------------------------------------------------------------- /ssd/train_ssd300.py: -------------------------------------------------------------------------------- 1 | from src.ssd_model import SSD300, Backbone 2 | import torch 3 | import transform 4 | from my_dataset import VOC2012DataSet 5 | import os 6 | import train_utils.train_eval_utils as utils 7 | from train_utils.coco_utils import get_coco_api_from_dataset 8 | 9 | 10 | def create_model(num_classes=21, device=torch.device('cpu')): 11 | # https://download.pytorch.org/models/resnet50-19c8e357.pth 12 | # pre_train_path = "./src/resnet50.pth" 13 | backbone = Backbone() 14 | model = SSD300(backbone=backbone, num_classes=num_classes) 15 | 16 | # https://ngc.nvidia.com/catalog/models -> search ssd -> download FP32 17 | pre_ssd_path = "./src/nvidia_ssdpyt_fp32.pt" 18 | pre_model_dict = torch.load(pre_ssd_path, map_location=device) 19 | pre_weights_dict = pre_model_dict["model"] 20 | 21 | # 删除类别预测器权重,注意,回归预测器的权重可以重用,因为不涉及num_classes 22 | del_conf_loc_dict = {} 23 | for k, v in pre_weights_dict.items(): 24 | split_key = k.split(".") 25 | if "conf" in split_key: 26 | continue 27 | del_conf_loc_dict.update({k: v}) 28 | 29 | missing_keys, unexpected_keys = model.load_state_dict(del_conf_loc_dict, strict=False) 30 | if len(missing_keys) != 0 or len(unexpected_keys) != 0: 31 | print("missing_keys: ", missing_keys) 32 | print("unexpected_keys: ", unexpected_keys) 33 | 34 | return model 35 | 36 | 37 | def main(parser_data): 38 | device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") 39 | print(device) 40 | 41 | if not os.path.exists("save_weights"): 42 | os.mkdir("save_weights") 43 | 44 | data_transform = { 45 | "train": transform.Compose([transform.SSDCropping(), 46 | transform.Resize(), 47 | transform.ColorJitter(), 48 | transform.ToTensor(), 49 | transform.RandomHorizontalFlip(), 50 | transform.Normalization(), 51 | transform.AssignGTtoDefaultBox()]), 52 | "val": transform.Compose([transform.Resize(), 53 | transform.ToTensor(), 54 | transform.Normalization()]) 55 | } 56 | 57 | VOC_root = parser_data.data_path 58 | train_dataset = VOC2012DataSet(VOC_root, data_transform['train'], train_set='train.txt') 59 | # 注意训练时,batch_size必须大于1 60 | train_data_loader = torch.utils.data.DataLoader(train_dataset, 61 | batch_size=2, 62 | shuffle=True, 63 | num_workers=0, 64 | collate_fn=utils.collate_fn) 65 | 66 | val_dataset = VOC2012DataSet(VOC_root, data_transform['val'], train_set='val.txt') 67 | val_data_loader = torch.utils.data.DataLoader(val_dataset, 68 | batch_size=1, 69 | shuffle=False, 70 | num_workers=0, 71 | collate_fn=utils.collate_fn) 72 | 73 | model = create_model(num_classes=21, device=device) 74 | model.to(device) 75 | 76 | # define optimizer 77 | params = [p for p in model.parameters() if p.requires_grad] 78 | optimizer = torch.optim.SGD(params, lr=0.0005, 79 | momentum=0.9, weight_decay=0.0005) 80 | # learning rate scheduler 81 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 82 | step_size=5, 83 | gamma=0.3) 84 | 85 | # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 86 | if parser_data.resume != "": 87 | checkpoint = torch.load(parser_data.resume, map_location=torch.device('cpu')) 88 | model.load_state_dict(checkpoint['model']) 89 | optimizer.load_state_dict(checkpoint['optimizer']) 90 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 91 | parser_data.start_epoch = checkpoint['epoch'] + 1 92 | print("the training process from epoch{}...".format(parser_data.start_epoch)) 93 | 94 | train_loss = [] 95 | learning_rate = [] 96 | val_map = [] 97 | 98 | val_data = None 99 | # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 100 | # val_data = get_coco_api_from_dataset(val_data_loader.dataset) 101 | for epoch in range(parser_data.start_epoch, parser_data.epochs): 102 | utils.train_one_epoch(model=model, optimizer=optimizer, 103 | data_loader=train_data_loader, 104 | device=device, epoch=epoch, 105 | print_freq=50, train_loss=train_loss, 106 | train_lr=learning_rate) 107 | 108 | lr_scheduler.step() 109 | 110 | utils.evaluate(model=model, data_loader=val_data_loader, 111 | device=device, data_set=val_data, mAP_list=val_map) 112 | 113 | # save weights 114 | save_files = { 115 | 'model': model.state_dict(), 116 | 'optimizer': optimizer.state_dict(), 117 | 'lr_scheduler': lr_scheduler.state_dict(), 118 | 'epoch': epoch} 119 | torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) 120 | 121 | # plot loss and lr curve 122 | if len(train_loss) != 0 and len(learning_rate) != 0: 123 | from plot_curve import plot_loss_and_lr 124 | plot_loss_and_lr(train_loss, learning_rate) 125 | 126 | # plot mAP curve 127 | if len(val_map) != 0: 128 | from plot_curve import plot_map 129 | plot_map(val_map) 130 | 131 | # inputs = torch.rand(size=(2, 3, 300, 300)) 132 | # output = model(inputs) 133 | # print(output) 134 | 135 | 136 | if __name__ == '__main__': 137 | import argparse 138 | 139 | parser = argparse.ArgumentParser( 140 | description=__doc__) 141 | 142 | # 训练设备类型 143 | parser.add_argument('--device', default='cuda:0', help='device') 144 | # 训练数据集的根目录 145 | parser.add_argument('--data-path', default='./', help='dataset') 146 | # 文件保存地址 147 | parser.add_argument('--output-dir', default='./save_weights', help='path where to save') 148 | # 若需要接着上次训练,则指定上次训练保存权重文件地址 149 | parser.add_argument('--resume', default='', type=str, help='resume from checkpoint') 150 | # 指定接着从哪个epoch数开始训练 151 | parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') 152 | # 训练的总epoch数 153 | parser.add_argument('--epochs', default=15, type=int, metavar='N', 154 | help='number of total epochs to run') 155 | 156 | args = parser.parse_args() 157 | print(args) 158 | 159 | # 检查保存权重文件夹是否存在,不存在则创建 160 | if not os.path.exists(args.output_dir): 161 | os.makedirs(args.output_dir) 162 | 163 | main(args) 164 | -------------------------------------------------------------------------------- /ssd/train_utils/__pycache__/coco_eval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/train_utils/__pycache__/coco_eval.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/train_utils/__pycache__/coco_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/train_utils/__pycache__/coco_utils.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/train_utils/__pycache__/train_eval_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AEProgrammer/object_detection/6d315d23d72f34a7a8c4f44a9550137623e72f36/ssd/train_utils/__pycache__/train_eval_utils.cpython-37.pyc -------------------------------------------------------------------------------- /ssd/train_utils/coco_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | import numpy as np 5 | import copy 6 | import time 7 | import torch 8 | import torch._six 9 | 10 | from pycocotools.cocoeval import COCOeval 11 | from pycocotools.coco import COCO 12 | import pycocotools.mask as mask_util 13 | 14 | from collections import defaultdict 15 | 16 | from train_utils import train_eval_utils as utils 17 | 18 | 19 | class CocoEvaluator(object): 20 | def __init__(self, coco_gt, iou_types): 21 | assert isinstance(iou_types, (list, tuple)) 22 | coco_gt = copy.deepcopy(coco_gt) 23 | self.coco_gt = coco_gt 24 | 25 | self.iou_types = iou_types 26 | self.coco_eval = {} 27 | for iou_type in iou_types: 28 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 29 | 30 | self.img_ids = [] 31 | self.eval_imgs = {k: [] for k in iou_types} 32 | 33 | def update(self, predictions): 34 | img_ids = list(np.unique(list(predictions.keys()))) 35 | self.img_ids.extend(img_ids) 36 | 37 | for iou_type in self.iou_types: 38 | results = self.prepare(predictions, iou_type) 39 | coco_dt = loadRes(self.coco_gt, results) if results else COCO() 40 | coco_eval = self.coco_eval[iou_type] 41 | 42 | coco_eval.cocoDt = coco_dt 43 | coco_eval.params.imgIds = list(img_ids) 44 | img_ids, eval_imgs = evaluate(coco_eval) 45 | 46 | self.eval_imgs[iou_type].append(eval_imgs) 47 | 48 | def synchronize_between_processes(self): 49 | for iou_type in self.iou_types: 50 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 51 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) 52 | 53 | def accumulate(self): 54 | for coco_eval in self.coco_eval.values(): 55 | coco_eval.accumulate() 56 | 57 | def summarize(self): 58 | for iou_type, coco_eval in self.coco_eval.items(): 59 | print("IoU metric: {}".format(iou_type)) 60 | coco_eval.summarize() 61 | 62 | def prepare(self, predictions, iou_type): 63 | if iou_type == "bbox": 64 | return self.prepare_for_coco_detection(predictions) 65 | elif iou_type == "segm": 66 | return self.prepare_for_coco_segmentation(predictions) 67 | elif iou_type == "keypoints": 68 | return self.prepare_for_coco_keypoint(predictions) 69 | else: 70 | raise ValueError("Unknown iou type {}".format(iou_type)) 71 | 72 | def prepare_for_coco_detection(self, predictions): 73 | coco_results = [] 74 | for original_id, prediction in predictions.items(): 75 | if len(prediction) == 0: 76 | continue 77 | 78 | # xmin, ymin, xmax, ymax 79 | boxes = prediction["boxes"] 80 | # 将box的相对坐标信息(0-1)转为绝对值坐标 81 | height_width = prediction["height_width"] 82 | # height_width = [300, 300] 83 | boxes[:, [0, 2]] = boxes[:, [0, 2]] * height_width[1] 84 | boxes[:, [1, 3]] = boxes[:, [1, 3]] * height_width[0] 85 | boxes = convert_to_xywh(boxes) 86 | boxes = boxes.tolist() 87 | scores = prediction["scores"].tolist() 88 | labels = prediction["labels"].tolist() 89 | 90 | coco_results.extend( 91 | [ 92 | { 93 | "image_id": original_id, 94 | "category_id": labels[k], 95 | "bbox": box, 96 | "score": scores[k], 97 | } 98 | for k, box in enumerate(boxes) 99 | ] 100 | ) 101 | return coco_results 102 | 103 | def prepare_for_coco_segmentation(self, predictions): 104 | coco_results = [] 105 | for original_id, prediction in predictions.items(): 106 | if len(prediction) == 0: 107 | continue 108 | 109 | scores = prediction["scores"] 110 | labels = prediction["labels"] 111 | masks = prediction["masks"] 112 | 113 | masks = masks > 0.5 114 | 115 | scores = prediction["scores"].tolist() 116 | labels = prediction["labels"].tolist() 117 | 118 | rles = [ 119 | mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] 120 | for mask in masks 121 | ] 122 | for rle in rles: 123 | rle["counts"] = rle["counts"].decode("utf-8") 124 | 125 | coco_results.extend( 126 | [ 127 | { 128 | "image_id": original_id, 129 | "category_id": labels[k], 130 | "segmentation": rle, 131 | "score": scores[k], 132 | } 133 | for k, rle in enumerate(rles) 134 | ] 135 | ) 136 | return coco_results 137 | 138 | def prepare_for_coco_keypoint(self, predictions): 139 | coco_results = [] 140 | for original_id, prediction in predictions.items(): 141 | if len(prediction) == 0: 142 | continue 143 | 144 | boxes = prediction["boxes"] 145 | boxes = convert_to_xywh(boxes).tolist() 146 | scores = prediction["scores"].tolist() 147 | labels = prediction["labels"].tolist() 148 | keypoints = prediction["keypoints"] 149 | keypoints = keypoints.flatten(start_dim=1).tolist() 150 | 151 | coco_results.extend( 152 | [ 153 | { 154 | "image_id": original_id, 155 | "category_id": labels[k], 156 | 'keypoints': keypoint, 157 | "score": scores[k], 158 | } 159 | for k, keypoint in enumerate(keypoints) 160 | ] 161 | ) 162 | return coco_results 163 | 164 | 165 | def convert_to_xywh(boxes): 166 | xmin, ymin, xmax, ymax = boxes.unbind(1) 167 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 168 | 169 | 170 | def merge(img_ids, eval_imgs): 171 | all_img_ids = utils.all_gather(img_ids) 172 | all_eval_imgs = utils.all_gather(eval_imgs) 173 | 174 | merged_img_ids = [] 175 | for p in all_img_ids: 176 | merged_img_ids.extend(p) 177 | 178 | merged_eval_imgs = [] 179 | for p in all_eval_imgs: 180 | merged_eval_imgs.append(p) 181 | 182 | merged_img_ids = np.array(merged_img_ids) 183 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 184 | 185 | # keep only unique (and in sorted order) images 186 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 187 | merged_eval_imgs = merged_eval_imgs[..., idx] 188 | 189 | return merged_img_ids, merged_eval_imgs 190 | 191 | 192 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 193 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 194 | img_ids = list(img_ids) 195 | eval_imgs = list(eval_imgs.flatten()) 196 | 197 | coco_eval.evalImgs = eval_imgs 198 | coco_eval.params.imgIds = img_ids 199 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 200 | 201 | 202 | ################################################################# 203 | # From pycocotools, just removed the prints and fixed 204 | # a Python3 bug about unicode not defined 205 | ################################################################# 206 | 207 | # Ideally, pycocotools wouldn't have hard-coded prints 208 | # so that we could avoid copy-pasting those two functions 209 | 210 | def createIndex(self): 211 | # create index 212 | # print('creating index...') 213 | anns, cats, imgs = {}, {}, {} 214 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list) 215 | if 'annotations' in self.dataset: 216 | for ann in self.dataset['annotations']: 217 | imgToAnns[ann['image_id']].append(ann) 218 | anns[ann['id']] = ann 219 | 220 | if 'images' in self.dataset: 221 | for img in self.dataset['images']: 222 | imgs[img['id']] = img 223 | 224 | if 'categories' in self.dataset: 225 | for cat in self.dataset['categories']: 226 | cats[cat['id']] = cat 227 | 228 | if 'annotations' in self.dataset and 'categories' in self.dataset: 229 | for ann in self.dataset['annotations']: 230 | catToImgs[ann['category_id']].append(ann['image_id']) 231 | 232 | # print('index created!') 233 | 234 | # create class members 235 | self.anns = anns 236 | self.imgToAnns = imgToAnns 237 | self.catToImgs = catToImgs 238 | self.imgs = imgs 239 | self.cats = cats 240 | 241 | 242 | maskUtils = mask_util 243 | 244 | 245 | def loadRes(self, resFile): 246 | """ 247 | Load result file and return a result api object. 248 | :param resFile (str) : file name of result file 249 | :return: res (obj) : result api object 250 | """ 251 | res = COCO() 252 | res.dataset['images'] = [img for img in self.dataset['images']] 253 | 254 | # print('Loading and preparing results...') 255 | # tic = time.time() 256 | if isinstance(resFile, torch._six.string_classes): 257 | anns = json.load(open(resFile)) 258 | elif type(resFile) == np.ndarray: 259 | anns = self.loadNumpyAnnotations(resFile) 260 | else: 261 | anns = resFile 262 | assert type(anns) == list, 'results in not an array of objects' 263 | annsImgIds = [ann['image_id'] for ann in anns] 264 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 265 | 'Results do not correspond to current coco set' 266 | if 'caption' in anns[0]: 267 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 268 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 269 | for id, ann in enumerate(anns): 270 | ann['id'] = id + 1 271 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 272 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 273 | for id, ann in enumerate(anns): 274 | bb = ann['bbox'] 275 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] 276 | if 'segmentation' not in ann: 277 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 278 | ann['area'] = bb[2] * bb[3] 279 | ann['id'] = id + 1 280 | ann['iscrowd'] = 0 281 | elif 'segmentation' in anns[0]: 282 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 283 | for id, ann in enumerate(anns): 284 | # now only support compressed RLE format as segmentation results 285 | ann['area'] = maskUtils.area(ann['segmentation']) 286 | if 'bbox' not in ann: 287 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 288 | ann['id'] = id + 1 289 | ann['iscrowd'] = 0 290 | elif 'keypoints' in anns[0]: 291 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 292 | for id, ann in enumerate(anns): 293 | s = ann['keypoints'] 294 | x = s[0::3] 295 | y = s[1::3] 296 | x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y) 297 | ann['area'] = (x2 - x1) * (y2 - y1) 298 | ann['id'] = id + 1 299 | ann['bbox'] = [x1, y1, x2 - x1, y2 - y1] 300 | # print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 301 | 302 | res.dataset['annotations'] = anns 303 | createIndex(res) 304 | return res 305 | 306 | 307 | def evaluate(self): 308 | ''' 309 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 310 | :return: None 311 | ''' 312 | # tic = time.time() 313 | # print('Running per image evaluation...') 314 | p = self.params 315 | # add backward compatibility if useSegm is specified in params 316 | if p.useSegm is not None: 317 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 318 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 319 | # print('Evaluate annotation type *{}*'.format(p.iouType)) 320 | p.imgIds = list(np.unique(p.imgIds)) 321 | if p.useCats: 322 | p.catIds = list(np.unique(p.catIds)) 323 | p.maxDets = sorted(p.maxDets) 324 | self.params = p 325 | 326 | self._prepare() 327 | # loop through images, area range, max detection number 328 | catIds = p.catIds if p.useCats else [-1] 329 | 330 | if p.iouType == 'segm' or p.iouType == 'bbox': 331 | computeIoU = self.computeIoU 332 | elif p.iouType == 'keypoints': 333 | computeIoU = self.computeOks 334 | self.ious = { 335 | (imgId, catId): computeIoU(imgId, catId) 336 | for imgId in p.imgIds 337 | for catId in catIds} 338 | 339 | evaluateImg = self.evaluateImg 340 | maxDet = p.maxDets[-1] 341 | evalImgs = [ 342 | evaluateImg(imgId, catId, areaRng, maxDet) 343 | for catId in catIds 344 | for areaRng in p.areaRng 345 | for imgId in p.imgIds 346 | ] 347 | # this is NOT in the pycocotools code, but could be done outside 348 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 349 | self._paramsEval = copy.deepcopy(self.params) 350 | # toc = time.time() 351 | # print('DONE (t={:0.2f}s).'.format(toc-tic)) 352 | return p.imgIds, evalImgs 353 | 354 | ################################################################# 355 | # end of straight copy from pycocotools, just removing the prints 356 | ################################################################# 357 | -------------------------------------------------------------------------------- /ssd/train_utils/coco_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data 4 | from pycocotools.coco import COCO 5 | 6 | 7 | def convert_to_coco_api(ds): 8 | coco_ds = COCO() 9 | # annotation IDs need to start at 1, not 0 10 | ann_id = 1 11 | dataset = {'images': [], 'categories': [], 'annotations': []} 12 | categories = set() 13 | for img_idx in range(len(ds)): 14 | # find better way to get target 15 | img, targets = ds[img_idx] 16 | image_id = targets["image_id"].item() 17 | img_dict = {} 18 | img_dict['id'] = image_id 19 | # img_dict['height'] = img.shape[-2] 20 | # img_dict['width'] = img.shape[-1] 21 | img_dict['height'] = targets["height_width"][0] 22 | img_dict['width'] = targets["height_width"][1] 23 | dataset['images'].append(img_dict) 24 | bboxes = targets["boxes"] 25 | bboxes[:, 2:] -= bboxes[:, :2] 26 | # 将box的相对坐标信息(0-1)转为绝对值坐标 27 | bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * img_dict["width"] 28 | bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * img_dict["height"] 29 | bboxes = bboxes.tolist() 30 | labels = targets['labels'].tolist() 31 | # 注意这里的boxes area也要进行转换,否则导致(small, medium, large)计算错误 32 | areas = (targets['area'] * img_dict["width"] * img_dict["height"]).tolist() 33 | iscrowd = targets['iscrowd'].tolist() 34 | num_objs = len(bboxes) 35 | for i in range(num_objs): 36 | ann = {} 37 | ann['image_id'] = image_id 38 | ann['bbox'] = bboxes[i] 39 | ann['category_id'] = labels[i] 40 | categories.add(labels[i]) 41 | ann['area'] = areas[i] 42 | ann['iscrowd'] = iscrowd[i] 43 | ann['id'] = ann_id 44 | dataset['annotations'].append(ann) 45 | ann_id += 1 46 | dataset['categories'] = [{'id': i} for i in sorted(categories)] 47 | coco_ds.dataset = dataset 48 | coco_ds.createIndex() 49 | return coco_ds 50 | 51 | 52 | def get_coco_api_from_dataset(dataset): 53 | for _ in range(10): 54 | if isinstance(dataset, torchvision.datasets.CocoDetection): 55 | break 56 | if isinstance(dataset, torch.utils.data.Subset): 57 | dataset = dataset.dataset 58 | if isinstance(dataset, torchvision.datasets.CocoDetection): 59 | return dataset.coco 60 | return convert_to_coco_api(dataset) 61 | -------------------------------------------------------------------------------- /ssd/train_utils/group_by_aspect_ratio.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from collections import defaultdict 3 | import copy 4 | from itertools import repeat, chain 5 | import math 6 | import numpy as np 7 | 8 | import torch 9 | import torch.utils.data 10 | from torch.utils.data.sampler import BatchSampler, Sampler 11 | from torch.utils.model_zoo import tqdm 12 | import torchvision 13 | 14 | from PIL import Image 15 | 16 | 17 | def _repeat_to_at_least(iterable, n): 18 | repeat_times = math.ceil(n / len(iterable)) 19 | repeated = chain.from_iterable(repeat(iterable, repeat_times)) 20 | return list(repeated) 21 | 22 | 23 | class GroupedBatchSampler(BatchSampler): 24 | """ 25 | Wraps another sampler to yield a mini-batch of indices. 26 | It enforces that the batch only contain elements from the same group. 27 | It also tries to provide mini-batches which follows an ordering which is 28 | as close as possible to the ordering from the original sampler. 29 | Arguments: 30 | sampler (Sampler): Base sampler. 31 | group_ids (list[int]): If the sampler produces indices in range [0, N), 32 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 33 | The group ids must be a continuous set of integers starting from 34 | 0, i.e. they must be in the range [0, num_groups). 35 | batch_size (int): Size of mini-batch. 36 | """ 37 | def __init__(self, sampler, group_ids, batch_size): 38 | if not isinstance(sampler, Sampler): 39 | raise ValueError( 40 | "sampler should be an instance of " 41 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 42 | ) 43 | self.sampler = sampler 44 | self.group_ids = group_ids 45 | self.batch_size = batch_size 46 | 47 | def __iter__(self): 48 | buffer_per_group = defaultdict(list) 49 | samples_per_group = defaultdict(list) 50 | 51 | num_batches = 0 52 | for idx in self.sampler: 53 | group_id = self.group_ids[idx] 54 | buffer_per_group[group_id].append(idx) 55 | samples_per_group[group_id].append(idx) 56 | if len(buffer_per_group[group_id]) == self.batch_size: 57 | yield buffer_per_group[group_id] 58 | num_batches += 1 59 | del buffer_per_group[group_id] 60 | assert len(buffer_per_group[group_id]) < self.batch_size 61 | 62 | # now we have run out of elements that satisfy 63 | # the group criteria, let's return the remaining 64 | # elements so that the size of the sampler is 65 | # deterministic 66 | expected_num_batches = len(self) 67 | num_remaining = expected_num_batches - num_batches 68 | if num_remaining > 0: 69 | # for the remaining batches, take first the buffers with largest number 70 | # of elements 71 | for group_id, _ in sorted(buffer_per_group.items(), 72 | key=lambda x: len(x[1]), reverse=True): 73 | remaining = self.batch_size - len(buffer_per_group[group_id]) 74 | samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining) 75 | buffer_per_group[group_id].extend(samples_from_group_id[:remaining]) 76 | assert len(buffer_per_group[group_id]) == self.batch_size 77 | yield buffer_per_group[group_id] 78 | num_remaining -= 1 79 | if num_remaining == 0: 80 | break 81 | assert num_remaining == 0 82 | 83 | def __len__(self): 84 | return len(self.sampler) // self.batch_size 85 | 86 | 87 | def _compute_aspect_ratios_slow(dataset, indices=None): 88 | print("Your dataset doesn't support the fast path for " 89 | "computing the aspect ratios, so will iterate over " 90 | "the full dataset and load every image instead. " 91 | "This might take some time...") 92 | if indices is None: 93 | indices = range(len(dataset)) 94 | 95 | class SubsetSampler(Sampler): 96 | def __init__(self, indices): 97 | self.indices = indices 98 | 99 | def __iter__(self): 100 | return iter(self.indices) 101 | 102 | def __len__(self): 103 | return len(self.indices) 104 | 105 | sampler = SubsetSampler(indices) 106 | data_loader = torch.utils.data.DataLoader( 107 | dataset, batch_size=1, sampler=sampler, 108 | num_workers=14, # you might want to increase it for faster processing 109 | collate_fn=lambda x: x[0]) 110 | aspect_ratios = [] 111 | with tqdm(total=len(dataset)) as pbar: 112 | for _i, (img, _) in enumerate(data_loader): 113 | pbar.update(1) 114 | height, width = img.shape[-2:] 115 | aspect_ratio = float(width) / float(height) 116 | aspect_ratios.append(aspect_ratio) 117 | return aspect_ratios 118 | 119 | 120 | def _compute_aspect_ratios_custom_dataset(dataset, indices=None): 121 | if indices is None: 122 | indices = range(len(dataset)) 123 | aspect_ratios = [] 124 | for i in indices: 125 | height, width = dataset.get_height_and_width(i) 126 | aspect_ratio = float(width) / float(height) 127 | aspect_ratios.append(aspect_ratio) 128 | return aspect_ratios 129 | 130 | 131 | def _compute_aspect_ratios_coco_dataset(dataset, indices=None): 132 | if indices is None: 133 | indices = range(len(dataset)) 134 | aspect_ratios = [] 135 | for i in indices: 136 | img_info = dataset.coco.imgs[dataset.ids[i]] 137 | aspect_ratio = float(img_info["width"]) / float(img_info["height"]) 138 | aspect_ratios.append(aspect_ratio) 139 | return aspect_ratios 140 | 141 | 142 | def _compute_aspect_ratios_voc_dataset(dataset, indices=None): 143 | if indices is None: 144 | indices = range(len(dataset)) 145 | aspect_ratios = [] 146 | for i in indices: 147 | # this doesn't load the data into memory, because PIL loads it lazily 148 | width, height = Image.open(dataset.images[i]).size 149 | aspect_ratio = float(width) / float(height) 150 | aspect_ratios.append(aspect_ratio) 151 | return aspect_ratios 152 | 153 | 154 | def _compute_aspect_ratios_subset_dataset(dataset, indices=None): 155 | if indices is None: 156 | indices = range(len(dataset)) 157 | 158 | ds_indices = [dataset.indices[i] for i in indices] 159 | return compute_aspect_ratios(dataset.dataset, ds_indices) 160 | 161 | 162 | def compute_aspect_ratios(dataset, indices=None): 163 | if hasattr(dataset, "get_height_and_width"): 164 | return _compute_aspect_ratios_custom_dataset(dataset, indices) 165 | 166 | if isinstance(dataset, torchvision.datasets.CocoDetection): 167 | return _compute_aspect_ratios_coco_dataset(dataset, indices) 168 | 169 | if isinstance(dataset, torchvision.datasets.VOCDetection): 170 | return _compute_aspect_ratios_voc_dataset(dataset, indices) 171 | 172 | if isinstance(dataset, torch.utils.data.Subset): 173 | return _compute_aspect_ratios_subset_dataset(dataset, indices) 174 | 175 | # slow path 176 | return _compute_aspect_ratios_slow(dataset, indices) 177 | 178 | 179 | def _quantize(x, bins): 180 | bins = copy.deepcopy(bins) 181 | bins = sorted(bins) 182 | # bisect_right:寻找y元素按顺序应该排在bins中哪个元素的右边,返回的是索引 183 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) 184 | return quantized 185 | 186 | 187 | def create_aspect_ratio_groups(dataset, k=0): 188 | # 计算所有数据集中的图片width/height比例 189 | aspect_ratios = compute_aspect_ratios(dataset) 190 | # 将[0.5, 2]区间划分成2*k+1等份 191 | bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0] 192 | 193 | # 统计所有图像比例在bins区间中的位置索引 194 | groups = _quantize(aspect_ratios, bins) 195 | # count number of elements per group 196 | # 统计每个区间的频次 197 | counts = np.unique(groups, return_counts=True)[1] 198 | fbins = [0] + bins + [np.inf] 199 | print("Using {} as bins for aspect ratio quantization".format(fbins)) 200 | print("Count of instances per bin: {}".format(counts)) 201 | return groups 202 | -------------------------------------------------------------------------------- /ssd/transform.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torchvision.transforms as t 3 | from torchvision.transforms import functional as F 4 | from src.utils import dboxes300_coco, calc_iou_tensor, Encoder 5 | import torch 6 | 7 | 8 | class Compose(object): 9 | """组合多个transform函数""" 10 | def __init__(self, transforms): 11 | self.transforms = transforms 12 | 13 | def __call__(self, image, target=None): 14 | for trans in self.transforms: 15 | image, target = trans(image, target) 16 | return image, target 17 | 18 | 19 | class ToTensor(object): 20 | """将PIL图像转为Tensor""" 21 | def __call__(self, image, target): 22 | image = F.to_tensor(image).contiguous() 23 | return image, target 24 | 25 | 26 | class RandomHorizontalFlip(object): 27 | """随机水平翻转图像以及bboxes,该方法应放在ToTensor后""" 28 | def __init__(self, prob=0.5): 29 | self.prob = prob 30 | 31 | def __call__(self, image, target): 32 | if random.random() < self.prob: 33 | # height, width = image.shape[-2:] 34 | image = image.flip(-1) # 水平翻转图片 35 | bbox = target["boxes"] 36 | # bbox: xmin, ymin, xmax, ymax 37 | # bbox[:, [0, 2]] = width - bbox[:, [2, 0]] # 翻转对应bbox坐标信息 38 | bbox[:, [0, 2]] = 1.0 - bbox[:, [2, 0]] # 翻转对应bbox坐标信息 39 | target["boxes"] = bbox 40 | return image, target 41 | 42 | 43 | # This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation. 44 | class SSDCropping(object): 45 | """ 46 | 根据原文,对图像进行裁剪,该方法应放在ToTensor前 47 | Cropping for SSD, according to original paper 48 | Choose between following 3 conditions: 49 | 1. Preserve the original image 50 | 2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9 51 | 3. Random crop 52 | Reference to https://github.com/chauhan-utk/src.DomainAdaptation 53 | """ 54 | def __init__(self): 55 | self.sample_options = ( 56 | # Do nothing 57 | None, 58 | # min IoU, max IoU 59 | (0.1, None), 60 | (0.3, None), 61 | (0.5, None), 62 | (0.7, None), 63 | (0.9, None), 64 | # no IoU requirements 65 | (None, None), 66 | ) 67 | self.dboxes = dboxes300_coco() 68 | 69 | def __call__(self, image, target): 70 | # Ensure always return cropped image 71 | while True: 72 | mode = random.choice(self.sample_options) 73 | if mode is None: # 不做随机裁剪处理 74 | return image, target 75 | 76 | htot, wtot = target['height_width'] 77 | 78 | min_iou, max_iou = mode 79 | min_iou = float('-inf') if min_iou is None else min_iou 80 | max_iou = float('+inf') if max_iou is None else max_iou 81 | 82 | # Implementation use 5 iteration to find possible candidate 83 | for _ in range(5): 84 | # 0.3*0.3 approx. 0.1 85 | w = random.uniform(0.3, 1.0) 86 | h = random.uniform(0.3, 1.0) 87 | 88 | if w/h < 0.5 or w/h > 2: # 保证宽高比例在0.5-2之间 89 | continue 90 | 91 | # left 0 ~ wtot - w, top 0 ~ htot - h 92 | left = random.uniform(0, 1.0 - w) 93 | top = random.uniform(0, 1.0 - h) 94 | 95 | right = left + w 96 | bottom = top + h 97 | 98 | # boxes的坐标是在0-1之间的 99 | bboxes = target["boxes"] 100 | ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]])) 101 | 102 | # tailor all the bboxes and return 103 | # all(): Returns True if all elements in the tensor are True, False otherwise. 104 | if not ((ious > min_iou) & (ious < max_iou)).all(): 105 | continue 106 | 107 | # discard any bboxes whose center not in the cropped image 108 | xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2]) 109 | yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3]) 110 | 111 | # 查找所有的gt box的中心点有没有在采样patch中的 112 | masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom) 113 | 114 | # if no such boxes, continue searching again 115 | # 如果所有的gt box的中心点都不在采样的patch中,则重新找 116 | if not masks.any(): 117 | continue 118 | 119 | # 修改采样patch中的所有gt box的坐标(防止出现越界的情况) 120 | bboxes[bboxes[:, 0] < left, 0] = left 121 | bboxes[bboxes[:, 1] < top, 1] = top 122 | bboxes[bboxes[:, 2] > right, 2] = right 123 | bboxes[bboxes[:, 3] > bottom, 3] = bottom 124 | 125 | # 虑除不在采样patch中的gt box 126 | bboxes = bboxes[masks, :] 127 | # 获取在采样patch中的gt box的标签 128 | labels = target['labels'] 129 | labels = labels[masks] 130 | 131 | # 裁剪patch 132 | left_idx = int(left * wtot) 133 | top_idx = int(top * htot) 134 | right_idx = int(right * wtot) 135 | bottom_idx = int(bottom * htot) 136 | image = image.crop((left_idx, top_idx, right_idx, bottom_idx)) 137 | 138 | # 调整裁剪后的bboxes坐标信息 139 | bboxes[:, 0] = (bboxes[:, 0] - left) / w 140 | bboxes[:, 1] = (bboxes[:, 1] - top) / h 141 | bboxes[:, 2] = (bboxes[:, 2] - left) / w 142 | bboxes[:, 3] = (bboxes[:, 3] - top) / h 143 | 144 | # 更新crop后的gt box坐标信息以及标签信息 145 | target['boxes'] = bboxes 146 | target['labels'] = labels 147 | 148 | return image, target 149 | 150 | 151 | class Resize(object): 152 | """对图像进行resize处理,该方法应放在ToTensor前""" 153 | def __init__(self, size=(300, 300)): 154 | self.resize = t.Resize(size) 155 | 156 | def __call__(self, image, target): 157 | image = self.resize(image) 158 | return image, target 159 | 160 | 161 | class ColorJitter(object): 162 | """对图像颜色信息进行随机调整,该方法应放在ToTensor前""" 163 | def __init__(self, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05): 164 | self.trans = t.ColorJitter(brightness, contrast, saturation, hue) 165 | 166 | def __call__(self, image, target): 167 | image = self.trans(image) 168 | return image, target 169 | 170 | 171 | class Normalization(object): 172 | """对图像标准化处理,该方法应放在ToTensor后""" 173 | def __init__(self, mean=None, std=None): 174 | if mean is None: 175 | mean = [0.485, 0.456, 0.406] 176 | if std is None: 177 | std = [0.229, 0.224, 0.225] 178 | self.normalize = t.Normalize(mean=mean, std=std) 179 | 180 | def __call__(self, image, target): 181 | image = self.normalize(image) 182 | return image, target 183 | 184 | 185 | class AssignGTtoDefaultBox(object): 186 | def __init__(self): 187 | self.default_box = dboxes300_coco() 188 | self.encoder = Encoder(self.default_box) 189 | 190 | def __call__(self, image, target): 191 | boxes = target['boxes'] 192 | labels = target["labels"] 193 | # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732) 194 | bboxes_out, labels_out = self.encoder.encode(boxes, labels) 195 | target['boxes'] = bboxes_out 196 | target['labels'] = labels_out 197 | 198 | return image, target 199 | --------------------------------------------------------------------------------