├── .gitignore ├── README.md ├── backbone ├── fpn101.py ├── hrnet.py ├── mobilenet.py ├── resnet50_fpn_model.py └── vgg16.py ├── config ├── test_config.py └── train_config.py ├── dataloader └── coco_dataset.py ├── imgs └── demo1.png ├── requirements.txt ├── test.py ├── test └── anchor_utils_test.py ├── train.py └── utils ├── anchor_utils.py ├── boxes_utils.py ├── coco_utils.py ├── det_utils.py ├── draw_box_utils.py ├── evaluate_utils.py ├── faster_rcnn_utils.py ├── im_utils.py ├── plot_utils.py ├── roi_header_util.py ├── rpn_utils.py ├── train_utils.py └── transform_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | .idea/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

18 | 19 | # pytorch-faster-rcnn 20 | ## 1. Introduction 21 | Pytorch based implementation of faster rcnn framework.For details about faster R-CNN please refer to the paper [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) by Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun 22 | 23 | 24 | This detection framework has the following features: 25 | * It can be run as pure python code, and also pure based on pytorch framework, no need to build 26 | * It is easily trained by only running a train.py script, just set the data root dir 27 | * It has many backbone networks. like vgg, resnet-fpn, mobilenet, high resolution net(HRNet) 28 | * It can be a really detection framework. You only need to change super parameters in config file and get different models to compare different model 29 | * It's memory-efficient (about 3GB for vgg16) 30 | ## 2. Installation 31 | ### 2.1 Prerequisites 32 | * Python 2.7 or 3.5 33 | * Pytorch 1.5.1 34 | * torchvision 0.6.1 35 | * numpy 1.15.4 36 | * Pillow 6.1.0 37 | * pycocotools 2.0 38 | * matplotlib 3.0.2 39 | * tensorboardX 2.0 40 | ```Shell 41 | pip install -r requirements.txt 42 | ``` 43 | ### 2.2 Code-Preparing 44 | ```Shell 45 | git clone https://github.com/AlphaJia/pytorch-faster-rcnn.git 46 | ``` 47 | ## 3. Data Preparation 48 | ### COCO 49 | ##### 3.1 Download the training, validation, test data and annotations 50 | ```Shell 51 | wget http://images.cocodataset.org/zips/train2017.zip 52 | wget http://images.cocodataset.org/zips/val2017.zip 53 | wget http://images.cocodataset.org/zips/test2017.zip 54 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 55 | ``` 56 | ##### 3.2 Extract all of these tars into one directory named COCODevKit 57 | ```Shell 58 | tar xvf train2017.zip 59 | tar xvf val2017.zip 60 | tar xvf test2017.zip 61 | tar xvf annotations_trainval2017.zip 62 | ``` 63 | ##### 3.3 Data dir should like this 64 | ``` 65 | COCODevKit 66 | |-- train2017 67 | |-- [xxxxxxxxxxxx].jpg 68 | |-- val2017 69 | |-- [xxxxxxxxxxxx].jpg 70 | |-- test2017 71 | |-- [xxxxxxxxxxxx].jpg 72 | |-- annotations 73 | |-- instances_train2017.json 74 | |-- instances_val2017.json 75 | |-- image_info_test2017.json 76 | ``` 77 | ##### 3.4 modify data_root_dir cfg item in config/train_config.py with /path/COCODevKit/ 78 | 79 | ## 4. Train 80 | Modify model_save_dir cfg item in config/train_config.py with your own save path and device_name with your own device 81 | * Train with [mobilenet](https://arxiv.org/abs/1801.04381) 82 | Modify backbone cfg item in config/train_config.py with mobilenet, download pretrained weights [here](https://download.pytorch.org/models/mobilenet_v2-b0353104.pth), and set backbone_pretrained_weights in config/train_config.py with downloaded path. 83 | ```Shell 84 | python train.py 85 | ``` 86 | * Train with [resnet-fpn](https://arxiv.org/abs/1409.1556) 87 | Modify backbone cfg item in config/train_config.py with resnet50_fpn, download pretrained weights [here](https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth), and set backbone_pretrained_weights in config/train_config.py with downloaded path 88 | ```Shell 89 | python train.py 90 | ``` 91 | * Train with [vgg16](https://arxiv.org/abs/1409.1556) 92 | Modify backbone cfg item in config/train_config.py with vgg16 93 | ```Shell 94 | python train.py 95 | ``` 96 | * Train with [HRNet](https://arxiv.org/abs/1409.1556) 97 | Modify backbone cfg item in config/train_config.py with HRNe 98 | ```Shell 99 | python train.py 100 | ``` 101 | 102 | Weights and tensorboard log will save in your model_save_path dir 103 | you may refer to config/train_config.py for more argument. 104 | Some Key arguments: 105 | `--backbone`: feature extraction backbone network 106 | `--backbone_pretrained_weights`: backbone pretrained weights, None or path 107 | `--train_horizon_flip_prob`: data horizontal flip probability 108 | `--num_class`: number of classification, including background 109 | `--data_root_dir`: COCO dataset root dir 110 | `--model_save_dir`: training weights save path 111 | `--device_name`: training device 112 | `--num_epochs`: training epochs 113 | ## 5. Test 114 | Modify model_weights cfg item in config/test_config.py with your trained weights path and gpu_id with your own cuda device ID. 115 | you may refer to config/test_config.py for more argument. 116 | Some Key arguments: 117 | `--model_weights`: training save path 118 | `--image_path`: predicted images 119 | `--gpu_id`: cuda device gpu ID 120 | `--num_classes`: number of classification, including background 121 | `--data_root_dir`: COCO dataset root dir 122 | 123 | ```Shell 124 | python test.py 125 | ``` 126 | ## 6. Demo 127 | ![img](imgs/demo1.png) 128 | ## 7. Framework Structure 129 | #### backbone 130 | This module includes backbone feature extraction network 131 | * vgg16:vgg16 net network([Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556)) 132 | * fpn101:resnet101 fpn network([Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)) ([Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144)) 133 | * hrnet:high resolution net([Deep High-Resolution Representation Learning for Visual Recognition](https://arxiv.org/abs/1908.07919)) 134 | * mobile_net:mobile_net v2 network([MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)) 135 | #### config 136 | This module includes config parameters in training period and testing period 137 | * test_config: specify config parameters in testing period like model_file, image_path_dir, save_dir, etc. 138 | * train_config: specify config parameters in training period like backbone network, batch_size, image_path_dir, anchor_size, ect. 139 | #### dataloader 140 | This module inherits pytorch dataloader classes, dataset IO.You can also generate your own dataset dataloader IO and put it in this module 141 | * coco_dataset: coco([Common Objects in Context](https://cocodataset.org/#home)) dataset dataloader IO 142 | #### test 143 | This module includes the utils function test(common called unit test, also called UT) 144 | * anchor_utils_test: some unit testing for utils/anchor_utils.py 145 | #### utils 146 | This module includes some utilies for image processing, network architectures building, anchor generating, loss function, etc. 147 | * anchor_utils: some basic function for building anchors 148 | * im_utils: some basic function for image processing 149 | 150 | -------------------------------------------------------------------------------- /backbone/fpn101.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class Bottleneck(nn.Module): 8 | expansion = 4 9 | 10 | def __init__(self, in_planes, planes, stride=1, downsample=None): 11 | super(Bottleneck, self).__init__() 12 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(planes) 14 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 15 | self.bn2 = nn.BatchNorm2d(planes) 16 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 17 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 18 | self.relu = nn.ReLU(inplace=True) 19 | self.downsample = downsample 20 | self.stride = stride 21 | 22 | def forward(self, x): 23 | residual = x 24 | 25 | out = self.conv1(x) 26 | out = self.bn1(out) 27 | out = self.relu(out) 28 | 29 | out = self.conv2(out) 30 | out = self.bn2(out) 31 | out = self.relu(out) 32 | 33 | out = self.conv3(out) 34 | out = self.bn3(out) 35 | 36 | if self.downsample is not None: 37 | residual = self.downsample(x) 38 | 39 | out += residual 40 | out = self.relu(out) 41 | 42 | return out 43 | 44 | 45 | class FPN(nn.Module): 46 | def __init__(self, block, layers): 47 | super(FPN, self).__init__() 48 | self.inplanes = 64 49 | 50 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 51 | self.bn1 = nn.BatchNorm2d(64) 52 | 53 | self.relu = nn.ReLU(inplace=True) 54 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 55 | # Bottom-up layers 56 | self.layer1 = self._make_layer(block, 64, layers[0]) 57 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 58 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 59 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 60 | 61 | # Top layer 62 | self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) # Reduce channels 63 | 64 | # Smooth layers 65 | self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 66 | self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 67 | self.smooth3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 68 | 69 | # Lateral layers 70 | self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) 71 | self.latlayer2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0) 72 | self.latlayer3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0) 73 | 74 | for m in self.modules(): 75 | if isinstance(m, nn.Conv2d): 76 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 77 | m.weight.data.normal_(0, math.sqrt(2. / n)) 78 | elif isinstance(m, nn.BatchNorm2d): 79 | m.weight.data.fill_(1) 80 | m.bias.data.zero_() 81 | 82 | def _make_layer(self, block, planes, blocks, stride=1): 83 | downsample = None 84 | if stride != 1 or self.inplanes != block.expansion * planes: 85 | downsample = nn.Sequential( 86 | nn.Conv2d(self.inplanes, block.expansion * planes, kernel_size=1, stride=stride, bias=False), 87 | nn.BatchNorm2d(block.expansion * planes) 88 | ) 89 | layers = [] 90 | layers.append(block(self.inplanes, planes, stride, downsample)) 91 | self.inplanes = planes * block.expansion 92 | for i in range(1, blocks): 93 | layers.append(block(self.inplanes, planes)) 94 | 95 | return nn.Sequential(*layers) 96 | 97 | def _upsample_add(self, x, y): 98 | _, _, H, W = y.size() 99 | return F.upsample(x, size=(H, W), mode='bilinear') + y 100 | 101 | def forward(self, x): 102 | # Bottom-up 103 | x = self.conv1(x) 104 | x = self.bn1(x) 105 | x = self.relu(x) 106 | c1 = self.maxpool(x) 107 | 108 | c2 = self.layer1(c1) 109 | c3 = self.layer2(c2) 110 | c4 = self.layer3(c3) 111 | c5 = self.layer4(c4) 112 | # Top-down 113 | p5 = self.toplayer(c5) 114 | p4 = self._upsample_add(p5, self.latlayer1(c4)) 115 | p3 = self._upsample_add(p4, self.latlayer2(c3)) 116 | p2 = self._upsample_add(p3, self.latlayer3(c2)) 117 | # Smooth 118 | p4 = self.smooth1(p4) 119 | p3 = self.smooth2(p3) 120 | p2 = self.smooth3(p2) 121 | return p2, p3, p4, p5 122 | 123 | 124 | def FPN101(): 125 | return FPN(Bottleneck, [2, 2, 2, 2]) 126 | -------------------------------------------------------------------------------- /backbone/hrnet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class Bottleneck(nn.Module): 5 | expansion = 4 6 | 7 | def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1): 8 | super(Bottleneck, self).__init__() 9 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 10 | self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum) 11 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 12 | self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum) 13 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) 14 | self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum) 15 | self.relu = nn.ReLU(inplace=True) 16 | self.downsample = downsample 17 | self.stride = stride 18 | 19 | def forward(self, x): 20 | residual = x 21 | 22 | out = self.conv1(x) 23 | out = self.bn1(out) 24 | out = self.relu(out) 25 | 26 | out = self.conv2(out) 27 | out = self.bn2(out) 28 | out = self.relu(out) 29 | 30 | out = self.conv3(out) 31 | out = self.bn3(out) 32 | 33 | if self.downsample is not None: 34 | residual = self.downsample(x) 35 | 36 | out += residual 37 | out = self.relu(out) 38 | 39 | return out 40 | 41 | 42 | class BasicBlock(nn.Module): 43 | expansion = 1 44 | 45 | def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1): 46 | super(BasicBlock, self).__init__() 47 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 48 | self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum) 49 | self.relu = nn.ReLU(inplace=True) 50 | self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False) 51 | self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum) 52 | self.downsample = downsample 53 | self.stride = stride 54 | 55 | def forward(self, x): 56 | residual = x 57 | 58 | out = self.conv1(x) 59 | out = self.bn1(out) 60 | out = self.relu(out) 61 | 62 | out = self.conv2(out) 63 | out = self.bn2(out) 64 | 65 | if self.downsample is not None: 66 | residual = self.downsample(x) 67 | 68 | out += residual 69 | out = self.relu(out) 70 | 71 | return out 72 | 73 | 74 | class StageModule(nn.Module): 75 | def __init__(self, stage, output_branches, c, bn_momentum): 76 | super(StageModule, self).__init__() 77 | self.stage = stage 78 | self.output_branches = output_branches 79 | 80 | self.branches = nn.ModuleList() 81 | for i in range(self.stage): 82 | w = c * (2 ** i) 83 | branch = nn.Sequential( 84 | BasicBlock(w, w, bn_momentum=bn_momentum), 85 | BasicBlock(w, w, bn_momentum=bn_momentum), 86 | BasicBlock(w, w, bn_momentum=bn_momentum), 87 | BasicBlock(w, w, bn_momentum=bn_momentum), 88 | ) 89 | self.branches.append(branch) 90 | 91 | self.fuse_layers = nn.ModuleList() 92 | # for each output_branches (i.e. each branch in all cases but the very last one) 93 | for i in range(self.output_branches): 94 | self.fuse_layers.append(nn.ModuleList()) 95 | for j in range(self.stage): # for each branch 96 | if i == j: 97 | self.fuse_layers[-1].append(nn.Sequential()) # Used in place of "None" because it is callable 98 | elif i < j: 99 | self.fuse_layers[-1].append(nn.Sequential( 100 | nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False), 101 | nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 102 | nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'), 103 | )) 104 | elif i > j: 105 | ops = [] 106 | for k in range(i - j - 1): 107 | ops.append(nn.Sequential( 108 | nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), 109 | bias=False), 110 | nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True, 111 | track_running_stats=True), 112 | nn.ReLU(inplace=True), 113 | )) 114 | ops.append(nn.Sequential( 115 | nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), 116 | bias=False), 117 | nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 118 | )) 119 | self.fuse_layers[-1].append(nn.Sequential(*ops)) 120 | 121 | self.relu = nn.ReLU(inplace=True) 122 | 123 | def forward(self, x): 124 | assert len(self.branches) == len(x) 125 | 126 | x = [branch(b) for branch, b in zip(self.branches, x)] 127 | 128 | x_fused = [] 129 | for i in range(len(self.fuse_layers)): 130 | for j in range(0, len(self.branches)): 131 | if j == 0: 132 | x_fused.append(self.fuse_layers[i][0](x[0])) 133 | else: 134 | x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j]) 135 | 136 | for i in range(len(x_fused)): 137 | x_fused[i] = self.relu(x_fused[i]) 138 | 139 | return x_fused 140 | 141 | 142 | class HRNet(nn.Module): 143 | def __init__(self, c=48, nof_joints=17, bn_momentum=0.1): 144 | super(HRNet, self).__init__() 145 | 146 | # Input (stem net) 147 | self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) 148 | self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True) 149 | self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) 150 | self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True) 151 | self.relu = nn.ReLU(inplace=True) 152 | 153 | # Stage 1 (layer1) - First group of bottleneck (resnet) modules 154 | downsample = nn.Sequential( 155 | nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False), 156 | nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), 157 | ) 158 | self.layer1 = nn.Sequential( 159 | Bottleneck(64, 64, downsample=downsample), 160 | Bottleneck(256, 64), 161 | Bottleneck(256, 64), 162 | Bottleneck(256, 64), 163 | ) 164 | 165 | # Fusion layer 1 (transition1) - Creation of the first two branches (one full and one half resolution) 166 | self.transition1 = nn.ModuleList([ 167 | nn.Sequential( 168 | nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), 169 | nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), 170 | nn.ReLU(inplace=True), 171 | ), 172 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights 173 | nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), 174 | nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), 175 | nn.ReLU(inplace=True), 176 | )), 177 | ]) 178 | 179 | # Stage 2 (stage2) - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches 180 | self.stage2 = nn.Sequential( 181 | StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum), 182 | ) 183 | 184 | # Fusion layer 2 (transition2) - Creation of the third branch (1/4 resolution) 185 | self.transition2 = nn.ModuleList([ 186 | nn.Sequential(), # None, - Used in place of "None" because it is callable 187 | nn.Sequential(), # None, - Used in place of "None" because it is callable 188 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights 189 | nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), 190 | nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), 191 | nn.ReLU(inplace=True), 192 | )), # ToDo Why the new branch derives from the "upper" branch only? 193 | ]) 194 | 195 | # Stage 3 (stage3) - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches 196 | self.stage3 = nn.Sequential( 197 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), 198 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), 199 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), 200 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum), 201 | ) 202 | 203 | # Fusion layer 3 (transition3) - Creation of the fourth branch (1/8 resolution) 204 | self.transition3 = nn.ModuleList([ 205 | nn.Sequential(), # None, - Used in place of "None" because it is callable 206 | nn.Sequential(), # None, - Used in place of "None" because it is callable 207 | nn.Sequential(), # None, - Used in place of "None" because it is callable 208 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights 209 | nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), 210 | nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True), 211 | nn.ReLU(inplace=True), 212 | )), 213 | ]) 214 | 215 | # Stage 4 (stage4) - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches 216 | self.stage4 = nn.Sequential( 217 | StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum), 218 | StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum), 219 | StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum), 220 | ) 221 | 222 | # Final layer (final_layer) 223 | self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1)) 224 | 225 | def forward(self, x): 226 | x = self.conv1(x) 227 | x = self.bn1(x) 228 | x = self.relu(x) 229 | x = self.conv2(x) 230 | x = self.bn2(x) 231 | x = self.relu(x) 232 | 233 | x = self.layer1(x) 234 | x = [trans(x) for trans in self.transition1] # Since now, x is a list (# == nof branches) 235 | 236 | x = self.stage2(x) 237 | # x = [trans(x[-1]) for trans in self.transition2] # New branch derives from the "upper" branch only 238 | x = [ 239 | self.transition2[0](x[0]), 240 | self.transition2[1](x[1]), 241 | self.transition2[2](x[-1]) 242 | ] # New branch derives from the "upper" branch only 243 | 244 | x = self.stage3(x) 245 | # x = [trans(x) for trans in self.transition3] # New branch derives from the "upper" branch only 246 | x = [ 247 | self.transition3[0](x[0]), 248 | self.transition3[1](x[1]), 249 | self.transition3[2](x[2]), 250 | self.transition3[3](x[-1]) 251 | ] # New branch derives from the "upper" branch only 252 | 253 | x = self.stage4(x) 254 | 255 | x = self.final_layer(x[0]) 256 | 257 | return x 258 | -------------------------------------------------------------------------------- /backbone/mobilenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | def _make_divisible(ch, divisor=8, min_ch=None): 6 | if min_ch is None: 7 | min_ch = divisor 8 | new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor) 9 | # Make sure that round down does not go down by more than 10%. 10 | if new_ch < 0.9 * ch: 11 | new_ch += divisor 12 | return new_ch 13 | 14 | 15 | class ConvBNReLU(nn.Sequential): 16 | def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1, norm_layer=None): 17 | padding = (kernel_size - 1) // 2 18 | if norm_layer is None: 19 | norm_layer = nn.BatchNorm2d 20 | super(ConvBNReLU, self).__init__( 21 | nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False), 22 | norm_layer(out_channel), 23 | nn.ReLU6(inplace=True) 24 | ) 25 | 26 | 27 | class InvertedResidual(nn.Module): 28 | def __init__(self, in_channel, out_channel, stride, expand_ratio, norm_layer=None): 29 | super(InvertedResidual, self).__init__() 30 | hidden_channel = in_channel * expand_ratio 31 | self.use_shortcut = stride == 1 and in_channel == out_channel 32 | if norm_layer is None: 33 | norm_layer = nn.BatchNorm2d 34 | 35 | layers = [] 36 | if expand_ratio != 1: 37 | # 1x1 pointwise conv 38 | layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1, norm_layer=norm_layer)) 39 | layers.extend([ 40 | # 3x3 depthwise conv 41 | ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel, norm_layer=norm_layer), 42 | # 1x1 pointwise conv(linear) 43 | nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False), 44 | norm_layer(out_channel), 45 | ]) 46 | 47 | self.conv = nn.Sequential(*layers) 48 | 49 | def forward(self, x): 50 | if self.use_shortcut: 51 | return x + self.conv(x) 52 | else: 53 | return self.conv(x) 54 | 55 | 56 | class MobileNetV2(nn.Module): 57 | def __init__(self, num_classes=1000, alpha=1.0, round_nearest=8, weights_path=None, norm_layer=None): 58 | super(MobileNetV2, self).__init__() 59 | block = InvertedResidual 60 | input_channel = _make_divisible(32 * alpha, round_nearest) 61 | last_channel = _make_divisible(1280 * alpha, round_nearest) 62 | 63 | if norm_layer is None: 64 | norm_layer = nn.BatchNorm2d 65 | 66 | inverted_residual_setting = [ 67 | # t, c, n, s 68 | [1, 16, 1, 1], 69 | [6, 24, 2, 2], 70 | [6, 32, 3, 2], 71 | [6, 64, 4, 2], 72 | [6, 96, 3, 1], 73 | [6, 160, 3, 2], 74 | [6, 320, 1, 1], 75 | ] 76 | 77 | features = [] 78 | # conv1 layer 79 | features.append(ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)) 80 | # building inverted residual residual blockes 81 | for t, c, n, s in inverted_residual_setting: 82 | output_channel = _make_divisible(c * alpha, round_nearest) 83 | for i in range(n): 84 | stride = s if i == 0 else 1 85 | features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer)) 86 | input_channel = output_channel 87 | # building last several layers 88 | features.append(ConvBNReLU(input_channel, last_channel, 1, norm_layer=norm_layer)) 89 | # combine feature layers 90 | self.features = nn.Sequential(*features) 91 | 92 | # building classifier 93 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 94 | self.classifier = nn.Sequential( 95 | nn.Dropout(0.2), 96 | nn.Linear(last_channel, num_classes) 97 | ) 98 | 99 | if weights_path is None: 100 | # weight initialization 101 | for m in self.modules(): 102 | if isinstance(m, nn.Conv2d): 103 | nn.init.kaiming_normal_(m.weight, mode='fan_out') 104 | if m.bias is not None: 105 | nn.init.zeros_(m.bias) 106 | elif isinstance(m, nn.BatchNorm2d): 107 | nn.init.ones_(m.weight) 108 | nn.init.zeros_(m.bias) 109 | elif isinstance(m, nn.Linear): 110 | nn.init.normal_(m.weight, 0, 0.01) 111 | nn.init.zeros_(m.bias) 112 | else: 113 | self.load_state_dict(torch.load(weights_path)) 114 | 115 | def forward(self, x): 116 | x = self.features(x) 117 | x = self.avgpool(x) 118 | x = torch.flatten(x, 1) 119 | x = self.classifier(x) 120 | return x 121 | -------------------------------------------------------------------------------- /backbone/resnet50_fpn_model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch import Tensor 7 | from torch.jit.annotations import Tuple, List, Dict 8 | 9 | 10 | class Bottleneck(nn.Module): 11 | expansion = 4 12 | 13 | def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None): 14 | super(Bottleneck, self).__init__() 15 | if norm_layer is None: 16 | norm_layer = nn.BatchNorm2d 17 | 18 | self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel, 19 | kernel_size=1, stride=1, bias=False) # squeeze channels 20 | self.bn1 = norm_layer(out_channel) 21 | # ----------------------------------------- 22 | self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel, 23 | kernel_size=3, stride=stride, bias=False, padding=1) 24 | self.bn2 = norm_layer(out_channel) 25 | # ----------------------------------------- 26 | self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion, 27 | kernel_size=1, stride=1, bias=False) # unsqueeze channels 28 | self.bn3 = norm_layer(out_channel * self.expansion) 29 | self.relu = nn.ReLU(inplace=True) 30 | self.downsample = downsample 31 | 32 | def forward(self, x): 33 | identity = x 34 | if self.downsample is not None: 35 | identity = self.downsample(x) 36 | 37 | out = self.conv1(x) 38 | out = self.bn1(out) 39 | out = self.relu(out) 40 | 41 | out = self.conv2(out) 42 | out = self.bn2(out) 43 | out = self.relu(out) 44 | 45 | out = self.conv3(out) 46 | out = self.bn3(out) 47 | 48 | out += identity 49 | out = self.relu(out) 50 | 51 | return out 52 | 53 | 54 | class ResNet(nn.Module): 55 | 56 | def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None): 57 | super(ResNet, self).__init__() 58 | if norm_layer is None: 59 | norm_layer = nn.BatchNorm2d 60 | self._norm_layer = norm_layer 61 | 62 | self.include_top = include_top 63 | self.in_channel = 64 64 | 65 | self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2, 66 | padding=3, bias=False) 67 | self.bn1 = norm_layer(self.in_channel) 68 | self.relu = nn.ReLU(inplace=True) 69 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 70 | self.layer1 = self._make_layer(block, 64, blocks_num[0]) 71 | self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2) 72 | self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2) 73 | self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2) 74 | if self.include_top: 75 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1) 76 | self.fc = nn.Linear(512 * block.expansion, num_classes) 77 | 78 | for m in self.modules(): 79 | if isinstance(m, nn.Conv2d): 80 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 81 | 82 | def _make_layer(self, block, channel, block_num, stride=1): 83 | norm_layer = self._norm_layer 84 | downsample = None 85 | if stride != 1 or self.in_channel != channel * block.expansion: 86 | downsample = nn.Sequential( 87 | nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False), 88 | norm_layer(channel * block.expansion)) 89 | 90 | layers = [] 91 | layers.append(block(self.in_channel, channel, downsample=downsample, 92 | stride=stride, norm_layer=norm_layer)) 93 | self.in_channel = channel * block.expansion 94 | 95 | for _ in range(1, block_num): 96 | layers.append(block(self.in_channel, channel, norm_layer=norm_layer)) 97 | 98 | return nn.Sequential(*layers) 99 | 100 | def forward(self, x): 101 | x = self.conv1(x) 102 | x = self.bn1(x) 103 | x = self.relu(x) 104 | x = self.maxpool(x) 105 | 106 | x = self.layer1(x) 107 | x = self.layer2(x) 108 | x = self.layer3(x) 109 | x = self.layer4(x) 110 | 111 | if self.include_top: 112 | x = self.avgpool(x) 113 | x = torch.flatten(x, 1) 114 | x = self.fc(x) 115 | 116 | return x 117 | 118 | 119 | class IntermediateLayerGetter(nn.ModuleDict): 120 | """ 121 | Module wrapper that returns intermediate layers from a model 122 | It has a strong assumption that the modules have been registered 123 | into the model in the same order as they are used. 124 | This means that one should **not** reuse the same nn.Module 125 | twice in the forward if you want this to work. 126 | Additionally, it is only able to query submodules that are directly 127 | assigned to the model. So if `model` is passed, `model.feature1` can 128 | be returned, but not `model.feature1.layer2`. 129 | Arguments: 130 | model (nn.Module): model on which we will extract the features 131 | return_layers (Dict[name, new_name]): a dict containing the names 132 | of the modules for which the activations will be returned as 133 | the key of the dict, and the value of the dict is the name 134 | of the returned activation (which the user can specify). 135 | """ 136 | __annotations__ = { 137 | "return_layers": Dict[str, str], 138 | } 139 | 140 | def __init__(self, model, return_layers): 141 | if not set(return_layers).issubset([name for name, _ in model.named_children()]): 142 | raise ValueError("return_layers are not present in model") 143 | 144 | orig_return_layers = return_layers 145 | return_layers = {k: v for k, v in return_layers.items()} 146 | layers = OrderedDict() 147 | 148 | for name, module in model.named_children(): 149 | layers[name] = module 150 | if name in return_layers: 151 | del return_layers[name] 152 | if not return_layers: 153 | break 154 | 155 | super(IntermediateLayerGetter, self).__init__(layers) 156 | self.return_layers = orig_return_layers 157 | 158 | def forward(self, x): 159 | out = OrderedDict() 160 | for name, module in self.named_children(): 161 | x = module(x) 162 | if name in self.return_layers: 163 | out_name = self.return_layers[name] 164 | out[out_name] = x 165 | return out 166 | 167 | 168 | class FeaturePyramidNetwork(nn.Module): 169 | """ 170 | Module that adds a FPN from on top of a set of feature maps. This is based on 171 | `"Feature Pyramid Network for Object Detection" `_. 172 | The feature maps are currently supposed to be in increasing depth 173 | order. 174 | The input to the model is expected to be an OrderedDict[Tensor], containing 175 | the feature maps on top of which the FPN will be added. 176 | Arguments: 177 | in_channels_list (list[int]): number of channels for each feature map that 178 | is passed to the module 179 | out_channels (int): number of channels of the FPN representation 180 | extra_blocks (ExtraFPNBlock or None): if provided, extra operations will 181 | be performed. It is expected to take the fpn features, the original 182 | features and the names of the original features as input, and returns 183 | a new list of feature maps and their corresponding names 184 | """ 185 | 186 | def __init__(self, in_channels_list, out_channels, extra_blocks=None): 187 | super(FeaturePyramidNetwork, self).__init__() 188 | self.inner_blocks = nn.ModuleList() 189 | self.layer_blocks = nn.ModuleList() 190 | for in_channels in in_channels_list: 191 | if in_channels == 0: 192 | continue 193 | inner_block_module = nn.Conv2d(in_channels, out_channels, 1) 194 | layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1) 195 | self.inner_blocks.append(inner_block_module) 196 | self.layer_blocks.append(layer_block_module) 197 | 198 | # initialize parameters now to avoid modifying the initialization of top_blocks 199 | for m in self.children(): 200 | if isinstance(m, nn.Conv2d): 201 | nn.init.kaiming_uniform_(m.weight, a=1) 202 | nn.init.constant_(m.bias, 0) 203 | 204 | self.extra_blocks = extra_blocks 205 | 206 | def get_result_from_inner_blocks(self, x, idx): 207 | # type: (Tensor, int) -> Tensor 208 | """ 209 | This is equivalent to self.inner_blocks[idx](x), 210 | but torchscript doesn't support this yet 211 | """ 212 | num_blocks = 0 213 | for m in self.inner_blocks: 214 | num_blocks += 1 215 | if idx < 0: 216 | idx += num_blocks 217 | i = 0 218 | out = x 219 | for module in self.inner_blocks: 220 | if i == idx: 221 | out = module(x) 222 | i += 1 223 | return out 224 | 225 | def get_result_from_layer_blocks(self, x, idx): 226 | # type: (Tensor, int) -> Tensor 227 | """ 228 | This is equivalent to self.layer_blocks[idx](x), 229 | but torchscript doesn't support this yet 230 | """ 231 | num_blocks = 0 232 | for m in self.layer_blocks: 233 | num_blocks += 1 234 | if idx < 0: 235 | idx += num_blocks 236 | i = 0 237 | out = x 238 | for module in self.layer_blocks: 239 | if i == idx: 240 | out = module(x) 241 | i += 1 242 | return out 243 | 244 | def forward(self, x): 245 | # type: (Dict[str, Tensor]) -> Dict[str, Tensor] 246 | """ 247 | Computes the FPN for a set of feature maps. 248 | Arguments: 249 | x (OrderedDict[Tensor]): feature maps for each feature level. 250 | Returns: 251 | results (OrderedDict[Tensor]): feature maps after FPN layers. 252 | They are ordered from highest resolution first. 253 | """ 254 | names = list(x.keys()) 255 | x = list(x.values()) 256 | 257 | last_inner = self.get_result_from_inner_blocks(x[-1], -1) 258 | 259 | results = [] 260 | results.append(self.get_result_from_layer_blocks(last_inner, -1)) 261 | 262 | for idx in range(len(x) - 2, -1, -1): 263 | inner_lateral = self.get_result_from_inner_blocks(x[idx], idx) 264 | feat_shape = inner_lateral.shape[-2:] 265 | inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest") 266 | last_inner = inner_lateral + inner_top_down 267 | results.insert(0, self.get_result_from_layer_blocks(last_inner, idx)) 268 | 269 | if self.extra_blocks is not None: 270 | results, names = self.extra_blocks(results, names) 271 | 272 | # make it back an OrderedDict 273 | out = OrderedDict([(k, v) for k, v in zip(names, results)]) 274 | 275 | return out 276 | 277 | 278 | class LastLevelMaxPool(torch.nn.Module): 279 | """ 280 | Applies a max_pool2d on top of the last feature map 281 | """ 282 | 283 | def forward(self, x, names): 284 | names.append("pool") 285 | x.append(F.max_pool2d(x[-1], 1, 2, 0)) 286 | return x, names 287 | 288 | 289 | class BackboneWithFPN(nn.Module): 290 | """ 291 | Adds a FPN on top of a model. 292 | Internally, it uses torchvision.models._utils.IntermediateLayerGetter to 293 | extract a submodel that returns the feature maps specified in return_layers. 294 | The same limitations of IntermediatLayerGetter apply here. 295 | Arguments: 296 | backbone (nn.Module) 297 | return_layers (Dict[name, new_name]): a dict containing the names 298 | of the modules for which the activations will be returned as 299 | the key of the dict, and the value of the dict is the name 300 | of the returned activation (which the user can specify). 301 | in_channels_list (List[int]): number of channels for each feature map 302 | that is returned, in the order they are present in the OrderedDict 303 | out_channels (int): number of channels in the FPN. 304 | Attributes: 305 | out_channels (int): the number of channels in the FPN 306 | """ 307 | 308 | def __init__(self, backbone, return_layers, in_channels_list, out_channels): 309 | super(BackboneWithFPN, self).__init__() 310 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 311 | self.fpn = FeaturePyramidNetwork( 312 | in_channels_list=in_channels_list, 313 | out_channels=out_channels, 314 | extra_blocks=LastLevelMaxPool(), 315 | ) 316 | self.out_channels = out_channels 317 | 318 | def forward(self, x): 319 | x = self.body(x) 320 | x = self.fpn(x) 321 | return x 322 | 323 | 324 | def resnet50_fpn_backbone(): 325 | 326 | resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3], 327 | include_top=False) 328 | 329 | for name, parameter in resnet_backbone.named_parameters(): 330 | if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 331 | parameter.requires_grad_(False) 332 | 333 | return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'} 334 | 335 | in_channels_stage2 = resnet_backbone.in_channel // 8 336 | in_channels_list = [ 337 | in_channels_stage2, # layer1 out_channel=256 338 | in_channels_stage2 * 2, # layer2 out_channel=512 339 | in_channels_stage2 * 4, # layer3 out_channel=1024 340 | in_channels_stage2 * 8, # layer4 out_channel=2048 341 | ] 342 | out_channels = 256 343 | return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels) 344 | -------------------------------------------------------------------------------- /backbone/vgg16.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | channels_cfgs = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] 5 | 6 | 7 | def vgg16(weights_path=None): 8 | model = VGG(make_features(channels_cfgs), weights_path=weights_path) 9 | return model 10 | 11 | 12 | class VGG(nn.Module): 13 | def __init__(self, features, class_num=1000, init_weights=False, weights_path=None): 14 | super(VGG, self).__init__() 15 | self.features = features 16 | self.classifier = nn.Sequential( 17 | nn.Dropout(p=0.5), 18 | nn.Linear(512 * 7 * 7, 2048), 19 | nn.ReLU(True), 20 | nn.Dropout(p=0.5), 21 | nn.Linear(2048, 2048), 22 | nn.ReLU(True), 23 | nn.Linear(2048, class_num) 24 | ) 25 | if init_weights and weights_path is None: 26 | self._initialize_weights() 27 | 28 | if weights_path is not None: 29 | self.load_state_dict(torch.load(weights_path), strict=False) 30 | 31 | def forward(self, x): 32 | # N x 3 x 224 x 224 33 | x = self.features(x) 34 | # N x 512 x 7 x 7 35 | x = torch.flatten(x, start_dim=1) 36 | # N x 512*7*7 37 | x = self.classifier(x) 38 | return x 39 | 40 | def _initialize_weights(self): 41 | for m in self.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | nn.init.xavier_uniform_(m.weight) 44 | if m.bias is not None: 45 | nn.init.constant_(m.bias, 0) 46 | elif isinstance(m, nn.Linear): 47 | nn.init.xavier_uniform_(m.weight) 48 | nn.init.constant_(m.bias, 0) 49 | 50 | 51 | def make_features(ch_cfgs): 52 | layers = [] 53 | in_channels = 3 54 | for v in ch_cfgs: 55 | if v == "M": 56 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 57 | else: 58 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 59 | layers += [conv2d, nn.ReLU(True)] 60 | in_channels = v 61 | return nn.Sequential(*layers) 62 | -------------------------------------------------------------------------------- /config/test_config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | model_weights = " " 3 | image_path = " " 4 | gpu_id = '2' 5 | num_classes = 80 + 1 6 | data_root_dir = " " 7 | 8 | 9 | test_cfg = Config() 10 | -------------------------------------------------------------------------------- /config/train_config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Config: 4 | backbone = 'mobilenet' # [vgg16, resnet-fpn, mobilenet, resnet50_fpn] 5 | backbone_pretrained_weights = None # [path or None] 6 | 7 | # data transform parameter 8 | train_horizon_flip_prob = 0.0 # data horizon flip probility in train transform 9 | min_size = 800 10 | max_size = 1000 11 | image_mean = [0.485, 0.456, 0.406] 12 | image_std = [0.229, 0.224, 0.225] 13 | 14 | # anchor parameters 15 | anchor_size = [64, 128, 256] 16 | anchor_ratio = [0.5, 1, 2.0] 17 | 18 | # roi align parameters 19 | roi_out_size = [7, 7] 20 | roi_sample_rate = 2 21 | 22 | # rpn process parameters 23 | rpn_pre_nms_top_n_train = 2000 24 | rpn_post_nms_top_n_train = 2000 25 | 26 | rpn_pre_nms_top_n_test = 1000 27 | rpn_post_nms_top_n_test = 1000 28 | 29 | rpn_nms_thresh = 0.7 30 | rpn_fg_iou_thresh = 0.7 31 | rpn_bg_iou_thresh = 0.3 32 | rpn_batch_size_per_image = 256 33 | rpn_positive_fraction = 0.5 34 | 35 | # remove low threshold target 36 | box_score_thresh = 0.05 37 | box_nms_thresh = 0.5 38 | box_detections_per_img = 100 39 | box_fg_iou_thresh = 0.5 40 | box_bg_iou_thresh = 0.5 41 | box_batch_size_per_image = 512 42 | box_positive_fraction = 0.25 43 | bbox_reg_weights = None 44 | 45 | device_name = 'cuda:7' 46 | 47 | resume = '' # pretrained_weights 48 | start_epoch = 0 # start epoch 49 | num_epochs = 5000 # train epochs 50 | 51 | # learning rate parameters 52 | lr = 5e-3 53 | momentum = 0.9 54 | weight_decay = 0.0005 55 | 56 | # learning rate schedule 57 | lr_gamma = 0.33 58 | lr_dec_step_size = 100 59 | 60 | batch_size = 6 61 | 62 | num_class = 80 + 1 # foreground + 1 background 63 | data_root_dir = " " 64 | model_save_dir = " " 65 | 66 | 67 | cfg = Config() 68 | -------------------------------------------------------------------------------- /dataloader/coco_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | 8 | import numpy as np 9 | import torch 10 | from PIL import Image 11 | from pycocotools.coco import COCO 12 | from torch.utils.data import Dataset 13 | 14 | 15 | class coco(Dataset): 16 | def __init__(self, root_dir, image_set, year, transforms=None): 17 | 18 | self._root_dir = root_dir 19 | self._year = year 20 | self._image_set = image_set 21 | self._data_name = image_set + year 22 | self._json_path = self._get_ann_file() 23 | self._transforms = transforms 24 | 25 | # load COCO API 26 | self._COCO = COCO(self._json_path) 27 | 28 | with open(self._json_path) as anno_file: 29 | self.anno = json.load(anno_file) 30 | 31 | cats = self._COCO.loadCats(self._COCO.getCatIds()) 32 | self._classes = tuple(['__background__'] + [c['name'] for c in cats]) 33 | 34 | self.classes = self._classes 35 | self.num_classes = len(self.classes) 36 | self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes))))) 37 | self._class_to_coco_cat_id = dict(list(zip([c['name'] for c in cats], 38 | self._COCO.getCatIds()))) 39 | 40 | self.coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls], 41 | self._class_to_ind[cls]) 42 | for cls in self._classes[1:]]) 43 | 44 | def __len__(self): 45 | return len(self.anno['images']) 46 | 47 | def _get_ann_file(self): 48 | prefix = 'instances' if self._image_set.find('test') == -1 else 'image_info' 49 | return os.path.join(self._root_dir, 'annotations', prefix + '_' + self._image_set + self._year + '.json') 50 | 51 | def _image_path_from_index(self, index): 52 | """ 53 | Construct an image path from the image's "index" identifier. 54 | """ 55 | # Example image path for index=119993: 56 | # images/train2014/COCO_train2014_000000119993.jpg 57 | file_name = (str(index).zfill(12) + '.jpg') 58 | image_path = os.path.join(self._root_dir, self._data_name, file_name) 59 | assert os.path.exists(image_path), 'Path does not exist: {}'.format(image_path) 60 | return image_path 61 | 62 | def __getitem__(self, idx): 63 | a = self.anno['images'][idx] 64 | image_idx = a['id'] 65 | img_path = os.path.join(self._root_dir, self._data_name, self._image_path_from_index(image_idx)) 66 | image = Image.open(img_path) 67 | 68 | width = a['width'] 69 | height = a['height'] 70 | 71 | annIds = self._COCO.getAnnIds(imgIds=image_idx, iscrowd=None) 72 | objs = self._COCO.loadAnns(annIds) 73 | 74 | # Sanitize bboxes -- some are invalid 75 | valid_objs = [] 76 | for obj in objs: 77 | x1 = np.max((0, obj['bbox'][0])) 78 | y1 = np.max((0, obj['bbox'][1])) 79 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) 80 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) 81 | if obj['area'] > 0 and x2 > x1 and y2 > y1: 82 | obj['clean_bbox'] = [x1, y1, x2, y2] 83 | valid_objs.append(obj) 84 | objs = valid_objs 85 | num_objs = len(objs) 86 | 87 | boxes = np.zeros((num_objs, 4), dtype=np.float32) 88 | gt_classes = np.zeros((num_objs), dtype=np.int32) 89 | 90 | iscrowd = [] 91 | for ix, obj in enumerate(objs): 92 | cls = self.coco_cat_id_to_class_ind[obj['category_id']] 93 | boxes[ix, :] = obj['clean_bbox'] 94 | gt_classes[ix] = cls 95 | iscrowd.append(int(obj["iscrowd"])) 96 | 97 | # convert everything into a torch.Tensor 98 | image_id = torch.tensor([image_idx]) 99 | boxes = torch.as_tensor(boxes, dtype=torch.float32) 100 | gt_classes = torch.as_tensor(gt_classes, dtype=torch.int32) 101 | iscrowd = torch.as_tensor(iscrowd, dtype=torch.int32) 102 | 103 | area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) 104 | 105 | target = {"boxes": boxes, "labels": gt_classes, "image_id": image_id, "area": area, "iscrowd": iscrowd} 106 | 107 | if self._transforms is not None: 108 | image, target = self._transforms(image, target) 109 | 110 | return image, target 111 | 112 | @staticmethod 113 | def collate_fn(batch): 114 | return tuple(zip(*batch)) 115 | 116 | @property 117 | def class_to_coco_cat_id(self): 118 | return self._class_to_coco_cat_id 119 | -------------------------------------------------------------------------------- /imgs/demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlphaJia/pytorch-faster-rcnn/943ef668facaacf77a4822fe79331343a6ebca2d/imgs/demo1.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch~=1.5.1 2 | torchvision~=0.6.1 3 | numpy~=1.15.4 4 | Pillow~=6.1.0 5 | pycocotools~=2.0 6 | matplotlib~=3.0.2 7 | tensorboardX~=2.0 8 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import torch 5 | from PIL import Image 6 | from torchvision import transforms 7 | 8 | from config.test_config import test_cfg 9 | from dataloader.coco_dataset import coco 10 | from utils.draw_box_utils import draw_box 11 | from utils.train_utils import create_model 12 | 13 | 14 | def test(): 15 | model = create_model(num_classes=test_cfg.num_classes) 16 | 17 | model.cuda() 18 | weights = test_cfg.model_weights 19 | 20 | checkpoint = torch.load(weights, map_location='cpu') 21 | model.load_state_dict(checkpoint['model']) 22 | 23 | # read class_indict 24 | data_transform = transforms.Compose([transforms.ToTensor()]) 25 | test_data_set = coco(test_cfg.data_root_dir, 'test', '2017', data_transform) 26 | category_index = test_data_set.class_to_coco_cat_id 27 | 28 | index_category = dict(zip(category_index.values(), category_index.keys())) 29 | 30 | original_img = Image.open(test_cfg.image_path) 31 | img = data_transform(original_img) 32 | img = torch.unsqueeze(img, dim=0) 33 | 34 | model.eval() 35 | with torch.no_grad(): 36 | predictions = model(img.cuda())[0] 37 | predict_boxes = predictions["boxes"].to("cpu").numpy() 38 | predict_classes = predictions["labels"].to("cpu").numpy() 39 | predict_scores = predictions["scores"].to("cpu").numpy() 40 | 41 | if len(predict_boxes) == 0: 42 | print("No target detected!") 43 | 44 | draw_box(original_img, 45 | predict_boxes, 46 | predict_classes, 47 | predict_scores, 48 | index_category, 49 | thresh=0.3, 50 | line_thickness=3) 51 | plt.imshow(original_img) 52 | plt.show() 53 | 54 | 55 | if __name__ == "__main__": 56 | version = torch.version.__version__[:5] 57 | print('torch version is {}'.format(version)) 58 | os.environ["CUDA_VISIBLE_DEVICES"] = test_cfg.gpu_id 59 | test() 60 | -------------------------------------------------------------------------------- /test/anchor_utils_test.py: -------------------------------------------------------------------------------- 1 | from utils.anchor_utils import generate_anchors 2 | 3 | 4 | def generate_anchors_test(): 5 | scales = [64, 128, 256] 6 | ratios = [0.5, 1.0, 2.0] 7 | generate_anchors(scales, ratios) 8 | 9 | 10 | if __name__ == '__main__': 11 | generate_anchors_test() 12 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from tensorboardX import SummaryWriter 5 | 6 | from config.train_config import cfg 7 | from dataloader.coco_dataset import coco 8 | from utils.evaluate_utils import evaluate 9 | from utils.im_utils import Compose, ToTensor, RandomHorizontalFlip 10 | from utils.plot_utils import plot_loss_and_lr, plot_map 11 | from utils.train_utils import train_one_epoch, write_tb, create_model 12 | 13 | 14 | def main(): 15 | device = torch.device(cfg.device_name) 16 | print("Using {} device training.".format(device.type)) 17 | 18 | if not os.path.exists(cfg.model_save_dir): 19 | os.makedirs(cfg.model_save_dir) 20 | 21 | # tensorboard writer 22 | writer = SummaryWriter(os.path.join(cfg.model_save_dir, 'epoch_log')) 23 | 24 | data_transform = { 25 | "train": Compose([ToTensor(), RandomHorizontalFlip(cfg.train_horizon_flip_prob)]), 26 | "val": Compose([ToTensor()]) 27 | } 28 | 29 | if not os.path.exists(cfg.data_root_dir): 30 | raise FileNotFoundError("dataset root dir not exist!") 31 | 32 | # load train data set 33 | train_data_set = coco(cfg.data_root_dir, 'train', '2017', data_transform["train"]) 34 | batch_size = cfg.batch_size 35 | nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) 36 | print('Using {} dataloader workers'.format(nw)) 37 | train_data_loader = torch.utils.data.DataLoader(train_data_set, 38 | batch_size=batch_size, 39 | shuffle=True, 40 | num_workers=nw, 41 | collate_fn=train_data_set.collate_fn) 42 | 43 | # load validation data set 44 | val_data_set = coco(cfg.data_root_dir, 'val', '2017', data_transform["val"]) 45 | val_data_set_loader = torch.utils.data.DataLoader(val_data_set, 46 | batch_size=batch_size, 47 | shuffle=False, 48 | num_workers=nw, 49 | collate_fn=train_data_set.collate_fn) 50 | 51 | # create model num_classes equal background + 80 classes 52 | model = create_model(num_classes=cfg.num_class) 53 | 54 | model.to(device) 55 | 56 | # define optimizer 57 | params = [p for p in model.parameters() if p.requires_grad] 58 | optimizer = torch.optim.SGD(params, lr=cfg.lr, 59 | momentum=cfg.momentum, weight_decay=cfg.weight_decay) 60 | 61 | # learning rate scheduler 62 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 63 | step_size=cfg.lr_dec_step_size, 64 | gamma=cfg.lr_gamma) 65 | 66 | # train from pretrained weights 67 | if cfg.resume != "": 68 | checkpoint = torch.load(cfg.resume) 69 | model.load_state_dict(checkpoint['model']) 70 | optimizer.load_state_dict(checkpoint['optimizer']) 71 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 72 | cfg.start_epoch = checkpoint['epoch'] + 1 73 | print("the training process from epoch{}...".format(cfg.start_epoch)) 74 | 75 | train_loss = [] 76 | learning_rate = [] 77 | train_mAP_list = [] 78 | val_mAP = [] 79 | 80 | best_mAP = 0 81 | for epoch in range(cfg.start_epoch, cfg.num_epochs): 82 | loss_dict, total_loss = train_one_epoch(model, optimizer, train_data_loader, 83 | device, epoch, train_loss=train_loss, train_lr=learning_rate, 84 | print_freq=50, warmup=False) 85 | 86 | lr_scheduler.step() 87 | 88 | print("------>Starting training data valid") 89 | _, train_mAP = evaluate(model, train_data_loader, device=device, mAP_list=train_mAP_list) 90 | 91 | print("------>Starting validation data valid") 92 | _, mAP = evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) 93 | print('training mAp is {}'.format(train_mAP)) 94 | print('validation mAp is {}'.format(mAP)) 95 | print('best mAp is {}'.format(best_mAP)) 96 | 97 | board_info = {'lr': optimizer.param_groups[0]['lr'], 98 | 'train_mAP': train_mAP, 99 | 'val_mAP': mAP} 100 | 101 | for k, v in loss_dict.items(): 102 | board_info[k] = v.item() 103 | board_info['total loss'] = total_loss.item() 104 | write_tb(writer, epoch, board_info) 105 | 106 | if mAP > best_mAP: 107 | best_mAP = mAP 108 | # save weights 109 | save_files = { 110 | 'model': model.state_dict(), 111 | 'optimizer': optimizer.state_dict(), 112 | 'lr_scheduler': lr_scheduler.state_dict(), 113 | 'epoch': epoch} 114 | model_save_dir = cfg.model_save_dir 115 | if not os.path.exists(model_save_dir): 116 | os.makedirs(model_save_dir) 117 | torch.save(save_files, 118 | os.path.join(model_save_dir, "{}-model-{}-mAp-{}.pth".format(cfg.backbone, epoch, mAP))) 119 | writer.close() 120 | # plot loss and lr curve 121 | if len(train_loss) != 0 and len(learning_rate) != 0: 122 | plot_loss_and_lr(train_loss, learning_rate, cfg.model_save_dir) 123 | 124 | # plot mAP curve 125 | if len(val_mAP) != 0: 126 | plot_map(val_mAP, cfg.model_save_dir) 127 | 128 | 129 | if __name__ == "__main__": 130 | version = torch.version.__version__[:5] 131 | print('torch version is {}'.format(version)) 132 | main() 133 | -------------------------------------------------------------------------------- /utils/anchor_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | def generate_anchors(scales, aspect_ratios, dtype=torch.float32, device="cpu"): 6 | """ 7 | generate anchor template based on sizes and ratios, generated template is centered at [0, 0] 8 | :param scales: anchor sizes, in tuple[int] 9 | :param aspect_ratios: anchor ratios, in tuple[float] 10 | :param dtype: data type 11 | :param device: date device 12 | :return: 13 | """ 14 | 15 | scales = torch.as_tensor(scales, dtype=dtype, device=device) 16 | aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device) 17 | h_ratios = torch.sqrt(aspect_ratios) 18 | w_ratios = 1.0 / h_ratios 19 | 20 | # [r1, r2, r3]' * [s1, s2, s3] 21 | # number of elements is len(ratios)*len(scales) 22 | ws = (w_ratios[:, None] * scales[None, :]).view(-1) 23 | hs = (h_ratios[:, None] * scales[None, :]).view(-1) 24 | 25 | # left-top, right-bottom coordinate relative to anchor center(0, 0) 26 | # anchor template is centered at [0, 0], shape [len(ratios)*len(scales), 4] 27 | base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2 28 | 29 | return base_anchors.round() # anchor will lose some precision here 30 | 31 | 32 | class AnchorsGenerator(nn.Module): 33 | """ 34 | anchor generator for feature maps according to anchor sizes and ratios 35 | :param sizes: anchor sizes, in tuple[int] 36 | :param aspect_ratios: anchor ratios, in tuple[float] 37 | :return: 38 | """ 39 | 40 | def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)): 41 | super(AnchorsGenerator, self).__init__() 42 | 43 | # assert len(sizes) == len(aspect_ratios), 'anchor sizes must equal to anchor ratios!' 44 | 45 | self.sizes = sizes 46 | self.aspect_ratios = aspect_ratios 47 | self.cell_anchors = None 48 | self._cache = {} 49 | 50 | def set_cell_anchors(self, dtype, device): 51 | """ 52 | generate template template 53 | :param dtype: data type 54 | :param device: data device 55 | :return: 56 | """ 57 | if self.cell_anchors is not None: 58 | cell_anchors = self.cell_anchors 59 | assert cell_anchors is not None 60 | 61 | # generate anchor template 62 | cell_anchors = [generate_anchors(sizes, aspect_ratios, dtype, device) 63 | for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)] 64 | self.cell_anchors = cell_anchors 65 | 66 | def num_anchors_per_location(self): 67 | # calculate the number of anchors per feature map, for k in origin paper 68 | return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)] 69 | 70 | def grid_anchors(self, feature_map_sizes, strides): 71 | """ 72 | compute anchor coordinate list in origin image, mapped from feature map 73 | :param feature_map_sizes: feature map sizes 74 | :param strides: strides between origin image and anchor 75 | :return: 76 | """ 77 | 78 | anchors = [] 79 | cell_anchors = self.cell_anchors # anchor template 80 | assert cell_anchors is not None 81 | 82 | # for every resolution feature map, like fpn 83 | for size, stride, base_anchors in zip(feature_map_sizes, strides, cell_anchors): 84 | f_p_height, f_p_width = size 85 | stride_height, stride_width = stride 86 | device = base_anchors.device 87 | 88 | # For output anchor, compute [x_center, y_center, x_center, y_center...] 89 | # x_center in origin image 90 | shifts_x = torch.arange(0, f_p_width, dtype=torch.float32, device=device) * stride_width 91 | 92 | # y_center in origin image 93 | shifts_y = torch.arange(0, f_p_height, dtype=torch.float32, device=device) * stride_height 94 | 95 | # torch.meshgrid will output grid 96 | # shape: [grid_height, grid_width] 97 | shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) 98 | shift_x = shift_x.reshape(-1) 99 | shift_y = shift_y.reshape(-1) 100 | 101 | shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1) 102 | 103 | # For every (base anchor, output anchor) pair, 104 | # offset each zero-centered base anchor by the center of the output anchor 105 | shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4) 106 | anchors.append(shifts_anchor.reshape(-1, 4)) 107 | 108 | return anchors # List[Tensor(all_num_anchors, 4)] 109 | 110 | def cached_grid_anchors(self, feature_map_size, strides): 111 | """ 112 | cached all anchor information 113 | :param feature_map_size: feature map size after backbone feature extractor 114 | :param strides: strides between origin image size and feature map size 115 | :return: 116 | """ 117 | 118 | key = str(feature_map_size) + str(strides) 119 | # self._cache is a dictionary type 120 | if key in self._cache: 121 | return self._cache[key] 122 | anchors = self.grid_anchors(feature_map_size, strides) 123 | self._cache[key] = anchors 124 | return anchors 125 | 126 | def forward(self, image_list, feature_maps): 127 | """ 128 | get feature map sizes 129 | :param image_list: 130 | :param feature_maps: 131 | :return: 132 | """ 133 | 134 | feature_map_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps]) 135 | 136 | # get input image sizes 137 | image_size = image_list.tensors.shape[-2:] 138 | 139 | # get dtype and device 140 | dtype, device = feature_maps[0].dtype, feature_maps[0].device 141 | 142 | # compute map stride between feature_maps and input images 143 | strides = [[torch.tensor(image_size[0] / g[0], dtype=torch.int64, device=device), 144 | torch.tensor(image_size[1] / g[1], dtype=torch.int64, device=device)] for g in feature_map_sizes] 145 | 146 | # get anchors template according size and aspect_ratios 147 | self.set_cell_anchors(dtype, device) 148 | 149 | # get anchor coordinate list in origin image, according to map 150 | anchors_over_all_feature_maps = self.cached_grid_anchors(feature_map_sizes, strides) 151 | 152 | anchors = [] 153 | # for every image and feature map in a batch 154 | for i, (_, _) in enumerate(image_list.image_sizes): 155 | anchors_in_image = [] 156 | # for every resolution feature map like fpn 157 | for anchors_per_feature_map in anchors_over_all_feature_maps: 158 | anchors_in_image.append(anchors_per_feature_map) 159 | anchors.append(anchors_in_image) 160 | 161 | # concat every resolution anchors, like fpn 162 | anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors] 163 | 164 | self._cache.clear() 165 | return anchors 166 | -------------------------------------------------------------------------------- /utils/boxes_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def nms(boxes, scores, iou_threshold): 5 | """ 6 | Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU). 7 | 8 | NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) 9 | box. 10 | :param boxes: Tensor[N, 4]), boxes to perform NMS on. They are expected to be in (x1, y1, x2, y2) format 11 | :param scores: Tensor[N], scores for each one of the boxes 12 | :param iou_threshold: float, discards all overlapping boxes with IoU < iou_threshold 13 | :return: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores 14 | """ 15 | 16 | return torch.ops.torchvision.nms(boxes, scores, iou_threshold) 17 | 18 | 19 | def batched_nms(boxes, scores, idxs, iou_threshold): 20 | """ 21 | Performs non-maximum suppression in a batched fashion. 22 | Each index value correspond to a category, and NMS 23 | will not be applied between elements of different categories 24 | :param boxes: Tensor[N, 4], boxes where NMS will be performed. They are expected to be in (x1, y1, x2, y2) format 25 | :param scores: Tensor[N], scores for each one of the boxes 26 | :param idxs: Tensor[N], indices of the categories for each one of the boxes. 27 | :param iou_threshold: float, discards all overlapping boxes, with IoU < iou_threshold 28 | :return: int64 tensor with the indices of the elements that have been kept by NMS, sorted 29 | in decreasing order of scores 30 | """ 31 | 32 | if boxes.numel() == 0: 33 | return torch.empty((0,), dtype=torch.int64, device=boxes.device) 34 | 35 | # strategy: in order to perform NMS independently per class. 36 | # we add an offset to all the boxes. The offset is dependent 37 | # only on the class idx, and is large enough so that boxes 38 | # from different classes do not overlap 39 | max_coordinate = boxes.max() 40 | 41 | # to(): Performs Tensor dtype and/or device conversion 42 | offsets = idxs.to(boxes) * (max_coordinate + 1) 43 | boxes_for_nms = boxes + offsets[:, None] 44 | keep = nms(boxes_for_nms, scores, iou_threshold) 45 | return keep 46 | 47 | 48 | def remove_small_boxes(boxes, min_size): 49 | """ 50 | Remove boxes which contains at least one side smaller than min_size. 51 | :param boxes: boxes in (x1, y1, x2, y2) format 52 | :param min_size: minimum size 53 | :return: indices of the boxes that have both sides 54 | larger than min_size 55 | """ 56 | 57 | ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1] 58 | keep = (ws >= min_size) & (hs >= min_size) 59 | # nonzero(): Returns a tensor containing the indices of all non-zero elements of input 60 | keep = keep.nonzero().squeeze(1) 61 | return keep 62 | 63 | 64 | def clip_boxes_to_image(boxes, size): 65 | """ 66 | Clip boxes so that they lie inside an image of size `size`. 67 | :param boxes: boxes in (x1, y1, x2, y2) format 68 | :param size: size of the image 69 | :return: clipped_boxes (Tensor[N, 4]) 70 | """ 71 | 72 | dim = boxes.dim() 73 | boxes_x = boxes[..., 0::2] # x1, x2 74 | boxes_y = boxes[..., 1::2] # y1, y2 75 | height, width = size 76 | 77 | boxes_x = boxes_x.clamp(min=0, max=width) 78 | boxes_y = boxes_y.clamp(min=0, max=height) 79 | 80 | clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) 81 | return clipped_boxes.reshape(boxes.shape) 82 | 83 | 84 | def box_area(boxes): 85 | """ 86 | Computes the area of a set of bounding boxes, which are specified by its 87 | (x1, y1, x2, y2) coordinates. 88 | :param boxes: boxes for which the area will be computed. They 89 | are expected to be in (x1, y1, x2, y2) format 90 | :return: area for each box 91 | """ 92 | 93 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 94 | 95 | 96 | def box_iou(boxes1, boxes2): 97 | """ 98 | Calculate intersection-over-union (Jaccard index) of boxes. 99 | Both sets of boxes are expected to be in (x1, y1, x2, y2) format. 100 | :param boxes1: boxes1 (Tensor[N, 4]) 101 | :param boxes2: boxes2 (Tensor[M, 4]) 102 | :return: iou (Tensor[N, M]): the NxM matrix containing the pairwise 103 | IoU values for every element in boxes1 and boxes2 104 | """ 105 | 106 | area1 = box_area(boxes1) 107 | area2 = box_area(boxes2) 108 | 109 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # left-top [N,M,2] 110 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # right-bottom [N,M,2] 111 | 112 | wh = (rb - lt).clamp(min=0) # [N,M,2] 113 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 114 | 115 | iou = inter / (area1[:, None] + area2 - inter) 116 | return iou 117 | 118 | 119 | def permute_and_flatten(layer, N, A, C, H, W): 120 | """ 121 | adjust tensor order，and reshape 122 | :param layer: classification or bboxes parameters 123 | :param N: batch_size 124 | :param A: anchors_num_per_position 125 | :param C: classes_num or bbox coordinate 126 | :param H: height 127 | :param W: width 128 | :return: Tensor after adjusting order and reshaping 129 | """ 130 | 131 | # [batch_size, anchors_num_per_position * (C or 4), height, width] 132 | layer = layer.view(N, -1, C, H, W) 133 | layer = layer.permute(0, 3, 4, 1, 2) # [N, H, W, -1, C] 134 | layer = layer.reshape(N, -1, C) 135 | return layer 136 | 137 | 138 | def concat_box_prediction_layers(box_cls, box_regression): 139 | """ 140 | Adjust box classification and bbox regression parameters order and reshape 141 | :param box_cls: target prediction score 142 | :param box_regression: bbox regression parameters 143 | :return: [N, -1, C] 144 | """ 145 | 146 | box_cls_flattened = [] 147 | box_regression_flattened = [] 148 | 149 | for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression): 150 | # [batch_size, anchors_num_per_position * classes_num, height, width], class_num is equal 2 151 | N, AxC, H, W = box_cls_per_level.shape 152 | # [batch_size, anchors_num_per_position * 4, height, width] 153 | Ax4 = box_regression_per_level.shape[1] 154 | # anchors_num_per_position 155 | A = Ax4 // 4 156 | # classes_num 157 | C = AxC // A 158 | 159 | # [N, -1, C] 160 | box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W) 161 | box_cls_flattened.append(box_cls_per_level) 162 | 163 | # [N, -1, C] 164 | box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W) 165 | box_regression_flattened.append(box_regression_per_level) 166 | 167 | box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2) # start_dim, end_dim 168 | box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4) 169 | return box_cls, box_regression 170 | 171 | -------------------------------------------------------------------------------- /utils/coco_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | import pycocotools.mask as mask_util 7 | import torch 8 | import torch.utils.data 9 | import torchvision 10 | from pycocotools.coco import COCO 11 | from pycocotools.cocoeval import COCOeval 12 | 13 | from utils.train_utils import all_gather 14 | 15 | 16 | def convert_to_coco_api(ds): 17 | coco_ds = COCO() 18 | ann_id = 1 19 | dataset = {'images': [], 'categories': [], 'annotations': []} 20 | categories = set() 21 | for img_idx in range(len(ds)): 22 | # find better way to get target 23 | img, targets = ds[img_idx] 24 | image_id = targets["image_id"].item() 25 | img_dict = {'id': image_id, 'height': img.shape[-2], 'width': img.shape[-1]} 26 | dataset['images'].append(img_dict) 27 | bboxes = targets["boxes"] 28 | bboxes[:, 2:] -= bboxes[:, :2] 29 | bboxes = bboxes.tolist() 30 | labels = targets['labels'].tolist() 31 | areas = targets['area'].tolist() 32 | iscrowd = targets['iscrowd'].tolist() 33 | num_objs = len(bboxes) 34 | for i in range(num_objs): 35 | ann = {'image_id': image_id, 'bbox': bboxes[i], 'category_id': labels[i]} 36 | categories.add(labels[i]) 37 | ann['area'] = areas[i] 38 | ann['iscrowd'] = iscrowd[i] 39 | ann['id'] = ann_id 40 | dataset['annotations'].append(ann) 41 | ann_id += 1 42 | dataset['categories'] = [{'id': i} for i in sorted(categories)] 43 | coco_ds.dataset = dataset 44 | coco_ds.createIndex() 45 | return coco_ds 46 | 47 | 48 | def get_coco_api_from_dataset(dataset): 49 | for _ in range(10): 50 | if isinstance(dataset, torchvision.datasets.CocoDetection): 51 | break 52 | if isinstance(dataset, torch.utils.data.Subset): 53 | dataset = dataset.dataset 54 | if isinstance(dataset, torchvision.datasets.CocoDetection): 55 | return dataset.coco 56 | return convert_to_coco_api(dataset) 57 | 58 | 59 | def prepare_for_coco_detection(predictions): 60 | coco_results = [] 61 | for original_id, prediction in predictions.items(): 62 | if len(prediction) == 0: 63 | continue 64 | 65 | boxes = prediction["boxes"] 66 | boxes = convert_to_xywh(boxes).tolist() 67 | scores = prediction["scores"].tolist() 68 | labels = prediction["labels"].tolist() 69 | 70 | coco_results.extend( 71 | [ 72 | { 73 | "image_id": original_id, 74 | "category_id": labels[k], 75 | "bbox": box, 76 | "score": scores[k], 77 | } 78 | for k, box in enumerate(boxes) 79 | ] 80 | ) 81 | return coco_results 82 | 83 | 84 | def prepare(predictions, iou_type): 85 | return prepare_for_coco_detection(predictions) 86 | 87 | 88 | class CocoEvaluator(object): 89 | def __init__(self, coco_gt, iou_types): 90 | assert isinstance(iou_types, (list, tuple)) 91 | coco_gt = copy.deepcopy(coco_gt) 92 | self.coco_gt = coco_gt 93 | 94 | self.iou_types = iou_types 95 | self.coco_eval = {} 96 | for iou_type in iou_types: 97 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 98 | 99 | self.img_ids = [] 100 | self.eval_imgs = {k: [] for k in iou_types} 101 | 102 | def update(self, predictions): 103 | img_ids = list(np.unique(list(predictions.keys()))) 104 | self.img_ids.extend(img_ids) 105 | 106 | for iou_type in self.iou_types: 107 | results = prepare(predictions, iou_type) 108 | coco_dt = loadRes(self.coco_gt, results) if results else COCO() 109 | coco_eval = self.coco_eval[iou_type] 110 | 111 | coco_eval.cocoDt = coco_dt 112 | coco_eval.params.imgIds = list(img_ids) 113 | img_ids, eval_imgs = evaluate(coco_eval) 114 | 115 | self.eval_imgs[iou_type].append(eval_imgs) 116 | 117 | def synchronize_between_processes(self): 118 | for iou_type in self.iou_types: 119 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 120 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) 121 | 122 | def accumulate(self): 123 | for coco_eval in self.coco_eval.values(): 124 | coco_eval.accumulate() 125 | 126 | def summarize(self): 127 | for iou_type, coco_eval in self.coco_eval.items(): 128 | print("IoU metric: {}".format(iou_type)) 129 | coco_eval.summarize() 130 | 131 | 132 | def convert_to_xywh(boxes): 133 | xmin, ymin, xmax, ymax = boxes.unbind(1) 134 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 135 | 136 | 137 | def merge(img_ids, eval_imgs): 138 | all_img_ids = all_gather(img_ids) 139 | all_eval_imgs = all_gather(eval_imgs) 140 | 141 | merged_img_ids = [] 142 | for p in all_img_ids: 143 | merged_img_ids.extend(p) 144 | 145 | merged_eval_imgs = [] 146 | for p in all_eval_imgs: 147 | merged_eval_imgs.append(p) 148 | 149 | merged_img_ids = np.array(merged_img_ids) 150 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 151 | 152 | # keep only unique (and in sorted order) images 153 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 154 | merged_eval_imgs = merged_eval_imgs[..., idx] 155 | 156 | return merged_img_ids, merged_eval_imgs 157 | 158 | 159 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 160 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 161 | img_ids = list(img_ids) 162 | eval_imgs = list(eval_imgs.flatten()) 163 | 164 | coco_eval.evalImgs = eval_imgs 165 | coco_eval.params.imgIds = img_ids 166 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 167 | 168 | 169 | def createIndex(self): 170 | anns, cats, imgs = {}, {}, {} 171 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list) 172 | if 'annotations' in self.dataset: 173 | for ann in self.dataset['annotations']: 174 | imgToAnns[ann['image_id']].append(ann) 175 | anns[ann['id']] = ann 176 | 177 | if 'images' in self.dataset: 178 | for img in self.dataset['images']: 179 | imgs[img['id']] = img 180 | 181 | if 'categories' in self.dataset: 182 | for cat in self.dataset['categories']: 183 | cats[cat['id']] = cat 184 | 185 | if 'annotations' in self.dataset and 'categories' in self.dataset: 186 | for ann in self.dataset['annotations']: 187 | catToImgs[ann['category_id']].append(ann['image_id']) 188 | 189 | # create class members 190 | self.anns = anns 191 | self.imgToAnns = imgToAnns 192 | self.catToImgs = catToImgs 193 | self.imgs = imgs 194 | self.cats = cats 195 | 196 | 197 | maskUtils = mask_util 198 | 199 | 200 | def loadRes(self, resFile): 201 | """ 202 | Load result file and return a result api object. 203 | :return: res (obj) : result api object 204 | """ 205 | res = COCO() 206 | res.dataset['images'] = [img for img in self.dataset['images']] 207 | 208 | if isinstance(resFile, torch._six.string_classes): 209 | anns = json.load(open(resFile)) 210 | elif type(resFile) == np.ndarray: 211 | anns = self.loadNumpyAnnotations(resFile) 212 | else: 213 | anns = resFile 214 | assert type(anns) == list, 'results in not an array of objects' 215 | annsImgIds = [ann['image_id'] for ann in anns] 216 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 217 | 'Results do not correspond to current coco set' 218 | if 'caption' in anns[0]: 219 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 220 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 221 | for id, ann in enumerate(anns): 222 | ann['id'] = id + 1 223 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 224 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 225 | for id, ann in enumerate(anns): 226 | bb = ann['bbox'] 227 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] 228 | if 'segmentation' not in ann: 229 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 230 | ann['area'] = bb[2] * bb[3] 231 | ann['id'] = id + 1 232 | ann['iscrowd'] = 0 233 | elif 'segmentation' in anns[0]: 234 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 235 | for id, ann in enumerate(anns): 236 | # now only support compressed RLE format as segmentation results 237 | ann['area'] = maskUtils.area(ann['segmentation']) 238 | if 'bbox' not in ann: 239 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 240 | ann['id'] = id + 1 241 | ann['iscrowd'] = 0 242 | elif 'keypoints' in anns[0]: 243 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 244 | for id, ann in enumerate(anns): 245 | s = ann['keypoints'] 246 | x = s[0::3] 247 | y = s[1::3] 248 | x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y) 249 | ann['area'] = (x2 - x1) * (y2 - y1) 250 | ann['id'] = id + 1 251 | ann['bbox'] = [x1, y1, x2 - x1, y2 - y1] 252 | 253 | res.dataset['annotations'] = anns 254 | createIndex(res) 255 | return res 256 | 257 | 258 | def evaluate(self): 259 | ''' 260 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 261 | :return: None 262 | ''' 263 | p = self.params 264 | # add backward compatibility if useSegm is specified in params 265 | if p.useSegm is not None: 266 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 267 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 268 | p.imgIds = list(np.unique(p.imgIds)) 269 | if p.useCats: 270 | p.catIds = list(np.unique(p.catIds)) 271 | p.maxDets = sorted(p.maxDets) 272 | self.params = p 273 | 274 | self._prepare() 275 | # loop through images, area range, max detection number 276 | catIds = p.catIds if p.useCats else [-1] 277 | 278 | if p.iouType == 'segm' or p.iouType == 'bbox': 279 | computeIoU = self.computeIoU 280 | elif p.iouType == 'keypoints': 281 | computeIoU = self.computeOks 282 | self.ious = { 283 | (imgId, catId): computeIoU(imgId, catId) 284 | for imgId in p.imgIds 285 | for catId in catIds} 286 | 287 | evaluateImg = self.evaluateImg 288 | maxDet = p.maxDets[-1] 289 | evalImgs = [ 290 | evaluateImg(imgId, catId, areaRng, maxDet) 291 | for catId in catIds 292 | for areaRng in p.areaRng 293 | for imgId in p.imgIds 294 | ] 295 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 296 | self._paramsEval = copy.deepcopy(self.params) 297 | return p.imgIds, evalImgs 298 | -------------------------------------------------------------------------------- /utils/det_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | 5 | class BalancedPositiveNegativeSampler(object): 6 | """ 7 | This class samples batches, ensuring that they contain a fixed proportion of positives 8 | :param batch_size_per_image: number of elements to be selected per image 9 | :param positive_fraction: percentage of positive elements per batch 10 | """ 11 | 12 | def __init__(self, batch_size_per_image, positive_fraction): 13 | self.batch_size_per_image = batch_size_per_image 14 | self.positive_fraction = positive_fraction 15 | 16 | def __call__(self, matched_idxs): 17 | """ 18 | Returns two lists of binary masks for each image. 19 | The first list contains the positive elements that were selected, 20 | and the second list the negative example. 21 | :param matched_idxs: list of tensors containing -1, 0 or positive values. 22 | Each tensor corresponds to a specific image. 23 | -1 values are ignored, 0 are considered as negatives and > 0 as 24 | positives. 25 | :return: pos_idx (list[tensor]) 26 | neg_idx (list[tensor]) 27 | """ 28 | 29 | pos_idx = [] 30 | neg_idx = [] 31 | for matched_idxs_per_image in matched_idxs: 32 | # positive sample if index >= 1 33 | positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) 34 | # negative sample if index == 0 35 | negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) 36 | 37 | # number of positive samples 38 | num_pos = int(self.batch_size_per_image * self.positive_fraction) 39 | # protect against not enough positive examples, used all positive samples 40 | num_pos = min(positive.numel(), num_pos) 41 | 42 | # number of negative samples 43 | num_neg = self.batch_size_per_image - num_pos 44 | # protect against not enough negative examples, used all negative samples 45 | num_neg = min(negative.numel(), num_neg) 46 | 47 | # randomly select positive and negative examples 48 | # Returns a random permutation of integers from 0 to n - 1. 49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] 50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] 51 | 52 | pos_idx_per_image = positive[perm1] 53 | neg_idx_per_image = negative[perm2] 54 | 55 | # create binary mask from indices 56 | pos_idx_per_image_mask = torch.zeros_like( 57 | matched_idxs_per_image, dtype=torch.uint8 58 | ) 59 | neg_idx_per_image_mask = torch.zeros_like( 60 | matched_idxs_per_image, dtype=torch.uint8 61 | ) 62 | 63 | pos_idx_per_image_mask[pos_idx_per_image] = 1 64 | neg_idx_per_image_mask[neg_idx_per_image] = 1 65 | 66 | pos_idx.append(pos_idx_per_image_mask) 67 | neg_idx.append(neg_idx_per_image_mask) 68 | 69 | return pos_idx, neg_idx 70 | 71 | 72 | def encode_boxes(reference_boxes, proposals, weights): 73 | """ 74 | Encode a set of proposals with respect to some reference boxes 75 | :param reference_boxes: reference boxes(gt) 76 | :param proposals: boxes to be encoded(anchors) 77 | :param weights: 78 | :return: 79 | """ 80 | 81 | wx = weights[0] 82 | wy = weights[1] 83 | ww = weights[2] 84 | wh = weights[3] 85 | 86 | # Returns a new tensor with a dimension of size one inserted at the specified position. 87 | proposals_x1 = proposals[:, 0].unsqueeze(1) 88 | proposals_y1 = proposals[:, 1].unsqueeze(1) 89 | proposals_x2 = proposals[:, 2].unsqueeze(1) 90 | proposals_y2 = proposals[:, 3].unsqueeze(1) 91 | 92 | reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1) 93 | reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1) 94 | reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1) 95 | reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1) 96 | 97 | # implementation starts here 98 | # parse widths and heights 99 | ex_widths = proposals_x2 - proposals_x1 100 | ex_heights = proposals_y2 - proposals_y1 101 | 102 | # center point 103 | ex_ctr_x = proposals_x1 + 0.5 * ex_widths 104 | ex_ctr_y = proposals_y1 + 0.5 * ex_heights 105 | 106 | gt_widths = reference_boxes_x2 - reference_boxes_x1 107 | gt_heights = reference_boxes_y2 - reference_boxes_y1 108 | gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths 109 | gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights 110 | 111 | targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths 112 | targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights 113 | targets_dw = ww * torch.log(gt_widths / ex_widths) 114 | targets_dh = wh * torch.log(gt_heights / ex_heights) 115 | 116 | targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) 117 | return targets 118 | 119 | 120 | class BoxCoder(object): 121 | """ 122 | This class encodes and decodes a set of bounding boxes into 123 | the representation used for training the regressors. 124 | :param weights: 4-element tuple, represented calculation weights of x, y, h, w 125 | :param bbox_xform_clip: float, represented maximum of height and width 126 | """ 127 | 128 | def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): 129 | self.weights = weights 130 | self.bbox_xform_clip = bbox_xform_clip 131 | 132 | def encode(self, reference_boxes, proposals): 133 | """ 134 | This class is inserted to calculate parameters of regression 135 | :param reference_boxes: gt bbox 136 | :param proposals: anchors bbox 137 | :return: regression parameters 138 | """ 139 | 140 | boxes_per_image = [len(b) for b in reference_boxes] 141 | reference_boxes = torch.cat(reference_boxes, dim=0) 142 | proposals = torch.cat(proposals, dim=0) 143 | 144 | # targets_dx, targets_dy, targets_dw, targets_dh 145 | targets = self.encode_single(reference_boxes, proposals) 146 | return targets.split(boxes_per_image, 0) 147 | 148 | def encode_single(self, reference_boxes, proposals): 149 | """ 150 | Encode a set of proposals with respect to some reference boxes 151 | :param reference_boxes: reference boxes 152 | :param proposals: boxes to be encoded 153 | :return: 154 | """ 155 | 156 | dtype = reference_boxes.dtype 157 | device = reference_boxes.device 158 | weights = torch.as_tensor(self.weights, dtype=dtype, device=device) 159 | targets = encode_boxes(reference_boxes, proposals, weights) 160 | 161 | return targets 162 | 163 | def decode(self, rel_codes, boxes): 164 | """ 165 | decode regression parameters 166 | :param rel_codes: bbox regression parameters 167 | :param boxes: anchors 168 | :return: 169 | """ 170 | 171 | assert isinstance(boxes, (list, tuple)) 172 | assert isinstance(rel_codes, torch.Tensor) 173 | 174 | boxes_per_image = [b.size(0) for b in boxes] 175 | concat_boxes = torch.cat(boxes, dim=0) 176 | 177 | box_sum = 0 178 | for val in boxes_per_image: 179 | box_sum += val 180 | # map regression parameters into anchors to get coordinate 181 | pred_boxes = self.decode_single( 182 | rel_codes.reshape(box_sum, -1), concat_boxes 183 | ) 184 | return pred_boxes.reshape(box_sum, -1, 4) 185 | 186 | def decode_single(self, rel_codes, boxes): 187 | """ 188 | From a set of original boxes and encoded relative box offsets, get the decoded boxes. 189 | :param rel_codes: encoded boxes (bbox regression parameters) 190 | :param boxes: reference boxes (anchors) 191 | :return: 192 | """ 193 | boxes = boxes.to(rel_codes.dtype) 194 | 195 | # xmin, ymin, xmax, ymax 196 | widths = boxes[:, 2] - boxes[:, 0] # anchor width 197 | heights = boxes[:, 3] - boxes[:, 1] # anchor height 198 | ctr_x = boxes[:, 0] + 0.5 * widths # anchor center x coordinate 199 | ctr_y = boxes[:, 1] + 0.5 * heights # anchor center y coordinate 200 | 201 | wx, wy, ww, wh = self.weights # default is 1 202 | dx = rel_codes[:, 0::4] / wx # predicated anchors center x regression parameters 203 | dy = rel_codes[:, 1::4] / wy # predicated anchors center y regression parameters 204 | dw = rel_codes[:, 2::4] / ww # predicated anchors width regression parameters 205 | dh = rel_codes[:, 3::4] / wh # predicated anchors height regression parameters 206 | 207 | # limit max value, prevent sending too large values into torch.exp() 208 | # self.bbox_xform_clip=math.log(1000. / 16) 209 | dw = torch.clamp(dw, max=self.bbox_xform_clip) 210 | dh = torch.clamp(dh, max=self.bbox_xform_clip) 211 | 212 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] 213 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] 214 | pred_w = torch.exp(dw) * widths[:, None] 215 | pred_h = torch.exp(dh) * heights[:, None] 216 | 217 | # xmin 218 | pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w 219 | # ymin 220 | pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h 221 | # xmax 222 | pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w 223 | # ymax 224 | pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h 225 | pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1) 226 | return pred_boxes 227 | 228 | 229 | def set_low_quality_matches_(matches, all_matches, match_quality_matrix): 230 | """ 231 | Produce additional matches for predictions that have only low-quality matches. 232 | Specifically, for each ground-truth find the set of predictions that have 233 | maximum overlap with it (including ties); for each prediction in that set, if 234 | it is unmatched, then match it to the ground-truth with which it has the highest 235 | quality value. 236 | """ 237 | # For each gt, find the prediction with which it has highest quality 238 | highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) # the dimension to reduce. 239 | 240 | # Find highest quality match available, even if it is low, including ties 241 | gt_pred_pairs_of_highest_quality = torch.nonzero( 242 | match_quality_matrix == highest_quality_foreach_gt[:, None] 243 | ) 244 | # Example gt_pred_pairs_of_highest_quality: 245 | # tensor([[ 0, 39796], 246 | # [ 1, 32055], 247 | # [ 1, 32070], 248 | # [ 2, 39190], 249 | # [ 2, 40255], 250 | # [ 3, 40390], 251 | # [ 3, 41455], 252 | # [ 4, 45470], 253 | # [ 5, 45325], 254 | # [ 5, 46390]]) 255 | # Each row is a (gt index, prediction index) 256 | # Note how gt items 1, 2, 3, and 5 each have two ties 257 | 258 | pre_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] 259 | matches[pre_inds_to_update] = all_matches[pre_inds_to_update] 260 | 261 | 262 | class Matcher(object): 263 | BELOW_LOW_THRESHOLD = -1 264 | BETWEEN_THRESHOLDS = -2 265 | 266 | def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): 267 | """ 268 | Args: 269 | high_threshold (float): quality values greater than or equal to 270 | this value are candidate matches. 271 | low_threshold (float): a lower quality threshold used to stratify 272 | matches into three levels: 273 | 1) matches >= high_threshold 274 | 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) 275 | 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) 276 | allow_low_quality_matches (bool): if True, produce additional matches 277 | for predictions that have only low-quality match candidates. See 278 | set_low_quality_matches_ for more details. 279 | """ 280 | self.BELOW_LOW_THRESHOLD = -1 281 | self.BETWEEN_THRESHOLDS = -2 282 | assert low_threshold <= high_threshold 283 | self.high_threshold = high_threshold # 0.7 284 | self.low_threshold = low_threshold # 0.3 285 | self.allow_low_quality_matches = allow_low_quality_matches 286 | 287 | def __call__(self, match_quality_matrix): 288 | """ 289 | calculate maximum iou between anchors and gt boxes, save index， 290 | iou < low_threshold: -1 291 | iou > high_threshold: 1 292 | low_threshold<=iou= self.low_threshold) & ( 322 | matched_vals < self.high_threshold 323 | ) 324 | matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD # -1 325 | 326 | matches[between_thresholds] = self.BETWEEN_THRESHOLDS # -2 327 | 328 | if self.allow_low_quality_matches: 329 | assert all_matches is not None 330 | set_low_quality_matches_(matches, all_matches, match_quality_matrix) 331 | 332 | return matches 333 | 334 | 335 | def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = True): 336 | """ 337 | smooth_l1_loss for bbox regression 338 | :param input: 339 | :param target: 340 | :param beta: 341 | :param size_average: 342 | :return: 343 | """ 344 | 345 | n = torch.abs(input - target) 346 | cond = n < beta 347 | loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) 348 | if size_average: 349 | return loss.mean() 350 | return loss.sum() 351 | -------------------------------------------------------------------------------- /utils/draw_box_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import PIL.ImageDraw as ImageDraw 3 | import PIL.ImageFont as ImageFont 4 | import numpy as np 5 | 6 | STANDARD_COLORS = [ 7 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 8 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 9 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 10 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 11 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 12 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 13 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 14 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 15 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 16 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 17 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 18 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 19 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 20 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 21 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 22 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 23 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 24 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 25 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 26 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 27 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 28 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 29 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 30 | ] 31 | 32 | 33 | def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map): 34 | for i in range(boxes.shape[0]): 35 | if scores[i] > thresh: 36 | box = tuple(boxes[i].tolist()) 37 | if classes[i] in category_index.keys(): 38 | class_name = category_index[classes[i]] 39 | else: 40 | class_name = 'N/A' 41 | display_str = str(class_name) 42 | display_str = '{}: {}%'.format(display_str, int(100 * scores[i])) 43 | box_to_display_str_map[box].append(display_str) 44 | box_to_color_map[box] = STANDARD_COLORS[ 45 | classes[i] % len(STANDARD_COLORS)] 46 | else: 47 | break 48 | 49 | 50 | def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color): 51 | try: 52 | font = ImageFont.truetype('arial.ttf', 24) 53 | except IOError: 54 | font = ImageFont.load_default() 55 | 56 | display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]] 57 | total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights) 58 | 59 | if top > total_display_str_height: 60 | text_bottom = top 61 | else: 62 | text_bottom = bottom + total_display_str_height 63 | # Reverse list and print from bottom to top. 64 | for display_str in box_to_display_str_map[box][::-1]: 65 | text_width, text_height = font.getsize(display_str) 66 | margin = np.ceil(0.05 * text_height) 67 | draw.rectangle([(left, text_bottom - text_height - 2 * margin), 68 | (left + text_width, text_bottom)], fill=color) 69 | draw.text((left + margin, text_bottom - text_height - margin), 70 | display_str, 71 | fill='black', 72 | font=font) 73 | text_bottom -= text_height - 2 * margin 74 | 75 | 76 | def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8): 77 | box_to_display_str_map = collections.defaultdict(list) 78 | box_to_color_map = collections.defaultdict(str) 79 | 80 | filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map) 81 | 82 | # Draw all boxes onto image. 83 | draw = ImageDraw.Draw(image) 84 | for box, color in box_to_color_map.items(): 85 | xmin, ymin, xmax, ymax = box 86 | (left, right, top, bottom) = (xmin * 1, xmax * 1, 87 | ymin * 1, ymax * 1) 88 | draw.line([(left, top), (left, bottom), (right, bottom), 89 | (right, top), (left, top)], width=line_thickness, fill=color) 90 | draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color) -------------------------------------------------------------------------------- /utils/evaluate_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | from utils.train_utils import MetricLogger 4 | from utils.coco_utils import get_coco_api_from_dataset, CocoEvaluator 5 | 6 | 7 | @torch.no_grad() 8 | def evaluate(model, data_loader, device, mAP_list=None): 9 | n_threads = torch.get_num_threads() 10 | torch.set_num_threads(1) 11 | cpu_device = torch.device("cpu") 12 | model.eval() 13 | metric_logger = MetricLogger(delimiter=" ") 14 | header = "Test: " 15 | 16 | coco = get_coco_api_from_dataset(data_loader.dataset) 17 | iou_types = ["bbox"] 18 | coco_evaluator = CocoEvaluator(coco, iou_types) 19 | 20 | for image, targets in metric_logger.log_every(data_loader, 100, header): 21 | image = list(img.to(device) for img in image) 22 | 23 | if device != torch.device("cpu"): 24 | torch.cuda.synchronize(device) 25 | 26 | model_time = time.time() 27 | outputs = model(image) 28 | 29 | outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] 30 | model_time = time.time() - model_time 31 | 32 | res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} 33 | 34 | evaluator_time = time.time() 35 | coco_evaluator.update(res) 36 | evaluator_time = time.time() - evaluator_time 37 | metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) 38 | 39 | # gather the stats from all processes 40 | metric_logger.synchronize_between_processes() 41 | print("Averaged stats:", metric_logger) 42 | coco_evaluator.synchronize_between_processes() 43 | 44 | # accumulate predictions from all images 45 | coco_evaluator.accumulate() 46 | coco_evaluator.summarize() 47 | torch.set_num_threads(n_threads) 48 | 49 | print_txt = coco_evaluator.coco_eval[iou_types[0]].stats 50 | coco_mAP = print_txt[0] 51 | voc_mAP = print_txt[1] 52 | if isinstance(mAP_list, list): 53 | mAP_list.append(voc_mAP) 54 | 55 | return coco_evaluator, voc_mAP 56 | 57 | -------------------------------------------------------------------------------- /utils/faster_rcnn_utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from collections import OrderedDict 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import Tensor 7 | from torch import nn 8 | from torch.jit.annotations import Tuple, List, Dict, Optional 9 | from torchvision.ops import MultiScaleRoIAlign 10 | 11 | from utils.anchor_utils import AnchorsGenerator 12 | from utils.roi_header_util import RoIHeads 13 | from utils.rpn_utils import RPNHead, RegionProposalNetwork 14 | from utils.transform_utils import GeneralizedRCNNTransform 15 | 16 | 17 | class FasterRCNNBase(nn.Module): 18 | """ 19 | Main class for Generalized R-CNN. 20 | 21 | Arguments: 22 | backbone (nn.Module): 23 | rpn (nn.Module): 24 | roi_heads (nn.Module): takes the features + the proposals from the RPN and computes 25 | detections / masks from it. 26 | transform (nn.Module): performs the data transformation from the inputs to feed into 27 | the model 28 | """ 29 | 30 | def __init__(self, backbone, rpn, roi_heads, transform): 31 | super(FasterRCNNBase, self).__init__() 32 | self.transform = transform 33 | self.backbone = backbone 34 | self.rpn = rpn 35 | self.roi_heads = roi_heads 36 | 37 | @torch.jit.unused 38 | def eager_outputs(self, losses, detections): 39 | if self.training: 40 | return losses 41 | 42 | return detections 43 | 44 | def forward(self, images, targets=None): 45 | """ 46 | Arguments: 47 | images (list[Tensor]): images to be processed 48 | targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) 49 | 50 | Returns: 51 | result (list[BoxList] or dict[Tensor]): the output from the model. 52 | During training, it returns a dict[Tensor] which contains the losses. 53 | During testing, it returns list[BoxList] contains additional fields 54 | like `scores`, `labels` and `mask` (for Mask R-CNN models). 55 | 56 | """ 57 | if self.training and targets is None: 58 | raise ValueError("In training mode, targets should be passed") 59 | 60 | if self.training: 61 | assert targets is not None 62 | for target in targets: 63 | boxes = target["boxes"] 64 | if isinstance(boxes, torch.Tensor): 65 | if len(boxes.shape) != 2 or boxes.shape[-1] != 4: 66 | raise ValueError("Expected target boxes to be a tensor" 67 | "of shape [N, 4], got {:}.".format( 68 | boxes.shape)) 69 | else: 70 | raise ValueError("Expected target boxes to be of type " 71 | "Tensor, got {:}.".format(type(boxes))) 72 | 73 | original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], []) 74 | for img in images: 75 | val = img.shape[-2:] 76 | assert len(val) == 2 77 | original_image_sizes.append((val[0], val[1])) 78 | 79 | images, targets = self.transform(images, targets) 80 | 81 | features = self.backbone(images.tensors) 82 | if isinstance(features, torch.Tensor): 83 | features = OrderedDict([('0', features)]) 84 | 85 | proposals, proposal_losses = self.rpn(images, features, targets) 86 | 87 | detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets) 88 | 89 | detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) 90 | 91 | losses = {} 92 | losses.update(detector_losses) 93 | losses.update(proposal_losses) 94 | 95 | return self.eager_outputs(losses, detections) 96 | 97 | 98 | class TwoMLPHead(nn.Module): 99 | """ 100 | two fc layers after roi pooling/align 101 | :param in_channels: number of input channels 102 | :param representation_size: size of the intermediate representation 103 | """ 104 | 105 | def __init__(self, in_channels, representation_size): 106 | super(TwoMLPHead, self).__init__() 107 | 108 | self.fc6 = nn.Linear(in_channels, representation_size) 109 | self.fc7 = nn.Linear(representation_size, representation_size) 110 | 111 | def forward(self, x): 112 | x = x.flatten(start_dim=1) 113 | 114 | x = F.relu(self.fc6(x)) 115 | x = F.relu(self.fc7(x)) 116 | 117 | return x 118 | 119 | 120 | class FastRCNNPredictor(nn.Module): 121 | """ 122 | Standard classification + bounding box regression layers for Fast R-CNN. 123 | :param in_channels: number of input channels 124 | :param num_classes: number of output classes (including background) 125 | """ 126 | 127 | def __init__(self, in_channels, num_classes): 128 | super(FastRCNNPredictor, self).__init__() 129 | self.cls_score = nn.Linear(in_channels, num_classes) 130 | self.bbox_pred = nn.Linear(in_channels, num_classes * 4) 131 | 132 | def forward(self, x): 133 | if x.dim() == 4: 134 | assert list(x.shape[2:]) == [1, 1] 135 | x = x.flatten(start_dim=1) 136 | scores = self.cls_score(x) 137 | bbox_deltas = self.bbox_pred(x) 138 | 139 | return scores, bbox_deltas 140 | 141 | 142 | class FasterRCNN(FasterRCNNBase): 143 | """ 144 | Implementation of Faster R-CNN. 145 | 146 | The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each 147 | image, and should be in 0-1 range. Different images can have different sizes. 148 | 149 | The behavior of the model changes depending if it is in training or inference mode. 150 | 151 | During training, the model expects both the input tensors, as well as a targets (list of dictionary), 152 | containing: 153 | - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values 154 | between 0 and H and 0 and W 155 | - labels (Int64Tensor[N]): the class label for each ground-truth box 156 | 157 | The model returns a Dict[Tensor] during training, containing the classification and regression 158 | losses for both the RPN and the R-CNN. 159 | 160 | During inference, the model requires only the input tensors, and returns the post-processed 161 | predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as 162 | follows: 163 | - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between 164 | 0 and H and 0 and W 165 | - labels (Int64Tensor[N]): the predicted labels for each image 166 | - scores (Tensor[N]): the scores or each prediction 167 | 168 | :param backbone: (nn.Module), the network used to compute the features for the model. 169 | It should contain a out_channels attribute, which indicates the number of output 170 | channels that each feature map has (and it should be the same for all feature maps). 171 | The backbone should return a single Tensor or and OrderedDict[Tensor]. 172 | :param num_classes: (int), number of output classes of the model (including the background). 173 | If box_predictor is specified, num_classes should be None. 174 | :param min_size: (int), minimum size of the image to be rescaled before feeding it to the backbone 175 | :param max_size: (int), maximum size of the image to be rescaled before feeding it to the backbone 176 | :param image_mean: (Tuple[float, float, float]):, mean values used for input normalization. 177 | They are generally the mean values of the dataset on which the backbone has been trained 178 | on 179 | :param image_std: (Tuple[float, float, float]), std values used for input normalization. 180 | They are generally the std values of the dataset on which the backbone has been trained on 181 | :param rpn_anchor_generator: (AnchorGenerator), module that generates the anchors for a set of feature maps. 182 | :param rpn_head: (nn.Module), module that computes the objectness and regression deltas from the RPN 183 | :param rpn_pre_nms_top_n_train:(int), number of proposals to keep before applying NMS during training 184 | :param rpn_pre_nms_top_n_test: (int), number of proposals to keep before applying NMS during testing 185 | :param rpn_post_nms_top_n_train: (int), number of proposals to keep after applying NMS during training 186 | :param rpn_post_nms_top_n_test: (int), number of proposals to keep after applying NMS during testing 187 | :param rpn_nms_thresh: (float), NMS threshold used for postprocessing the RPN proposals 188 | :param rpn_fg_iou_thresh:(float), minimum IoU between the anchor and the GT box so that they can be 189 | considered as positive during training of the RPN. 190 | :param rpn_bg_iou_thresh:(float), maximum IoU between the anchor and the GT box so that they can be 191 | considered as negative during training of the RPN. 192 | :param rpn_batch_size_per_image: (int), number of anchors that are sampled during training of the RPN 193 | for computing the loss 194 | :param rpn_positive_fraction: (float), proportion of positive anchors in a mini-batch during training 195 | of the RPN 196 | :param box_roi_pool:(MultiScaleRoIAlign), the module which crops and resizes the feature maps in 197 | the locations indicated by the bounding boxes 198 | :param box_head:(nn.Module), module that takes the cropped feature maps as input 199 | :param box_predictor:(nn.Module), module that takes the output of box_head and returns the 200 | classification logits and box regression deltas. 201 | :param box_score_thresh:(float),during inference, only return proposals with a classification score 202 | greater than box_score_thresh 203 | :param box_nms_thresh: (float), NMS threshold for the prediction head. Used during inference 204 | :param box_detections_per_img: (int), maximum number of detections per image, for all classes. 205 | :param box_fg_iou_thresh:(float): minimum IoU between the proposals and the GT box so that they can be 206 | considered as positive during training of the classification head 207 | :param box_bg_iou_thresh: (float), maximum IoU between the proposals and the GT box so that they can be 208 | considered as negative during training of the classification head 209 | :param box_batch_size_per_image: (int), number of proposals that are sampled during training of the 210 | classification head 211 | :param box_positive_fraction: (float), proportion of positive proposals in a mini-batch during training 212 | of the classification head 213 | :param bbox_reg_weights: (Tuple[float, float, float, float]), weights for the encoding/decoding of the 214 | bounding boxes 215 | """ 216 | 217 | def __init__(self, backbone, num_classes=None, 218 | # transform parameter 219 | min_size=300, max_size=800, # preprocess minimum and maximum size 220 | image_mean=None, image_std=None, # mean and std in preprocess 221 | 222 | # RPN parameters 223 | rpn_anchor_generator=None, rpn_head=None, 224 | rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # kept proposals before nms 225 | rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # kept proposals after nms 226 | rpn_nms_thresh=0.7, # iou threshold during nms 227 | rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # bg/fg threshold 228 | rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # number of samples and fraction 229 | 230 | # Box parameters 231 | box_roi_pool=None, box_head=None, box_predictor=None, 232 | 233 | # remove low threshold target 234 | box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, 235 | box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, 236 | box_batch_size_per_image=512, box_positive_fraction=0.25, 237 | bbox_reg_weights=None 238 | ): 239 | 240 | if not hasattr(backbone, "out_channels"): 241 | raise ValueError( 242 | "backbone should contain an attribute out_channels" 243 | "specifying the number of output channels (assumed to be the" 244 | "same for all the levels" 245 | ) 246 | 247 | assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None))) 248 | assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) 249 | 250 | if num_classes is not None: 251 | if box_predictor is not None: 252 | raise ValueError("num_classes should be None when box_predictor " 253 | "is specified") 254 | else: 255 | if box_predictor is None: 256 | raise ValueError("num_classes should not be None when box_predictor " 257 | "is not specified") 258 | 259 | # output channels of the backbone 260 | out_channels = backbone.out_channels 261 | 262 | if rpn_head is None: 263 | rpn_head = RPNHead( 264 | out_channels, rpn_anchor_generator.num_anchors_per_location()[0] 265 | ) 266 | 267 | rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) 268 | rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) 269 | 270 | rpn = RegionProposalNetwork( 271 | rpn_anchor_generator, rpn_head, 272 | rpn_fg_iou_thresh, rpn_bg_iou_thresh, 273 | rpn_batch_size_per_image, rpn_positive_fraction, 274 | rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) 275 | 276 | # two fc layer after roi pooling 277 | if box_head is None: 278 | resolution = box_roi_pool.output_size[0] 279 | representation_size = 1024 280 | box_head = TwoMLPHead( 281 | out_channels * resolution ** 2, 282 | representation_size 283 | ) 284 | 285 | # get prediction 286 | if box_predictor is None: 287 | representation_size = 1024 288 | box_predictor = FastRCNNPredictor( 289 | representation_size, 290 | num_classes) 291 | 292 | roi_heads = RoIHeads( 293 | # box 294 | box_roi_pool, box_head, box_predictor, 295 | box_fg_iou_thresh, box_bg_iou_thresh, 296 | box_batch_size_per_image, box_positive_fraction, 297 | bbox_reg_weights, 298 | box_score_thresh, box_nms_thresh, box_detections_per_img) 299 | 300 | transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) 301 | 302 | super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform) 303 | -------------------------------------------------------------------------------- /utils/im_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from torch.jit.annotations import List, Tuple 4 | from torch import Tensor 5 | from torchvision.transforms import functional as F 6 | 7 | 8 | class Compose(object): 9 | 10 | def __init__(self, transforms): 11 | self.transforms = transforms 12 | 13 | def __call__(self, image, target): 14 | for t in self.transforms: 15 | image, target = t(image, target) 16 | return image, target 17 | 18 | 19 | class ToTensor(object): 20 | 21 | def __call__(self, image, target): 22 | image = F.to_tensor(image) 23 | return image, target 24 | 25 | 26 | class RandomHorizontalFlip(object): 27 | 28 | def __init__(self, prob=0.5): 29 | self.prob = prob 30 | 31 | def __call__(self, image, target): 32 | if random.random() < self.prob: 33 | height, width = image.shape[-2:] 34 | image = image.flip(-1) 35 | bbox = target["boxes"] 36 | # bbox: xmin, ymin, xmax, ymax 37 | bbox[:, [0, 2]] = width - bbox[:, [2, 0]] 38 | target["boxes"] = bbox 39 | return image, target 40 | 41 | 42 | @torch.jit.script 43 | class ImageList(object): 44 | """ 45 | Structure that holds a list of images (of possibly 46 | varying sizes) as a single tensor. 47 | This works by padding the images to the same size, 48 | and storing in a field the original sizes of each image 49 | """ 50 | 51 | def __init__(self, tensors, image_sizes): 52 | """ 53 | Arguments: 54 | tensors (tensor) padding后的图像数据 55 | image_sizes (list[tuple[int, int]]) padding前的图像尺寸 56 | """ 57 | self.tensors = tensors 58 | self.image_sizes = image_sizes 59 | 60 | def to(self, device): 61 | cast_tensor = self.tensors.to(device) 62 | return ImageList(cast_tensor, self.image_sizes) 63 | -------------------------------------------------------------------------------- /utils/plot_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_loss_and_lr(train_loss, learning_rate, save_dir): 6 | try: 7 | x = list(range(len(train_loss))) 8 | fig, ax1 = plt.subplots(1, 1) 9 | ax1.plot(x, train_loss, 'r', label='loss') 10 | ax1.set_xlabel("step") 11 | ax1.set_ylabel("loss") 12 | ax1.set_title("Train Loss and lr") 13 | plt.legend(loc='best') 14 | 15 | ax2 = ax1.twinx() 16 | ax2.plot(x, learning_rate, label='lr') 17 | ax2.set_ylabel("learning rate") 18 | ax2.set_xlim(0, len(train_loss)) 19 | plt.legend(loc='best') 20 | 21 | handles1, labels1 = ax1.get_legend_handles_labels() 22 | handles2, labels2 = ax2.get_legend_handles_labels() 23 | plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right') 24 | 25 | fig.subplots_adjust(right=0.8) 26 | fig.savefig(os.path.join(save_dir, 'loss_and_lr.png')) 27 | plt.close() 28 | print("successful save loss curve! ") 29 | except Exception as e: 30 | print(e) 31 | 32 | 33 | def plot_map(mAP, save_dir): 34 | try: 35 | x = list(range(len(mAP))) 36 | plt.plot(x, mAP, label='mAp') 37 | plt.xlabel('epoch') 38 | plt.ylabel('mAP') 39 | plt.title('Eval mAP') 40 | plt.xlim(0, len(mAP)) 41 | plt.legend(loc='best') 42 | plt.savefig(os.path.join(save_dir, 'mAP.png')) 43 | plt.close() 44 | print("successful save mAP curve!") 45 | except Exception as e: 46 | print(e) 47 | -------------------------------------------------------------------------------- /utils/roi_header_util.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from torch import Tensor 3 | from torch.jit.annotations import List, Dict, Tuple 4 | 5 | import utils.boxes_utils as box_op 6 | from utils.det_utils import * 7 | 8 | 9 | def fastrcnn_loss(class_logits, box_regression, labels, regression_targets): 10 | """ 11 | Computes the loss for Faster R-CNN. 12 | :param class_logits: predicted class, shape=[num_anchors, num_classes] 13 | :param box_regression: predicted bbox regression 14 | :param labels: true label 15 | :param regression_targets: true bbox 16 | :return: classification_loss (Tensor) 17 | box_loss (Tensor) 18 | """ 19 | 20 | labels = torch.cat(labels, dim=0) 21 | regression_targets = torch.cat(regression_targets, dim=0) 22 | 23 | classification_loss = F.cross_entropy(class_logits, labels) 24 | 25 | # get indices that correspond to the regression targets for 26 | # the corresponding ground truth labels, to be used with 27 | # advanced indexing 28 | sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) 29 | 30 | labels_pos = labels[sampled_pos_inds_subset] 31 | 32 | # shape=[num_proposal, num_classes] 33 | N, num_classes = class_logits.shape 34 | box_regression = box_regression.reshape(N, -1, 4) 35 | 36 | box_loss = smooth_l1_loss(box_regression[sampled_pos_inds_subset, labels_pos], 37 | regression_targets[sampled_pos_inds_subset], 38 | beta=1 / 9, 39 | size_average=False, 40 | ) / labels.numel() 41 | 42 | return classification_loss, box_loss 43 | 44 | 45 | def add_gt_proposals(proposals, gt_boxes): 46 | """ 47 | concate gt_box and proposals 48 | :param proposals: bboxes of predicted by rpn 49 | :param gt_boxes: true bbox 50 | :return: 51 | """ 52 | 53 | proposals = [ 54 | torch.cat((proposal, gt_box)) 55 | for proposal, gt_box in zip(proposals, gt_boxes) 56 | ] 57 | return proposals 58 | 59 | 60 | def check_targets(targets): 61 | assert targets is not None 62 | assert all(["boxes" in t for t in targets]) 63 | assert all(["labels" in t for t in targets]) 64 | 65 | 66 | class RoIHeads(torch.nn.Module): 67 | def __init__(self, 68 | box_roi_pool, 69 | box_head, 70 | box_predictor, 71 | 72 | # Faster R-CNN training 73 | fg_iou_thresh, bg_iou_thresh, 74 | batch_size_per_image, positive_fraction, 75 | bbox_reg_weights, 76 | 77 | # Faster R-CNN inference 78 | score_thresh, 79 | nms_thresh, 80 | detection_per_img): 81 | super(RoIHeads, self).__init__() 82 | 83 | self.box_similarity = box_op.box_iou 84 | 85 | # assign ground-truth boxes for each proposal 86 | self.proposal_matcher = Matcher( 87 | fg_iou_thresh, # 0.5 88 | bg_iou_thresh, # 0.5 89 | allow_low_quality_matches=False) 90 | 91 | self.fg_bg_sampler = BalancedPositiveNegativeSampler( 92 | batch_size_per_image, # 512 93 | positive_fraction) # 0.25 94 | 95 | if bbox_reg_weights is None: 96 | bbox_reg_weights = (10., 10., 5., 5.) 97 | self.box_coder = BoxCoder(bbox_reg_weights) 98 | 99 | self.box_roi_pool = box_roi_pool 100 | self.box_head = box_head 101 | self.box_predictor = box_predictor 102 | 103 | self.score_thresh = score_thresh 104 | self.nms_thresh = nms_thresh 105 | self.detection_per_img = detection_per_img 106 | 107 | def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels): 108 | """ 109 | get the matched gt_bbox for every anchors, and set positive/negative samples 110 | :param proposals: 111 | :param gt_boxes: 112 | :param gt_labels: 113 | :return: 114 | """ 115 | 116 | matched_idxs = [] 117 | labels = [] 118 | for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels): 119 | if gt_boxes_in_image.numel() == 0: 120 | # background image 121 | device = proposals_in_image.device 122 | clamped_matched_idxs_in_image = torch.zeros( 123 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device 124 | ) 125 | labels_in_image = torch.zeros( 126 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device 127 | ) 128 | else: 129 | # iou of bbox and anchors 130 | match_quality_matrix = box_op.box_iou(gt_boxes_in_image, proposals_in_image) 131 | 132 | matched_idxs_in_image = self.proposal_matcher(match_quality_matrix) 133 | 134 | clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0) 135 | 136 | labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image] 137 | labels_in_image = labels_in_image.to(dtype=torch.int64) 138 | 139 | # label background (below the low threshold) 140 | bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD # -1 141 | labels_in_image[bg_inds] = 0 142 | 143 | # label ignore proposals (between low and high threshold) 144 | ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS # -2 145 | labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler 146 | 147 | matched_idxs.append(clamped_matched_idxs_in_image) 148 | labels.append(labels_in_image) 149 | return matched_idxs, labels 150 | 151 | def subsample(self, labels): 152 | sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) 153 | sampled_inds = [] 154 | for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)): 155 | img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) 156 | sampled_inds.append(img_sampled_inds) 157 | return sampled_inds 158 | 159 | def select_training_samples(self, 160 | proposals, 161 | targets 162 | ): 163 | 164 | check_targets(targets) 165 | assert targets is not None 166 | dtype = proposals[0].dtype 167 | device = proposals[0].device 168 | 169 | gt_boxes = [t["boxes"].to(dtype) for t in targets] 170 | gt_labels = [t["labels"] for t in targets] 171 | 172 | # append ground-truth bboxes to proposal 173 | proposals = add_gt_proposals(proposals, gt_boxes) 174 | 175 | # get matching gt indices for each proposal 176 | matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels) 177 | 178 | # sample a fixed proportion of positive-negative proposals 179 | sampled_inds = self.subsample(labels) 180 | matched_gt_boxes = [] 181 | num_images = len(proposals) 182 | 183 | for img_id in range(num_images): 184 | img_sampled_inds = sampled_inds[img_id] 185 | proposals[img_id] = proposals[img_id][img_sampled_inds] 186 | labels[img_id] = labels[img_id][img_sampled_inds] 187 | matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds] 188 | 189 | gt_boxes_in_image = gt_boxes[img_id] 190 | if gt_boxes_in_image.numel() == 0: 191 | gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device) 192 | matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]]) 193 | 194 | regression_targets = self.box_coder.encode(matched_gt_boxes, proposals) 195 | return proposals, matched_idxs, labels, regression_targets 196 | 197 | def postprocess_detections(self, 198 | class_logits, 199 | box_regression, 200 | proposals, 201 | image_shapes 202 | ): 203 | """ 204 | 对网络的预测数据进行后处理，包括 205 | （1）根据proposal以及预测的回归参数计算出最终bbox坐标 206 | （2）对预测类别结果进行softmax处理 207 | （3）裁剪预测的boxes信息，将越界的坐标调整到图片边界上 208 | （4）移除所有背景信息 209 | （5）移除低概率目标 210 | （6）移除小尺寸目标 211 | （7）执行nms处理，并按scores进行排序 212 | （8）根据scores排序返回前topk个目标 213 | Args: 214 | class_logits: 网络预测类别概率信息 215 | box_regression: 网络预测的边界框回归参数 216 | proposals: rpn输出的proposal 217 | image_shapes: 打包成batch前每张图像的宽高 218 | 219 | Returns: 220 | 221 | """ 222 | device = class_logits.device 223 | # 预测目标类别数 224 | num_classes = class_logits.shape[-1] 225 | 226 | # 获取每张图像的预测bbox数量 227 | boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals] 228 | # 根据proposal以及预测的回归参数计算出最终bbox坐标 229 | pred_boxes = self.box_coder.decode(box_regression, proposals) 230 | 231 | # 对预测类别结果进行softmax处理 232 | pred_scores = F.softmax(class_logits, -1) 233 | 234 | # split boxes and scores per image 235 | # 根据每张图像的预测bbox数量分割结果 236 | pred_boxes_list = pred_boxes.split(boxes_per_image, 0) 237 | pred_scores_list = pred_scores.split(boxes_per_image, 0) 238 | 239 | all_boxes = [] 240 | all_scores = [] 241 | all_labels = [] 242 | # 遍历每张图像预测信息 243 | for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes): 244 | # 裁剪预测的boxes信息，将越界的坐标调整到图片边界上 245 | boxes = box_op.clip_boxes_to_image(boxes, image_shape) 246 | 247 | # create labels for each prediction 248 | labels = torch.arange(num_classes, device=device) 249 | labels = labels.view(1, -1).expand_as(scores) 250 | 251 | # remove prediction with the background label 252 | # 移除索引为0的所有信息（0代表背景） 253 | boxes = boxes[:, 1:] 254 | scores = scores[:, 1:] 255 | labels = labels[:, 1:] 256 | 257 | # batch everything, by making every class prediction be a separate instance 258 | boxes = boxes.reshape(-1, 4) 259 | scores = scores.reshape(-1) 260 | labels = labels.reshape(-1) 261 | 262 | # remove low scoring boxes 263 | # 移除低概率目标，self.scores_thresh=0.05 264 | inds = torch.nonzero(scores > self.score_thresh).squeeze(1) 265 | boxes, scores, labels = boxes[inds], scores[inds], labels[inds] 266 | 267 | # remove empty boxes 268 | # 移除小目标 269 | keep = box_op.remove_small_boxes(boxes, min_size=1e-2) 270 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep] 271 | 272 | # non-maximun suppression, independently done per class 273 | # 执行nms处理，执行后的结果会按照scores从大到小进行排序返回 274 | keep = box_op.batched_nms(boxes, scores, labels, self.nms_thresh) 275 | 276 | # keep only topk scoring predictions 277 | # 获取scores排在前topk个预测目标 278 | keep = keep[:self.detection_per_img] 279 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep] 280 | 281 | all_boxes.append(boxes) 282 | all_scores.append(scores) 283 | all_labels.append(labels) 284 | 285 | return all_boxes, all_scores, all_labels 286 | 287 | def forward(self, 288 | features, 289 | proposals, 290 | image_shapes, 291 | targets=None 292 | ): 293 | """ 294 | Arguments: 295 | features (List[Tensor]) 296 | proposals (List[Tensor[N, 4]]) 297 | image_shapes (List[Tuple[H, W]]) 298 | targets (List[Dict]) 299 | """ 300 | 301 | if targets is not None: 302 | for t in targets: 303 | floating_point_types = (torch.float, torch.double, torch.half) 304 | assert t["boxes"].dtype in floating_point_types, "target boxes must of float type" 305 | # assert t["labels"].dtype == torch.int64, "target labels must of int64 type" 306 | 307 | if self.training: 308 | # 划分正负样本，统计对应gt的标签以及边界框回归信息 309 | proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets) 310 | else: 311 | labels = None 312 | regression_targets = None 313 | matched_idxs = None 314 | 315 | # 将采集样本通过roi_pooling层 316 | box_features = self.box_roi_pool(features, proposals, image_shapes) 317 | # 通过roi_pooling后的两层全连接层 318 | box_features = self.box_head(box_features) 319 | # 接着分别预测目标类别和边界框回归参数 320 | class_logits, box_regression = self.box_predictor(box_features) 321 | 322 | result = torch.jit.annotate(List[Dict[str, torch.Tensor]], []) 323 | losses = {} 324 | if self.training: 325 | assert labels is not None and regression_targets is not None 326 | loss_classifier, loss_box_reg = fastrcnn_loss( 327 | class_logits, box_regression, labels, regression_targets) 328 | losses = { 329 | "loss_classifier": loss_classifier, 330 | "loss_box_reg": loss_box_reg 331 | } 332 | else: 333 | boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes) 334 | num_images = len(boxes) 335 | for i in range(num_images): 336 | result.append( 337 | { 338 | "boxes": boxes[i], 339 | "labels": labels[i], 340 | "scores": scores[i], 341 | } 342 | ) 343 | 344 | return result, losses 345 | -------------------------------------------------------------------------------- /utils/rpn_utils.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.jit.annotations import Dict 3 | from torch.nn import functional as F 4 | 5 | import utils.boxes_utils as box_op 6 | from utils.det_utils import * 7 | 8 | 9 | class RPNHead(nn.Module): 10 | """ 11 | RPN head with background/foreground classification and bbox regression 12 | :param self: 13 | :param in_channels: number of channels of the input feature 14 | :param num_anchors: number of anchors to be predicted 15 | :return: 16 | """ 17 | 18 | def __init__(self, in_channels, num_anchors): 19 | 20 | super(RPNHead, self).__init__() 21 | # 3x3 conv 22 | self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) 23 | 24 | # background/foreground score 25 | self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) 26 | 27 | # bbox regression parameters 28 | self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1) 29 | 30 | for layer in self.children(): 31 | if isinstance(layer, nn.Conv2d): 32 | torch.nn.init.normal_(layer.weight, std=0.01) 33 | torch.nn.init.constant_(layer.bias, 0) 34 | 35 | def forward(self, x): 36 | cls_scores = [] 37 | bbox_reg = [] 38 | for i, feature in enumerate(x): 39 | t = F.relu(self.conv(feature)) 40 | cls_scores.append(self.cls_logits(t)) 41 | bbox_reg.append(self.bbox_pred(t)) 42 | return cls_scores, bbox_reg 43 | 44 | 45 | class RegionProposalNetwork(torch.nn.Module): 46 | """ 47 | Implementation of Region Proposal Network (RPN). 48 | :param anchor_generator: module that generates the anchors for feature map. 49 | :param head: module that computes the objectness and regression deltas 50 | :param fg_iou_thresh: minimum IoU between the anchor and the GT box so that they can be 51 | considered as positive during training of the RPN. 52 | :param bg_iou_thresh: maximum IoU between the anchor and the GT box so that they can be 53 | considered as negative during training of the RPN. 54 | :param batch_size_per_image: number of anchors that are sampled during training of the RPN 55 | for computing the loss 56 | :param positive_fraction: proportion of positive anchors in a mini-batch during training 57 | of the RPN 58 | :param pre_nms_top_n: number of proposals to keep before applying NMS. It should 59 | contain two fields: training and testing, to allow for different values depending 60 | on training or evaluation 61 | :param post_nms_top_n: number of proposals to keep after applying NMS. It should 62 | contain two fields: training and testing, to allow for different values depending 63 | on training or evaluation 64 | :param nms_thresh: NMS threshold used for postprocessing the RPN proposals 65 | """ 66 | 67 | def __init__(self, anchor_generator, head, fg_iou_thresh, bg_iou_thresh, batch_size_per_image, positive_fraction, 68 | pre_nms_top_n, post_nms_top_n, nms_thresh): 69 | 70 | super(RegionProposalNetwork, self).__init__() 71 | self.anchor_generator = anchor_generator 72 | self.head = head 73 | self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) 74 | 75 | # use during training 76 | # function for computing iou between anchor and true bbox 77 | self.box_similarity = box_op.box_iou 78 | 79 | self.proposal_matcher = Matcher( 80 | fg_iou_thresh, # foreground threshold, if IOU > threshold(0.7), is positive samples 81 | bg_iou_thresh, # background threshold, if IOU < threshold(0.3), is negative samples 82 | allow_low_quality_matches=True 83 | ) 84 | 85 | self.fg_bg_sampler = BalancedPositiveNegativeSampler( 86 | batch_size_per_image, positive_fraction # 256, 0.5 87 | ) 88 | 89 | # use during testing 90 | self._pre_nms_top_n = pre_nms_top_n 91 | self._post_nms_top_n = post_nms_top_n 92 | self.nms_thresh = nms_thresh 93 | self.min_size = 1e-3 94 | 95 | def pre_nms_top_n(self): 96 | if self.training: 97 | return self._pre_nms_top_n['training'] 98 | return self._pre_nms_top_n['testing'] 99 | 100 | def post_nms_top_n(self): 101 | if self.training: 102 | return self._post_nms_top_n['training'] 103 | return self._post_nms_top_n['testing'] 104 | 105 | def assign_targets_to_anchors(self, anchors, targets): 106 | """ 107 | get the best match gt for anchors, divided into bg samples, fg samples and unused samples 108 | :param anchors: (List[Tensor]) 109 | :param targets: (List[Dict[Tensor]) 110 | :return: labels: anchors cls, 1 is foreground, 0 is background, -1 is unused 111 | matched_gt_boxes：best matched gt 112 | """ 113 | 114 | labels = [] 115 | matched_gt_boxes = [] 116 | for anchors_per_image, targets_per_image in zip(anchors, targets): 117 | gt_boxes = targets_per_image["boxes"] 118 | if gt_boxes.numel() == 0: 119 | device = anchors_per_image.device 120 | matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device) 121 | labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device) 122 | else: 123 | # compute iou of anchors and real bbox 124 | match_quality_matrix = box_op.box_iou(gt_boxes, anchors_per_image) 125 | # calculate index of anchors and gt iou（iou<0.3 is -1，0.3= 0 135 | labels_per_image = labels_per_image.to(dtype=torch.float32) 136 | 137 | # background (negative examples) 138 | bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD # -1 139 | labels_per_image[bg_indices] = 0.0 140 | 141 | # discard indices that are between thresholds 142 | inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS # -2 143 | labels_per_image[inds_to_discard] = -1.0 144 | 145 | labels.append(labels_per_image) 146 | matched_gt_boxes.append(matched_gt_boxes_per_image) 147 | return labels, matched_gt_boxes 148 | 149 | def _get_top_n_idx(self, objectness, num_anchors_per_level): 150 | """ 151 | get thr top pre_nms_top_n anchor index in predicted feature_maps based on scores 152 | :param objectness: scores 153 | :param num_anchors_per_level: number of anchors 154 | :return: 155 | """ 156 | 157 | result = [] 158 | offset = 0 159 | for ob in objectness.split(num_anchors_per_level, 1): 160 | num_anchors = ob.shape[1] 161 | pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors) 162 | 163 | # Returns the k largest elements of the given input tensor along a given dimension 164 | _, top_n_idx = ob.topk(pre_nms_top_n, dim=1) 165 | result.append(top_n_idx + offset) 166 | offset += num_anchors 167 | return torch.cat(result, dim=1) 168 | 169 | def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level): 170 | """ 171 | remove small bboxes, nms process, get post_nms_top_n target 172 | :param proposals: predicted bbox coordinates 173 | :param objectness: predicted scores 174 | :param image_shapes: image shape 175 | :param num_anchors_per_level: number od anchors of per feature_maps 176 | :return: 177 | """ 178 | 179 | num_images = proposals.shape[0] 180 | device = proposals.device 181 | 182 | # do not backprop throught objectness 183 | objectness = objectness.detach() 184 | objectness = objectness.reshape(num_images, -1) 185 | 186 | # Returns a tensor of size, size filled with fill_value 187 | levels = [torch.full((n,), idx, dtype=torch.int64, device=device) 188 | for idx, n in enumerate(num_anchors_per_level)] 189 | levels = torch.cat(levels, 0) 190 | 191 | # Expand this tensor to the same size as objectness 192 | levels = levels.reshape(1, -1).expand_as(objectness) 193 | 194 | # select top_n boxes independently per level before applying nms 195 | top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level) 196 | 197 | image_range = torch.arange(num_images, device=device) 198 | batch_idx = image_range[:, None] # [batch_size, 1] 199 | 200 | objectness = objectness[batch_idx, top_n_idx] 201 | levels = levels[batch_idx, top_n_idx] 202 | proposals = proposals[batch_idx, top_n_idx] 203 | 204 | final_boxes = [] 205 | final_scores = [] 206 | for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes): 207 | # adjust predicted bbox, make boxes outside of the image in image 208 | boxes = box_op.clip_boxes_to_image(boxes, img_shape) 209 | 210 | # Remove boxes which contains at least one side smaller than min_size. 211 | keep = box_op.remove_small_boxes(boxes, self.min_size) 212 | boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep] 213 | 214 | # non-maximum suppression, independently done per level 215 | keep = box_op.batched_nms(boxes, scores, lvl, self.nms_thresh) 216 | 217 | # keep only top k scoring predictions 218 | keep = keep[: self.post_nms_top_n()] 219 | boxes, scores = boxes[keep], scores[keep] 220 | final_boxes.append(boxes) 221 | final_scores.append(scores) 222 | return final_boxes, final_scores 223 | 224 | def compute_loss(self, objectness, pred_bbox_deltas, labels, regression_targets): 225 | """ 226 | compute RPN loss, include classification loss(foreground and background), bbox regression loss 227 | :param objectness: predicted foreground probability 228 | :param pred_bbox_deltas: predicted bbox regression parameters 229 | :param labels: true lable, 1, 0 and -1 230 | :param regression_targets: true bbox regression 231 | :return: objectness_loss (Tensor) : classification loss 232 | box_loss (Tensor)：bbox loss 233 | """ 234 | 235 | # selective positive and negative samples 236 | sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) 237 | 238 | sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) 239 | sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) 240 | 241 | sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) 242 | objectness = objectness.flatten() 243 | 244 | labels = torch.cat(labels, dim=0) 245 | regression_targets = torch.cat(regression_targets, dim=0) 246 | 247 | # bbox regression loss 248 | box_loss = smooth_l1_loss(pred_bbox_deltas[sampled_pos_inds], regression_targets[sampled_pos_inds], 249 | beta=1 / 9, size_average=False, ) / (sampled_inds.numel()) 250 | 251 | # classification loss 252 | objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds]) 253 | 254 | return objectness_loss, box_loss 255 | 256 | def forward(self, images, features, targets=None): 257 | """ 258 | :param images: (ImageList), images for which we want to compute the predictions 259 | :param features: (Dict[Tensor]), features computed from the images that are 260 | used for computing the predictions. Each tensor in the list 261 | correspond to different feature levels 262 | :param targets: (List[Dict[Tensor]), ground-truth boxes present in the image (optional). 263 | If provided, each element in the dict should contain a field `boxes`, 264 | with the locations of the ground-truth boxes. 265 | :return: 266 | boxes (List[Tensor]): the predicted boxes from the RPN image. 267 | losses (Dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. 268 | """ 269 | 270 | # RPN uses all feature maps that are available 271 | features = list(features.values()) 272 | 273 | # Two fc layers to compute the fg/bg scores and bboxs regressions 274 | fg_bg_scores, pred_bbox_deltas = self.head(features) 275 | 276 | # get all anchors of images based on features 277 | anchors = self.anchor_generator(images, features) 278 | 279 | # batch_size 280 | num_images = len(anchors) 281 | 282 | # numel() Returns the total number of elements in the input tensor. 283 | num_anchors_per_level_shape_tensors = [o[0].shape for o in fg_bg_scores] 284 | num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors] 285 | 286 | # adjust tensor order and reshape 287 | fg_bg_scores, pred_bbox_deltas = box_op.concat_box_prediction_layers(fg_bg_scores, pred_bbox_deltas) 288 | 289 | # apply pred_bbox_deltas to anchors to obtain the decoded proposals 290 | proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors) 291 | proposals = proposals.view(num_images, -1, 4) 292 | 293 | # remove small bboxes, nms process, get post_nms_top_n target 294 | boxes, scores = self.filter_proposals(proposals, fg_bg_scores, images.image_sizes, num_anchors_per_level) 295 | 296 | losses = {} 297 | if self.training: 298 | assert targets is not None 299 | labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets) 300 | 301 | # encode parameters based on the bboxes and anchors 302 | regression_targets = self.box_coder.encode(matched_gt_boxes, anchors) 303 | loss_objectness, loss_rpn_box_reg = self.compute_loss( 304 | fg_bg_scores, pred_bbox_deltas, labels, regression_targets) 305 | losses = {"loss_objectness": loss_objectness, "loss_rpn_box_reg": loss_rpn_box_reg} 306 | 307 | return boxes, losses 308 | -------------------------------------------------------------------------------- /utils/train_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pickle 3 | import time 4 | from collections import defaultdict, deque 5 | 6 | import torch.distributed as dist 7 | from torchvision import ops 8 | 9 | from backbone.mobilenet import MobileNetV2 10 | from backbone.resnet50_fpn_model import * 11 | from config.train_config import cfg 12 | from utils.anchor_utils import AnchorsGenerator 13 | from utils.faster_rcnn_utils import FasterRCNN, FastRCNNPredictor 14 | 15 | 16 | def create_model(num_classes): 17 | global backbone, model 18 | backbone_network = cfg.backbone 19 | 20 | anchor_sizes = tuple((f,) for f in cfg.anchor_size) 21 | aspect_ratios = tuple((f,) for f in cfg.anchor_ratio) * len(anchor_sizes) 22 | anchor_generator = AnchorsGenerator(sizes=anchor_sizes, 23 | aspect_ratios=aspect_ratios) 24 | 25 | if backbone_network == 'mobilenet': 26 | backbone = MobileNetV2(weights_path=cfg.backbone_pretrained_weights).features 27 | backbone.out_channels = 1280 28 | 29 | roi_pooler = ops.MultiScaleRoIAlign(featmap_names=['0'], # roi pooling in which resolution feature 30 | output_size=cfg.roi_out_size, # roi_pooling output feature size 31 | sampling_ratio=cfg.roi_sample_rate) # sampling_ratio 32 | 33 | model = FasterRCNN(backbone=backbone, num_classes=num_classes, 34 | # transform parameters 35 | min_size=cfg.min_size, max_size=cfg.max_size, 36 | image_mean=cfg.image_mean, image_std=cfg.image_std, 37 | # rpn parameters 38 | rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler, 39 | rpn_pre_nms_top_n_train=cfg.rpn_pre_nms_top_n_train, 40 | rpn_pre_nms_top_n_test=cfg.rpn_pre_nms_top_n_test, 41 | rpn_post_nms_top_n_train=cfg.rpn_post_nms_top_n_train, 42 | rpn_post_nms_top_n_test=cfg.rpn_post_nms_top_n_test, 43 | rpn_nms_thresh=cfg.rpn_nms_thresh, 44 | rpn_fg_iou_thresh=cfg.rpn_fg_iou_thresh, 45 | rpn_bg_iou_thresh=cfg.rpn_bg_iou_thresh, 46 | rpn_batch_size_per_image=cfg.rpn_batch_size_per_image, 47 | rpn_positive_fraction=cfg.rpn_positive_fraction, 48 | # Box parameters 49 | box_head=None, box_predictor=None, 50 | 51 | # remove low threshold target 52 | box_score_thresh=cfg.box_score_thresh, 53 | box_nms_thresh=cfg.box_nms_thresh, 54 | box_detections_per_img=cfg.box_detections_per_img, 55 | box_fg_iou_thresh=cfg.box_fg_iou_thresh, 56 | box_bg_iou_thresh=cfg.box_bg_iou_thresh, 57 | box_batch_size_per_image=cfg.box_batch_size_per_image, 58 | box_positive_fraction=cfg.box_positive_fraction, 59 | bbox_reg_weights=cfg.bbox_reg_weights 60 | ) 61 | elif backbone_network == 'resnet50_fpn': 62 | backbone = resnet50_fpn_backbone() 63 | 64 | roi_pooler = ops.MultiScaleRoIAlign( 65 | featmap_names=['0', '1', '2', '3'], 66 | output_size=cfg.roi_out_size, 67 | sampling_ratio=cfg.roi_sample_rate) 68 | model = FasterRCNN(backbone=backbone, num_classes=num_classes, 69 | # transform parameters 70 | min_size=cfg.min_size, max_size=cfg.max_size, 71 | image_mean=cfg.image_mean, image_std=cfg.image_std, 72 | # rpn parameters 73 | rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler, 74 | rpn_pre_nms_top_n_train=cfg.rpn_pre_nms_top_n_train, 75 | rpn_pre_nms_top_n_test=cfg.rpn_pre_nms_top_n_test, 76 | rpn_post_nms_top_n_train=cfg.rpn_post_nms_top_n_train, 77 | rpn_post_nms_top_n_test=cfg.rpn_post_nms_top_n_test, 78 | rpn_nms_thresh=cfg.rpn_nms_thresh, 79 | rpn_fg_iou_thresh=cfg.rpn_fg_iou_thresh, 80 | rpn_bg_iou_thresh=cfg.rpn_bg_iou_thresh, 81 | rpn_batch_size_per_image=cfg.rpn_batch_size_per_image, 82 | rpn_positive_fraction=cfg.rpn_positive_fraction, 83 | # Box parameters 84 | box_head=None, box_predictor=None, 85 | 86 | # remove low threshold target 87 | box_score_thresh=cfg.box_score_thresh, 88 | box_nms_thresh=cfg.box_nms_thresh, 89 | box_detections_per_img=cfg.box_detections_per_img, 90 | box_fg_iou_thresh=cfg.box_fg_iou_thresh, 91 | box_bg_iou_thresh=cfg.box_bg_iou_thresh, 92 | box_batch_size_per_image=cfg.box_batch_size_per_image, 93 | box_positive_fraction=cfg.box_positive_fraction, 94 | bbox_reg_weights=cfg.bbox_reg_weights 95 | ) 96 | 97 | # weights_dict = torch.load(cfg.pretrained_weights) 98 | # missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False) 99 | # if len(missing_keys) != 0 or len(unexpected_keys) != 0: 100 | # print("missing_keys: ", missing_keys) 101 | # print("unexpected_keys: ", unexpected_keys) 102 | 103 | in_features = model.roi_heads.box_predictor.cls_score.in_features 104 | model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) 105 | 106 | return model 107 | 108 | 109 | def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor): 110 | def f(x): 111 | if x >= warmup_iters: 112 | return 1 113 | alpha = float(x) / warmup_iters 114 | return warmup_factor * (1 - alpha) + alpha 115 | 116 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f) 117 | 118 | 119 | def is_dist_avail_and_initialized(): 120 | if not dist.is_available(): 121 | return False 122 | if not dist.is_initialized(): 123 | return False 124 | return True 125 | 126 | 127 | def get_world_size(): 128 | if not is_dist_avail_and_initialized(): 129 | return 1 130 | return dist.get_world_size() 131 | 132 | 133 | def reduce_dict(input_dict, average=True): 134 | """ 135 | Args: 136 | input_dict (dict): all the values will be reduced 137 | average (bool): whether to do average or sum 138 | Reduce the values in the dictionary from all processes so that all processes 139 | have the averaged results. Returns a dict with the same fields as 140 | input_dict, after reduction. 141 | """ 142 | world_size = get_world_size() 143 | if world_size < 2: 144 | return input_dict 145 | with torch.no_grad(): 146 | names = [] 147 | values = [] 148 | # sort the keys so that they are consistent across processes 149 | for k in sorted(input_dict.keys()): 150 | names.append(k) 151 | values.append(input_dict[k]) 152 | values = torch.stack(values, dim=0) 153 | dist.all_reduce(values) 154 | if average: 155 | values /= world_size 156 | 157 | reduced_dict = {k: v for k, v in zip(names, values)} 158 | return reduced_dict 159 | 160 | 161 | class SmoothedValue(object): 162 | """Track a series of values and provide access to smoothed values over a 163 | window or the global series average. 164 | """ 165 | 166 | def __init__(self, window_size=20, fmt=None): 167 | if fmt is None: 168 | fmt = "{median:.4f} ({global_avg:.4f})" 169 | self.deque = deque(maxlen=window_size) # deque简单理解成加强版list 170 | self.total = 0.0 171 | self.count = 0 172 | self.fmt = fmt 173 | 174 | def update(self, value, n=1): 175 | self.deque.append(value) 176 | self.count += n 177 | self.total += value * n 178 | 179 | def synchronize_between_processes(self): 180 | """ 181 | Warning: does not synchronize the deque! 182 | """ 183 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda") 184 | dist.barrier() 185 | dist.all_reduce(t) 186 | t = t.tolist() 187 | self.count = int(t[0]) 188 | self.total = t[1] 189 | 190 | @property 191 | def median(self): 192 | d = torch.tensor(list(self.deque)) 193 | return d.median().item() 194 | 195 | @property 196 | def avg(self): 197 | d = torch.tensor(list(self.deque), dtype=torch.float32) 198 | return d.mean().item() 199 | 200 | @property 201 | def global_avg(self): 202 | return self.total / self.count 203 | 204 | @property 205 | def max(self): 206 | return max(self.deque) 207 | 208 | @property 209 | def value(self): 210 | return self.deque[-1] 211 | 212 | def __str__(self): 213 | return self.fmt.format( 214 | median=self.median, 215 | avg=self.avg, 216 | global_avg=self.global_avg, 217 | max=self.max, 218 | value=self.value) 219 | 220 | 221 | def all_gather(data): 222 | """ 223 | Run all_gather on arbitrary picklable data (not necessarily tensors) 224 | Args: 225 | data: any picklable object 226 | Returns: 227 | list[data]: list of data gathered from each rank 228 | """ 229 | world_size = get_world_size() 230 | if world_size == 1: 231 | return [data] 232 | 233 | # serialized to a Tensor 234 | buffer = pickle.dumps(data) 235 | storage = torch.ByteStorage.from_buffer(buffer) 236 | tensor = torch.ByteTensor(storage).to("cuda") 237 | 238 | # obtain Tensor size of each rank 239 | local_size = torch.tensor([tensor.numel()], device="cuda") 240 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 241 | dist.all_gather(size_list, local_size) 242 | size_list = [int(size.item()) for size in size_list] 243 | max_size = max(size_list) 244 | 245 | # receiving Tensor from all ranks 246 | # we pad the tensor because torch all_gather does not support 247 | # gathering tensors of different shapes 248 | tensor_list = [] 249 | for _ in size_list: 250 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 251 | if local_size != max_size: 252 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 253 | tensor = torch.cat((tensor, padding), dim=0) 254 | dist.all_gather(tensor_list, tensor) 255 | 256 | data_list = [] 257 | for size, tensor in zip(size_list, tensor_list): 258 | buffer = tensor.cpu().numpy().tobytes()[:size] 259 | data_list.append(pickle.loads(buffer)) 260 | 261 | return data_list 262 | 263 | 264 | class MetricLogger(object): 265 | def __init__(self, delimiter="\t"): 266 | self.meters = defaultdict(SmoothedValue) 267 | self.delimiter = delimiter 268 | 269 | def update(self, **kwargs): 270 | for k, v in kwargs.items(): 271 | if isinstance(v, torch.Tensor): 272 | v = v.item() 273 | assert isinstance(v, (float, int)) 274 | self.meters[k].update(v) 275 | 276 | def __getattr__(self, attr): 277 | if attr in self.meters: 278 | return self.meters[attr] 279 | if attr in self.__dict__: 280 | return self.__dict__[attr] 281 | raise AttributeError("'{}' object has no attribute '{}'".format( 282 | type(self).__name__, attr)) 283 | 284 | def __str__(self): 285 | loss_str = [] 286 | for name, meter in self.meters.items(): 287 | loss_str.append( 288 | "{}: {}".format(name, str(meter)) 289 | ) 290 | return self.delimiter.join(loss_str) 291 | 292 | def add_meter(self, name, meter): 293 | self.meters[name] = meter 294 | 295 | def synchronize_between_processes(self): 296 | for meter in self.meters.values(): 297 | meter.synchronize_between_processes() 298 | 299 | def log_every(self, iterable, print_freq, header=None): 300 | i = 0 301 | if not header: 302 | header = "" 303 | start_time = time.time() 304 | end = time.time() 305 | iter_time = SmoothedValue(fmt='{avg:.4f}') 306 | data_time = SmoothedValue(fmt='{avg:.4f}') 307 | space_fmt = ":" + str(len(str(len(iterable)))) + "d" 308 | if torch.cuda.is_available(): 309 | log_msg = self.delimiter.join([header, 310 | '[{0' + space_fmt + '}/{1}]', 311 | 'eta: {eta}', 312 | '{meters}', 313 | 'time: {time}', 314 | 'data: {data}', 315 | 'max mem: {memory:.0f}']) 316 | else: 317 | log_msg = self.delimiter.join([header, 318 | '[{0' + space_fmt + '}/{1}]', 319 | 'eta: {eta}', 320 | '{meters}', 321 | 'time: {time}', 322 | 'data: {data}']) 323 | MB = 1024.0 * 1024.0 324 | for obj in iterable: 325 | data_time.update(time.time() - end) 326 | yield obj 327 | iter_time.update(time.time() - end) 328 | if i % print_freq == 0 or i == len(iterable) - 1: 329 | eta_second = iter_time.global_avg * (len(iterable) - i) 330 | eta_string = str(datetime.timedelta(seconds=eta_second)) 331 | if torch.cuda.is_available(): 332 | print(log_msg.format(i, len(iterable), 333 | eta=eta_string, 334 | meters=str(self), 335 | time=str(iter_time), 336 | data=str(data_time), 337 | memory=torch.cuda.max_memory_allocated() / MB)) 338 | else: 339 | print(log_msg.format(i, len(iterable), 340 | eta=eta_string, 341 | meters=str(self), 342 | time=str(iter_time), 343 | data=str(data_time))) 344 | i += 1 345 | end = time.time() 346 | total_time = time.time() - start_time 347 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 348 | print('{} Total time: {} ({:.4f} s / it)'.format(header, 349 | total_time_str, 350 | 351 | total_time / len(iterable))) 352 | 353 | 354 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, 355 | train_loss=None, train_lr=None, warmup=False): 356 | global loss_dict, losses 357 | model.train() 358 | metric_logger = MetricLogger(delimiter=" ") 359 | metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) 360 | header = 'Epoch: [{}]'.format(epoch) 361 | 362 | lr_scheduler = None 363 | if epoch == 0 and warmup is True: 364 | warmup_factor = 1.0 / 1000 365 | warmup_iters = min(1000, len(data_loader) - 1) 366 | 367 | lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) 368 | 369 | for images, targets in metric_logger.log_every(data_loader, print_freq, header): 370 | images = list(image.to(device) for image in images) 371 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 372 | 373 | loss_dict = model(images, targets) 374 | 375 | losses = sum(loss for loss in loss_dict.values()) 376 | 377 | # reduce losses over all GPUs for logging purpose 378 | loss_dict_reduced = reduce_dict(loss_dict) 379 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 380 | 381 | loss_value = losses_reduced.item() 382 | if isinstance(train_loss, list): 383 | train_loss.append(loss_value) 384 | 385 | optimizer.zero_grad() 386 | losses.backward() 387 | optimizer.step() 388 | 389 | if lr_scheduler is not None: 390 | lr_scheduler.step() 391 | 392 | metric_logger.update(loss=losses_reduced, **loss_dict_reduced) 393 | now_lr = optimizer.param_groups[0]["lr"] 394 | metric_logger.update(lr=now_lr) 395 | if isinstance(train_lr, list): 396 | train_lr.append(now_lr) 397 | 398 | return loss_dict, losses 399 | 400 | 401 | def write_tb(writer, num, info): 402 | for item in info.items(): 403 | writer.add_scalar(item[0], item[1], num) 404 | -------------------------------------------------------------------------------- /utils/transform_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | from torch.jit.annotations import List, Tuple 6 | 7 | from utils.im_utils import ImageList 8 | 9 | 10 | def torch_choice(l): 11 | index = int(torch.empty(1).uniform_(0., float(len(l))).item()) 12 | return l[index] 13 | 14 | 15 | def max_by_axis(the_list): 16 | maxes = the_list[0] 17 | for sublist in the_list[1:]: 18 | for index, item in enumerate(sublist): 19 | maxes[index] = max(maxes[index], item) 20 | return maxes 21 | 22 | 23 | def batch_images(images, size_divisible=32): 24 | """ 25 | batched images 26 | :param images: a set of images 27 | :param size_divisible: ratio of height/width to be adjusted 28 | :return: batched tensor image 29 | """ 30 | 31 | max_size = max_by_axis([list(img.shape) for img in images]) 32 | 33 | stride = float(size_divisible) 34 | 35 | max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) 36 | max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) 37 | 38 | # [batch, channel, height, width] 39 | batch_shape = [len(images)] + max_size 40 | 41 | batched_imgs = images[0].new_full(batch_shape, 0) 42 | for img, pad_img in zip(images, batched_imgs): 43 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 44 | 45 | return batched_imgs 46 | 47 | 48 | class GeneralizedRCNNTransform(nn.Module): 49 | """ 50 | Performs input / target transformation before feeding the data to a GeneralizedRCNN model. 51 | The transformations it perform are: 52 | - input normalization (mean subtraction and std division) 53 | - input / target resizing to match min_size / max_size 54 | 55 | It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets 56 | :param min_size: minimum size of input image 57 | :param max_size: maximum size of input image 58 | :param image_mean: image mean 59 | :param image_std: image std 60 | """ 61 | 62 | def __init__(self, min_size, max_size, image_mean, image_std): 63 | super(GeneralizedRCNNTransform, self).__init__() 64 | if not isinstance(min_size, (list, tuple)): 65 | min_size = (min_size,) 66 | self.min_size = min_size 67 | self.max_size = max_size 68 | self.image_mean = image_mean 69 | self.image_std = image_std 70 | 71 | def normalize(self, image): 72 | dtype, device = image.dtype, image.device 73 | mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) 74 | std = torch.as_tensor(self.image_std, dtype=dtype, device=device) 75 | return (image - mean[:, None, None]) / std[:, None, None] 76 | 77 | def resize(self, image, target): 78 | """ 79 | resize input image to specified size and transform for target 80 | :param image: input image 81 | :param target: target related info, like bbox 82 | :return: 83 | image: resized image 84 | target: resized target 85 | """ 86 | 87 | # image shape is [channel, height, width] 88 | h, w = image.shape[-2:] 89 | im_shape = torch.tensor(image.shape[-2:]) 90 | min_size = float(torch.min(im_shape)) 91 | max_size = float(torch.max(im_shape)) 92 | if self.training: 93 | size = float(torch_choice(self.min_size)) 94 | else: 95 | size = float(self.min_size[-1]) 96 | scale_factor = size / min_size 97 | 98 | if max_size * scale_factor > self.max_size: 99 | scale_factor = self.max_size / max_size 100 | 101 | image = torch.nn.functional.interpolate( 102 | image[None], scale_factor=scale_factor, mode='bilinear', align_corners=False)[0] 103 | 104 | if target is None: 105 | return image, target 106 | 107 | bbox = target["boxes"] 108 | bbox = resize_boxes(bbox, (h, w), image.shape[-2:]) 109 | target["boxes"] = bbox 110 | 111 | return image, target 112 | 113 | def postprocess(self, result, image_shapes, original_image_sizes): 114 | """ 115 | post process of predictions, mainly map bboxed coordinates to original image 116 | :param result: predictions result 117 | :param image_shapes: image size after preprocess 118 | :param original_image_sizes: original image size 119 | :return: 120 | """ 121 | 122 | if self.training: 123 | return result 124 | for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)): 125 | boxes = pred["boxes"] 126 | boxes = resize_boxes(boxes, im_s, o_im_s) 127 | result[i]["boxes"] = boxes 128 | return result 129 | 130 | def forward(self, images, targets=None): 131 | images = [img for img in images] 132 | for i in range(len(images)): 133 | image = images[i] 134 | target_index = targets[i] if targets is not None else None 135 | 136 | if image.dim() != 3: 137 | raise ValueError("images is expected to be a list of 3d tensors " 138 | "of shape [C, H, W], got {}".format(image.shape)) 139 | image = self.normalize(image) 140 | image, target_index = self.resize(image, target_index) 141 | images[i] = image 142 | if targets is not None and target_index is not None: 143 | targets[i] = target_index 144 | 145 | # save resized image size 146 | image_sizes = [img.shape[-2:] for img in images] 147 | images = batch_images(images) 148 | image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], []) 149 | 150 | for image_size in image_sizes: 151 | assert len(image_size) == 2 152 | image_sizes_list.append((image_size[0], image_size[1])) 153 | 154 | image_list = ImageList(images, image_sizes_list) 155 | return image_list, targets 156 | 157 | 158 | def resize_boxes(boxes, original_size, new_size): 159 | """ 160 | resize bbox to original image based on stride 161 | :param boxes: predicted bboxes 162 | :param original_size: original image size 163 | :param new_size: rescaled image size 164 | :return: 165 | """ 166 | ratios = [ 167 | torch.tensor(s, dtype=torch.float32, device=boxes.device) / 168 | torch.tensor(s_orig, dtype=torch.float32, device=boxes.device) 169 | for s, s_orig in zip(new_size, original_size) 170 | ] 171 | ratios_height, ratios_width = ratios 172 | 173 | xmin, ymin, xmax, ymax = boxes.unbind(1) 174 | xmin = xmin * ratios_width 175 | xmax = xmax * ratios_width 176 | ymin = ymin * ratios_height 177 | ymax = ymax * ratios_height 178 | return torch.stack((xmin, ymin, xmax, ymax), dim=1) 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | --------------------------------------------------------------------------------