├── .gitignore
├── README.md
├── backbone
├── fpn101.py
├── hrnet.py
├── mobilenet.py
├── resnet50_fpn_model.py
└── vgg16.py
├── config
├── test_config.py
└── train_config.py
├── dataloader
└── coco_dataset.py
├── imgs
└── demo1.png
├── requirements.txt
├── test.py
├── test
└── anchor_utils_test.py
├── train.py
└── utils
├── anchor_utils.py
├── boxes_utils.py
├── coco_utils.py
├── det_utils.py
├── draw_box_utils.py
├── evaluate_utils.py
├── faster_rcnn_utils.py
├── im_utils.py
├── plot_utils.py
├── roi_header_util.py
├── rpn_utils.py
├── train_utils.py
└── transform_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | .idea/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |  |
4 |  |
5 |  |
6 |
7 |
8 |  |
9 |  |
10 |  |
11 |
12 |
13 |  |
14 |  |
15 |  |
16 |
17 |
18 |
19 | # pytorch-faster-rcnn
20 | ## 1. Introduction
21 | Pytorch based implementation of faster rcnn framework.For details about faster R-CNN please refer to the paper [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) by Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun
22 |
23 |
24 | This detection framework has the following features:
25 | * It can be run as pure python code, and also pure based on pytorch framework, no need to build
26 | * It is easily trained by only running a train.py script, just set the data root dir
27 | * It has many backbone networks. like vgg, resnet-fpn, mobilenet, high resolution net(HRNet)
28 | * It can be a really detection framework. You only need to change super parameters in config file and get different models to compare different model
29 | * It's memory-efficient (about 3GB for vgg16)
30 | ## 2. Installation
31 | ### 2.1 Prerequisites
32 | * Python 2.7 or 3.5
33 | * Pytorch 1.5.1
34 | * torchvision 0.6.1
35 | * numpy 1.15.4
36 | * Pillow 6.1.0
37 | * pycocotools 2.0
38 | * matplotlib 3.0.2
39 | * tensorboardX 2.0
40 | ```Shell
41 | pip install -r requirements.txt
42 | ```
43 | ### 2.2 Code-Preparing
44 | ```Shell
45 | git clone https://github.com/AlphaJia/pytorch-faster-rcnn.git
46 | ```
47 | ## 3. Data Preparation
48 | ### COCO
49 | ##### 3.1 Download the training, validation, test data and annotations
50 | ```Shell
51 | wget http://images.cocodataset.org/zips/train2017.zip
52 | wget http://images.cocodataset.org/zips/val2017.zip
53 | wget http://images.cocodataset.org/zips/test2017.zip
54 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
55 | ```
56 | ##### 3.2 Extract all of these tars into one directory named COCODevKit
57 | ```Shell
58 | tar xvf train2017.zip
59 | tar xvf val2017.zip
60 | tar xvf test2017.zip
61 | tar xvf annotations_trainval2017.zip
62 | ```
63 | ##### 3.3 Data dir should like this
64 | ```
65 | COCODevKit
66 | |-- train2017
67 | |-- [xxxxxxxxxxxx].jpg
68 | |-- val2017
69 | |-- [xxxxxxxxxxxx].jpg
70 | |-- test2017
71 | |-- [xxxxxxxxxxxx].jpg
72 | |-- annotations
73 | |-- instances_train2017.json
74 | |-- instances_val2017.json
75 | |-- image_info_test2017.json
76 | ```
77 | ##### 3.4 modify data_root_dir cfg item in config/train_config.py with /path/COCODevKit/
78 |
79 | ## 4. Train
80 | Modify model_save_dir cfg item in config/train_config.py with your own save path and device_name with your own device
81 | * Train with [mobilenet](https://arxiv.org/abs/1801.04381)
82 | Modify backbone cfg item in config/train_config.py with mobilenet, download pretrained weights [here](https://download.pytorch.org/models/mobilenet_v2-b0353104.pth), and set backbone_pretrained_weights in config/train_config.py with downloaded path.
83 | ```Shell
84 | python train.py
85 | ```
86 | * Train with [resnet-fpn](https://arxiv.org/abs/1409.1556)
87 | Modify backbone cfg item in config/train_config.py with resnet50_fpn, download pretrained weights [here](https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth), and set backbone_pretrained_weights in config/train_config.py with downloaded path
88 | ```Shell
89 | python train.py
90 | ```
91 | * Train with [vgg16](https://arxiv.org/abs/1409.1556)
92 | Modify backbone cfg item in config/train_config.py with vgg16
93 | ```Shell
94 | python train.py
95 | ```
96 | * Train with [HRNet](https://arxiv.org/abs/1409.1556)
97 | Modify backbone cfg item in config/train_config.py with HRNe
98 | ```Shell
99 | python train.py
100 | ```
101 |
102 | Weights and tensorboard log will save in your model_save_path dir
103 | you may refer to config/train_config.py for more argument.
104 | Some Key arguments:
105 | `--backbone`: feature extraction backbone network
106 | `--backbone_pretrained_weights`: backbone pretrained weights, None or path
107 | `--train_horizon_flip_prob`: data horizontal flip probability
108 | `--num_class`: number of classification, including background
109 | `--data_root_dir`: COCO dataset root dir
110 | `--model_save_dir`: training weights save path
111 | `--device_name`: training device
112 | `--num_epochs`: training epochs
113 | ## 5. Test
114 | Modify model_weights cfg item in config/test_config.py with your trained weights path and gpu_id with your own cuda device ID.
115 | you may refer to config/test_config.py for more argument.
116 | Some Key arguments:
117 | `--model_weights`: training save path
118 | `--image_path`: predicted images
119 | `--gpu_id`: cuda device gpu ID
120 | `--num_classes`: number of classification, including background
121 | `--data_root_dir`: COCO dataset root dir
122 |
123 | ```Shell
124 | python test.py
125 | ```
126 | ## 6. Demo
127 | 
128 | ## 7. Framework Structure
129 | #### backbone
130 | This module includes backbone feature extraction network
131 | * vgg16:vgg16 net network([Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556))
132 | * fpn101:resnet101 fpn network([Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)) ([Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144))
133 | * hrnet:high resolution net([Deep High-Resolution Representation Learning for Visual Recognition](https://arxiv.org/abs/1908.07919))
134 | * mobile_net:mobile_net v2 network([MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381))
135 | #### config
136 | This module includes config parameters in training period and testing period
137 | * test_config: specify config parameters in testing period like model_file, image_path_dir, save_dir, etc.
138 | * train_config: specify config parameters in training period like backbone network, batch_size, image_path_dir, anchor_size, ect.
139 | #### dataloader
140 | This module inherits pytorch dataloader classes, dataset IO.You can also generate your own dataset dataloader IO and put it in this module
141 | * coco_dataset: coco([Common Objects in Context](https://cocodataset.org/#home)) dataset dataloader IO
142 | #### test
143 | This module includes the utils function test(common called unit test, also called UT)
144 | * anchor_utils_test: some unit testing for utils/anchor_utils.py
145 | #### utils
146 | This module includes some utilies for image processing, network architectures building, anchor generating, loss function, etc.
147 | * anchor_utils: some basic function for building anchors
148 | * im_utils: some basic function for image processing
149 |
150 |
--------------------------------------------------------------------------------
/backbone/fpn101.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | class Bottleneck(nn.Module):
8 | expansion = 4
9 |
10 | def __init__(self, in_planes, planes, stride=1, downsample=None):
11 | super(Bottleneck, self).__init__()
12 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
13 | self.bn1 = nn.BatchNorm2d(planes)
14 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
15 | self.bn2 = nn.BatchNorm2d(planes)
16 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
17 | self.bn3 = nn.BatchNorm2d(self.expansion * planes)
18 | self.relu = nn.ReLU(inplace=True)
19 | self.downsample = downsample
20 | self.stride = stride
21 |
22 | def forward(self, x):
23 | residual = x
24 |
25 | out = self.conv1(x)
26 | out = self.bn1(out)
27 | out = self.relu(out)
28 |
29 | out = self.conv2(out)
30 | out = self.bn2(out)
31 | out = self.relu(out)
32 |
33 | out = self.conv3(out)
34 | out = self.bn3(out)
35 |
36 | if self.downsample is not None:
37 | residual = self.downsample(x)
38 |
39 | out += residual
40 | out = self.relu(out)
41 |
42 | return out
43 |
44 |
45 | class FPN(nn.Module):
46 | def __init__(self, block, layers):
47 | super(FPN, self).__init__()
48 | self.inplanes = 64
49 |
50 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
51 | self.bn1 = nn.BatchNorm2d(64)
52 |
53 | self.relu = nn.ReLU(inplace=True)
54 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
55 | # Bottom-up layers
56 | self.layer1 = self._make_layer(block, 64, layers[0])
57 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
58 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
59 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
60 |
61 | # Top layer
62 | self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) # Reduce channels
63 |
64 | # Smooth layers
65 | self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
66 | self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
67 | self.smooth3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
68 |
69 | # Lateral layers
70 | self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
71 | self.latlayer2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
72 | self.latlayer3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
73 |
74 | for m in self.modules():
75 | if isinstance(m, nn.Conv2d):
76 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
77 | m.weight.data.normal_(0, math.sqrt(2. / n))
78 | elif isinstance(m, nn.BatchNorm2d):
79 | m.weight.data.fill_(1)
80 | m.bias.data.zero_()
81 |
82 | def _make_layer(self, block, planes, blocks, stride=1):
83 | downsample = None
84 | if stride != 1 or self.inplanes != block.expansion * planes:
85 | downsample = nn.Sequential(
86 | nn.Conv2d(self.inplanes, block.expansion * planes, kernel_size=1, stride=stride, bias=False),
87 | nn.BatchNorm2d(block.expansion * planes)
88 | )
89 | layers = []
90 | layers.append(block(self.inplanes, planes, stride, downsample))
91 | self.inplanes = planes * block.expansion
92 | for i in range(1, blocks):
93 | layers.append(block(self.inplanes, planes))
94 |
95 | return nn.Sequential(*layers)
96 |
97 | def _upsample_add(self, x, y):
98 | _, _, H, W = y.size()
99 | return F.upsample(x, size=(H, W), mode='bilinear') + y
100 |
101 | def forward(self, x):
102 | # Bottom-up
103 | x = self.conv1(x)
104 | x = self.bn1(x)
105 | x = self.relu(x)
106 | c1 = self.maxpool(x)
107 |
108 | c2 = self.layer1(c1)
109 | c3 = self.layer2(c2)
110 | c4 = self.layer3(c3)
111 | c5 = self.layer4(c4)
112 | # Top-down
113 | p5 = self.toplayer(c5)
114 | p4 = self._upsample_add(p5, self.latlayer1(c4))
115 | p3 = self._upsample_add(p4, self.latlayer2(c3))
116 | p2 = self._upsample_add(p3, self.latlayer3(c2))
117 | # Smooth
118 | p4 = self.smooth1(p4)
119 | p3 = self.smooth2(p3)
120 | p2 = self.smooth3(p2)
121 | return p2, p3, p4, p5
122 |
123 |
124 | def FPN101():
125 | return FPN(Bottleneck, [2, 2, 2, 2])
126 |
--------------------------------------------------------------------------------
/backbone/hrnet.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class Bottleneck(nn.Module):
5 | expansion = 4
6 |
7 | def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
8 | super(Bottleneck, self).__init__()
9 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
10 | self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
11 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
12 | self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
13 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
14 | self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
15 | self.relu = nn.ReLU(inplace=True)
16 | self.downsample = downsample
17 | self.stride = stride
18 |
19 | def forward(self, x):
20 | residual = x
21 |
22 | out = self.conv1(x)
23 | out = self.bn1(out)
24 | out = self.relu(out)
25 |
26 | out = self.conv2(out)
27 | out = self.bn2(out)
28 | out = self.relu(out)
29 |
30 | out = self.conv3(out)
31 | out = self.bn3(out)
32 |
33 | if self.downsample is not None:
34 | residual = self.downsample(x)
35 |
36 | out += residual
37 | out = self.relu(out)
38 |
39 | return out
40 |
41 |
42 | class BasicBlock(nn.Module):
43 | expansion = 1
44 |
45 | def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
46 | super(BasicBlock, self).__init__()
47 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
48 | self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
49 | self.relu = nn.ReLU(inplace=True)
50 | self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
51 | self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
52 | self.downsample = downsample
53 | self.stride = stride
54 |
55 | def forward(self, x):
56 | residual = x
57 |
58 | out = self.conv1(x)
59 | out = self.bn1(out)
60 | out = self.relu(out)
61 |
62 | out = self.conv2(out)
63 | out = self.bn2(out)
64 |
65 | if self.downsample is not None:
66 | residual = self.downsample(x)
67 |
68 | out += residual
69 | out = self.relu(out)
70 |
71 | return out
72 |
73 |
74 | class StageModule(nn.Module):
75 | def __init__(self, stage, output_branches, c, bn_momentum):
76 | super(StageModule, self).__init__()
77 | self.stage = stage
78 | self.output_branches = output_branches
79 |
80 | self.branches = nn.ModuleList()
81 | for i in range(self.stage):
82 | w = c * (2 ** i)
83 | branch = nn.Sequential(
84 | BasicBlock(w, w, bn_momentum=bn_momentum),
85 | BasicBlock(w, w, bn_momentum=bn_momentum),
86 | BasicBlock(w, w, bn_momentum=bn_momentum),
87 | BasicBlock(w, w, bn_momentum=bn_momentum),
88 | )
89 | self.branches.append(branch)
90 |
91 | self.fuse_layers = nn.ModuleList()
92 | # for each output_branches (i.e. each branch in all cases but the very last one)
93 | for i in range(self.output_branches):
94 | self.fuse_layers.append(nn.ModuleList())
95 | for j in range(self.stage): # for each branch
96 | if i == j:
97 | self.fuse_layers[-1].append(nn.Sequential()) # Used in place of "None" because it is callable
98 | elif i < j:
99 | self.fuse_layers[-1].append(nn.Sequential(
100 | nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False),
101 | nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
102 | nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'),
103 | ))
104 | elif i > j:
105 | ops = []
106 | for k in range(i - j - 1):
107 | ops.append(nn.Sequential(
108 | nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
109 | bias=False),
110 | nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True,
111 | track_running_stats=True),
112 | nn.ReLU(inplace=True),
113 | ))
114 | ops.append(nn.Sequential(
115 | nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
116 | bias=False),
117 | nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
118 | ))
119 | self.fuse_layers[-1].append(nn.Sequential(*ops))
120 |
121 | self.relu = nn.ReLU(inplace=True)
122 |
123 | def forward(self, x):
124 | assert len(self.branches) == len(x)
125 |
126 | x = [branch(b) for branch, b in zip(self.branches, x)]
127 |
128 | x_fused = []
129 | for i in range(len(self.fuse_layers)):
130 | for j in range(0, len(self.branches)):
131 | if j == 0:
132 | x_fused.append(self.fuse_layers[i][0](x[0]))
133 | else:
134 | x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j])
135 |
136 | for i in range(len(x_fused)):
137 | x_fused[i] = self.relu(x_fused[i])
138 |
139 | return x_fused
140 |
141 |
142 | class HRNet(nn.Module):
143 | def __init__(self, c=48, nof_joints=17, bn_momentum=0.1):
144 | super(HRNet, self).__init__()
145 |
146 | # Input (stem net)
147 | self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
148 | self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
149 | self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
150 | self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
151 | self.relu = nn.ReLU(inplace=True)
152 |
153 | # Stage 1 (layer1) - First group of bottleneck (resnet) modules
154 | downsample = nn.Sequential(
155 | nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False),
156 | nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
157 | )
158 | self.layer1 = nn.Sequential(
159 | Bottleneck(64, 64, downsample=downsample),
160 | Bottleneck(256, 64),
161 | Bottleneck(256, 64),
162 | Bottleneck(256, 64),
163 | )
164 |
165 | # Fusion layer 1 (transition1) - Creation of the first two branches (one full and one half resolution)
166 | self.transition1 = nn.ModuleList([
167 | nn.Sequential(
168 | nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
169 | nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
170 | nn.ReLU(inplace=True),
171 | ),
172 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights
173 | nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
174 | nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
175 | nn.ReLU(inplace=True),
176 | )),
177 | ])
178 |
179 | # Stage 2 (stage2) - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches
180 | self.stage2 = nn.Sequential(
181 | StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum),
182 | )
183 |
184 | # Fusion layer 2 (transition2) - Creation of the third branch (1/4 resolution)
185 | self.transition2 = nn.ModuleList([
186 | nn.Sequential(), # None, - Used in place of "None" because it is callable
187 | nn.Sequential(), # None, - Used in place of "None" because it is callable
188 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights
189 | nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
190 | nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
191 | nn.ReLU(inplace=True),
192 | )), # ToDo Why the new branch derives from the "upper" branch only?
193 | ])
194 |
195 | # Stage 3 (stage3) - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches
196 | self.stage3 = nn.Sequential(
197 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
198 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
199 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
200 | StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
201 | )
202 |
203 | # Fusion layer 3 (transition3) - Creation of the fourth branch (1/8 resolution)
204 | self.transition3 = nn.ModuleList([
205 | nn.Sequential(), # None, - Used in place of "None" because it is callable
206 | nn.Sequential(), # None, - Used in place of "None" because it is callable
207 | nn.Sequential(), # None, - Used in place of "None" because it is callable
208 | nn.Sequential(nn.Sequential( # Double Sequential to fit with official pretrained weights
209 | nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
210 | nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
211 | nn.ReLU(inplace=True),
212 | )),
213 | ])
214 |
215 | # Stage 4 (stage4) - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches
216 | self.stage4 = nn.Sequential(
217 | StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
218 | StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
219 | StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum),
220 | )
221 |
222 | # Final layer (final_layer)
223 | self.final_layer = nn.Conv2d(c, nof_joints, kernel_size=(1, 1), stride=(1, 1))
224 |
225 | def forward(self, x):
226 | x = self.conv1(x)
227 | x = self.bn1(x)
228 | x = self.relu(x)
229 | x = self.conv2(x)
230 | x = self.bn2(x)
231 | x = self.relu(x)
232 |
233 | x = self.layer1(x)
234 | x = [trans(x) for trans in self.transition1] # Since now, x is a list (# == nof branches)
235 |
236 | x = self.stage2(x)
237 | # x = [trans(x[-1]) for trans in self.transition2] # New branch derives from the "upper" branch only
238 | x = [
239 | self.transition2[0](x[0]),
240 | self.transition2[1](x[1]),
241 | self.transition2[2](x[-1])
242 | ] # New branch derives from the "upper" branch only
243 |
244 | x = self.stage3(x)
245 | # x = [trans(x) for trans in self.transition3] # New branch derives from the "upper" branch only
246 | x = [
247 | self.transition3[0](x[0]),
248 | self.transition3[1](x[1]),
249 | self.transition3[2](x[2]),
250 | self.transition3[3](x[-1])
251 | ] # New branch derives from the "upper" branch only
252 |
253 | x = self.stage4(x)
254 |
255 | x = self.final_layer(x[0])
256 |
257 | return x
258 |
--------------------------------------------------------------------------------
/backbone/mobilenet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | def _make_divisible(ch, divisor=8, min_ch=None):
6 | if min_ch is None:
7 | min_ch = divisor
8 | new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor)
9 | # Make sure that round down does not go down by more than 10%.
10 | if new_ch < 0.9 * ch:
11 | new_ch += divisor
12 | return new_ch
13 |
14 |
15 | class ConvBNReLU(nn.Sequential):
16 | def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1, norm_layer=None):
17 | padding = (kernel_size - 1) // 2
18 | if norm_layer is None:
19 | norm_layer = nn.BatchNorm2d
20 | super(ConvBNReLU, self).__init__(
21 | nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False),
22 | norm_layer(out_channel),
23 | nn.ReLU6(inplace=True)
24 | )
25 |
26 |
27 | class InvertedResidual(nn.Module):
28 | def __init__(self, in_channel, out_channel, stride, expand_ratio, norm_layer=None):
29 | super(InvertedResidual, self).__init__()
30 | hidden_channel = in_channel * expand_ratio
31 | self.use_shortcut = stride == 1 and in_channel == out_channel
32 | if norm_layer is None:
33 | norm_layer = nn.BatchNorm2d
34 |
35 | layers = []
36 | if expand_ratio != 1:
37 | # 1x1 pointwise conv
38 | layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1, norm_layer=norm_layer))
39 | layers.extend([
40 | # 3x3 depthwise conv
41 | ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel, norm_layer=norm_layer),
42 | # 1x1 pointwise conv(linear)
43 | nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False),
44 | norm_layer(out_channel),
45 | ])
46 |
47 | self.conv = nn.Sequential(*layers)
48 |
49 | def forward(self, x):
50 | if self.use_shortcut:
51 | return x + self.conv(x)
52 | else:
53 | return self.conv(x)
54 |
55 |
56 | class MobileNetV2(nn.Module):
57 | def __init__(self, num_classes=1000, alpha=1.0, round_nearest=8, weights_path=None, norm_layer=None):
58 | super(MobileNetV2, self).__init__()
59 | block = InvertedResidual
60 | input_channel = _make_divisible(32 * alpha, round_nearest)
61 | last_channel = _make_divisible(1280 * alpha, round_nearest)
62 |
63 | if norm_layer is None:
64 | norm_layer = nn.BatchNorm2d
65 |
66 | inverted_residual_setting = [
67 | # t, c, n, s
68 | [1, 16, 1, 1],
69 | [6, 24, 2, 2],
70 | [6, 32, 3, 2],
71 | [6, 64, 4, 2],
72 | [6, 96, 3, 1],
73 | [6, 160, 3, 2],
74 | [6, 320, 1, 1],
75 | ]
76 |
77 | features = []
78 | # conv1 layer
79 | features.append(ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer))
80 | # building inverted residual residual blockes
81 | for t, c, n, s in inverted_residual_setting:
82 | output_channel = _make_divisible(c * alpha, round_nearest)
83 | for i in range(n):
84 | stride = s if i == 0 else 1
85 | features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
86 | input_channel = output_channel
87 | # building last several layers
88 | features.append(ConvBNReLU(input_channel, last_channel, 1, norm_layer=norm_layer))
89 | # combine feature layers
90 | self.features = nn.Sequential(*features)
91 |
92 | # building classifier
93 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
94 | self.classifier = nn.Sequential(
95 | nn.Dropout(0.2),
96 | nn.Linear(last_channel, num_classes)
97 | )
98 |
99 | if weights_path is None:
100 | # weight initialization
101 | for m in self.modules():
102 | if isinstance(m, nn.Conv2d):
103 | nn.init.kaiming_normal_(m.weight, mode='fan_out')
104 | if m.bias is not None:
105 | nn.init.zeros_(m.bias)
106 | elif isinstance(m, nn.BatchNorm2d):
107 | nn.init.ones_(m.weight)
108 | nn.init.zeros_(m.bias)
109 | elif isinstance(m, nn.Linear):
110 | nn.init.normal_(m.weight, 0, 0.01)
111 | nn.init.zeros_(m.bias)
112 | else:
113 | self.load_state_dict(torch.load(weights_path))
114 |
115 | def forward(self, x):
116 | x = self.features(x)
117 | x = self.avgpool(x)
118 | x = torch.flatten(x, 1)
119 | x = self.classifier(x)
120 | return x
121 |
--------------------------------------------------------------------------------
/backbone/resnet50_fpn_model.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch import Tensor
7 | from torch.jit.annotations import Tuple, List, Dict
8 |
9 |
10 | class Bottleneck(nn.Module):
11 | expansion = 4
12 |
13 | def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
14 | super(Bottleneck, self).__init__()
15 | if norm_layer is None:
16 | norm_layer = nn.BatchNorm2d
17 |
18 | self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
19 | kernel_size=1, stride=1, bias=False) # squeeze channels
20 | self.bn1 = norm_layer(out_channel)
21 | # -----------------------------------------
22 | self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
23 | kernel_size=3, stride=stride, bias=False, padding=1)
24 | self.bn2 = norm_layer(out_channel)
25 | # -----------------------------------------
26 | self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
27 | kernel_size=1, stride=1, bias=False) # unsqueeze channels
28 | self.bn3 = norm_layer(out_channel * self.expansion)
29 | self.relu = nn.ReLU(inplace=True)
30 | self.downsample = downsample
31 |
32 | def forward(self, x):
33 | identity = x
34 | if self.downsample is not None:
35 | identity = self.downsample(x)
36 |
37 | out = self.conv1(x)
38 | out = self.bn1(out)
39 | out = self.relu(out)
40 |
41 | out = self.conv2(out)
42 | out = self.bn2(out)
43 | out = self.relu(out)
44 |
45 | out = self.conv3(out)
46 | out = self.bn3(out)
47 |
48 | out += identity
49 | out = self.relu(out)
50 |
51 | return out
52 |
53 |
54 | class ResNet(nn.Module):
55 |
56 | def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
57 | super(ResNet, self).__init__()
58 | if norm_layer is None:
59 | norm_layer = nn.BatchNorm2d
60 | self._norm_layer = norm_layer
61 |
62 | self.include_top = include_top
63 | self.in_channel = 64
64 |
65 | self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
66 | padding=3, bias=False)
67 | self.bn1 = norm_layer(self.in_channel)
68 | self.relu = nn.ReLU(inplace=True)
69 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
70 | self.layer1 = self._make_layer(block, 64, blocks_num[0])
71 | self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
72 | self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
73 | self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
74 | if self.include_top:
75 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
76 | self.fc = nn.Linear(512 * block.expansion, num_classes)
77 |
78 | for m in self.modules():
79 | if isinstance(m, nn.Conv2d):
80 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
81 |
82 | def _make_layer(self, block, channel, block_num, stride=1):
83 | norm_layer = self._norm_layer
84 | downsample = None
85 | if stride != 1 or self.in_channel != channel * block.expansion:
86 | downsample = nn.Sequential(
87 | nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
88 | norm_layer(channel * block.expansion))
89 |
90 | layers = []
91 | layers.append(block(self.in_channel, channel, downsample=downsample,
92 | stride=stride, norm_layer=norm_layer))
93 | self.in_channel = channel * block.expansion
94 |
95 | for _ in range(1, block_num):
96 | layers.append(block(self.in_channel, channel, norm_layer=norm_layer))
97 |
98 | return nn.Sequential(*layers)
99 |
100 | def forward(self, x):
101 | x = self.conv1(x)
102 | x = self.bn1(x)
103 | x = self.relu(x)
104 | x = self.maxpool(x)
105 |
106 | x = self.layer1(x)
107 | x = self.layer2(x)
108 | x = self.layer3(x)
109 | x = self.layer4(x)
110 |
111 | if self.include_top:
112 | x = self.avgpool(x)
113 | x = torch.flatten(x, 1)
114 | x = self.fc(x)
115 |
116 | return x
117 |
118 |
119 | class IntermediateLayerGetter(nn.ModuleDict):
120 | """
121 | Module wrapper that returns intermediate layers from a model
122 | It has a strong assumption that the modules have been registered
123 | into the model in the same order as they are used.
124 | This means that one should **not** reuse the same nn.Module
125 | twice in the forward if you want this to work.
126 | Additionally, it is only able to query submodules that are directly
127 | assigned to the model. So if `model` is passed, `model.feature1` can
128 | be returned, but not `model.feature1.layer2`.
129 | Arguments:
130 | model (nn.Module): model on which we will extract the features
131 | return_layers (Dict[name, new_name]): a dict containing the names
132 | of the modules for which the activations will be returned as
133 | the key of the dict, and the value of the dict is the name
134 | of the returned activation (which the user can specify).
135 | """
136 | __annotations__ = {
137 | "return_layers": Dict[str, str],
138 | }
139 |
140 | def __init__(self, model, return_layers):
141 | if not set(return_layers).issubset([name for name, _ in model.named_children()]):
142 | raise ValueError("return_layers are not present in model")
143 |
144 | orig_return_layers = return_layers
145 | return_layers = {k: v for k, v in return_layers.items()}
146 | layers = OrderedDict()
147 |
148 | for name, module in model.named_children():
149 | layers[name] = module
150 | if name in return_layers:
151 | del return_layers[name]
152 | if not return_layers:
153 | break
154 |
155 | super(IntermediateLayerGetter, self).__init__(layers)
156 | self.return_layers = orig_return_layers
157 |
158 | def forward(self, x):
159 | out = OrderedDict()
160 | for name, module in self.named_children():
161 | x = module(x)
162 | if name in self.return_layers:
163 | out_name = self.return_layers[name]
164 | out[out_name] = x
165 | return out
166 |
167 |
168 | class FeaturePyramidNetwork(nn.Module):
169 | """
170 | Module that adds a FPN from on top of a set of feature maps. This is based on
171 | `"Feature Pyramid Network for Object Detection" `_.
172 | The feature maps are currently supposed to be in increasing depth
173 | order.
174 | The input to the model is expected to be an OrderedDict[Tensor], containing
175 | the feature maps on top of which the FPN will be added.
176 | Arguments:
177 | in_channels_list (list[int]): number of channels for each feature map that
178 | is passed to the module
179 | out_channels (int): number of channels of the FPN representation
180 | extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
181 | be performed. It is expected to take the fpn features, the original
182 | features and the names of the original features as input, and returns
183 | a new list of feature maps and their corresponding names
184 | """
185 |
186 | def __init__(self, in_channels_list, out_channels, extra_blocks=None):
187 | super(FeaturePyramidNetwork, self).__init__()
188 | self.inner_blocks = nn.ModuleList()
189 | self.layer_blocks = nn.ModuleList()
190 | for in_channels in in_channels_list:
191 | if in_channels == 0:
192 | continue
193 | inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
194 | layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
195 | self.inner_blocks.append(inner_block_module)
196 | self.layer_blocks.append(layer_block_module)
197 |
198 | # initialize parameters now to avoid modifying the initialization of top_blocks
199 | for m in self.children():
200 | if isinstance(m, nn.Conv2d):
201 | nn.init.kaiming_uniform_(m.weight, a=1)
202 | nn.init.constant_(m.bias, 0)
203 |
204 | self.extra_blocks = extra_blocks
205 |
206 | def get_result_from_inner_blocks(self, x, idx):
207 | # type: (Tensor, int) -> Tensor
208 | """
209 | This is equivalent to self.inner_blocks[idx](x),
210 | but torchscript doesn't support this yet
211 | """
212 | num_blocks = 0
213 | for m in self.inner_blocks:
214 | num_blocks += 1
215 | if idx < 0:
216 | idx += num_blocks
217 | i = 0
218 | out = x
219 | for module in self.inner_blocks:
220 | if i == idx:
221 | out = module(x)
222 | i += 1
223 | return out
224 |
225 | def get_result_from_layer_blocks(self, x, idx):
226 | # type: (Tensor, int) -> Tensor
227 | """
228 | This is equivalent to self.layer_blocks[idx](x),
229 | but torchscript doesn't support this yet
230 | """
231 | num_blocks = 0
232 | for m in self.layer_blocks:
233 | num_blocks += 1
234 | if idx < 0:
235 | idx += num_blocks
236 | i = 0
237 | out = x
238 | for module in self.layer_blocks:
239 | if i == idx:
240 | out = module(x)
241 | i += 1
242 | return out
243 |
244 | def forward(self, x):
245 | # type: (Dict[str, Tensor]) -> Dict[str, Tensor]
246 | """
247 | Computes the FPN for a set of feature maps.
248 | Arguments:
249 | x (OrderedDict[Tensor]): feature maps for each feature level.
250 | Returns:
251 | results (OrderedDict[Tensor]): feature maps after FPN layers.
252 | They are ordered from highest resolution first.
253 | """
254 | names = list(x.keys())
255 | x = list(x.values())
256 |
257 | last_inner = self.get_result_from_inner_blocks(x[-1], -1)
258 |
259 | results = []
260 | results.append(self.get_result_from_layer_blocks(last_inner, -1))
261 |
262 | for idx in range(len(x) - 2, -1, -1):
263 | inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
264 | feat_shape = inner_lateral.shape[-2:]
265 | inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
266 | last_inner = inner_lateral + inner_top_down
267 | results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
268 |
269 | if self.extra_blocks is not None:
270 | results, names = self.extra_blocks(results, names)
271 |
272 | # make it back an OrderedDict
273 | out = OrderedDict([(k, v) for k, v in zip(names, results)])
274 |
275 | return out
276 |
277 |
278 | class LastLevelMaxPool(torch.nn.Module):
279 | """
280 | Applies a max_pool2d on top of the last feature map
281 | """
282 |
283 | def forward(self, x, names):
284 | names.append("pool")
285 | x.append(F.max_pool2d(x[-1], 1, 2, 0))
286 | return x, names
287 |
288 |
289 | class BackboneWithFPN(nn.Module):
290 | """
291 | Adds a FPN on top of a model.
292 | Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
293 | extract a submodel that returns the feature maps specified in return_layers.
294 | The same limitations of IntermediatLayerGetter apply here.
295 | Arguments:
296 | backbone (nn.Module)
297 | return_layers (Dict[name, new_name]): a dict containing the names
298 | of the modules for which the activations will be returned as
299 | the key of the dict, and the value of the dict is the name
300 | of the returned activation (which the user can specify).
301 | in_channels_list (List[int]): number of channels for each feature map
302 | that is returned, in the order they are present in the OrderedDict
303 | out_channels (int): number of channels in the FPN.
304 | Attributes:
305 | out_channels (int): the number of channels in the FPN
306 | """
307 |
308 | def __init__(self, backbone, return_layers, in_channels_list, out_channels):
309 | super(BackboneWithFPN, self).__init__()
310 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
311 | self.fpn = FeaturePyramidNetwork(
312 | in_channels_list=in_channels_list,
313 | out_channels=out_channels,
314 | extra_blocks=LastLevelMaxPool(),
315 | )
316 | self.out_channels = out_channels
317 |
318 | def forward(self, x):
319 | x = self.body(x)
320 | x = self.fpn(x)
321 | return x
322 |
323 |
324 | def resnet50_fpn_backbone():
325 |
326 | resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3],
327 | include_top=False)
328 |
329 | for name, parameter in resnet_backbone.named_parameters():
330 | if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
331 | parameter.requires_grad_(False)
332 |
333 | return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
334 |
335 | in_channels_stage2 = resnet_backbone.in_channel // 8
336 | in_channels_list = [
337 | in_channels_stage2, # layer1 out_channel=256
338 | in_channels_stage2 * 2, # layer2 out_channel=512
339 | in_channels_stage2 * 4, # layer3 out_channel=1024
340 | in_channels_stage2 * 8, # layer4 out_channel=2048
341 | ]
342 | out_channels = 256
343 | return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels)
344 |
--------------------------------------------------------------------------------
/backbone/vgg16.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | channels_cfgs = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
5 |
6 |
7 | def vgg16(weights_path=None):
8 | model = VGG(make_features(channels_cfgs), weights_path=weights_path)
9 | return model
10 |
11 |
12 | class VGG(nn.Module):
13 | def __init__(self, features, class_num=1000, init_weights=False, weights_path=None):
14 | super(VGG, self).__init__()
15 | self.features = features
16 | self.classifier = nn.Sequential(
17 | nn.Dropout(p=0.5),
18 | nn.Linear(512 * 7 * 7, 2048),
19 | nn.ReLU(True),
20 | nn.Dropout(p=0.5),
21 | nn.Linear(2048, 2048),
22 | nn.ReLU(True),
23 | nn.Linear(2048, class_num)
24 | )
25 | if init_weights and weights_path is None:
26 | self._initialize_weights()
27 |
28 | if weights_path is not None:
29 | self.load_state_dict(torch.load(weights_path), strict=False)
30 |
31 | def forward(self, x):
32 | # N x 3 x 224 x 224
33 | x = self.features(x)
34 | # N x 512 x 7 x 7
35 | x = torch.flatten(x, start_dim=1)
36 | # N x 512*7*7
37 | x = self.classifier(x)
38 | return x
39 |
40 | def _initialize_weights(self):
41 | for m in self.modules():
42 | if isinstance(m, nn.Conv2d):
43 | nn.init.xavier_uniform_(m.weight)
44 | if m.bias is not None:
45 | nn.init.constant_(m.bias, 0)
46 | elif isinstance(m, nn.Linear):
47 | nn.init.xavier_uniform_(m.weight)
48 | nn.init.constant_(m.bias, 0)
49 |
50 |
51 | def make_features(ch_cfgs):
52 | layers = []
53 | in_channels = 3
54 | for v in ch_cfgs:
55 | if v == "M":
56 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
57 | else:
58 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
59 | layers += [conv2d, nn.ReLU(True)]
60 | in_channels = v
61 | return nn.Sequential(*layers)
62 |
--------------------------------------------------------------------------------
/config/test_config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 | model_weights = " "
3 | image_path = " "
4 | gpu_id = '2'
5 | num_classes = 80 + 1
6 | data_root_dir = " "
7 |
8 |
9 | test_cfg = Config()
10 |
--------------------------------------------------------------------------------
/config/train_config.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class Config:
4 | backbone = 'mobilenet' # [vgg16, resnet-fpn, mobilenet, resnet50_fpn]
5 | backbone_pretrained_weights = None # [path or None]
6 |
7 | # data transform parameter
8 | train_horizon_flip_prob = 0.0 # data horizon flip probility in train transform
9 | min_size = 800
10 | max_size = 1000
11 | image_mean = [0.485, 0.456, 0.406]
12 | image_std = [0.229, 0.224, 0.225]
13 |
14 | # anchor parameters
15 | anchor_size = [64, 128, 256]
16 | anchor_ratio = [0.5, 1, 2.0]
17 |
18 | # roi align parameters
19 | roi_out_size = [7, 7]
20 | roi_sample_rate = 2
21 |
22 | # rpn process parameters
23 | rpn_pre_nms_top_n_train = 2000
24 | rpn_post_nms_top_n_train = 2000
25 |
26 | rpn_pre_nms_top_n_test = 1000
27 | rpn_post_nms_top_n_test = 1000
28 |
29 | rpn_nms_thresh = 0.7
30 | rpn_fg_iou_thresh = 0.7
31 | rpn_bg_iou_thresh = 0.3
32 | rpn_batch_size_per_image = 256
33 | rpn_positive_fraction = 0.5
34 |
35 | # remove low threshold target
36 | box_score_thresh = 0.05
37 | box_nms_thresh = 0.5
38 | box_detections_per_img = 100
39 | box_fg_iou_thresh = 0.5
40 | box_bg_iou_thresh = 0.5
41 | box_batch_size_per_image = 512
42 | box_positive_fraction = 0.25
43 | bbox_reg_weights = None
44 |
45 | device_name = 'cuda:7'
46 |
47 | resume = '' # pretrained_weights
48 | start_epoch = 0 # start epoch
49 | num_epochs = 5000 # train epochs
50 |
51 | # learning rate parameters
52 | lr = 5e-3
53 | momentum = 0.9
54 | weight_decay = 0.0005
55 |
56 | # learning rate schedule
57 | lr_gamma = 0.33
58 | lr_dec_step_size = 100
59 |
60 | batch_size = 6
61 |
62 | num_class = 80 + 1 # foreground + 1 background
63 | data_root_dir = " "
64 | model_save_dir = " "
65 |
66 |
67 | cfg = Config()
68 |
--------------------------------------------------------------------------------
/dataloader/coco_dataset.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import json
6 | import os
7 |
8 | import numpy as np
9 | import torch
10 | from PIL import Image
11 | from pycocotools.coco import COCO
12 | from torch.utils.data import Dataset
13 |
14 |
15 | class coco(Dataset):
16 | def __init__(self, root_dir, image_set, year, transforms=None):
17 |
18 | self._root_dir = root_dir
19 | self._year = year
20 | self._image_set = image_set
21 | self._data_name = image_set + year
22 | self._json_path = self._get_ann_file()
23 | self._transforms = transforms
24 |
25 | # load COCO API
26 | self._COCO = COCO(self._json_path)
27 |
28 | with open(self._json_path) as anno_file:
29 | self.anno = json.load(anno_file)
30 |
31 | cats = self._COCO.loadCats(self._COCO.getCatIds())
32 | self._classes = tuple(['__background__'] + [c['name'] for c in cats])
33 |
34 | self.classes = self._classes
35 | self.num_classes = len(self.classes)
36 | self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
37 | self._class_to_coco_cat_id = dict(list(zip([c['name'] for c in cats],
38 | self._COCO.getCatIds())))
39 |
40 | self.coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls],
41 | self._class_to_ind[cls])
42 | for cls in self._classes[1:]])
43 |
44 | def __len__(self):
45 | return len(self.anno['images'])
46 |
47 | def _get_ann_file(self):
48 | prefix = 'instances' if self._image_set.find('test') == -1 else 'image_info'
49 | return os.path.join(self._root_dir, 'annotations', prefix + '_' + self._image_set + self._year + '.json')
50 |
51 | def _image_path_from_index(self, index):
52 | """
53 | Construct an image path from the image's "index" identifier.
54 | """
55 | # Example image path for index=119993:
56 | # images/train2014/COCO_train2014_000000119993.jpg
57 | file_name = (str(index).zfill(12) + '.jpg')
58 | image_path = os.path.join(self._root_dir, self._data_name, file_name)
59 | assert os.path.exists(image_path), 'Path does not exist: {}'.format(image_path)
60 | return image_path
61 |
62 | def __getitem__(self, idx):
63 | a = self.anno['images'][idx]
64 | image_idx = a['id']
65 | img_path = os.path.join(self._root_dir, self._data_name, self._image_path_from_index(image_idx))
66 | image = Image.open(img_path)
67 |
68 | width = a['width']
69 | height = a['height']
70 |
71 | annIds = self._COCO.getAnnIds(imgIds=image_idx, iscrowd=None)
72 | objs = self._COCO.loadAnns(annIds)
73 |
74 | # Sanitize bboxes -- some are invalid
75 | valid_objs = []
76 | for obj in objs:
77 | x1 = np.max((0, obj['bbox'][0]))
78 | y1 = np.max((0, obj['bbox'][1]))
79 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
80 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
81 | if obj['area'] > 0 and x2 > x1 and y2 > y1:
82 | obj['clean_bbox'] = [x1, y1, x2, y2]
83 | valid_objs.append(obj)
84 | objs = valid_objs
85 | num_objs = len(objs)
86 |
87 | boxes = np.zeros((num_objs, 4), dtype=np.float32)
88 | gt_classes = np.zeros((num_objs), dtype=np.int32)
89 |
90 | iscrowd = []
91 | for ix, obj in enumerate(objs):
92 | cls = self.coco_cat_id_to_class_ind[obj['category_id']]
93 | boxes[ix, :] = obj['clean_bbox']
94 | gt_classes[ix] = cls
95 | iscrowd.append(int(obj["iscrowd"]))
96 |
97 | # convert everything into a torch.Tensor
98 | image_id = torch.tensor([image_idx])
99 | boxes = torch.as_tensor(boxes, dtype=torch.float32)
100 | gt_classes = torch.as_tensor(gt_classes, dtype=torch.int32)
101 | iscrowd = torch.as_tensor(iscrowd, dtype=torch.int32)
102 |
103 | area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
104 |
105 | target = {"boxes": boxes, "labels": gt_classes, "image_id": image_id, "area": area, "iscrowd": iscrowd}
106 |
107 | if self._transforms is not None:
108 | image, target = self._transforms(image, target)
109 |
110 | return image, target
111 |
112 | @staticmethod
113 | def collate_fn(batch):
114 | return tuple(zip(*batch))
115 |
116 | @property
117 | def class_to_coco_cat_id(self):
118 | return self._class_to_coco_cat_id
119 |
--------------------------------------------------------------------------------
/imgs/demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlphaJia/pytorch-faster-rcnn/943ef668facaacf77a4822fe79331343a6ebca2d/imgs/demo1.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch~=1.5.1
2 | torchvision~=0.6.1
3 | numpy~=1.15.4
4 | Pillow~=6.1.0
5 | pycocotools~=2.0
6 | matplotlib~=3.0.2
7 | tensorboardX~=2.0
8 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import matplotlib.pyplot as plt
4 | import torch
5 | from PIL import Image
6 | from torchvision import transforms
7 |
8 | from config.test_config import test_cfg
9 | from dataloader.coco_dataset import coco
10 | from utils.draw_box_utils import draw_box
11 | from utils.train_utils import create_model
12 |
13 |
14 | def test():
15 | model = create_model(num_classes=test_cfg.num_classes)
16 |
17 | model.cuda()
18 | weights = test_cfg.model_weights
19 |
20 | checkpoint = torch.load(weights, map_location='cpu')
21 | model.load_state_dict(checkpoint['model'])
22 |
23 | # read class_indict
24 | data_transform = transforms.Compose([transforms.ToTensor()])
25 | test_data_set = coco(test_cfg.data_root_dir, 'test', '2017', data_transform)
26 | category_index = test_data_set.class_to_coco_cat_id
27 |
28 | index_category = dict(zip(category_index.values(), category_index.keys()))
29 |
30 | original_img = Image.open(test_cfg.image_path)
31 | img = data_transform(original_img)
32 | img = torch.unsqueeze(img, dim=0)
33 |
34 | model.eval()
35 | with torch.no_grad():
36 | predictions = model(img.cuda())[0]
37 | predict_boxes = predictions["boxes"].to("cpu").numpy()
38 | predict_classes = predictions["labels"].to("cpu").numpy()
39 | predict_scores = predictions["scores"].to("cpu").numpy()
40 |
41 | if len(predict_boxes) == 0:
42 | print("No target detected!")
43 |
44 | draw_box(original_img,
45 | predict_boxes,
46 | predict_classes,
47 | predict_scores,
48 | index_category,
49 | thresh=0.3,
50 | line_thickness=3)
51 | plt.imshow(original_img)
52 | plt.show()
53 |
54 |
55 | if __name__ == "__main__":
56 | version = torch.version.__version__[:5]
57 | print('torch version is {}'.format(version))
58 | os.environ["CUDA_VISIBLE_DEVICES"] = test_cfg.gpu_id
59 | test()
60 |
--------------------------------------------------------------------------------
/test/anchor_utils_test.py:
--------------------------------------------------------------------------------
1 | from utils.anchor_utils import generate_anchors
2 |
3 |
4 | def generate_anchors_test():
5 | scales = [64, 128, 256]
6 | ratios = [0.5, 1.0, 2.0]
7 | generate_anchors(scales, ratios)
8 |
9 |
10 | if __name__ == '__main__':
11 | generate_anchors_test()
12 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from tensorboardX import SummaryWriter
5 |
6 | from config.train_config import cfg
7 | from dataloader.coco_dataset import coco
8 | from utils.evaluate_utils import evaluate
9 | from utils.im_utils import Compose, ToTensor, RandomHorizontalFlip
10 | from utils.plot_utils import plot_loss_and_lr, plot_map
11 | from utils.train_utils import train_one_epoch, write_tb, create_model
12 |
13 |
14 | def main():
15 | device = torch.device(cfg.device_name)
16 | print("Using {} device training.".format(device.type))
17 |
18 | if not os.path.exists(cfg.model_save_dir):
19 | os.makedirs(cfg.model_save_dir)
20 |
21 | # tensorboard writer
22 | writer = SummaryWriter(os.path.join(cfg.model_save_dir, 'epoch_log'))
23 |
24 | data_transform = {
25 | "train": Compose([ToTensor(), RandomHorizontalFlip(cfg.train_horizon_flip_prob)]),
26 | "val": Compose([ToTensor()])
27 | }
28 |
29 | if not os.path.exists(cfg.data_root_dir):
30 | raise FileNotFoundError("dataset root dir not exist!")
31 |
32 | # load train data set
33 | train_data_set = coco(cfg.data_root_dir, 'train', '2017', data_transform["train"])
34 | batch_size = cfg.batch_size
35 | nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])
36 | print('Using {} dataloader workers'.format(nw))
37 | train_data_loader = torch.utils.data.DataLoader(train_data_set,
38 | batch_size=batch_size,
39 | shuffle=True,
40 | num_workers=nw,
41 | collate_fn=train_data_set.collate_fn)
42 |
43 | # load validation data set
44 | val_data_set = coco(cfg.data_root_dir, 'val', '2017', data_transform["val"])
45 | val_data_set_loader = torch.utils.data.DataLoader(val_data_set,
46 | batch_size=batch_size,
47 | shuffle=False,
48 | num_workers=nw,
49 | collate_fn=train_data_set.collate_fn)
50 |
51 | # create model num_classes equal background + 80 classes
52 | model = create_model(num_classes=cfg.num_class)
53 |
54 | model.to(device)
55 |
56 | # define optimizer
57 | params = [p for p in model.parameters() if p.requires_grad]
58 | optimizer = torch.optim.SGD(params, lr=cfg.lr,
59 | momentum=cfg.momentum, weight_decay=cfg.weight_decay)
60 |
61 | # learning rate scheduler
62 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
63 | step_size=cfg.lr_dec_step_size,
64 | gamma=cfg.lr_gamma)
65 |
66 | # train from pretrained weights
67 | if cfg.resume != "":
68 | checkpoint = torch.load(cfg.resume)
69 | model.load_state_dict(checkpoint['model'])
70 | optimizer.load_state_dict(checkpoint['optimizer'])
71 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
72 | cfg.start_epoch = checkpoint['epoch'] + 1
73 | print("the training process from epoch{}...".format(cfg.start_epoch))
74 |
75 | train_loss = []
76 | learning_rate = []
77 | train_mAP_list = []
78 | val_mAP = []
79 |
80 | best_mAP = 0
81 | for epoch in range(cfg.start_epoch, cfg.num_epochs):
82 | loss_dict, total_loss = train_one_epoch(model, optimizer, train_data_loader,
83 | device, epoch, train_loss=train_loss, train_lr=learning_rate,
84 | print_freq=50, warmup=False)
85 |
86 | lr_scheduler.step()
87 |
88 | print("------>Starting training data valid")
89 | _, train_mAP = evaluate(model, train_data_loader, device=device, mAP_list=train_mAP_list)
90 |
91 | print("------>Starting validation data valid")
92 | _, mAP = evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP)
93 | print('training mAp is {}'.format(train_mAP))
94 | print('validation mAp is {}'.format(mAP))
95 | print('best mAp is {}'.format(best_mAP))
96 |
97 | board_info = {'lr': optimizer.param_groups[0]['lr'],
98 | 'train_mAP': train_mAP,
99 | 'val_mAP': mAP}
100 |
101 | for k, v in loss_dict.items():
102 | board_info[k] = v.item()
103 | board_info['total loss'] = total_loss.item()
104 | write_tb(writer, epoch, board_info)
105 |
106 | if mAP > best_mAP:
107 | best_mAP = mAP
108 | # save weights
109 | save_files = {
110 | 'model': model.state_dict(),
111 | 'optimizer': optimizer.state_dict(),
112 | 'lr_scheduler': lr_scheduler.state_dict(),
113 | 'epoch': epoch}
114 | model_save_dir = cfg.model_save_dir
115 | if not os.path.exists(model_save_dir):
116 | os.makedirs(model_save_dir)
117 | torch.save(save_files,
118 | os.path.join(model_save_dir, "{}-model-{}-mAp-{}.pth".format(cfg.backbone, epoch, mAP)))
119 | writer.close()
120 | # plot loss and lr curve
121 | if len(train_loss) != 0 and len(learning_rate) != 0:
122 | plot_loss_and_lr(train_loss, learning_rate, cfg.model_save_dir)
123 |
124 | # plot mAP curve
125 | if len(val_mAP) != 0:
126 | plot_map(val_mAP, cfg.model_save_dir)
127 |
128 |
129 | if __name__ == "__main__":
130 | version = torch.version.__version__[:5]
131 | print('torch version is {}'.format(version))
132 | main()
133 |
--------------------------------------------------------------------------------
/utils/anchor_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | def generate_anchors(scales, aspect_ratios, dtype=torch.float32, device="cpu"):
6 | """
7 | generate anchor template based on sizes and ratios, generated template is centered at [0, 0]
8 | :param scales: anchor sizes, in tuple[int]
9 | :param aspect_ratios: anchor ratios, in tuple[float]
10 | :param dtype: data type
11 | :param device: date device
12 | :return:
13 | """
14 |
15 | scales = torch.as_tensor(scales, dtype=dtype, device=device)
16 | aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
17 | h_ratios = torch.sqrt(aspect_ratios)
18 | w_ratios = 1.0 / h_ratios
19 |
20 | # [r1, r2, r3]' * [s1, s2, s3]
21 | # number of elements is len(ratios)*len(scales)
22 | ws = (w_ratios[:, None] * scales[None, :]).view(-1)
23 | hs = (h_ratios[:, None] * scales[None, :]).view(-1)
24 |
25 | # left-top, right-bottom coordinate relative to anchor center(0, 0)
26 | # anchor template is centered at [0, 0], shape [len(ratios)*len(scales), 4]
27 | base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
28 |
29 | return base_anchors.round() # anchor will lose some precision here
30 |
31 |
32 | class AnchorsGenerator(nn.Module):
33 | """
34 | anchor generator for feature maps according to anchor sizes and ratios
35 | :param sizes: anchor sizes, in tuple[int]
36 | :param aspect_ratios: anchor ratios, in tuple[float]
37 | :return:
38 | """
39 |
40 | def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
41 | super(AnchorsGenerator, self).__init__()
42 |
43 | # assert len(sizes) == len(aspect_ratios), 'anchor sizes must equal to anchor ratios!'
44 |
45 | self.sizes = sizes
46 | self.aspect_ratios = aspect_ratios
47 | self.cell_anchors = None
48 | self._cache = {}
49 |
50 | def set_cell_anchors(self, dtype, device):
51 | """
52 | generate template template
53 | :param dtype: data type
54 | :param device: data device
55 | :return:
56 | """
57 | if self.cell_anchors is not None:
58 | cell_anchors = self.cell_anchors
59 | assert cell_anchors is not None
60 |
61 | # generate anchor template
62 | cell_anchors = [generate_anchors(sizes, aspect_ratios, dtype, device)
63 | for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)]
64 | self.cell_anchors = cell_anchors
65 |
66 | def num_anchors_per_location(self):
67 | # calculate the number of anchors per feature map, for k in origin paper
68 | return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
69 |
70 | def grid_anchors(self, feature_map_sizes, strides):
71 | """
72 | compute anchor coordinate list in origin image, mapped from feature map
73 | :param feature_map_sizes: feature map sizes
74 | :param strides: strides between origin image and anchor
75 | :return:
76 | """
77 |
78 | anchors = []
79 | cell_anchors = self.cell_anchors # anchor template
80 | assert cell_anchors is not None
81 |
82 | # for every resolution feature map, like fpn
83 | for size, stride, base_anchors in zip(feature_map_sizes, strides, cell_anchors):
84 | f_p_height, f_p_width = size
85 | stride_height, stride_width = stride
86 | device = base_anchors.device
87 |
88 | # For output anchor, compute [x_center, y_center, x_center, y_center...]
89 | # x_center in origin image
90 | shifts_x = torch.arange(0, f_p_width, dtype=torch.float32, device=device) * stride_width
91 |
92 | # y_center in origin image
93 | shifts_y = torch.arange(0, f_p_height, dtype=torch.float32, device=device) * stride_height
94 |
95 | # torch.meshgrid will output grid
96 | # shape: [grid_height, grid_width]
97 | shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
98 | shift_x = shift_x.reshape(-1)
99 | shift_y = shift_y.reshape(-1)
100 |
101 | shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
102 |
103 | # For every (base anchor, output anchor) pair,
104 | # offset each zero-centered base anchor by the center of the output anchor
105 | shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
106 | anchors.append(shifts_anchor.reshape(-1, 4))
107 |
108 | return anchors # List[Tensor(all_num_anchors, 4)]
109 |
110 | def cached_grid_anchors(self, feature_map_size, strides):
111 | """
112 | cached all anchor information
113 | :param feature_map_size: feature map size after backbone feature extractor
114 | :param strides: strides between origin image size and feature map size
115 | :return:
116 | """
117 |
118 | key = str(feature_map_size) + str(strides)
119 | # self._cache is a dictionary type
120 | if key in self._cache:
121 | return self._cache[key]
122 | anchors = self.grid_anchors(feature_map_size, strides)
123 | self._cache[key] = anchors
124 | return anchors
125 |
126 | def forward(self, image_list, feature_maps):
127 | """
128 | get feature map sizes
129 | :param image_list:
130 | :param feature_maps:
131 | :return:
132 | """
133 |
134 | feature_map_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
135 |
136 | # get input image sizes
137 | image_size = image_list.tensors.shape[-2:]
138 |
139 | # get dtype and device
140 | dtype, device = feature_maps[0].dtype, feature_maps[0].device
141 |
142 | # compute map stride between feature_maps and input images
143 | strides = [[torch.tensor(image_size[0] / g[0], dtype=torch.int64, device=device),
144 | torch.tensor(image_size[1] / g[1], dtype=torch.int64, device=device)] for g in feature_map_sizes]
145 |
146 | # get anchors template according size and aspect_ratios
147 | self.set_cell_anchors(dtype, device)
148 |
149 | # get anchor coordinate list in origin image, according to map
150 | anchors_over_all_feature_maps = self.cached_grid_anchors(feature_map_sizes, strides)
151 |
152 | anchors = []
153 | # for every image and feature map in a batch
154 | for i, (_, _) in enumerate(image_list.image_sizes):
155 | anchors_in_image = []
156 | # for every resolution feature map like fpn
157 | for anchors_per_feature_map in anchors_over_all_feature_maps:
158 | anchors_in_image.append(anchors_per_feature_map)
159 | anchors.append(anchors_in_image)
160 |
161 | # concat every resolution anchors, like fpn
162 | anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
163 |
164 | self._cache.clear()
165 | return anchors
166 |
--------------------------------------------------------------------------------
/utils/boxes_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def nms(boxes, scores, iou_threshold):
5 | """
6 | Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
7 |
8 | NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring)
9 | box.
10 | :param boxes: Tensor[N, 4]), boxes to perform NMS on. They are expected to be in (x1, y1, x2, y2) format
11 | :param scores: Tensor[N], scores for each one of the boxes
12 | :param iou_threshold: float, discards all overlapping boxes with IoU < iou_threshold
13 | :return: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores
14 | """
15 |
16 | return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
17 |
18 |
19 | def batched_nms(boxes, scores, idxs, iou_threshold):
20 | """
21 | Performs non-maximum suppression in a batched fashion.
22 | Each index value correspond to a category, and NMS
23 | will not be applied between elements of different categories
24 | :param boxes: Tensor[N, 4], boxes where NMS will be performed. They are expected to be in (x1, y1, x2, y2) format
25 | :param scores: Tensor[N], scores for each one of the boxes
26 | :param idxs: Tensor[N], indices of the categories for each one of the boxes.
27 | :param iou_threshold: float, discards all overlapping boxes, with IoU < iou_threshold
28 | :return: int64 tensor with the indices of the elements that have been kept by NMS, sorted
29 | in decreasing order of scores
30 | """
31 |
32 | if boxes.numel() == 0:
33 | return torch.empty((0,), dtype=torch.int64, device=boxes.device)
34 |
35 | # strategy: in order to perform NMS independently per class.
36 | # we add an offset to all the boxes. The offset is dependent
37 | # only on the class idx, and is large enough so that boxes
38 | # from different classes do not overlap
39 | max_coordinate = boxes.max()
40 |
41 | # to(): Performs Tensor dtype and/or device conversion
42 | offsets = idxs.to(boxes) * (max_coordinate + 1)
43 | boxes_for_nms = boxes + offsets[:, None]
44 | keep = nms(boxes_for_nms, scores, iou_threshold)
45 | return keep
46 |
47 |
48 | def remove_small_boxes(boxes, min_size):
49 | """
50 | Remove boxes which contains at least one side smaller than min_size.
51 | :param boxes: boxes in (x1, y1, x2, y2) format
52 | :param min_size: minimum size
53 | :return: indices of the boxes that have both sides
54 | larger than min_size
55 | """
56 |
57 | ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
58 | keep = (ws >= min_size) & (hs >= min_size)
59 | # nonzero(): Returns a tensor containing the indices of all non-zero elements of input
60 | keep = keep.nonzero().squeeze(1)
61 | return keep
62 |
63 |
64 | def clip_boxes_to_image(boxes, size):
65 | """
66 | Clip boxes so that they lie inside an image of size `size`.
67 | :param boxes: boxes in (x1, y1, x2, y2) format
68 | :param size: size of the image
69 | :return: clipped_boxes (Tensor[N, 4])
70 | """
71 |
72 | dim = boxes.dim()
73 | boxes_x = boxes[..., 0::2] # x1, x2
74 | boxes_y = boxes[..., 1::2] # y1, y2
75 | height, width = size
76 |
77 | boxes_x = boxes_x.clamp(min=0, max=width)
78 | boxes_y = boxes_y.clamp(min=0, max=height)
79 |
80 | clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
81 | return clipped_boxes.reshape(boxes.shape)
82 |
83 |
84 | def box_area(boxes):
85 | """
86 | Computes the area of a set of bounding boxes, which are specified by its
87 | (x1, y1, x2, y2) coordinates.
88 | :param boxes: boxes for which the area will be computed. They
89 | are expected to be in (x1, y1, x2, y2) format
90 | :return: area for each box
91 | """
92 |
93 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
94 |
95 |
96 | def box_iou(boxes1, boxes2):
97 | """
98 | Calculate intersection-over-union (Jaccard index) of boxes.
99 | Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
100 | :param boxes1: boxes1 (Tensor[N, 4])
101 | :param boxes2: boxes2 (Tensor[M, 4])
102 | :return: iou (Tensor[N, M]): the NxM matrix containing the pairwise
103 | IoU values for every element in boxes1 and boxes2
104 | """
105 |
106 | area1 = box_area(boxes1)
107 | area2 = box_area(boxes2)
108 |
109 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # left-top [N,M,2]
110 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # right-bottom [N,M,2]
111 |
112 | wh = (rb - lt).clamp(min=0) # [N,M,2]
113 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
114 |
115 | iou = inter / (area1[:, None] + area2 - inter)
116 | return iou
117 |
118 |
119 | def permute_and_flatten(layer, N, A, C, H, W):
120 | """
121 | adjust tensor order,and reshape
122 | :param layer: classification or bboxes parameters
123 | :param N: batch_size
124 | :param A: anchors_num_per_position
125 | :param C: classes_num or bbox coordinate
126 | :param H: height
127 | :param W: width
128 | :return: Tensor after adjusting order and reshaping
129 | """
130 |
131 | # [batch_size, anchors_num_per_position * (C or 4), height, width]
132 | layer = layer.view(N, -1, C, H, W)
133 | layer = layer.permute(0, 3, 4, 1, 2) # [N, H, W, -1, C]
134 | layer = layer.reshape(N, -1, C)
135 | return layer
136 |
137 |
138 | def concat_box_prediction_layers(box_cls, box_regression):
139 | """
140 | Adjust box classification and bbox regression parameters order and reshape
141 | :param box_cls: target prediction score
142 | :param box_regression: bbox regression parameters
143 | :return: [N, -1, C]
144 | """
145 |
146 | box_cls_flattened = []
147 | box_regression_flattened = []
148 |
149 | for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
150 | # [batch_size, anchors_num_per_position * classes_num, height, width], class_num is equal 2
151 | N, AxC, H, W = box_cls_per_level.shape
152 | # [batch_size, anchors_num_per_position * 4, height, width]
153 | Ax4 = box_regression_per_level.shape[1]
154 | # anchors_num_per_position
155 | A = Ax4 // 4
156 | # classes_num
157 | C = AxC // A
158 |
159 | # [N, -1, C]
160 | box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
161 | box_cls_flattened.append(box_cls_per_level)
162 |
163 | # [N, -1, C]
164 | box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
165 | box_regression_flattened.append(box_regression_per_level)
166 |
167 | box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2) # start_dim, end_dim
168 | box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
169 | return box_cls, box_regression
170 |
171 |
--------------------------------------------------------------------------------
/utils/coco_utils.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import json
3 | from collections import defaultdict
4 |
5 | import numpy as np
6 | import pycocotools.mask as mask_util
7 | import torch
8 | import torch.utils.data
9 | import torchvision
10 | from pycocotools.coco import COCO
11 | from pycocotools.cocoeval import COCOeval
12 |
13 | from utils.train_utils import all_gather
14 |
15 |
16 | def convert_to_coco_api(ds):
17 | coco_ds = COCO()
18 | ann_id = 1
19 | dataset = {'images': [], 'categories': [], 'annotations': []}
20 | categories = set()
21 | for img_idx in range(len(ds)):
22 | # find better way to get target
23 | img, targets = ds[img_idx]
24 | image_id = targets["image_id"].item()
25 | img_dict = {'id': image_id, 'height': img.shape[-2], 'width': img.shape[-1]}
26 | dataset['images'].append(img_dict)
27 | bboxes = targets["boxes"]
28 | bboxes[:, 2:] -= bboxes[:, :2]
29 | bboxes = bboxes.tolist()
30 | labels = targets['labels'].tolist()
31 | areas = targets['area'].tolist()
32 | iscrowd = targets['iscrowd'].tolist()
33 | num_objs = len(bboxes)
34 | for i in range(num_objs):
35 | ann = {'image_id': image_id, 'bbox': bboxes[i], 'category_id': labels[i]}
36 | categories.add(labels[i])
37 | ann['area'] = areas[i]
38 | ann['iscrowd'] = iscrowd[i]
39 | ann['id'] = ann_id
40 | dataset['annotations'].append(ann)
41 | ann_id += 1
42 | dataset['categories'] = [{'id': i} for i in sorted(categories)]
43 | coco_ds.dataset = dataset
44 | coco_ds.createIndex()
45 | return coco_ds
46 |
47 |
48 | def get_coco_api_from_dataset(dataset):
49 | for _ in range(10):
50 | if isinstance(dataset, torchvision.datasets.CocoDetection):
51 | break
52 | if isinstance(dataset, torch.utils.data.Subset):
53 | dataset = dataset.dataset
54 | if isinstance(dataset, torchvision.datasets.CocoDetection):
55 | return dataset.coco
56 | return convert_to_coco_api(dataset)
57 |
58 |
59 | def prepare_for_coco_detection(predictions):
60 | coco_results = []
61 | for original_id, prediction in predictions.items():
62 | if len(prediction) == 0:
63 | continue
64 |
65 | boxes = prediction["boxes"]
66 | boxes = convert_to_xywh(boxes).tolist()
67 | scores = prediction["scores"].tolist()
68 | labels = prediction["labels"].tolist()
69 |
70 | coco_results.extend(
71 | [
72 | {
73 | "image_id": original_id,
74 | "category_id": labels[k],
75 | "bbox": box,
76 | "score": scores[k],
77 | }
78 | for k, box in enumerate(boxes)
79 | ]
80 | )
81 | return coco_results
82 |
83 |
84 | def prepare(predictions, iou_type):
85 | return prepare_for_coco_detection(predictions)
86 |
87 |
88 | class CocoEvaluator(object):
89 | def __init__(self, coco_gt, iou_types):
90 | assert isinstance(iou_types, (list, tuple))
91 | coco_gt = copy.deepcopy(coco_gt)
92 | self.coco_gt = coco_gt
93 |
94 | self.iou_types = iou_types
95 | self.coco_eval = {}
96 | for iou_type in iou_types:
97 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
98 |
99 | self.img_ids = []
100 | self.eval_imgs = {k: [] for k in iou_types}
101 |
102 | def update(self, predictions):
103 | img_ids = list(np.unique(list(predictions.keys())))
104 | self.img_ids.extend(img_ids)
105 |
106 | for iou_type in self.iou_types:
107 | results = prepare(predictions, iou_type)
108 | coco_dt = loadRes(self.coco_gt, results) if results else COCO()
109 | coco_eval = self.coco_eval[iou_type]
110 |
111 | coco_eval.cocoDt = coco_dt
112 | coco_eval.params.imgIds = list(img_ids)
113 | img_ids, eval_imgs = evaluate(coco_eval)
114 |
115 | self.eval_imgs[iou_type].append(eval_imgs)
116 |
117 | def synchronize_between_processes(self):
118 | for iou_type in self.iou_types:
119 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
120 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
121 |
122 | def accumulate(self):
123 | for coco_eval in self.coco_eval.values():
124 | coco_eval.accumulate()
125 |
126 | def summarize(self):
127 | for iou_type, coco_eval in self.coco_eval.items():
128 | print("IoU metric: {}".format(iou_type))
129 | coco_eval.summarize()
130 |
131 |
132 | def convert_to_xywh(boxes):
133 | xmin, ymin, xmax, ymax = boxes.unbind(1)
134 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
135 |
136 |
137 | def merge(img_ids, eval_imgs):
138 | all_img_ids = all_gather(img_ids)
139 | all_eval_imgs = all_gather(eval_imgs)
140 |
141 | merged_img_ids = []
142 | for p in all_img_ids:
143 | merged_img_ids.extend(p)
144 |
145 | merged_eval_imgs = []
146 | for p in all_eval_imgs:
147 | merged_eval_imgs.append(p)
148 |
149 | merged_img_ids = np.array(merged_img_ids)
150 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
151 |
152 | # keep only unique (and in sorted order) images
153 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
154 | merged_eval_imgs = merged_eval_imgs[..., idx]
155 |
156 | return merged_img_ids, merged_eval_imgs
157 |
158 |
159 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
160 | img_ids, eval_imgs = merge(img_ids, eval_imgs)
161 | img_ids = list(img_ids)
162 | eval_imgs = list(eval_imgs.flatten())
163 |
164 | coco_eval.evalImgs = eval_imgs
165 | coco_eval.params.imgIds = img_ids
166 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
167 |
168 |
169 | def createIndex(self):
170 | anns, cats, imgs = {}, {}, {}
171 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
172 | if 'annotations' in self.dataset:
173 | for ann in self.dataset['annotations']:
174 | imgToAnns[ann['image_id']].append(ann)
175 | anns[ann['id']] = ann
176 |
177 | if 'images' in self.dataset:
178 | for img in self.dataset['images']:
179 | imgs[img['id']] = img
180 |
181 | if 'categories' in self.dataset:
182 | for cat in self.dataset['categories']:
183 | cats[cat['id']] = cat
184 |
185 | if 'annotations' in self.dataset and 'categories' in self.dataset:
186 | for ann in self.dataset['annotations']:
187 | catToImgs[ann['category_id']].append(ann['image_id'])
188 |
189 | # create class members
190 | self.anns = anns
191 | self.imgToAnns = imgToAnns
192 | self.catToImgs = catToImgs
193 | self.imgs = imgs
194 | self.cats = cats
195 |
196 |
197 | maskUtils = mask_util
198 |
199 |
200 | def loadRes(self, resFile):
201 | """
202 | Load result file and return a result api object.
203 | :return: res (obj) : result api object
204 | """
205 | res = COCO()
206 | res.dataset['images'] = [img for img in self.dataset['images']]
207 |
208 | if isinstance(resFile, torch._six.string_classes):
209 | anns = json.load(open(resFile))
210 | elif type(resFile) == np.ndarray:
211 | anns = self.loadNumpyAnnotations(resFile)
212 | else:
213 | anns = resFile
214 | assert type(anns) == list, 'results in not an array of objects'
215 | annsImgIds = [ann['image_id'] for ann in anns]
216 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
217 | 'Results do not correspond to current coco set'
218 | if 'caption' in anns[0]:
219 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
220 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
221 | for id, ann in enumerate(anns):
222 | ann['id'] = id + 1
223 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
224 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
225 | for id, ann in enumerate(anns):
226 | bb = ann['bbox']
227 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
228 | if 'segmentation' not in ann:
229 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
230 | ann['area'] = bb[2] * bb[3]
231 | ann['id'] = id + 1
232 | ann['iscrowd'] = 0
233 | elif 'segmentation' in anns[0]:
234 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
235 | for id, ann in enumerate(anns):
236 | # now only support compressed RLE format as segmentation results
237 | ann['area'] = maskUtils.area(ann['segmentation'])
238 | if 'bbox' not in ann:
239 | ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
240 | ann['id'] = id + 1
241 | ann['iscrowd'] = 0
242 | elif 'keypoints' in anns[0]:
243 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
244 | for id, ann in enumerate(anns):
245 | s = ann['keypoints']
246 | x = s[0::3]
247 | y = s[1::3]
248 | x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
249 | ann['area'] = (x2 - x1) * (y2 - y1)
250 | ann['id'] = id + 1
251 | ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
252 |
253 | res.dataset['annotations'] = anns
254 | createIndex(res)
255 | return res
256 |
257 |
258 | def evaluate(self):
259 | '''
260 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
261 | :return: None
262 | '''
263 | p = self.params
264 | # add backward compatibility if useSegm is specified in params
265 | if p.useSegm is not None:
266 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
267 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
268 | p.imgIds = list(np.unique(p.imgIds))
269 | if p.useCats:
270 | p.catIds = list(np.unique(p.catIds))
271 | p.maxDets = sorted(p.maxDets)
272 | self.params = p
273 |
274 | self._prepare()
275 | # loop through images, area range, max detection number
276 | catIds = p.catIds if p.useCats else [-1]
277 |
278 | if p.iouType == 'segm' or p.iouType == 'bbox':
279 | computeIoU = self.computeIoU
280 | elif p.iouType == 'keypoints':
281 | computeIoU = self.computeOks
282 | self.ious = {
283 | (imgId, catId): computeIoU(imgId, catId)
284 | for imgId in p.imgIds
285 | for catId in catIds}
286 |
287 | evaluateImg = self.evaluateImg
288 | maxDet = p.maxDets[-1]
289 | evalImgs = [
290 | evaluateImg(imgId, catId, areaRng, maxDet)
291 | for catId in catIds
292 | for areaRng in p.areaRng
293 | for imgId in p.imgIds
294 | ]
295 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
296 | self._paramsEval = copy.deepcopy(self.params)
297 | return p.imgIds, evalImgs
298 |
--------------------------------------------------------------------------------
/utils/det_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import math
3 |
4 |
5 | class BalancedPositiveNegativeSampler(object):
6 | """
7 | This class samples batches, ensuring that they contain a fixed proportion of positives
8 | :param batch_size_per_image: number of elements to be selected per image
9 | :param positive_fraction: percentage of positive elements per batch
10 | """
11 |
12 | def __init__(self, batch_size_per_image, positive_fraction):
13 | self.batch_size_per_image = batch_size_per_image
14 | self.positive_fraction = positive_fraction
15 |
16 | def __call__(self, matched_idxs):
17 | """
18 | Returns two lists of binary masks for each image.
19 | The first list contains the positive elements that were selected,
20 | and the second list the negative example.
21 | :param matched_idxs: list of tensors containing -1, 0 or positive values.
22 | Each tensor corresponds to a specific image.
23 | -1 values are ignored, 0 are considered as negatives and > 0 as
24 | positives.
25 | :return: pos_idx (list[tensor])
26 | neg_idx (list[tensor])
27 | """
28 |
29 | pos_idx = []
30 | neg_idx = []
31 | for matched_idxs_per_image in matched_idxs:
32 | # positive sample if index >= 1
33 | positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
34 | # negative sample if index == 0
35 | negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
36 |
37 | # number of positive samples
38 | num_pos = int(self.batch_size_per_image * self.positive_fraction)
39 | # protect against not enough positive examples, used all positive samples
40 | num_pos = min(positive.numel(), num_pos)
41 |
42 | # number of negative samples
43 | num_neg = self.batch_size_per_image - num_pos
44 | # protect against not enough negative examples, used all negative samples
45 | num_neg = min(negative.numel(), num_neg)
46 |
47 | # randomly select positive and negative examples
48 | # Returns a random permutation of integers from 0 to n - 1.
49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
51 |
52 | pos_idx_per_image = positive[perm1]
53 | neg_idx_per_image = negative[perm2]
54 |
55 | # create binary mask from indices
56 | pos_idx_per_image_mask = torch.zeros_like(
57 | matched_idxs_per_image, dtype=torch.uint8
58 | )
59 | neg_idx_per_image_mask = torch.zeros_like(
60 | matched_idxs_per_image, dtype=torch.uint8
61 | )
62 |
63 | pos_idx_per_image_mask[pos_idx_per_image] = 1
64 | neg_idx_per_image_mask[neg_idx_per_image] = 1
65 |
66 | pos_idx.append(pos_idx_per_image_mask)
67 | neg_idx.append(neg_idx_per_image_mask)
68 |
69 | return pos_idx, neg_idx
70 |
71 |
72 | def encode_boxes(reference_boxes, proposals, weights):
73 | """
74 | Encode a set of proposals with respect to some reference boxes
75 | :param reference_boxes: reference boxes(gt)
76 | :param proposals: boxes to be encoded(anchors)
77 | :param weights:
78 | :return:
79 | """
80 |
81 | wx = weights[0]
82 | wy = weights[1]
83 | ww = weights[2]
84 | wh = weights[3]
85 |
86 | # Returns a new tensor with a dimension of size one inserted at the specified position.
87 | proposals_x1 = proposals[:, 0].unsqueeze(1)
88 | proposals_y1 = proposals[:, 1].unsqueeze(1)
89 | proposals_x2 = proposals[:, 2].unsqueeze(1)
90 | proposals_y2 = proposals[:, 3].unsqueeze(1)
91 |
92 | reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
93 | reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
94 | reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
95 | reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
96 |
97 | # implementation starts here
98 | # parse widths and heights
99 | ex_widths = proposals_x2 - proposals_x1
100 | ex_heights = proposals_y2 - proposals_y1
101 |
102 | # center point
103 | ex_ctr_x = proposals_x1 + 0.5 * ex_widths
104 | ex_ctr_y = proposals_y1 + 0.5 * ex_heights
105 |
106 | gt_widths = reference_boxes_x2 - reference_boxes_x1
107 | gt_heights = reference_boxes_y2 - reference_boxes_y1
108 | gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
109 | gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
110 |
111 | targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
112 | targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
113 | targets_dw = ww * torch.log(gt_widths / ex_widths)
114 | targets_dh = wh * torch.log(gt_heights / ex_heights)
115 |
116 | targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
117 | return targets
118 |
119 |
120 | class BoxCoder(object):
121 | """
122 | This class encodes and decodes a set of bounding boxes into
123 | the representation used for training the regressors.
124 | :param weights: 4-element tuple, represented calculation weights of x, y, h, w
125 | :param bbox_xform_clip: float, represented maximum of height and width
126 | """
127 |
128 | def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
129 | self.weights = weights
130 | self.bbox_xform_clip = bbox_xform_clip
131 |
132 | def encode(self, reference_boxes, proposals):
133 | """
134 | This class is inserted to calculate parameters of regression
135 | :param reference_boxes: gt bbox
136 | :param proposals: anchors bbox
137 | :return: regression parameters
138 | """
139 |
140 | boxes_per_image = [len(b) for b in reference_boxes]
141 | reference_boxes = torch.cat(reference_boxes, dim=0)
142 | proposals = torch.cat(proposals, dim=0)
143 |
144 | # targets_dx, targets_dy, targets_dw, targets_dh
145 | targets = self.encode_single(reference_boxes, proposals)
146 | return targets.split(boxes_per_image, 0)
147 |
148 | def encode_single(self, reference_boxes, proposals):
149 | """
150 | Encode a set of proposals with respect to some reference boxes
151 | :param reference_boxes: reference boxes
152 | :param proposals: boxes to be encoded
153 | :return:
154 | """
155 |
156 | dtype = reference_boxes.dtype
157 | device = reference_boxes.device
158 | weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
159 | targets = encode_boxes(reference_boxes, proposals, weights)
160 |
161 | return targets
162 |
163 | def decode(self, rel_codes, boxes):
164 | """
165 | decode regression parameters
166 | :param rel_codes: bbox regression parameters
167 | :param boxes: anchors
168 | :return:
169 | """
170 |
171 | assert isinstance(boxes, (list, tuple))
172 | assert isinstance(rel_codes, torch.Tensor)
173 |
174 | boxes_per_image = [b.size(0) for b in boxes]
175 | concat_boxes = torch.cat(boxes, dim=0)
176 |
177 | box_sum = 0
178 | for val in boxes_per_image:
179 | box_sum += val
180 | # map regression parameters into anchors to get coordinate
181 | pred_boxes = self.decode_single(
182 | rel_codes.reshape(box_sum, -1), concat_boxes
183 | )
184 | return pred_boxes.reshape(box_sum, -1, 4)
185 |
186 | def decode_single(self, rel_codes, boxes):
187 | """
188 | From a set of original boxes and encoded relative box offsets, get the decoded boxes.
189 | :param rel_codes: encoded boxes (bbox regression parameters)
190 | :param boxes: reference boxes (anchors)
191 | :return:
192 | """
193 | boxes = boxes.to(rel_codes.dtype)
194 |
195 | # xmin, ymin, xmax, ymax
196 | widths = boxes[:, 2] - boxes[:, 0] # anchor width
197 | heights = boxes[:, 3] - boxes[:, 1] # anchor height
198 | ctr_x = boxes[:, 0] + 0.5 * widths # anchor center x coordinate
199 | ctr_y = boxes[:, 1] + 0.5 * heights # anchor center y coordinate
200 |
201 | wx, wy, ww, wh = self.weights # default is 1
202 | dx = rel_codes[:, 0::4] / wx # predicated anchors center x regression parameters
203 | dy = rel_codes[:, 1::4] / wy # predicated anchors center y regression parameters
204 | dw = rel_codes[:, 2::4] / ww # predicated anchors width regression parameters
205 | dh = rel_codes[:, 3::4] / wh # predicated anchors height regression parameters
206 |
207 | # limit max value, prevent sending too large values into torch.exp()
208 | # self.bbox_xform_clip=math.log(1000. / 16)
209 | dw = torch.clamp(dw, max=self.bbox_xform_clip)
210 | dh = torch.clamp(dh, max=self.bbox_xform_clip)
211 |
212 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
213 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
214 | pred_w = torch.exp(dw) * widths[:, None]
215 | pred_h = torch.exp(dh) * heights[:, None]
216 |
217 | # xmin
218 | pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
219 | # ymin
220 | pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
221 | # xmax
222 | pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
223 | # ymax
224 | pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
225 | pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
226 | return pred_boxes
227 |
228 |
229 | def set_low_quality_matches_(matches, all_matches, match_quality_matrix):
230 | """
231 | Produce additional matches for predictions that have only low-quality matches.
232 | Specifically, for each ground-truth find the set of predictions that have
233 | maximum overlap with it (including ties); for each prediction in that set, if
234 | it is unmatched, then match it to the ground-truth with which it has the highest
235 | quality value.
236 | """
237 | # For each gt, find the prediction with which it has highest quality
238 | highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) # the dimension to reduce.
239 |
240 | # Find highest quality match available, even if it is low, including ties
241 | gt_pred_pairs_of_highest_quality = torch.nonzero(
242 | match_quality_matrix == highest_quality_foreach_gt[:, None]
243 | )
244 | # Example gt_pred_pairs_of_highest_quality:
245 | # tensor([[ 0, 39796],
246 | # [ 1, 32055],
247 | # [ 1, 32070],
248 | # [ 2, 39190],
249 | # [ 2, 40255],
250 | # [ 3, 40390],
251 | # [ 3, 41455],
252 | # [ 4, 45470],
253 | # [ 5, 45325],
254 | # [ 5, 46390]])
255 | # Each row is a (gt index, prediction index)
256 | # Note how gt items 1, 2, 3, and 5 each have two ties
257 |
258 | pre_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
259 | matches[pre_inds_to_update] = all_matches[pre_inds_to_update]
260 |
261 |
262 | class Matcher(object):
263 | BELOW_LOW_THRESHOLD = -1
264 | BETWEEN_THRESHOLDS = -2
265 |
266 | def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
267 | """
268 | Args:
269 | high_threshold (float): quality values greater than or equal to
270 | this value are candidate matches.
271 | low_threshold (float): a lower quality threshold used to stratify
272 | matches into three levels:
273 | 1) matches >= high_threshold
274 | 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
275 | 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
276 | allow_low_quality_matches (bool): if True, produce additional matches
277 | for predictions that have only low-quality match candidates. See
278 | set_low_quality_matches_ for more details.
279 | """
280 | self.BELOW_LOW_THRESHOLD = -1
281 | self.BETWEEN_THRESHOLDS = -2
282 | assert low_threshold <= high_threshold
283 | self.high_threshold = high_threshold # 0.7
284 | self.low_threshold = low_threshold # 0.3
285 | self.allow_low_quality_matches = allow_low_quality_matches
286 |
287 | def __call__(self, match_quality_matrix):
288 | """
289 | calculate maximum iou between anchors and gt boxes, save index,
290 | iou < low_threshold: -1
291 | iou > high_threshold: 1
292 | low_threshold<=iou= self.low_threshold) & (
322 | matched_vals < self.high_threshold
323 | )
324 | matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD # -1
325 |
326 | matches[between_thresholds] = self.BETWEEN_THRESHOLDS # -2
327 |
328 | if self.allow_low_quality_matches:
329 | assert all_matches is not None
330 | set_low_quality_matches_(matches, all_matches, match_quality_matrix)
331 |
332 | return matches
333 |
334 |
335 | def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = True):
336 | """
337 | smooth_l1_loss for bbox regression
338 | :param input:
339 | :param target:
340 | :param beta:
341 | :param size_average:
342 | :return:
343 | """
344 |
345 | n = torch.abs(input - target)
346 | cond = n < beta
347 | loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
348 | if size_average:
349 | return loss.mean()
350 | return loss.sum()
351 |
--------------------------------------------------------------------------------
/utils/draw_box_utils.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import PIL.ImageDraw as ImageDraw
3 | import PIL.ImageFont as ImageFont
4 | import numpy as np
5 |
6 | STANDARD_COLORS = [
7 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
8 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
9 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
10 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
11 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
12 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
13 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
14 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
15 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
16 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
17 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
18 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
19 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
20 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
21 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
22 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
23 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
24 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
25 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
26 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
27 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
28 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
29 | 'WhiteSmoke', 'Yellow', 'YellowGreen'
30 | ]
31 |
32 |
33 | def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
34 | for i in range(boxes.shape[0]):
35 | if scores[i] > thresh:
36 | box = tuple(boxes[i].tolist())
37 | if classes[i] in category_index.keys():
38 | class_name = category_index[classes[i]]
39 | else:
40 | class_name = 'N/A'
41 | display_str = str(class_name)
42 | display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
43 | box_to_display_str_map[box].append(display_str)
44 | box_to_color_map[box] = STANDARD_COLORS[
45 | classes[i] % len(STANDARD_COLORS)]
46 | else:
47 | break
48 |
49 |
50 | def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
51 | try:
52 | font = ImageFont.truetype('arial.ttf', 24)
53 | except IOError:
54 | font = ImageFont.load_default()
55 |
56 | display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
57 | total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
58 |
59 | if top > total_display_str_height:
60 | text_bottom = top
61 | else:
62 | text_bottom = bottom + total_display_str_height
63 | # Reverse list and print from bottom to top.
64 | for display_str in box_to_display_str_map[box][::-1]:
65 | text_width, text_height = font.getsize(display_str)
66 | margin = np.ceil(0.05 * text_height)
67 | draw.rectangle([(left, text_bottom - text_height - 2 * margin),
68 | (left + text_width, text_bottom)], fill=color)
69 | draw.text((left + margin, text_bottom - text_height - margin),
70 | display_str,
71 | fill='black',
72 | font=font)
73 | text_bottom -= text_height - 2 * margin
74 |
75 |
76 | def draw_box(image, boxes, classes, scores, category_index, thresh=0.5, line_thickness=8):
77 | box_to_display_str_map = collections.defaultdict(list)
78 | box_to_color_map = collections.defaultdict(str)
79 |
80 | filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
81 |
82 | # Draw all boxes onto image.
83 | draw = ImageDraw.Draw(image)
84 | for box, color in box_to_color_map.items():
85 | xmin, ymin, xmax, ymax = box
86 | (left, right, top, bottom) = (xmin * 1, xmax * 1,
87 | ymin * 1, ymax * 1)
88 | draw.line([(left, top), (left, bottom), (right, bottom),
89 | (right, top), (left, top)], width=line_thickness, fill=color)
90 | draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
--------------------------------------------------------------------------------
/utils/evaluate_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | import torch
3 | from utils.train_utils import MetricLogger
4 | from utils.coco_utils import get_coco_api_from_dataset, CocoEvaluator
5 |
6 |
7 | @torch.no_grad()
8 | def evaluate(model, data_loader, device, mAP_list=None):
9 | n_threads = torch.get_num_threads()
10 | torch.set_num_threads(1)
11 | cpu_device = torch.device("cpu")
12 | model.eval()
13 | metric_logger = MetricLogger(delimiter=" ")
14 | header = "Test: "
15 |
16 | coco = get_coco_api_from_dataset(data_loader.dataset)
17 | iou_types = ["bbox"]
18 | coco_evaluator = CocoEvaluator(coco, iou_types)
19 |
20 | for image, targets in metric_logger.log_every(data_loader, 100, header):
21 | image = list(img.to(device) for img in image)
22 |
23 | if device != torch.device("cpu"):
24 | torch.cuda.synchronize(device)
25 |
26 | model_time = time.time()
27 | outputs = model(image)
28 |
29 | outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
30 | model_time = time.time() - model_time
31 |
32 | res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
33 |
34 | evaluator_time = time.time()
35 | coco_evaluator.update(res)
36 | evaluator_time = time.time() - evaluator_time
37 | metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
38 |
39 | # gather the stats from all processes
40 | metric_logger.synchronize_between_processes()
41 | print("Averaged stats:", metric_logger)
42 | coco_evaluator.synchronize_between_processes()
43 |
44 | # accumulate predictions from all images
45 | coco_evaluator.accumulate()
46 | coco_evaluator.summarize()
47 | torch.set_num_threads(n_threads)
48 |
49 | print_txt = coco_evaluator.coco_eval[iou_types[0]].stats
50 | coco_mAP = print_txt[0]
51 | voc_mAP = print_txt[1]
52 | if isinstance(mAP_list, list):
53 | mAP_list.append(voc_mAP)
54 |
55 | return coco_evaluator, voc_mAP
56 |
57 |
--------------------------------------------------------------------------------
/utils/faster_rcnn_utils.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from collections import OrderedDict
3 |
4 | import torch
5 | import torch.nn.functional as F
6 | from torch import Tensor
7 | from torch import nn
8 | from torch.jit.annotations import Tuple, List, Dict, Optional
9 | from torchvision.ops import MultiScaleRoIAlign
10 |
11 | from utils.anchor_utils import AnchorsGenerator
12 | from utils.roi_header_util import RoIHeads
13 | from utils.rpn_utils import RPNHead, RegionProposalNetwork
14 | from utils.transform_utils import GeneralizedRCNNTransform
15 |
16 |
17 | class FasterRCNNBase(nn.Module):
18 | """
19 | Main class for Generalized R-CNN.
20 |
21 | Arguments:
22 | backbone (nn.Module):
23 | rpn (nn.Module):
24 | roi_heads (nn.Module): takes the features + the proposals from the RPN and computes
25 | detections / masks from it.
26 | transform (nn.Module): performs the data transformation from the inputs to feed into
27 | the model
28 | """
29 |
30 | def __init__(self, backbone, rpn, roi_heads, transform):
31 | super(FasterRCNNBase, self).__init__()
32 | self.transform = transform
33 | self.backbone = backbone
34 | self.rpn = rpn
35 | self.roi_heads = roi_heads
36 |
37 | @torch.jit.unused
38 | def eager_outputs(self, losses, detections):
39 | if self.training:
40 | return losses
41 |
42 | return detections
43 |
44 | def forward(self, images, targets=None):
45 | """
46 | Arguments:
47 | images (list[Tensor]): images to be processed
48 | targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
49 |
50 | Returns:
51 | result (list[BoxList] or dict[Tensor]): the output from the model.
52 | During training, it returns a dict[Tensor] which contains the losses.
53 | During testing, it returns list[BoxList] contains additional fields
54 | like `scores`, `labels` and `mask` (for Mask R-CNN models).
55 |
56 | """
57 | if self.training and targets is None:
58 | raise ValueError("In training mode, targets should be passed")
59 |
60 | if self.training:
61 | assert targets is not None
62 | for target in targets:
63 | boxes = target["boxes"]
64 | if isinstance(boxes, torch.Tensor):
65 | if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
66 | raise ValueError("Expected target boxes to be a tensor"
67 | "of shape [N, 4], got {:}.".format(
68 | boxes.shape))
69 | else:
70 | raise ValueError("Expected target boxes to be of type "
71 | "Tensor, got {:}.".format(type(boxes)))
72 |
73 | original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
74 | for img in images:
75 | val = img.shape[-2:]
76 | assert len(val) == 2
77 | original_image_sizes.append((val[0], val[1]))
78 |
79 | images, targets = self.transform(images, targets)
80 |
81 | features = self.backbone(images.tensors)
82 | if isinstance(features, torch.Tensor):
83 | features = OrderedDict([('0', features)])
84 |
85 | proposals, proposal_losses = self.rpn(images, features, targets)
86 |
87 | detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
88 |
89 | detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
90 |
91 | losses = {}
92 | losses.update(detector_losses)
93 | losses.update(proposal_losses)
94 |
95 | return self.eager_outputs(losses, detections)
96 |
97 |
98 | class TwoMLPHead(nn.Module):
99 | """
100 | two fc layers after roi pooling/align
101 | :param in_channels: number of input channels
102 | :param representation_size: size of the intermediate representation
103 | """
104 |
105 | def __init__(self, in_channels, representation_size):
106 | super(TwoMLPHead, self).__init__()
107 |
108 | self.fc6 = nn.Linear(in_channels, representation_size)
109 | self.fc7 = nn.Linear(representation_size, representation_size)
110 |
111 | def forward(self, x):
112 | x = x.flatten(start_dim=1)
113 |
114 | x = F.relu(self.fc6(x))
115 | x = F.relu(self.fc7(x))
116 |
117 | return x
118 |
119 |
120 | class FastRCNNPredictor(nn.Module):
121 | """
122 | Standard classification + bounding box regression layers for Fast R-CNN.
123 | :param in_channels: number of input channels
124 | :param num_classes: number of output classes (including background)
125 | """
126 |
127 | def __init__(self, in_channels, num_classes):
128 | super(FastRCNNPredictor, self).__init__()
129 | self.cls_score = nn.Linear(in_channels, num_classes)
130 | self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
131 |
132 | def forward(self, x):
133 | if x.dim() == 4:
134 | assert list(x.shape[2:]) == [1, 1]
135 | x = x.flatten(start_dim=1)
136 | scores = self.cls_score(x)
137 | bbox_deltas = self.bbox_pred(x)
138 |
139 | return scores, bbox_deltas
140 |
141 |
142 | class FasterRCNN(FasterRCNNBase):
143 | """
144 | Implementation of Faster R-CNN.
145 |
146 | The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
147 | image, and should be in 0-1 range. Different images can have different sizes.
148 |
149 | The behavior of the model changes depending if it is in training or inference mode.
150 |
151 | During training, the model expects both the input tensors, as well as a targets (list of dictionary),
152 | containing:
153 | - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
154 | between 0 and H and 0 and W
155 | - labels (Int64Tensor[N]): the class label for each ground-truth box
156 |
157 | The model returns a Dict[Tensor] during training, containing the classification and regression
158 | losses for both the RPN and the R-CNN.
159 |
160 | During inference, the model requires only the input tensors, and returns the post-processed
161 | predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
162 | follows:
163 | - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
164 | 0 and H and 0 and W
165 | - labels (Int64Tensor[N]): the predicted labels for each image
166 | - scores (Tensor[N]): the scores or each prediction
167 |
168 | :param backbone: (nn.Module), the network used to compute the features for the model.
169 | It should contain a out_channels attribute, which indicates the number of output
170 | channels that each feature map has (and it should be the same for all feature maps).
171 | The backbone should return a single Tensor or and OrderedDict[Tensor].
172 | :param num_classes: (int), number of output classes of the model (including the background).
173 | If box_predictor is specified, num_classes should be None.
174 | :param min_size: (int), minimum size of the image to be rescaled before feeding it to the backbone
175 | :param max_size: (int), maximum size of the image to be rescaled before feeding it to the backbone
176 | :param image_mean: (Tuple[float, float, float]):, mean values used for input normalization.
177 | They are generally the mean values of the dataset on which the backbone has been trained
178 | on
179 | :param image_std: (Tuple[float, float, float]), std values used for input normalization.
180 | They are generally the std values of the dataset on which the backbone has been trained on
181 | :param rpn_anchor_generator: (AnchorGenerator), module that generates the anchors for a set of feature maps.
182 | :param rpn_head: (nn.Module), module that computes the objectness and regression deltas from the RPN
183 | :param rpn_pre_nms_top_n_train:(int), number of proposals to keep before applying NMS during training
184 | :param rpn_pre_nms_top_n_test: (int), number of proposals to keep before applying NMS during testing
185 | :param rpn_post_nms_top_n_train: (int), number of proposals to keep after applying NMS during training
186 | :param rpn_post_nms_top_n_test: (int), number of proposals to keep after applying NMS during testing
187 | :param rpn_nms_thresh: (float), NMS threshold used for postprocessing the RPN proposals
188 | :param rpn_fg_iou_thresh:(float), minimum IoU between the anchor and the GT box so that they can be
189 | considered as positive during training of the RPN.
190 | :param rpn_bg_iou_thresh:(float), maximum IoU between the anchor and the GT box so that they can be
191 | considered as negative during training of the RPN.
192 | :param rpn_batch_size_per_image: (int), number of anchors that are sampled during training of the RPN
193 | for computing the loss
194 | :param rpn_positive_fraction: (float), proportion of positive anchors in a mini-batch during training
195 | of the RPN
196 | :param box_roi_pool:(MultiScaleRoIAlign), the module which crops and resizes the feature maps in
197 | the locations indicated by the bounding boxes
198 | :param box_head:(nn.Module), module that takes the cropped feature maps as input
199 | :param box_predictor:(nn.Module), module that takes the output of box_head and returns the
200 | classification logits and box regression deltas.
201 | :param box_score_thresh:(float),during inference, only return proposals with a classification score
202 | greater than box_score_thresh
203 | :param box_nms_thresh: (float), NMS threshold for the prediction head. Used during inference
204 | :param box_detections_per_img: (int), maximum number of detections per image, for all classes.
205 | :param box_fg_iou_thresh:(float): minimum IoU between the proposals and the GT box so that they can be
206 | considered as positive during training of the classification head
207 | :param box_bg_iou_thresh: (float), maximum IoU between the proposals and the GT box so that they can be
208 | considered as negative during training of the classification head
209 | :param box_batch_size_per_image: (int), number of proposals that are sampled during training of the
210 | classification head
211 | :param box_positive_fraction: (float), proportion of positive proposals in a mini-batch during training
212 | of the classification head
213 | :param bbox_reg_weights: (Tuple[float, float, float, float]), weights for the encoding/decoding of the
214 | bounding boxes
215 | """
216 |
217 | def __init__(self, backbone, num_classes=None,
218 | # transform parameter
219 | min_size=300, max_size=800, # preprocess minimum and maximum size
220 | image_mean=None, image_std=None, # mean and std in preprocess
221 |
222 | # RPN parameters
223 | rpn_anchor_generator=None, rpn_head=None,
224 | rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # kept proposals before nms
225 | rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # kept proposals after nms
226 | rpn_nms_thresh=0.7, # iou threshold during nms
227 | rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # bg/fg threshold
228 | rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # number of samples and fraction
229 |
230 | # Box parameters
231 | box_roi_pool=None, box_head=None, box_predictor=None,
232 |
233 | # remove low threshold target
234 | box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
235 | box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,
236 | box_batch_size_per_image=512, box_positive_fraction=0.25,
237 | bbox_reg_weights=None
238 | ):
239 |
240 | if not hasattr(backbone, "out_channels"):
241 | raise ValueError(
242 | "backbone should contain an attribute out_channels"
243 | "specifying the number of output channels (assumed to be the"
244 | "same for all the levels"
245 | )
246 |
247 | assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None)))
248 | assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))
249 |
250 | if num_classes is not None:
251 | if box_predictor is not None:
252 | raise ValueError("num_classes should be None when box_predictor "
253 | "is specified")
254 | else:
255 | if box_predictor is None:
256 | raise ValueError("num_classes should not be None when box_predictor "
257 | "is not specified")
258 |
259 | # output channels of the backbone
260 | out_channels = backbone.out_channels
261 |
262 | if rpn_head is None:
263 | rpn_head = RPNHead(
264 | out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
265 | )
266 |
267 | rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
268 | rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
269 |
270 | rpn = RegionProposalNetwork(
271 | rpn_anchor_generator, rpn_head,
272 | rpn_fg_iou_thresh, rpn_bg_iou_thresh,
273 | rpn_batch_size_per_image, rpn_positive_fraction,
274 | rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)
275 |
276 | # two fc layer after roi pooling
277 | if box_head is None:
278 | resolution = box_roi_pool.output_size[0]
279 | representation_size = 1024
280 | box_head = TwoMLPHead(
281 | out_channels * resolution ** 2,
282 | representation_size
283 | )
284 |
285 | # get prediction
286 | if box_predictor is None:
287 | representation_size = 1024
288 | box_predictor = FastRCNNPredictor(
289 | representation_size,
290 | num_classes)
291 |
292 | roi_heads = RoIHeads(
293 | # box
294 | box_roi_pool, box_head, box_predictor,
295 | box_fg_iou_thresh, box_bg_iou_thresh,
296 | box_batch_size_per_image, box_positive_fraction,
297 | bbox_reg_weights,
298 | box_score_thresh, box_nms_thresh, box_detections_per_img)
299 |
300 | transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
301 |
302 | super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
303 |
--------------------------------------------------------------------------------
/utils/im_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 | import torch
3 | from torch.jit.annotations import List, Tuple
4 | from torch import Tensor
5 | from torchvision.transforms import functional as F
6 |
7 |
8 | class Compose(object):
9 |
10 | def __init__(self, transforms):
11 | self.transforms = transforms
12 |
13 | def __call__(self, image, target):
14 | for t in self.transforms:
15 | image, target = t(image, target)
16 | return image, target
17 |
18 |
19 | class ToTensor(object):
20 |
21 | def __call__(self, image, target):
22 | image = F.to_tensor(image)
23 | return image, target
24 |
25 |
26 | class RandomHorizontalFlip(object):
27 |
28 | def __init__(self, prob=0.5):
29 | self.prob = prob
30 |
31 | def __call__(self, image, target):
32 | if random.random() < self.prob:
33 | height, width = image.shape[-2:]
34 | image = image.flip(-1)
35 | bbox = target["boxes"]
36 | # bbox: xmin, ymin, xmax, ymax
37 | bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
38 | target["boxes"] = bbox
39 | return image, target
40 |
41 |
42 | @torch.jit.script
43 | class ImageList(object):
44 | """
45 | Structure that holds a list of images (of possibly
46 | varying sizes) as a single tensor.
47 | This works by padding the images to the same size,
48 | and storing in a field the original sizes of each image
49 | """
50 |
51 | def __init__(self, tensors, image_sizes):
52 | """
53 | Arguments:
54 | tensors (tensor) padding后的图像数据
55 | image_sizes (list[tuple[int, int]]) padding前的图像尺寸
56 | """
57 | self.tensors = tensors
58 | self.image_sizes = image_sizes
59 |
60 | def to(self, device):
61 | cast_tensor = self.tensors.to(device)
62 | return ImageList(cast_tensor, self.image_sizes)
63 |
--------------------------------------------------------------------------------
/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_loss_and_lr(train_loss, learning_rate, save_dir):
6 | try:
7 | x = list(range(len(train_loss)))
8 | fig, ax1 = plt.subplots(1, 1)
9 | ax1.plot(x, train_loss, 'r', label='loss')
10 | ax1.set_xlabel("step")
11 | ax1.set_ylabel("loss")
12 | ax1.set_title("Train Loss and lr")
13 | plt.legend(loc='best')
14 |
15 | ax2 = ax1.twinx()
16 | ax2.plot(x, learning_rate, label='lr')
17 | ax2.set_ylabel("learning rate")
18 | ax2.set_xlim(0, len(train_loss))
19 | plt.legend(loc='best')
20 |
21 | handles1, labels1 = ax1.get_legend_handles_labels()
22 | handles2, labels2 = ax2.get_legend_handles_labels()
23 | plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right')
24 |
25 | fig.subplots_adjust(right=0.8)
26 | fig.savefig(os.path.join(save_dir, 'loss_and_lr.png'))
27 | plt.close()
28 | print("successful save loss curve! ")
29 | except Exception as e:
30 | print(e)
31 |
32 |
33 | def plot_map(mAP, save_dir):
34 | try:
35 | x = list(range(len(mAP)))
36 | plt.plot(x, mAP, label='mAp')
37 | plt.xlabel('epoch')
38 | plt.ylabel('mAP')
39 | plt.title('Eval mAP')
40 | plt.xlim(0, len(mAP))
41 | plt.legend(loc='best')
42 | plt.savefig(os.path.join(save_dir, 'mAP.png'))
43 | plt.close()
44 | print("successful save mAP curve!")
45 | except Exception as e:
46 | print(e)
47 |
--------------------------------------------------------------------------------
/utils/roi_header_util.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from torch import Tensor
3 | from torch.jit.annotations import List, Dict, Tuple
4 |
5 | import utils.boxes_utils as box_op
6 | from utils.det_utils import *
7 |
8 |
9 | def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
10 | """
11 | Computes the loss for Faster R-CNN.
12 | :param class_logits: predicted class, shape=[num_anchors, num_classes]
13 | :param box_regression: predicted bbox regression
14 | :param labels: true label
15 | :param regression_targets: true bbox
16 | :return: classification_loss (Tensor)
17 | box_loss (Tensor)
18 | """
19 |
20 | labels = torch.cat(labels, dim=0)
21 | regression_targets = torch.cat(regression_targets, dim=0)
22 |
23 | classification_loss = F.cross_entropy(class_logits, labels)
24 |
25 | # get indices that correspond to the regression targets for
26 | # the corresponding ground truth labels, to be used with
27 | # advanced indexing
28 | sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
29 |
30 | labels_pos = labels[sampled_pos_inds_subset]
31 |
32 | # shape=[num_proposal, num_classes]
33 | N, num_classes = class_logits.shape
34 | box_regression = box_regression.reshape(N, -1, 4)
35 |
36 | box_loss = smooth_l1_loss(box_regression[sampled_pos_inds_subset, labels_pos],
37 | regression_targets[sampled_pos_inds_subset],
38 | beta=1 / 9,
39 | size_average=False,
40 | ) / labels.numel()
41 |
42 | return classification_loss, box_loss
43 |
44 |
45 | def add_gt_proposals(proposals, gt_boxes):
46 | """
47 | concate gt_box and proposals
48 | :param proposals: bboxes of predicted by rpn
49 | :param gt_boxes: true bbox
50 | :return:
51 | """
52 |
53 | proposals = [
54 | torch.cat((proposal, gt_box))
55 | for proposal, gt_box in zip(proposals, gt_boxes)
56 | ]
57 | return proposals
58 |
59 |
60 | def check_targets(targets):
61 | assert targets is not None
62 | assert all(["boxes" in t for t in targets])
63 | assert all(["labels" in t for t in targets])
64 |
65 |
66 | class RoIHeads(torch.nn.Module):
67 | def __init__(self,
68 | box_roi_pool,
69 | box_head,
70 | box_predictor,
71 |
72 | # Faster R-CNN training
73 | fg_iou_thresh, bg_iou_thresh,
74 | batch_size_per_image, positive_fraction,
75 | bbox_reg_weights,
76 |
77 | # Faster R-CNN inference
78 | score_thresh,
79 | nms_thresh,
80 | detection_per_img):
81 | super(RoIHeads, self).__init__()
82 |
83 | self.box_similarity = box_op.box_iou
84 |
85 | # assign ground-truth boxes for each proposal
86 | self.proposal_matcher = Matcher(
87 | fg_iou_thresh, # 0.5
88 | bg_iou_thresh, # 0.5
89 | allow_low_quality_matches=False)
90 |
91 | self.fg_bg_sampler = BalancedPositiveNegativeSampler(
92 | batch_size_per_image, # 512
93 | positive_fraction) # 0.25
94 |
95 | if bbox_reg_weights is None:
96 | bbox_reg_weights = (10., 10., 5., 5.)
97 | self.box_coder = BoxCoder(bbox_reg_weights)
98 |
99 | self.box_roi_pool = box_roi_pool
100 | self.box_head = box_head
101 | self.box_predictor = box_predictor
102 |
103 | self.score_thresh = score_thresh
104 | self.nms_thresh = nms_thresh
105 | self.detection_per_img = detection_per_img
106 |
107 | def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
108 | """
109 | get the matched gt_bbox for every anchors, and set positive/negative samples
110 | :param proposals:
111 | :param gt_boxes:
112 | :param gt_labels:
113 | :return:
114 | """
115 |
116 | matched_idxs = []
117 | labels = []
118 | for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
119 | if gt_boxes_in_image.numel() == 0:
120 | # background image
121 | device = proposals_in_image.device
122 | clamped_matched_idxs_in_image = torch.zeros(
123 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device
124 | )
125 | labels_in_image = torch.zeros(
126 | (proposals_in_image.shape[0],), dtype=torch.int64, device=device
127 | )
128 | else:
129 | # iou of bbox and anchors
130 | match_quality_matrix = box_op.box_iou(gt_boxes_in_image, proposals_in_image)
131 |
132 | matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
133 |
134 | clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
135 |
136 | labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
137 | labels_in_image = labels_in_image.to(dtype=torch.int64)
138 |
139 | # label background (below the low threshold)
140 | bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD # -1
141 | labels_in_image[bg_inds] = 0
142 |
143 | # label ignore proposals (between low and high threshold)
144 | ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS # -2
145 | labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
146 |
147 | matched_idxs.append(clamped_matched_idxs_in_image)
148 | labels.append(labels_in_image)
149 | return matched_idxs, labels
150 |
151 | def subsample(self, labels):
152 | sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
153 | sampled_inds = []
154 | for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
155 | img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
156 | sampled_inds.append(img_sampled_inds)
157 | return sampled_inds
158 |
159 | def select_training_samples(self,
160 | proposals,
161 | targets
162 | ):
163 |
164 | check_targets(targets)
165 | assert targets is not None
166 | dtype = proposals[0].dtype
167 | device = proposals[0].device
168 |
169 | gt_boxes = [t["boxes"].to(dtype) for t in targets]
170 | gt_labels = [t["labels"] for t in targets]
171 |
172 | # append ground-truth bboxes to proposal
173 | proposals = add_gt_proposals(proposals, gt_boxes)
174 |
175 | # get matching gt indices for each proposal
176 | matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
177 |
178 | # sample a fixed proportion of positive-negative proposals
179 | sampled_inds = self.subsample(labels)
180 | matched_gt_boxes = []
181 | num_images = len(proposals)
182 |
183 | for img_id in range(num_images):
184 | img_sampled_inds = sampled_inds[img_id]
185 | proposals[img_id] = proposals[img_id][img_sampled_inds]
186 | labels[img_id] = labels[img_id][img_sampled_inds]
187 | matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
188 |
189 | gt_boxes_in_image = gt_boxes[img_id]
190 | if gt_boxes_in_image.numel() == 0:
191 | gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
192 | matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
193 |
194 | regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
195 | return proposals, matched_idxs, labels, regression_targets
196 |
197 | def postprocess_detections(self,
198 | class_logits,
199 | box_regression,
200 | proposals,
201 | image_shapes
202 | ):
203 | """
204 | 对网络的预测数据进行后处理,包括
205 | (1)根据proposal以及预测的回归参数计算出最终bbox坐标
206 | (2)对预测类别结果进行softmax处理
207 | (3)裁剪预测的boxes信息,将越界的坐标调整到图片边界上
208 | (4)移除所有背景信息
209 | (5)移除低概率目标
210 | (6)移除小尺寸目标
211 | (7)执行nms处理,并按scores进行排序
212 | (8)根据scores排序返回前topk个目标
213 | Args:
214 | class_logits: 网络预测类别概率信息
215 | box_regression: 网络预测的边界框回归参数
216 | proposals: rpn输出的proposal
217 | image_shapes: 打包成batch前每张图像的宽高
218 |
219 | Returns:
220 |
221 | """
222 | device = class_logits.device
223 | # 预测目标类别数
224 | num_classes = class_logits.shape[-1]
225 |
226 | # 获取每张图像的预测bbox数量
227 | boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
228 | # 根据proposal以及预测的回归参数计算出最终bbox坐标
229 | pred_boxes = self.box_coder.decode(box_regression, proposals)
230 |
231 | # 对预测类别结果进行softmax处理
232 | pred_scores = F.softmax(class_logits, -1)
233 |
234 | # split boxes and scores per image
235 | # 根据每张图像的预测bbox数量分割结果
236 | pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
237 | pred_scores_list = pred_scores.split(boxes_per_image, 0)
238 |
239 | all_boxes = []
240 | all_scores = []
241 | all_labels = []
242 | # 遍历每张图像预测信息
243 | for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
244 | # 裁剪预测的boxes信息,将越界的坐标调整到图片边界上
245 | boxes = box_op.clip_boxes_to_image(boxes, image_shape)
246 |
247 | # create labels for each prediction
248 | labels = torch.arange(num_classes, device=device)
249 | labels = labels.view(1, -1).expand_as(scores)
250 |
251 | # remove prediction with the background label
252 | # 移除索引为0的所有信息(0代表背景)
253 | boxes = boxes[:, 1:]
254 | scores = scores[:, 1:]
255 | labels = labels[:, 1:]
256 |
257 | # batch everything, by making every class prediction be a separate instance
258 | boxes = boxes.reshape(-1, 4)
259 | scores = scores.reshape(-1)
260 | labels = labels.reshape(-1)
261 |
262 | # remove low scoring boxes
263 | # 移除低概率目标,self.scores_thresh=0.05
264 | inds = torch.nonzero(scores > self.score_thresh).squeeze(1)
265 | boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
266 |
267 | # remove empty boxes
268 | # 移除小目标
269 | keep = box_op.remove_small_boxes(boxes, min_size=1e-2)
270 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
271 |
272 | # non-maximun suppression, independently done per class
273 | # 执行nms处理,执行后的结果会按照scores从大到小进行排序返回
274 | keep = box_op.batched_nms(boxes, scores, labels, self.nms_thresh)
275 |
276 | # keep only topk scoring predictions
277 | # 获取scores排在前topk个预测目标
278 | keep = keep[:self.detection_per_img]
279 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
280 |
281 | all_boxes.append(boxes)
282 | all_scores.append(scores)
283 | all_labels.append(labels)
284 |
285 | return all_boxes, all_scores, all_labels
286 |
287 | def forward(self,
288 | features,
289 | proposals,
290 | image_shapes,
291 | targets=None
292 | ):
293 | """
294 | Arguments:
295 | features (List[Tensor])
296 | proposals (List[Tensor[N, 4]])
297 | image_shapes (List[Tuple[H, W]])
298 | targets (List[Dict])
299 | """
300 |
301 | if targets is not None:
302 | for t in targets:
303 | floating_point_types = (torch.float, torch.double, torch.half)
304 | assert t["boxes"].dtype in floating_point_types, "target boxes must of float type"
305 | # assert t["labels"].dtype == torch.int64, "target labels must of int64 type"
306 |
307 | if self.training:
308 | # 划分正负样本,统计对应gt的标签以及边界框回归信息
309 | proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
310 | else:
311 | labels = None
312 | regression_targets = None
313 | matched_idxs = None
314 |
315 | # 将采集样本通过roi_pooling层
316 | box_features = self.box_roi_pool(features, proposals, image_shapes)
317 | # 通过roi_pooling后的两层全连接层
318 | box_features = self.box_head(box_features)
319 | # 接着分别预测目标类别和边界框回归参数
320 | class_logits, box_regression = self.box_predictor(box_features)
321 |
322 | result = torch.jit.annotate(List[Dict[str, torch.Tensor]], [])
323 | losses = {}
324 | if self.training:
325 | assert labels is not None and regression_targets is not None
326 | loss_classifier, loss_box_reg = fastrcnn_loss(
327 | class_logits, box_regression, labels, regression_targets)
328 | losses = {
329 | "loss_classifier": loss_classifier,
330 | "loss_box_reg": loss_box_reg
331 | }
332 | else:
333 | boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
334 | num_images = len(boxes)
335 | for i in range(num_images):
336 | result.append(
337 | {
338 | "boxes": boxes[i],
339 | "labels": labels[i],
340 | "scores": scores[i],
341 | }
342 | )
343 |
344 | return result, losses
345 |
--------------------------------------------------------------------------------
/utils/rpn_utils.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.jit.annotations import Dict
3 | from torch.nn import functional as F
4 |
5 | import utils.boxes_utils as box_op
6 | from utils.det_utils import *
7 |
8 |
9 | class RPNHead(nn.Module):
10 | """
11 | RPN head with background/foreground classification and bbox regression
12 | :param self:
13 | :param in_channels: number of channels of the input feature
14 | :param num_anchors: number of anchors to be predicted
15 | :return:
16 | """
17 |
18 | def __init__(self, in_channels, num_anchors):
19 |
20 | super(RPNHead, self).__init__()
21 | # 3x3 conv
22 | self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
23 |
24 | # background/foreground score
25 | self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
26 |
27 | # bbox regression parameters
28 | self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
29 |
30 | for layer in self.children():
31 | if isinstance(layer, nn.Conv2d):
32 | torch.nn.init.normal_(layer.weight, std=0.01)
33 | torch.nn.init.constant_(layer.bias, 0)
34 |
35 | def forward(self, x):
36 | cls_scores = []
37 | bbox_reg = []
38 | for i, feature in enumerate(x):
39 | t = F.relu(self.conv(feature))
40 | cls_scores.append(self.cls_logits(t))
41 | bbox_reg.append(self.bbox_pred(t))
42 | return cls_scores, bbox_reg
43 |
44 |
45 | class RegionProposalNetwork(torch.nn.Module):
46 | """
47 | Implementation of Region Proposal Network (RPN).
48 | :param anchor_generator: module that generates the anchors for feature map.
49 | :param head: module that computes the objectness and regression deltas
50 | :param fg_iou_thresh: minimum IoU between the anchor and the GT box so that they can be
51 | considered as positive during training of the RPN.
52 | :param bg_iou_thresh: maximum IoU between the anchor and the GT box so that they can be
53 | considered as negative during training of the RPN.
54 | :param batch_size_per_image: number of anchors that are sampled during training of the RPN
55 | for computing the loss
56 | :param positive_fraction: proportion of positive anchors in a mini-batch during training
57 | of the RPN
58 | :param pre_nms_top_n: number of proposals to keep before applying NMS. It should
59 | contain two fields: training and testing, to allow for different values depending
60 | on training or evaluation
61 | :param post_nms_top_n: number of proposals to keep after applying NMS. It should
62 | contain two fields: training and testing, to allow for different values depending
63 | on training or evaluation
64 | :param nms_thresh: NMS threshold used for postprocessing the RPN proposals
65 | """
66 |
67 | def __init__(self, anchor_generator, head, fg_iou_thresh, bg_iou_thresh, batch_size_per_image, positive_fraction,
68 | pre_nms_top_n, post_nms_top_n, nms_thresh):
69 |
70 | super(RegionProposalNetwork, self).__init__()
71 | self.anchor_generator = anchor_generator
72 | self.head = head
73 | self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
74 |
75 | # use during training
76 | # function for computing iou between anchor and true bbox
77 | self.box_similarity = box_op.box_iou
78 |
79 | self.proposal_matcher = Matcher(
80 | fg_iou_thresh, # foreground threshold, if IOU > threshold(0.7), is positive samples
81 | bg_iou_thresh, # background threshold, if IOU < threshold(0.3), is negative samples
82 | allow_low_quality_matches=True
83 | )
84 |
85 | self.fg_bg_sampler = BalancedPositiveNegativeSampler(
86 | batch_size_per_image, positive_fraction # 256, 0.5
87 | )
88 |
89 | # use during testing
90 | self._pre_nms_top_n = pre_nms_top_n
91 | self._post_nms_top_n = post_nms_top_n
92 | self.nms_thresh = nms_thresh
93 | self.min_size = 1e-3
94 |
95 | def pre_nms_top_n(self):
96 | if self.training:
97 | return self._pre_nms_top_n['training']
98 | return self._pre_nms_top_n['testing']
99 |
100 | def post_nms_top_n(self):
101 | if self.training:
102 | return self._post_nms_top_n['training']
103 | return self._post_nms_top_n['testing']
104 |
105 | def assign_targets_to_anchors(self, anchors, targets):
106 | """
107 | get the best match gt for anchors, divided into bg samples, fg samples and unused samples
108 | :param anchors: (List[Tensor])
109 | :param targets: (List[Dict[Tensor])
110 | :return: labels: anchors cls, 1 is foreground, 0 is background, -1 is unused
111 | matched_gt_boxes:best matched gt
112 | """
113 |
114 | labels = []
115 | matched_gt_boxes = []
116 | for anchors_per_image, targets_per_image in zip(anchors, targets):
117 | gt_boxes = targets_per_image["boxes"]
118 | if gt_boxes.numel() == 0:
119 | device = anchors_per_image.device
120 | matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
121 | labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
122 | else:
123 | # compute iou of anchors and real bbox
124 | match_quality_matrix = box_op.box_iou(gt_boxes, anchors_per_image)
125 | # calculate index of anchors and gt iou(iou<0.3 is -1,0.3= 0
135 | labels_per_image = labels_per_image.to(dtype=torch.float32)
136 |
137 | # background (negative examples)
138 | bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD # -1
139 | labels_per_image[bg_indices] = 0.0
140 |
141 | # discard indices that are between thresholds
142 | inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS # -2
143 | labels_per_image[inds_to_discard] = -1.0
144 |
145 | labels.append(labels_per_image)
146 | matched_gt_boxes.append(matched_gt_boxes_per_image)
147 | return labels, matched_gt_boxes
148 |
149 | def _get_top_n_idx(self, objectness, num_anchors_per_level):
150 | """
151 | get thr top pre_nms_top_n anchor index in predicted feature_maps based on scores
152 | :param objectness: scores
153 | :param num_anchors_per_level: number of anchors
154 | :return:
155 | """
156 |
157 | result = []
158 | offset = 0
159 | for ob in objectness.split(num_anchors_per_level, 1):
160 | num_anchors = ob.shape[1]
161 | pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
162 |
163 | # Returns the k largest elements of the given input tensor along a given dimension
164 | _, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
165 | result.append(top_n_idx + offset)
166 | offset += num_anchors
167 | return torch.cat(result, dim=1)
168 |
169 | def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
170 | """
171 | remove small bboxes, nms process, get post_nms_top_n target
172 | :param proposals: predicted bbox coordinates
173 | :param objectness: predicted scores
174 | :param image_shapes: image shape
175 | :param num_anchors_per_level: number od anchors of per feature_maps
176 | :return:
177 | """
178 |
179 | num_images = proposals.shape[0]
180 | device = proposals.device
181 |
182 | # do not backprop throught objectness
183 | objectness = objectness.detach()
184 | objectness = objectness.reshape(num_images, -1)
185 |
186 | # Returns a tensor of size, size filled with fill_value
187 | levels = [torch.full((n,), idx, dtype=torch.int64, device=device)
188 | for idx, n in enumerate(num_anchors_per_level)]
189 | levels = torch.cat(levels, 0)
190 |
191 | # Expand this tensor to the same size as objectness
192 | levels = levels.reshape(1, -1).expand_as(objectness)
193 |
194 | # select top_n boxes independently per level before applying nms
195 | top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
196 |
197 | image_range = torch.arange(num_images, device=device)
198 | batch_idx = image_range[:, None] # [batch_size, 1]
199 |
200 | objectness = objectness[batch_idx, top_n_idx]
201 | levels = levels[batch_idx, top_n_idx]
202 | proposals = proposals[batch_idx, top_n_idx]
203 |
204 | final_boxes = []
205 | final_scores = []
206 | for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes):
207 | # adjust predicted bbox, make boxes outside of the image in image
208 | boxes = box_op.clip_boxes_to_image(boxes, img_shape)
209 |
210 | # Remove boxes which contains at least one side smaller than min_size.
211 | keep = box_op.remove_small_boxes(boxes, self.min_size)
212 | boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
213 |
214 | # non-maximum suppression, independently done per level
215 | keep = box_op.batched_nms(boxes, scores, lvl, self.nms_thresh)
216 |
217 | # keep only top k scoring predictions
218 | keep = keep[: self.post_nms_top_n()]
219 | boxes, scores = boxes[keep], scores[keep]
220 | final_boxes.append(boxes)
221 | final_scores.append(scores)
222 | return final_boxes, final_scores
223 |
224 | def compute_loss(self, objectness, pred_bbox_deltas, labels, regression_targets):
225 | """
226 | compute RPN loss, include classification loss(foreground and background), bbox regression loss
227 | :param objectness: predicted foreground probability
228 | :param pred_bbox_deltas: predicted bbox regression parameters
229 | :param labels: true lable, 1, 0 and -1
230 | :param regression_targets: true bbox regression
231 | :return: objectness_loss (Tensor) : classification loss
232 | box_loss (Tensor):bbox loss
233 | """
234 |
235 | # selective positive and negative samples
236 | sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
237 |
238 | sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
239 | sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
240 |
241 | sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
242 | objectness = objectness.flatten()
243 |
244 | labels = torch.cat(labels, dim=0)
245 | regression_targets = torch.cat(regression_targets, dim=0)
246 |
247 | # bbox regression loss
248 | box_loss = smooth_l1_loss(pred_bbox_deltas[sampled_pos_inds], regression_targets[sampled_pos_inds],
249 | beta=1 / 9, size_average=False, ) / (sampled_inds.numel())
250 |
251 | # classification loss
252 | objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds])
253 |
254 | return objectness_loss, box_loss
255 |
256 | def forward(self, images, features, targets=None):
257 | """
258 | :param images: (ImageList), images for which we want to compute the predictions
259 | :param features: (Dict[Tensor]), features computed from the images that are
260 | used for computing the predictions. Each tensor in the list
261 | correspond to different feature levels
262 | :param targets: (List[Dict[Tensor]), ground-truth boxes present in the image (optional).
263 | If provided, each element in the dict should contain a field `boxes`,
264 | with the locations of the ground-truth boxes.
265 | :return:
266 | boxes (List[Tensor]): the predicted boxes from the RPN image.
267 | losses (Dict[Tensor]): the losses for the model during training. During testing, it is an empty dict.
268 | """
269 |
270 | # RPN uses all feature maps that are available
271 | features = list(features.values())
272 |
273 | # Two fc layers to compute the fg/bg scores and bboxs regressions
274 | fg_bg_scores, pred_bbox_deltas = self.head(features)
275 |
276 | # get all anchors of images based on features
277 | anchors = self.anchor_generator(images, features)
278 |
279 | # batch_size
280 | num_images = len(anchors)
281 |
282 | # numel() Returns the total number of elements in the input tensor.
283 | num_anchors_per_level_shape_tensors = [o[0].shape for o in fg_bg_scores]
284 | num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
285 |
286 | # adjust tensor order and reshape
287 | fg_bg_scores, pred_bbox_deltas = box_op.concat_box_prediction_layers(fg_bg_scores, pred_bbox_deltas)
288 |
289 | # apply pred_bbox_deltas to anchors to obtain the decoded proposals
290 | proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
291 | proposals = proposals.view(num_images, -1, 4)
292 |
293 | # remove small bboxes, nms process, get post_nms_top_n target
294 | boxes, scores = self.filter_proposals(proposals, fg_bg_scores, images.image_sizes, num_anchors_per_level)
295 |
296 | losses = {}
297 | if self.training:
298 | assert targets is not None
299 | labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
300 |
301 | # encode parameters based on the bboxes and anchors
302 | regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
303 | loss_objectness, loss_rpn_box_reg = self.compute_loss(
304 | fg_bg_scores, pred_bbox_deltas, labels, regression_targets)
305 | losses = {"loss_objectness": loss_objectness, "loss_rpn_box_reg": loss_rpn_box_reg}
306 |
307 | return boxes, losses
308 |
--------------------------------------------------------------------------------
/utils/train_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import pickle
3 | import time
4 | from collections import defaultdict, deque
5 |
6 | import torch.distributed as dist
7 | from torchvision import ops
8 |
9 | from backbone.mobilenet import MobileNetV2
10 | from backbone.resnet50_fpn_model import *
11 | from config.train_config import cfg
12 | from utils.anchor_utils import AnchorsGenerator
13 | from utils.faster_rcnn_utils import FasterRCNN, FastRCNNPredictor
14 |
15 |
16 | def create_model(num_classes):
17 | global backbone, model
18 | backbone_network = cfg.backbone
19 |
20 | anchor_sizes = tuple((f,) for f in cfg.anchor_size)
21 | aspect_ratios = tuple((f,) for f in cfg.anchor_ratio) * len(anchor_sizes)
22 | anchor_generator = AnchorsGenerator(sizes=anchor_sizes,
23 | aspect_ratios=aspect_ratios)
24 |
25 | if backbone_network == 'mobilenet':
26 | backbone = MobileNetV2(weights_path=cfg.backbone_pretrained_weights).features
27 | backbone.out_channels = 1280
28 |
29 | roi_pooler = ops.MultiScaleRoIAlign(featmap_names=['0'], # roi pooling in which resolution feature
30 | output_size=cfg.roi_out_size, # roi_pooling output feature size
31 | sampling_ratio=cfg.roi_sample_rate) # sampling_ratio
32 |
33 | model = FasterRCNN(backbone=backbone, num_classes=num_classes,
34 | # transform parameters
35 | min_size=cfg.min_size, max_size=cfg.max_size,
36 | image_mean=cfg.image_mean, image_std=cfg.image_std,
37 | # rpn parameters
38 | rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler,
39 | rpn_pre_nms_top_n_train=cfg.rpn_pre_nms_top_n_train,
40 | rpn_pre_nms_top_n_test=cfg.rpn_pre_nms_top_n_test,
41 | rpn_post_nms_top_n_train=cfg.rpn_post_nms_top_n_train,
42 | rpn_post_nms_top_n_test=cfg.rpn_post_nms_top_n_test,
43 | rpn_nms_thresh=cfg.rpn_nms_thresh,
44 | rpn_fg_iou_thresh=cfg.rpn_fg_iou_thresh,
45 | rpn_bg_iou_thresh=cfg.rpn_bg_iou_thresh,
46 | rpn_batch_size_per_image=cfg.rpn_batch_size_per_image,
47 | rpn_positive_fraction=cfg.rpn_positive_fraction,
48 | # Box parameters
49 | box_head=None, box_predictor=None,
50 |
51 | # remove low threshold target
52 | box_score_thresh=cfg.box_score_thresh,
53 | box_nms_thresh=cfg.box_nms_thresh,
54 | box_detections_per_img=cfg.box_detections_per_img,
55 | box_fg_iou_thresh=cfg.box_fg_iou_thresh,
56 | box_bg_iou_thresh=cfg.box_bg_iou_thresh,
57 | box_batch_size_per_image=cfg.box_batch_size_per_image,
58 | box_positive_fraction=cfg.box_positive_fraction,
59 | bbox_reg_weights=cfg.bbox_reg_weights
60 | )
61 | elif backbone_network == 'resnet50_fpn':
62 | backbone = resnet50_fpn_backbone()
63 |
64 | roi_pooler = ops.MultiScaleRoIAlign(
65 | featmap_names=['0', '1', '2', '3'],
66 | output_size=cfg.roi_out_size,
67 | sampling_ratio=cfg.roi_sample_rate)
68 | model = FasterRCNN(backbone=backbone, num_classes=num_classes,
69 | # transform parameters
70 | min_size=cfg.min_size, max_size=cfg.max_size,
71 | image_mean=cfg.image_mean, image_std=cfg.image_std,
72 | # rpn parameters
73 | rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler,
74 | rpn_pre_nms_top_n_train=cfg.rpn_pre_nms_top_n_train,
75 | rpn_pre_nms_top_n_test=cfg.rpn_pre_nms_top_n_test,
76 | rpn_post_nms_top_n_train=cfg.rpn_post_nms_top_n_train,
77 | rpn_post_nms_top_n_test=cfg.rpn_post_nms_top_n_test,
78 | rpn_nms_thresh=cfg.rpn_nms_thresh,
79 | rpn_fg_iou_thresh=cfg.rpn_fg_iou_thresh,
80 | rpn_bg_iou_thresh=cfg.rpn_bg_iou_thresh,
81 | rpn_batch_size_per_image=cfg.rpn_batch_size_per_image,
82 | rpn_positive_fraction=cfg.rpn_positive_fraction,
83 | # Box parameters
84 | box_head=None, box_predictor=None,
85 |
86 | # remove low threshold target
87 | box_score_thresh=cfg.box_score_thresh,
88 | box_nms_thresh=cfg.box_nms_thresh,
89 | box_detections_per_img=cfg.box_detections_per_img,
90 | box_fg_iou_thresh=cfg.box_fg_iou_thresh,
91 | box_bg_iou_thresh=cfg.box_bg_iou_thresh,
92 | box_batch_size_per_image=cfg.box_batch_size_per_image,
93 | box_positive_fraction=cfg.box_positive_fraction,
94 | bbox_reg_weights=cfg.bbox_reg_weights
95 | )
96 |
97 | # weights_dict = torch.load(cfg.pretrained_weights)
98 | # missing_keys, unexpected_keys = model.load_state_dict(weights_dict, strict=False)
99 | # if len(missing_keys) != 0 or len(unexpected_keys) != 0:
100 | # print("missing_keys: ", missing_keys)
101 | # print("unexpected_keys: ", unexpected_keys)
102 |
103 | in_features = model.roi_heads.box_predictor.cls_score.in_features
104 | model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
105 |
106 | return model
107 |
108 |
109 | def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
110 | def f(x):
111 | if x >= warmup_iters:
112 | return 1
113 | alpha = float(x) / warmup_iters
114 | return warmup_factor * (1 - alpha) + alpha
115 |
116 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
117 |
118 |
119 | def is_dist_avail_and_initialized():
120 | if not dist.is_available():
121 | return False
122 | if not dist.is_initialized():
123 | return False
124 | return True
125 |
126 |
127 | def get_world_size():
128 | if not is_dist_avail_and_initialized():
129 | return 1
130 | return dist.get_world_size()
131 |
132 |
133 | def reduce_dict(input_dict, average=True):
134 | """
135 | Args:
136 | input_dict (dict): all the values will be reduced
137 | average (bool): whether to do average or sum
138 | Reduce the values in the dictionary from all processes so that all processes
139 | have the averaged results. Returns a dict with the same fields as
140 | input_dict, after reduction.
141 | """
142 | world_size = get_world_size()
143 | if world_size < 2:
144 | return input_dict
145 | with torch.no_grad():
146 | names = []
147 | values = []
148 | # sort the keys so that they are consistent across processes
149 | for k in sorted(input_dict.keys()):
150 | names.append(k)
151 | values.append(input_dict[k])
152 | values = torch.stack(values, dim=0)
153 | dist.all_reduce(values)
154 | if average:
155 | values /= world_size
156 |
157 | reduced_dict = {k: v for k, v in zip(names, values)}
158 | return reduced_dict
159 |
160 |
161 | class SmoothedValue(object):
162 | """Track a series of values and provide access to smoothed values over a
163 | window or the global series average.
164 | """
165 |
166 | def __init__(self, window_size=20, fmt=None):
167 | if fmt is None:
168 | fmt = "{median:.4f} ({global_avg:.4f})"
169 | self.deque = deque(maxlen=window_size) # deque简单理解成加强版list
170 | self.total = 0.0
171 | self.count = 0
172 | self.fmt = fmt
173 |
174 | def update(self, value, n=1):
175 | self.deque.append(value)
176 | self.count += n
177 | self.total += value * n
178 |
179 | def synchronize_between_processes(self):
180 | """
181 | Warning: does not synchronize the deque!
182 | """
183 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
184 | dist.barrier()
185 | dist.all_reduce(t)
186 | t = t.tolist()
187 | self.count = int(t[0])
188 | self.total = t[1]
189 |
190 | @property
191 | def median(self):
192 | d = torch.tensor(list(self.deque))
193 | return d.median().item()
194 |
195 | @property
196 | def avg(self):
197 | d = torch.tensor(list(self.deque), dtype=torch.float32)
198 | return d.mean().item()
199 |
200 | @property
201 | def global_avg(self):
202 | return self.total / self.count
203 |
204 | @property
205 | def max(self):
206 | return max(self.deque)
207 |
208 | @property
209 | def value(self):
210 | return self.deque[-1]
211 |
212 | def __str__(self):
213 | return self.fmt.format(
214 | median=self.median,
215 | avg=self.avg,
216 | global_avg=self.global_avg,
217 | max=self.max,
218 | value=self.value)
219 |
220 |
221 | def all_gather(data):
222 | """
223 | Run all_gather on arbitrary picklable data (not necessarily tensors)
224 | Args:
225 | data: any picklable object
226 | Returns:
227 | list[data]: list of data gathered from each rank
228 | """
229 | world_size = get_world_size()
230 | if world_size == 1:
231 | return [data]
232 |
233 | # serialized to a Tensor
234 | buffer = pickle.dumps(data)
235 | storage = torch.ByteStorage.from_buffer(buffer)
236 | tensor = torch.ByteTensor(storage).to("cuda")
237 |
238 | # obtain Tensor size of each rank
239 | local_size = torch.tensor([tensor.numel()], device="cuda")
240 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
241 | dist.all_gather(size_list, local_size)
242 | size_list = [int(size.item()) for size in size_list]
243 | max_size = max(size_list)
244 |
245 | # receiving Tensor from all ranks
246 | # we pad the tensor because torch all_gather does not support
247 | # gathering tensors of different shapes
248 | tensor_list = []
249 | for _ in size_list:
250 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
251 | if local_size != max_size:
252 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
253 | tensor = torch.cat((tensor, padding), dim=0)
254 | dist.all_gather(tensor_list, tensor)
255 |
256 | data_list = []
257 | for size, tensor in zip(size_list, tensor_list):
258 | buffer = tensor.cpu().numpy().tobytes()[:size]
259 | data_list.append(pickle.loads(buffer))
260 |
261 | return data_list
262 |
263 |
264 | class MetricLogger(object):
265 | def __init__(self, delimiter="\t"):
266 | self.meters = defaultdict(SmoothedValue)
267 | self.delimiter = delimiter
268 |
269 | def update(self, **kwargs):
270 | for k, v in kwargs.items():
271 | if isinstance(v, torch.Tensor):
272 | v = v.item()
273 | assert isinstance(v, (float, int))
274 | self.meters[k].update(v)
275 |
276 | def __getattr__(self, attr):
277 | if attr in self.meters:
278 | return self.meters[attr]
279 | if attr in self.__dict__:
280 | return self.__dict__[attr]
281 | raise AttributeError("'{}' object has no attribute '{}'".format(
282 | type(self).__name__, attr))
283 |
284 | def __str__(self):
285 | loss_str = []
286 | for name, meter in self.meters.items():
287 | loss_str.append(
288 | "{}: {}".format(name, str(meter))
289 | )
290 | return self.delimiter.join(loss_str)
291 |
292 | def add_meter(self, name, meter):
293 | self.meters[name] = meter
294 |
295 | def synchronize_between_processes(self):
296 | for meter in self.meters.values():
297 | meter.synchronize_between_processes()
298 |
299 | def log_every(self, iterable, print_freq, header=None):
300 | i = 0
301 | if not header:
302 | header = ""
303 | start_time = time.time()
304 | end = time.time()
305 | iter_time = SmoothedValue(fmt='{avg:.4f}')
306 | data_time = SmoothedValue(fmt='{avg:.4f}')
307 | space_fmt = ":" + str(len(str(len(iterable)))) + "d"
308 | if torch.cuda.is_available():
309 | log_msg = self.delimiter.join([header,
310 | '[{0' + space_fmt + '}/{1}]',
311 | 'eta: {eta}',
312 | '{meters}',
313 | 'time: {time}',
314 | 'data: {data}',
315 | 'max mem: {memory:.0f}'])
316 | else:
317 | log_msg = self.delimiter.join([header,
318 | '[{0' + space_fmt + '}/{1}]',
319 | 'eta: {eta}',
320 | '{meters}',
321 | 'time: {time}',
322 | 'data: {data}'])
323 | MB = 1024.0 * 1024.0
324 | for obj in iterable:
325 | data_time.update(time.time() - end)
326 | yield obj
327 | iter_time.update(time.time() - end)
328 | if i % print_freq == 0 or i == len(iterable) - 1:
329 | eta_second = iter_time.global_avg * (len(iterable) - i)
330 | eta_string = str(datetime.timedelta(seconds=eta_second))
331 | if torch.cuda.is_available():
332 | print(log_msg.format(i, len(iterable),
333 | eta=eta_string,
334 | meters=str(self),
335 | time=str(iter_time),
336 | data=str(data_time),
337 | memory=torch.cuda.max_memory_allocated() / MB))
338 | else:
339 | print(log_msg.format(i, len(iterable),
340 | eta=eta_string,
341 | meters=str(self),
342 | time=str(iter_time),
343 | data=str(data_time)))
344 | i += 1
345 | end = time.time()
346 | total_time = time.time() - start_time
347 | total_time_str = str(datetime.timedelta(seconds=int(total_time)))
348 | print('{} Total time: {} ({:.4f} s / it)'.format(header,
349 | total_time_str,
350 |
351 | total_time / len(iterable)))
352 |
353 |
354 | def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
355 | train_loss=None, train_lr=None, warmup=False):
356 | global loss_dict, losses
357 | model.train()
358 | metric_logger = MetricLogger(delimiter=" ")
359 | metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
360 | header = 'Epoch: [{}]'.format(epoch)
361 |
362 | lr_scheduler = None
363 | if epoch == 0 and warmup is True:
364 | warmup_factor = 1.0 / 1000
365 | warmup_iters = min(1000, len(data_loader) - 1)
366 |
367 | lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
368 |
369 | for images, targets in metric_logger.log_every(data_loader, print_freq, header):
370 | images = list(image.to(device) for image in images)
371 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
372 |
373 | loss_dict = model(images, targets)
374 |
375 | losses = sum(loss for loss in loss_dict.values())
376 |
377 | # reduce losses over all GPUs for logging purpose
378 | loss_dict_reduced = reduce_dict(loss_dict)
379 | losses_reduced = sum(loss for loss in loss_dict_reduced.values())
380 |
381 | loss_value = losses_reduced.item()
382 | if isinstance(train_loss, list):
383 | train_loss.append(loss_value)
384 |
385 | optimizer.zero_grad()
386 | losses.backward()
387 | optimizer.step()
388 |
389 | if lr_scheduler is not None:
390 | lr_scheduler.step()
391 |
392 | metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
393 | now_lr = optimizer.param_groups[0]["lr"]
394 | metric_logger.update(lr=now_lr)
395 | if isinstance(train_lr, list):
396 | train_lr.append(now_lr)
397 |
398 | return loss_dict, losses
399 |
400 |
401 | def write_tb(writer, num, info):
402 | for item in info.items():
403 | writer.add_scalar(item[0], item[1], num)
404 |
--------------------------------------------------------------------------------
/utils/transform_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 | from torch.jit.annotations import List, Tuple
6 |
7 | from utils.im_utils import ImageList
8 |
9 |
10 | def torch_choice(l):
11 | index = int(torch.empty(1).uniform_(0., float(len(l))).item())
12 | return l[index]
13 |
14 |
15 | def max_by_axis(the_list):
16 | maxes = the_list[0]
17 | for sublist in the_list[1:]:
18 | for index, item in enumerate(sublist):
19 | maxes[index] = max(maxes[index], item)
20 | return maxes
21 |
22 |
23 | def batch_images(images, size_divisible=32):
24 | """
25 | batched images
26 | :param images: a set of images
27 | :param size_divisible: ratio of height/width to be adjusted
28 | :return: batched tensor image
29 | """
30 |
31 | max_size = max_by_axis([list(img.shape) for img in images])
32 |
33 | stride = float(size_divisible)
34 |
35 | max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
36 | max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
37 |
38 | # [batch, channel, height, width]
39 | batch_shape = [len(images)] + max_size
40 |
41 | batched_imgs = images[0].new_full(batch_shape, 0)
42 | for img, pad_img in zip(images, batched_imgs):
43 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
44 |
45 | return batched_imgs
46 |
47 |
48 | class GeneralizedRCNNTransform(nn.Module):
49 | """
50 | Performs input / target transformation before feeding the data to a GeneralizedRCNN model.
51 | The transformations it perform are:
52 | - input normalization (mean subtraction and std division)
53 | - input / target resizing to match min_size / max_size
54 |
55 | It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
56 | :param min_size: minimum size of input image
57 | :param max_size: maximum size of input image
58 | :param image_mean: image mean
59 | :param image_std: image std
60 | """
61 |
62 | def __init__(self, min_size, max_size, image_mean, image_std):
63 | super(GeneralizedRCNNTransform, self).__init__()
64 | if not isinstance(min_size, (list, tuple)):
65 | min_size = (min_size,)
66 | self.min_size = min_size
67 | self.max_size = max_size
68 | self.image_mean = image_mean
69 | self.image_std = image_std
70 |
71 | def normalize(self, image):
72 | dtype, device = image.dtype, image.device
73 | mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
74 | std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
75 | return (image - mean[:, None, None]) / std[:, None, None]
76 |
77 | def resize(self, image, target):
78 | """
79 | resize input image to specified size and transform for target
80 | :param image: input image
81 | :param target: target related info, like bbox
82 | :return:
83 | image: resized image
84 | target: resized target
85 | """
86 |
87 | # image shape is [channel, height, width]
88 | h, w = image.shape[-2:]
89 | im_shape = torch.tensor(image.shape[-2:])
90 | min_size = float(torch.min(im_shape))
91 | max_size = float(torch.max(im_shape))
92 | if self.training:
93 | size = float(torch_choice(self.min_size))
94 | else:
95 | size = float(self.min_size[-1])
96 | scale_factor = size / min_size
97 |
98 | if max_size * scale_factor > self.max_size:
99 | scale_factor = self.max_size / max_size
100 |
101 | image = torch.nn.functional.interpolate(
102 | image[None], scale_factor=scale_factor, mode='bilinear', align_corners=False)[0]
103 |
104 | if target is None:
105 | return image, target
106 |
107 | bbox = target["boxes"]
108 | bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
109 | target["boxes"] = bbox
110 |
111 | return image, target
112 |
113 | def postprocess(self, result, image_shapes, original_image_sizes):
114 | """
115 | post process of predictions, mainly map bboxed coordinates to original image
116 | :param result: predictions result
117 | :param image_shapes: image size after preprocess
118 | :param original_image_sizes: original image size
119 | :return:
120 | """
121 |
122 | if self.training:
123 | return result
124 | for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
125 | boxes = pred["boxes"]
126 | boxes = resize_boxes(boxes, im_s, o_im_s)
127 | result[i]["boxes"] = boxes
128 | return result
129 |
130 | def forward(self, images, targets=None):
131 | images = [img for img in images]
132 | for i in range(len(images)):
133 | image = images[i]
134 | target_index = targets[i] if targets is not None else None
135 |
136 | if image.dim() != 3:
137 | raise ValueError("images is expected to be a list of 3d tensors "
138 | "of shape [C, H, W], got {}".format(image.shape))
139 | image = self.normalize(image)
140 | image, target_index = self.resize(image, target_index)
141 | images[i] = image
142 | if targets is not None and target_index is not None:
143 | targets[i] = target_index
144 |
145 | # save resized image size
146 | image_sizes = [img.shape[-2:] for img in images]
147 | images = batch_images(images)
148 | image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], [])
149 |
150 | for image_size in image_sizes:
151 | assert len(image_size) == 2
152 | image_sizes_list.append((image_size[0], image_size[1]))
153 |
154 | image_list = ImageList(images, image_sizes_list)
155 | return image_list, targets
156 |
157 |
158 | def resize_boxes(boxes, original_size, new_size):
159 | """
160 | resize bbox to original image based on stride
161 | :param boxes: predicted bboxes
162 | :param original_size: original image size
163 | :param new_size: rescaled image size
164 | :return:
165 | """
166 | ratios = [
167 | torch.tensor(s, dtype=torch.float32, device=boxes.device) /
168 | torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
169 | for s, s_orig in zip(new_size, original_size)
170 | ]
171 | ratios_height, ratios_width = ratios
172 |
173 | xmin, ymin, xmax, ymax = boxes.unbind(1)
174 | xmin = xmin * ratios_width
175 | xmax = xmax * ratios_width
176 | ymin = ymin * ratios_height
177 | ymax = ymax * ratios_height
178 | return torch.stack((xmin, ymin, xmax, ymax), dim=1)
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------