├── README.md ├── fpn.py └── retina_fpn.py /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch-FPN 2 | _Feature Pyramid Networks_ in PyTorch. 3 | 4 | References: 5 | [1] [Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144) 6 | [2] [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) 7 | -------------------------------------------------------------------------------- /fpn.py: -------------------------------------------------------------------------------- 1 | '''FPN in PyTorch. 2 | 3 | See the paper "Feature Pyramid Networks for Object Detection" for more details. 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from torch.autograd import Variable 10 | 11 | 12 | class Bottleneck(nn.Module): 13 | expansion = 4 14 | 15 | def __init__(self, in_planes, planes, stride=1): 16 | super(Bottleneck, self).__init__() 17 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 22 | self.bn3 = nn.BatchNorm2d(self.expansion*planes) 23 | 24 | self.shortcut = nn.Sequential() 25 | if stride != 1 or in_planes != self.expansion*planes: 26 | self.shortcut = nn.Sequential( 27 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 28 | nn.BatchNorm2d(self.expansion*planes) 29 | ) 30 | 31 | def forward(self, x): 32 | out = F.relu(self.bn1(self.conv1(x))) 33 | out = F.relu(self.bn2(self.conv2(out))) 34 | out = self.bn3(self.conv3(out)) 35 | out += self.shortcut(x) 36 | out = F.relu(out) 37 | return out 38 | 39 | 40 | class FPN(nn.Module): 41 | def __init__(self, block, num_blocks): 42 | super(FPN, self).__init__() 43 | self.in_planes = 64 44 | 45 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 46 | self.bn1 = nn.BatchNorm2d(64) 47 | 48 | # Bottom-up layers 49 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 50 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 51 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 52 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 53 | 54 | # Top layer 55 | self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) # Reduce channels 56 | 57 | # Smooth layers 58 | self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 59 | self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 60 | self.smooth3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 61 | 62 | # Lateral layers 63 | self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) 64 | self.latlayer2 = nn.Conv2d( 512, 256, kernel_size=1, stride=1, padding=0) 65 | self.latlayer3 = nn.Conv2d( 256, 256, kernel_size=1, stride=1, padding=0) 66 | 67 | def _make_layer(self, block, planes, num_blocks, stride): 68 | strides = [stride] + [1]*(num_blocks-1) 69 | layers = [] 70 | for stride in strides: 71 | layers.append(block(self.in_planes, planes, stride)) 72 | self.in_planes = planes * block.expansion 73 | return nn.Sequential(*layers) 74 | 75 | def _upsample_add(self, x, y): 76 | '''Upsample and add two feature maps. 77 | 78 | Args: 79 | x: (Variable) top feature map to be upsampled. 80 | y: (Variable) lateral feature map. 81 | 82 | Returns: 83 | (Variable) added feature map. 84 | 85 | Note in PyTorch, when input size is odd, the upsampled feature map 86 | with `F.upsample(..., scale_factor=2, mode='nearest')` 87 | maybe not equal to the lateral feature map size. 88 | 89 | e.g. 90 | original input size: [N,_,15,15] -> 91 | conv2d feature map size: [N,_,8,8] -> 92 | upsampled feature map size: [N,_,16,16] 93 | 94 | So we choose bilinear upsample which supports arbitrary output sizes. 95 | ''' 96 | _,_,H,W = y.size() 97 | return F.upsample(x, size=(H,W), mode='bilinear') + y 98 | 99 | def forward(self, x): 100 | # Bottom-up 101 | c1 = F.relu(self.bn1(self.conv1(x))) 102 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 103 | c2 = self.layer1(c1) 104 | c3 = self.layer2(c2) 105 | c4 = self.layer3(c3) 106 | c5 = self.layer4(c4) 107 | # Top-down 108 | p5 = self.toplayer(c5) 109 | p4 = self._upsample_add(p5, self.latlayer1(c4)) 110 | p3 = self._upsample_add(p4, self.latlayer2(c3)) 111 | p2 = self._upsample_add(p3, self.latlayer3(c2)) 112 | # Smooth 113 | p4 = self.smooth1(p4) 114 | p3 = self.smooth2(p3) 115 | p2 = self.smooth3(p2) 116 | return p2, p3, p4, p5 117 | 118 | 119 | def FPN101(): 120 | # return FPN(Bottleneck, [2,4,23,3]) 121 | return FPN(Bottleneck, [2,2,2,2]) 122 | 123 | 124 | def test(): 125 | net = FPN101() 126 | fms = net(Variable(torch.randn(1,3,600,900))) 127 | for fm in fms: 128 | print(fm.size()) 129 | 130 | test() 131 | -------------------------------------------------------------------------------- /retina_fpn.py: -------------------------------------------------------------------------------- 1 | '''RetinaFPN in PyTorch. 2 | 3 | See the paper "Focal Loss for Dense Object Detection" for more details. 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from torch.autograd import Variable 10 | 11 | 12 | class Bottleneck(nn.Module): 13 | expansion = 4 14 | 15 | def __init__(self, in_planes, planes, stride=1): 16 | super(Bottleneck, self).__init__() 17 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 22 | self.bn3 = nn.BatchNorm2d(self.expansion*planes) 23 | 24 | self.shortcut = nn.Sequential() 25 | if stride != 1 or in_planes != self.expansion*planes: 26 | self.shortcut = nn.Sequential( 27 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 28 | nn.BatchNorm2d(self.expansion*planes) 29 | ) 30 | 31 | def forward(self, x): 32 | out = F.relu(self.bn1(self.conv1(x))) 33 | out = F.relu(self.bn2(self.conv2(out))) 34 | out = self.bn3(self.conv3(out)) 35 | out += self.shortcut(x) 36 | out = F.relu(out) 37 | return out 38 | 39 | 40 | class RetinaFPN(nn.Module): 41 | def __init__(self, block, num_blocks): 42 | super(RetinaFPN, self).__init__() 43 | self.in_planes = 64 44 | 45 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 46 | self.bn1 = nn.BatchNorm2d(64) 47 | 48 | # Bottom-up layers 49 | self.layer2 = self._make_layer(block, 64, num_blocks[0], stride=1) 50 | self.layer3 = self._make_layer(block, 128, num_blocks[1], stride=2) 51 | self.layer4 = self._make_layer(block, 256, num_blocks[2], stride=2) 52 | self.layer5 = self._make_layer(block, 512, num_blocks[3], stride=2) 53 | self.conv6 = nn.Conv2d(2048, 256, kernel_size=3, stride=2, padding=1) 54 | self.conv7 = nn.Conv2d( 256, 256, kernel_size=3, stride=2, padding=1) 55 | 56 | # Top layer 57 | self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) # Reduce channels 58 | 59 | # Smooth layers 60 | self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 61 | self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 62 | 63 | # Lateral layers 64 | self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) 65 | self.latlayer2 = nn.Conv2d( 512, 256, kernel_size=1, stride=1, padding=0) 66 | 67 | def _make_layer(self, block, planes, num_blocks, stride): 68 | strides = [stride] + [1]*(num_blocks-1) 69 | layers = [] 70 | for stride in strides: 71 | layers.append(block(self.in_planes, planes, stride)) 72 | self.in_planes = planes * block.expansion 73 | return nn.Sequential(*layers) 74 | 75 | def _upsample_add(self, x, y): 76 | '''Upsample and add two feature maps. 77 | 78 | Args: 79 | x: (Variable) top feature map to be upsampled. 80 | y: (Variable) lateral feature map. 81 | 82 | Returns: 83 | (Variable) added feature map. 84 | 85 | Note in PyTorch, when input size is odd, the upsampled feature map 86 | with `F.upsample(..., scale_factor=2, mode='nearest')` 87 | maybe not equal to the lateral feature map size. 88 | 89 | e.g. 90 | original input size: [N,_,15,15] -> 91 | conv2d feature map size: [N,_,8,8] -> 92 | upsampled feature map size: [N,_,16,16] 93 | 94 | So we choose bilinear upsample which supports arbitrary output sizes. 95 | ''' 96 | _,_,H,W = y.size() 97 | return F.upsample(x, size=(H,W), mode='bilinear') + y 98 | 99 | def forward(self, x): 100 | # Bottom-up 101 | c1 = F.relu(self.bn1(self.conv1(x))) 102 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 103 | c2 = self.layer2(c1) 104 | c3 = self.layer3(c2) 105 | c4 = self.layer4(c3) 106 | c5 = self.layer5(c4) 107 | p6 = self.conv6(c5) 108 | p7 = self.conv7(F.relu(p6)) 109 | # Top-down 110 | p5 = self.toplayer(c5) 111 | p4 = self._upsample_add(p5, self.latlayer1(c4)) 112 | p3 = self._upsample_add(p4, self.latlayer2(c3)) 113 | # Smooth 114 | p4 = self.smooth1(p4) 115 | p3 = self.smooth2(p3) 116 | return p3, p4, p5, p6, p7 117 | 118 | 119 | def RetinaFPN101(): 120 | # return RetinaFPN(Bottleneck, [2,4,23,3]) 121 | return RetinaFPN(Bottleneck, [2,2,2,2]) 122 | 123 | 124 | def test(): 125 | net = RetinaFPN101() 126 | fms = net(Variable(torch.randn(1,3,600,900))) 127 | for fm in fms: 128 | print(fm.size()) 129 | 130 | test() 131 | --------------------------------------------------------------------------------