├── Figures ├── gradcam.jpg ├── scaling.png ├── Top-1_ACC.jpg ├── gradcam2.jpg ├── EMCA_Algorithm.PNG ├── EMCA_and_integration_CVPR.png ├── EMCA_archeticture_only_CVPR.png ├── EMCA_integration_only_CVPR.png └── Revisit_Channel_Attention_dense_connection.png ├── Code ├── benchmarking │ ├── __pycache__ │ │ ├── cbam.cpython-37.pyc │ │ ├── resnet.cpython-37.pyc │ │ ├── senet.cpython-37.pyc │ │ ├── densenet.cpython-37.pyc │ │ ├── self_att.cpython-37.pyc │ │ └── resnet_cbam.cpython-37.pyc │ ├── vgg.py │ ├── squeezenet.py │ ├── mobilenetv2.py │ ├── cbam.py │ ├── preactresnet.py │ ├── resnext.py │ ├── googlenet.py │ ├── shufflenetv2.py │ ├── resnet.py │ ├── mobilenet.py │ ├── xception.py │ ├── resnet_cbam.py │ ├── rir.py │ ├── shufflenet.py │ ├── densenet.py │ ├── nasnet.py │ ├── inceptionv3.py │ ├── attention.py │ ├── inceptionv4.py │ └── senet.py ├── conf │ ├── __init__.py │ └── global_settings.py ├── dataset.py ├── train.py └── train_imagenet.py └── README.md /Figures/gradcam.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/gradcam.jpg -------------------------------------------------------------------------------- /Figures/scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/scaling.png -------------------------------------------------------------------------------- /Figures/Top-1_ACC.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/Top-1_ACC.jpg -------------------------------------------------------------------------------- /Figures/gradcam2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/gradcam2.jpg -------------------------------------------------------------------------------- /Figures/EMCA_Algorithm.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_Algorithm.PNG -------------------------------------------------------------------------------- /Figures/EMCA_and_integration_CVPR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_and_integration_CVPR.png -------------------------------------------------------------------------------- /Figures/EMCA_archeticture_only_CVPR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_archeticture_only_CVPR.png -------------------------------------------------------------------------------- /Figures/EMCA_integration_only_CVPR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_integration_only_CVPR.png -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/cbam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/cbam.cpython-37.pyc -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/resnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/resnet.cpython-37.pyc -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/senet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/senet.cpython-37.pyc -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/densenet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/densenet.cpython-37.pyc -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/self_att.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/self_att.cpython-37.pyc -------------------------------------------------------------------------------- /Figures/Revisit_Channel_Attention_dense_connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/Revisit_Channel_Attention_dense_connection.png -------------------------------------------------------------------------------- /Code/benchmarking/__pycache__/resnet_cbam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/resnet_cbam.cpython-37.pyc -------------------------------------------------------------------------------- /Code/conf/__init__.py: -------------------------------------------------------------------------------- 1 | """ dynamically load settings 2 | 3 | author baiyu 4 | """ 5 | import conf.global_settings as settings 6 | 7 | class Settings: 8 | def __init__(self, settings): 9 | 10 | for attr in dir(settings): 11 | if attr.isupper(): 12 | setattr(self, attr, getattr(settings, attr)) 13 | 14 | settings = Settings(settings) -------------------------------------------------------------------------------- /Code/conf/global_settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | #CIFAR100 dataset path (python version) 5 | #CIFAR100_PATH = '/nfs/private/cifar100/cifar-100-python' 6 | 7 | #mean and std of cifar100 dataset 8 | CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343) 9 | CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) 10 | 11 | #CIFAR100_TEST_MEAN = (0.5088964127604166, 0.48739301317401956, 0.44194221124387256) 12 | #CIFAR100_TEST_STD = (0.2682515741720801, 0.2573637364478126, 0.2770957707973042) 13 | 14 | # data_set type 15 | data_type = "tiny-imagenet" 16 | if data_type == "cifar100": 17 | IMG_SIZE = 32 18 | elif data_type == "tiny-imagenet": 19 | IMG_SIZE = 64 20 | elif data_type == "dogs": 21 | IMG_SIZE = 128 22 | elif data_type == "imagenet": 23 | IMG_SIZE = 224 24 | 25 | #directory to save weights file 26 | CHECKPOINT_PATH = 'checkpoint' 27 | 28 | #total training epoches 29 | EPOCH = 230 30 | MILESTONES = [60, 120, 160, 200] 31 | 32 | #initial learning rate 33 | #INIT_LR = 0.1 34 | 35 | #time of we run the script 36 | TIME_NOW = 'tiny_imagenet_self_local_ch_att_simple_3att' 37 | 38 | #tensorboard log dir 39 | LOG_DIR = 'runs' 40 | 41 | #save weights file per SAVE_EPOCH epoch 42 | SAVE_EPOCH = 100 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Code/benchmarking/vgg.py: -------------------------------------------------------------------------------- 1 | """vgg in pytorch 2 | 3 | 4 | [1] Karen Simonyan, Andrew Zisserman 5 | 6 | Very Deep Convolutional Networks for Large-Scale Image Recognition. 7 | https://arxiv.org/abs/1409.1556v6 8 | """ 9 | '''VGG11/13/16/19 in Pytorch.''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | cfg = { 15 | 'A' : [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 16 | 'B' : [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 17 | 'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 18 | 'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'] 19 | } 20 | 21 | class VGG(nn.Module): 22 | 23 | def __init__(self, features, num_class=100): 24 | super().__init__() 25 | self.features = features 26 | 27 | self.classifier = nn.Sequential( 28 | nn.Linear(512, 4096), 29 | nn.ReLU(inplace=True), 30 | nn.Dropout(), 31 | nn.Linear(4096, 4096), 32 | nn.ReLU(inplace=True), 33 | nn.Dropout(), 34 | nn.Linear(4096, num_class) 35 | ) 36 | 37 | def forward(self, x): 38 | output = self.features(x) 39 | output = output.view(output.size()[0], -1) 40 | output = self.classifier(output) 41 | 42 | return output 43 | 44 | def make_layers(cfg, batch_norm=False): 45 | layers = [] 46 | 47 | input_channel = 3 48 | for l in cfg: 49 | if l == 'M': 50 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 51 | continue 52 | 53 | layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)] 54 | 55 | if batch_norm: 56 | layers += [nn.BatchNorm2d(l)] 57 | 58 | layers += [nn.ReLU(inplace=True)] 59 | input_channel = l 60 | 61 | return nn.Sequential(*layers) 62 | 63 | def vgg11_bn(): 64 | return VGG(make_layers(cfg['A'], batch_norm=True)) 65 | 66 | def vgg13_bn(): 67 | return VGG(make_layers(cfg['B'], batch_norm=True)) 68 | 69 | def vgg16_bn(): 70 | return VGG(make_layers(cfg['D'], batch_norm=True)) 71 | 72 | def vgg19_bn(): 73 | return VGG(make_layers(cfg['E'], batch_norm=True)) 74 | 75 | 76 | -------------------------------------------------------------------------------- /Code/benchmarking/squeezenet.py: -------------------------------------------------------------------------------- 1 | """squeezenet in pytorch 2 | 3 | 4 | 5 | [1] Song Han, Jeff Pool, John Tran, William J. Dally 6 | 7 | squeezenet: Learning both Weights and Connections for Efficient Neural Networks 8 | https://arxiv.org/abs/1506.02626 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | class Fire(nn.Module): 16 | 17 | def __init__(self, in_channel, out_channel, squzee_channel): 18 | 19 | super().__init__() 20 | self.squeeze = nn.Sequential( 21 | nn.Conv2d(in_channel, squzee_channel, 1), 22 | nn.BatchNorm2d(squzee_channel), 23 | nn.ReLU(inplace=True) 24 | ) 25 | 26 | self.expand_1x1 = nn.Sequential( 27 | nn.Conv2d(squzee_channel, int(out_channel / 2), 1), 28 | nn.BatchNorm2d(int(out_channel / 2)), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | self.expand_3x3 = nn.Sequential( 33 | nn.Conv2d(squzee_channel, int(out_channel / 2), 3, padding=1), 34 | nn.BatchNorm2d(int(out_channel / 2)), 35 | nn.ReLU(inplace=True) 36 | ) 37 | 38 | def forward(self, x): 39 | 40 | x = self.squeeze(x) 41 | x = torch.cat([ 42 | self.expand_1x1(x), 43 | self.expand_3x3(x) 44 | ], 1) 45 | 46 | return x 47 | 48 | class SqueezeNet(nn.Module): 49 | 50 | """mobile net with simple bypass""" 51 | def __init__(self, class_num=100): 52 | 53 | super().__init__() 54 | self.stem = nn.Sequential( 55 | nn.Conv2d(3, 96, 3, padding=1), 56 | nn.BatchNorm2d(96), 57 | nn.ReLU(inplace=True), 58 | nn.MaxPool2d(2, 2) 59 | ) 60 | 61 | self.fire2 = Fire(96, 128, 16) 62 | self.fire3 = Fire(128, 128, 16) 63 | self.fire4 = Fire(128, 256, 32) 64 | self.fire5 = Fire(256, 256, 32) 65 | self.fire6 = Fire(256, 384, 48) 66 | self.fire7 = Fire(384, 384, 48) 67 | self.fire8 = Fire(384, 512, 64) 68 | self.fire9 = Fire(512, 512, 64) 69 | 70 | self.conv10 = nn.Conv2d(512, class_num, 1) 71 | self.avg = nn.AdaptiveAvgPool2d(1) 72 | self.maxpool = nn.MaxPool2d(2, 2) 73 | 74 | def forward(self, x): 75 | x = self.stem(x) 76 | 77 | f2 = self.fire2(x) 78 | f3 = self.fire3(f2) + f2 79 | f4 = self.fire4(f3) 80 | f4 = self.maxpool(f4) 81 | 82 | f5 = self.fire5(f4) + f4 83 | f6 = self.fire6(f5) 84 | f7 = self.fire7(f6) + f6 85 | f8 = self.fire8(f7) 86 | f8 = self.maxpool(f8) 87 | 88 | f9 = self.fire9(f8) 89 | c10 = self.conv10(f9) 90 | 91 | x = self.avg(c10) 92 | x = x.view(x.size(0), -1) 93 | 94 | return x 95 | 96 | def squeezenet(class_num=100): 97 | return SqueezeNet(class_num=class_num) 98 | -------------------------------------------------------------------------------- /Code/benchmarking/mobilenetv2.py: -------------------------------------------------------------------------------- 1 | """mobilenetv2 in pytorch 2 | 3 | 4 | 5 | [1] Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 6 | 7 | MobileNetV2: Inverted Residuals and Linear Bottlenecks 8 | https://arxiv.org/abs/1801.04381 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | class LinearBottleNeck(nn.Module): 17 | 18 | def __init__(self, in_channels, out_channels, stride, t=6, class_num=100): 19 | super().__init__() 20 | 21 | self.residual = nn.Sequential( 22 | nn.Conv2d(in_channels, in_channels * t, 1), 23 | nn.BatchNorm2d(in_channels * t), 24 | nn.ReLU6(inplace=True), 25 | 26 | nn.Conv2d(in_channels * t, in_channels * t, 3, stride=stride, padding=1, groups=in_channels * t), 27 | nn.BatchNorm2d(in_channels * t), 28 | nn.ReLU6(inplace=True), 29 | 30 | nn.Conv2d(in_channels * t, out_channels, 1), 31 | nn.BatchNorm2d(out_channels) 32 | ) 33 | 34 | self.stride = stride 35 | self.in_channels = in_channels 36 | self.out_channels = out_channels 37 | 38 | def forward(self, x): 39 | 40 | residual = self.residual(x) 41 | 42 | if self.stride == 1 and self.in_channels == self.out_channels: 43 | residual += x 44 | 45 | return residual 46 | 47 | class MobileNetV2(nn.Module): 48 | 49 | def __init__(self, class_num=100): 50 | super().__init__() 51 | 52 | self.pre = nn.Sequential( 53 | nn.Conv2d(3, 32, 1, padding=1), 54 | nn.BatchNorm2d(32), 55 | nn.ReLU6(inplace=True) 56 | ) 57 | 58 | self.stage1 = LinearBottleNeck(32, 16, 1, 1) 59 | self.stage2 = self._make_stage(2, 16, 24, 2, 6) 60 | self.stage3 = self._make_stage(3, 24, 32, 2, 6) 61 | self.stage4 = self._make_stage(4, 32, 64, 2, 6) 62 | self.stage5 = self._make_stage(3, 64, 96, 1, 6) 63 | self.stage6 = self._make_stage(3, 96, 160, 1, 6) 64 | self.stage7 = LinearBottleNeck(160, 320, 1, 6) 65 | 66 | self.conv1 = nn.Sequential( 67 | nn.Conv2d(320, 1280, 1), 68 | nn.BatchNorm2d(1280), 69 | nn.ReLU6(inplace=True) 70 | ) 71 | 72 | self.conv2 = nn.Conv2d(1280, class_num, 1) 73 | 74 | def forward(self, x): 75 | x = self.pre(x) 76 | x = self.stage1(x) 77 | x = self.stage2(x) 78 | x = self.stage3(x) 79 | x = self.stage4(x) 80 | x = self.stage5(x) 81 | x = self.stage6(x) 82 | x = self.stage7(x) 83 | x = self.conv1(x) 84 | x = F.adaptive_avg_pool2d(x, 1) 85 | x = self.conv2(x) 86 | x = x.view(x.size(0), -1) 87 | 88 | return x 89 | 90 | def _make_stage(self, repeat, in_channels, out_channels, stride, t): 91 | 92 | layers = [] 93 | layers.append(LinearBottleNeck(in_channels, out_channels, stride, t)) 94 | 95 | while repeat - 1: 96 | layers.append(LinearBottleNeck(out_channels, out_channels, 1, t)) 97 | repeat -= 1 98 | 99 | return nn.Sequential(*layers) 100 | 101 | def mobilenetv2(): 102 | return MobileNetV2() -------------------------------------------------------------------------------- /Code/benchmarking/cbam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class BasicConv(nn.Module): 7 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 8 | super(BasicConv, self).__init__() 9 | self.out_channels = out_planes 10 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 11 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 12 | self.relu = nn.ReLU() if relu else None 13 | 14 | def forward(self, x): 15 | x = self.conv(x) 16 | if self.bn is not None: 17 | x = self.bn(x) 18 | if self.relu is not None: 19 | x = self.relu(x) 20 | return x 21 | 22 | class Flatten(nn.Module): 23 | def forward(self, x): 24 | return x.view(x.size(0), -1) 25 | 26 | class ChannelGate(nn.Module): 27 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): 28 | super(ChannelGate, self).__init__() 29 | self.gate_channels = gate_channels 30 | self.mlp = nn.Sequential( 31 | Flatten(), 32 | nn.Linear(gate_channels, gate_channels // reduction_ratio), 33 | nn.ReLU(), 34 | nn.Linear(gate_channels // reduction_ratio, gate_channels) 35 | ) 36 | self.pool_types = pool_types 37 | def forward(self, x): 38 | channel_att_sum = None 39 | for pool_type in self.pool_types: 40 | if pool_type=='avg': 41 | avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 42 | channel_att_raw = self.mlp( avg_pool ) 43 | elif pool_type=='max': 44 | max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 45 | channel_att_raw = self.mlp( max_pool ) 46 | elif pool_type=='lp': 47 | lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 48 | channel_att_raw = self.mlp( lp_pool ) 49 | elif pool_type=='lse': 50 | # LSE pool only 51 | lse_pool = logsumexp_2d(x) 52 | channel_att_raw = self.mlp( lse_pool ) 53 | 54 | if channel_att_sum is None: 55 | channel_att_sum = channel_att_raw 56 | else: 57 | channel_att_sum = channel_att_sum + channel_att_raw 58 | 59 | scale = F.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x) 60 | return x * scale 61 | 62 | def logsumexp_2d(tensor): 63 | tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1) 64 | s, _ = torch.max(tensor_flatten, dim=2, keepdim=True) 65 | outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log() 66 | return outputs 67 | 68 | class ChannelPool(nn.Module): 69 | def forward(self, x): 70 | return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 ) 71 | 72 | class SpatialGate(nn.Module): 73 | def __init__(self): 74 | super(SpatialGate, self).__init__() 75 | kernel_size = 7 76 | self.compress = ChannelPool() 77 | self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False) 78 | def forward(self, x): 79 | x_compress = self.compress(x) 80 | x_out = self.spatial(x_compress) 81 | scale = F.sigmoid(x_out) # broadcasting 82 | return x * scale 83 | 84 | class CBAM(nn.Module): 85 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'], no_spatial=False, no_channel=False): 86 | super(CBAM, self).__init__() 87 | self.no_channel=no_channel 88 | if not no_channel: 89 | self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types) 90 | self.no_spatial=no_spatial 91 | if not no_spatial: 92 | self.SpatialGate = SpatialGate() 93 | def forward(self, x): 94 | if not self.no_channel: 95 | x_out = self.ChannelGate(x) 96 | else: 97 | x_out = x 98 | if not self.no_spatial: 99 | x_out = self.SpatialGate(x_out) 100 | return x_out 101 | -------------------------------------------------------------------------------- /Code/benchmarking/preactresnet.py: -------------------------------------------------------------------------------- 1 | """preactresnet in pytorch 2 | 3 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 4 | 5 | Identity Mappings in Deep Residual Networks 6 | https://arxiv.org/abs/1603.05027 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | class PreActBasic(nn.Module): 14 | 15 | expansion = 1 16 | def __init__(self, in_channels, out_channels, stride): 17 | super().__init__() 18 | self.residual = nn.Sequential( 19 | nn.BatchNorm2d(in_channels), 20 | nn.ReLU(inplace=True), 21 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1), 22 | nn.BatchNorm2d(out_channels), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(out_channels, out_channels * PreActBasic.expansion, kernel_size=3, padding=1) 25 | ) 26 | 27 | self.shortcut = nn.Sequential() 28 | if stride != 1 or in_channels != out_channels * PreActBasic.expansion: 29 | self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBasic.expansion, 1, stride=stride) 30 | 31 | def forward(self, x): 32 | 33 | res = self.residual(x) 34 | shortcut = self.shortcut(x) 35 | 36 | return res + shortcut 37 | 38 | 39 | class PreActBottleNeck(nn.Module): 40 | 41 | expansion = 4 42 | def __init__(self, in_channels, out_channels, stride): 43 | super().__init__() 44 | 45 | self.residual = nn.Sequential( 46 | nn.BatchNorm2d(in_channels), 47 | nn.ReLU(inplace=True), 48 | nn.Conv2d(in_channels, out_channels, 1, stride=stride), 49 | 50 | nn.BatchNorm2d(out_channels), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(out_channels, out_channels, 3, padding=1), 53 | 54 | nn.BatchNorm2d(out_channels), 55 | nn.ReLU(inplace=True), 56 | nn.Conv2d(out_channels, out_channels * PreActBottleNeck.expansion, 1) 57 | ) 58 | 59 | self.shortcut = nn.Sequential() 60 | 61 | if stride != 1 or in_channels != out_channels * PreActBottleNeck.expansion: 62 | self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBottleNeck.expansion, 1, stride=stride) 63 | 64 | def forward(self, x): 65 | 66 | res = self.residual(x) 67 | shortcut = self.shortcut(x) 68 | 69 | return res + shortcut 70 | 71 | class PreActResNet(nn.Module): 72 | 73 | def __init__(self, block, num_block, class_num=100): 74 | super().__init__() 75 | self.input_channels = 64 76 | 77 | self.pre = nn.Sequential( 78 | nn.Conv2d(3, 64, 3, padding=1), 79 | nn.BatchNorm2d(64), 80 | nn.ReLU(inplace=True) 81 | ) 82 | 83 | self.stage1 = self._make_layers(block, num_block[0], 64, 1) 84 | self.stage2 = self._make_layers(block, num_block[1], 128, 2) 85 | self.stage3 = self._make_layers(block, num_block[2], 256, 2) 86 | self.stage4 = self._make_layers(block, num_block[3], 512, 2) 87 | 88 | self.linear = nn.Linear(self.input_channels, class_num) 89 | 90 | def _make_layers(self, block, block_num, out_channels, stride): 91 | layers = [] 92 | 93 | layers.append(block(self.input_channels, out_channels, stride)) 94 | self.input_channels = out_channels * block.expansion 95 | 96 | while block_num - 1: 97 | layers.append(block(self.input_channels, out_channels, 1)) 98 | self.input_channels = out_channels * block.expansion 99 | block_num -= 1 100 | 101 | return nn.Sequential(*layers) 102 | 103 | def forward(self, x): 104 | x = self.pre(x) 105 | 106 | x = self.stage1(x) 107 | x = self.stage2(x) 108 | x = self.stage3(x) 109 | x = self.stage4(x) 110 | 111 | x = F.adaptive_avg_pool2d(x, 1) 112 | x = x.view(x.size(0), -1) 113 | x = self.linear(x) 114 | 115 | return x 116 | 117 | def preactresnet18(): 118 | return PreActResNet(PreActBasic, [2, 2, 2, 2]) 119 | 120 | def preactresnet34(): 121 | return PreActResNet(PreActBasic, [3, 4, 6, 3]) 122 | 123 | def preactresnet50(): 124 | return PreActResNet(PreActBottleNeck, [3, 4, 6, 3]) 125 | 126 | def preactresnet101(): 127 | return PreActResNet(PreActBottleNeck, [3, 4, 23, 3]) 128 | 129 | def preactresnet152(): 130 | return PreActResNet(PreActBottleNeck, [3, 8, 36, 3]) 131 | 132 | -------------------------------------------------------------------------------- /Code/benchmarking/resnext.py: -------------------------------------------------------------------------------- 1 | """resnext in pytorch 2 | 3 | 4 | 5 | [1] Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He. 6 | 7 | Aggregated Residual Transformations for Deep Neural Networks 8 | https://arxiv.org/abs/1611.05431 9 | """ 10 | 11 | import math 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | #only implements ResNext bottleneck c 17 | 18 | 19 | #"""This strategy exposes a new dimension, which we call “cardinality” 20 | #(the size of the set of transformations), as an essential factor 21 | #in addition to the dimensions of depth and width.""" 22 | CARDINALITY = 32 23 | DEPTH = 4 24 | BASEWIDTH = 64 25 | 26 | #"""The grouped convolutional layer in Fig. 3(c) performs 32 groups 27 | #of convolutions whose input and output channels are 4-dimensional. 28 | #The grouped convolutional layer concatenates them as the outputs 29 | #of the layer.""" 30 | 31 | class ResNextBottleNeckC(nn.Module): 32 | 33 | def __init__(self, in_channels, out_channels, stride): 34 | super().__init__() 35 | 36 | C = CARDINALITY #How many groups a feature map was splitted into 37 | 38 | #"""We note that the input/output width of the template is fixed as 39 | #256-d (Fig. 3), We note that the input/output width of the template 40 | #is fixed as 256-d (Fig. 3), and all widths are dou- bled each time 41 | #when the feature map is subsampled (see Table 1).""" 42 | D = int(DEPTH * out_channels / BASEWIDTH) #number of channels per group 43 | self.split_transforms = nn.Sequential( 44 | nn.Conv2d(in_channels, C * D, kernel_size=1, groups=C, bias=False), 45 | nn.BatchNorm2d(C * D), 46 | nn.ReLU(inplace=True), 47 | nn.Conv2d(C * D, C * D, kernel_size=3, stride=stride, groups=C, padding=1, bias=False), 48 | nn.BatchNorm2d(C * D), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(C * D, out_channels * 4, kernel_size=1, bias=False), 51 | nn.BatchNorm2d(out_channels * 4), 52 | ) 53 | 54 | self.shortcut = nn.Sequential() 55 | 56 | if stride != 1 or in_channels != out_channels * 4: 57 | self.shortcut = nn.Sequential( 58 | nn.Conv2d(in_channels, out_channels * 4, stride=stride, kernel_size=1, bias=False), 59 | nn.BatchNorm2d(out_channels * 4) 60 | ) 61 | 62 | def forward(self, x): 63 | return F.relu(self.split_transforms(x) + self.shortcut(x)) 64 | 65 | class ResNext(nn.Module): 66 | 67 | def __init__(self, block, num_blocks, class_names=100): 68 | super().__init__() 69 | self.in_channels = 64 70 | 71 | self.conv1 = nn.Sequential( 72 | nn.Conv2d(3, 64, 3, stride=1, padding=1, bias=False), 73 | nn.BatchNorm2d(64), 74 | nn.ReLU(inplace=True) 75 | ) 76 | 77 | self.conv2 = self._make_layer(block, num_blocks[0], 64, 1) 78 | self.conv3 = self._make_layer(block, num_blocks[1], 128, 2) 79 | self.conv4 = self._make_layer(block, num_blocks[2], 256, 2) 80 | self.conv5 = self._make_layer(block, num_blocks[3], 512, 2) 81 | self.avg = nn.AdaptiveAvgPool2d((1, 1)) 82 | self.fc = nn.Linear(512 * 4, 100) 83 | 84 | def forward(self, x): 85 | x = self.conv1(x) 86 | x = self.conv2(x) 87 | x = self.conv3(x) 88 | x = self.conv4(x) 89 | x = self.conv5(x) 90 | x = self.avg(x) 91 | x = x.view(x.size(0), -1) 92 | x = self.fc(x) 93 | return x 94 | 95 | def _make_layer(self, block, num_block, out_channels, stride): 96 | """Building resnext block 97 | Args: 98 | block: block type(default resnext bottleneck c) 99 | num_block: number of blocks per layer 100 | out_channels: output channels per block 101 | stride: block stride 102 | 103 | Returns: 104 | a resnext layer 105 | """ 106 | strides = [stride] + [1] * (num_block - 1) 107 | layers = [] 108 | for stride in strides: 109 | layers.append(block(self.in_channels, out_channels, stride)) 110 | self.in_channels = out_channels * 4 111 | 112 | return nn.Sequential(*layers) 113 | 114 | def resnext50(): 115 | """ return a resnext50(c32x4d) network 116 | """ 117 | return ResNext(ResNextBottleNeckC, [3, 4, 6, 3]) 118 | 119 | def resnext101(): 120 | """ return a resnext101(c32x4d) network 121 | """ 122 | return ResNext(ResNextBottleNeckC, [3, 4, 23, 3]) 123 | 124 | def resnext152(): 125 | """ return a resnext101(c32x4d) network 126 | """ 127 | return ResNext(ResNextBottleNeckC, [3, 4, 36, 3]) 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /Code/benchmarking/googlenet.py: -------------------------------------------------------------------------------- 1 | """google net in pytorch 2 | 3 | 4 | 5 | [1] Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 6 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 7 | 8 | Going Deeper with Convolutions 9 | https://arxiv.org/abs/1409.4842v1 10 | """ 11 | 12 | import torch 13 | import torch.nn as nn 14 | 15 | class Inception(nn.Module): 16 | def __init__(self, input_channels, n1x1, n3x3_reduce, n3x3, n5x5_reduce, n5x5, pool_proj): 17 | super().__init__() 18 | 19 | #1x1conv branch 20 | self.b1 = nn.Sequential( 21 | nn.Conv2d(input_channels, n1x1, kernel_size=1), 22 | nn.BatchNorm2d(n1x1), 23 | nn.ReLU(inplace=True) 24 | ) 25 | 26 | #1x1conv -> 3x3conv branch 27 | self.b2 = nn.Sequential( 28 | nn.Conv2d(input_channels, n3x3_reduce, kernel_size=1), 29 | nn.BatchNorm2d(n3x3_reduce), 30 | nn.ReLU(inplace=True), 31 | nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1), 32 | nn.BatchNorm2d(n3x3), 33 | nn.ReLU(inplace=True) 34 | ) 35 | 36 | #1x1conv -> 5x5conv branch 37 | #we use 2 3x3 conv filters stacked instead 38 | #of 1 5x5 filters to obtain the same receptive 39 | #field with fewer parameters 40 | self.b3 = nn.Sequential( 41 | nn.Conv2d(input_channels, n5x5_reduce, kernel_size=1), 42 | nn.BatchNorm2d(n5x5_reduce), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(n5x5_reduce, n5x5, kernel_size=3, padding=1), 45 | nn.BatchNorm2d(n5x5, n5x5), 46 | nn.ReLU(inplace=True), 47 | nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), 48 | nn.BatchNorm2d(n5x5), 49 | nn.ReLU(inplace=True) 50 | ) 51 | 52 | #3x3pooling -> 1x1conv 53 | #same conv 54 | self.b4 = nn.Sequential( 55 | nn.MaxPool2d(3, stride=1, padding=1), 56 | nn.Conv2d(input_channels, pool_proj, kernel_size=1), 57 | nn.BatchNorm2d(pool_proj), 58 | nn.ReLU(inplace=True) 59 | ) 60 | 61 | def forward(self, x): 62 | return torch.cat([self.b1(x), self.b2(x), self.b3(x), self.b4(x)], dim=1) 63 | 64 | 65 | class GoogleNet(nn.Module): 66 | 67 | def __init__(self, num_class=100): 68 | super().__init__() 69 | self.prelayer = nn.Sequential( 70 | nn.Conv2d(3, 192, kernel_size=3, padding=1), 71 | nn.BatchNorm2d(192), 72 | nn.ReLU(inplace=True) 73 | ) 74 | 75 | #although we only use 1 conv layer as prelayer, 76 | #we still use name a3, b3....... 77 | self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) 78 | self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) 79 | 80 | #"""In general, an Inception network is a network consisting of 81 | #modules of the above type stacked upon each other, with occasional 82 | #max-pooling layers with stride 2 to halve the resolution of the 83 | #grid""" 84 | self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) 85 | 86 | self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) 87 | self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) 88 | self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) 89 | self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) 90 | self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) 91 | 92 | self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) 93 | self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) 94 | 95 | #input feature size: 8*8*1024 96 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 97 | self.dropout = nn.Dropout2d(p=0.4) 98 | self.linear = nn.Linear(1024, num_class) 99 | 100 | def forward(self, x): 101 | output = self.prelayer(x) 102 | output = self.a3(output) 103 | output = self.b3(output) 104 | 105 | output = self.maxpool(output) 106 | 107 | output = self.a4(output) 108 | output = self.b4(output) 109 | output = self.c4(output) 110 | output = self.d4(output) 111 | output = self.e4(output) 112 | 113 | output = self.maxpool(output) 114 | 115 | output = self.a5(output) 116 | output = self.b5(output) 117 | 118 | #"""It was found that a move from fully connected layers to 119 | #average pooling improved the top-1 accuracy by about 0.6%, 120 | #however the use of dropout remained essential even after 121 | #removing the fully connected layers.""" 122 | output = self.avgpool(output) 123 | output = self.dropout(output) 124 | output = output.view(output.size()[0], -1) 125 | output = self.linear(output) 126 | 127 | return output 128 | 129 | def googlenet(): 130 | return GoogleNet() 131 | 132 | 133 | -------------------------------------------------------------------------------- /Code/benchmarking/shufflenetv2.py: -------------------------------------------------------------------------------- 1 | """shufflenetv2 in pytorch 2 | 3 | 4 | 5 | [1] Ningning Ma, Xiangyu Zhang, Hai-Tao Zheng, Jian Sun 6 | 7 | ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design 8 | https://arxiv.org/abs/1807.11164 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | def channel_split(x, split): 17 | """split a tensor into two pieces along channel dimension 18 | Args: 19 | x: input tensor 20 | split:(int) channel size for each pieces 21 | """ 22 | assert x.size(1) == split * 2 23 | return torch.split(x, split, dim=1) 24 | 25 | def channel_shuffle(x, groups): 26 | """channel shuffle operation 27 | Args: 28 | x: input tensor 29 | groups: input branch number 30 | """ 31 | 32 | batch_size, channels, height, width = x.size() 33 | channels_per_group = int(channels / groups) 34 | 35 | x = x.view(batch_size, groups, channels_per_group, height, width) 36 | x = x.transpose(1, 2).contiguous() 37 | x = x.view(batch_size, -1, height, width) 38 | 39 | return x 40 | 41 | class ShuffleUnit(nn.Module): 42 | 43 | def __init__(self, in_channels, out_channels, stride): 44 | super().__init__() 45 | 46 | self.stride = stride 47 | self.in_channels = in_channels 48 | self.out_channels = out_channels 49 | 50 | if stride != 1 or in_channels != out_channels: 51 | self.residual = nn.Sequential( 52 | nn.Conv2d(in_channels, in_channels, 1), 53 | nn.BatchNorm2d(in_channels), 54 | nn.ReLU(inplace=True), 55 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 56 | nn.BatchNorm2d(in_channels), 57 | nn.Conv2d(in_channels, int(out_channels / 2), 1), 58 | nn.BatchNorm2d(int(out_channels / 2)), 59 | nn.ReLU(inplace=True) 60 | ) 61 | 62 | self.shortcut = nn.Sequential( 63 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 64 | nn.BatchNorm2d(in_channels), 65 | nn.Conv2d(in_channels, int(out_channels / 2), 1), 66 | nn.BatchNorm2d(int(out_channels / 2)), 67 | nn.ReLU(inplace=True) 68 | ) 69 | else: 70 | self.shortcut = nn.Sequential() 71 | 72 | in_channels = int(in_channels / 2) 73 | self.residual = nn.Sequential( 74 | nn.Conv2d(in_channels, in_channels, 1), 75 | nn.BatchNorm2d(in_channels), 76 | nn.ReLU(inplace=True), 77 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 78 | nn.BatchNorm2d(in_channels), 79 | nn.Conv2d(in_channels, in_channels, 1), 80 | nn.BatchNorm2d(in_channels), 81 | nn.ReLU(inplace=True) 82 | ) 83 | 84 | 85 | def forward(self, x): 86 | 87 | if self.stride == 1 and self.out_channels == self.in_channels: 88 | shortcut, residual = channel_split(x, int(self.in_channels / 2)) 89 | else: 90 | shortcut = x 91 | residual = x 92 | 93 | shortcut = self.shortcut(shortcut) 94 | residual = self.residual(residual) 95 | x = torch.cat([shortcut, residual], dim=1) 96 | x = channel_shuffle(x, 2) 97 | 98 | return x 99 | 100 | class ShuffleNetV2(nn.Module): 101 | 102 | def __init__(self, ratio=1, class_num=100): 103 | super().__init__() 104 | if ratio == 0.5: 105 | out_channels = [48, 96, 192, 1024] 106 | elif ratio == 1: 107 | out_channels = [116, 232, 464, 1024] 108 | elif ratio == 1.5: 109 | out_channels = [176, 352, 704, 1024] 110 | elif ratio == 2: 111 | out_channels = [244, 488, 976, 2048] 112 | else: 113 | ValueError('unsupported ratio number') 114 | 115 | self.pre = nn.Sequential( 116 | nn.Conv2d(3, 24, 3, padding=1), 117 | nn.BatchNorm2d(24) 118 | ) 119 | 120 | self.stage2 = self._make_stage(24, out_channels[0], 3) 121 | self.stage3 = self._make_stage(out_channels[0], out_channels[1], 7) 122 | self.stage4 = self._make_stage(out_channels[1], out_channels[2], 3) 123 | self.conv5 = nn.Sequential( 124 | nn.Conv2d(out_channels[2], out_channels[3], 1), 125 | nn.BatchNorm2d(out_channels[3]), 126 | nn.ReLU(inplace=True) 127 | ) 128 | 129 | self.fc = nn.Linear(out_channels[3], class_num) 130 | 131 | def forward(self, x): 132 | x = self.pre(x) 133 | x = self.stage2(x) 134 | x = self.stage3(x) 135 | x = self.stage4(x) 136 | x = self.conv5(x) 137 | x = F.adaptive_avg_pool2d(x, 1) 138 | x = x.view(x.size(0), -1) 139 | x = self.fc(x) 140 | 141 | return x 142 | 143 | def _make_stage(self, in_channels, out_channels, repeat): 144 | layers = [] 145 | layers.append(ShuffleUnit(in_channels, out_channels, 2)) 146 | 147 | while repeat: 148 | layers.append(ShuffleUnit(out_channels, out_channels, 1)) 149 | repeat -= 1 150 | 151 | return nn.Sequential(*layers) 152 | 153 | def shufflenetv2(): 154 | return ShuffleNetV2() 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /Code/benchmarking/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class BasicBlock(nn.Module): 5 | """Basic Block for resnet 18 and resnet 34 6 | 7 | """ 8 | 9 | #BasicBlock and BottleNeck block 10 | #have different output size 11 | #we use class attribute expansion 12 | #to distinct 13 | expansion = 1 14 | 15 | def __init__(self, in_channels, out_channels, stride=1): 16 | super().__init__() 17 | 18 | #residual function 19 | self.residual_function = nn.Sequential( 20 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), 21 | nn.BatchNorm2d(out_channels), 22 | nn.ReLU(inplace=True), 23 | nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False), 24 | nn.BatchNorm2d(out_channels * BasicBlock.expansion) 25 | ) 26 | 27 | #shortcut 28 | self.shortcut = nn.Sequential() 29 | 30 | #the shortcut output dimension is not the same with residual function 31 | #use 1*1 convolution to match the dimension 32 | if stride != 1 or in_channels != BasicBlock.expansion * out_channels: 33 | self.shortcut = nn.Sequential( 34 | nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False), 35 | nn.BatchNorm2d(out_channels * BasicBlock.expansion) 36 | ) 37 | 38 | def forward(self, x): 39 | return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) 40 | 41 | class BottleNeck(nn.Module): 42 | """Residual block for resnet over 50 layers 43 | 44 | """ 45 | expansion = 4 46 | def __init__(self, in_channels, out_channels, stride=1): 47 | super().__init__() 48 | self.residual_function = nn.Sequential( 49 | nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False), 50 | nn.BatchNorm2d(out_channels), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False), 53 | nn.BatchNorm2d(out_channels), 54 | nn.ReLU(inplace=True), 55 | nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False), 56 | nn.BatchNorm2d(out_channels * BottleNeck.expansion), 57 | ) 58 | 59 | self.shortcut = nn.Sequential() 60 | 61 | if stride != 1 or in_channels != out_channels * BottleNeck.expansion: 62 | self.shortcut = nn.Sequential( 63 | nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False), 64 | nn.BatchNorm2d(out_channels * BottleNeck.expansion) 65 | ) 66 | 67 | def forward(self, x): 68 | return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) 69 | 70 | class ResNet(nn.Module): 71 | 72 | def __init__(self, block, num_block, num_classes=120): 73 | super().__init__() 74 | 75 | self.in_channels = 64 76 | 77 | self.conv1 = nn.Sequential( 78 | nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), 79 | nn.BatchNorm2d(64), 80 | nn.ReLU(inplace=True)) 81 | #we use a different inputsize than the original paper 82 | #so conv2_x's stride is 1 83 | self.conv2_x = self._make_layer(block, 64, num_block[0], 1) 84 | self.conv3_x = self._make_layer(block, 128, num_block[1], 2) 85 | self.conv4_x = self._make_layer(block, 256, num_block[2], 2) 86 | self.conv5_x = self._make_layer(block, 512, num_block[3], 2) 87 | self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 88 | self.fc = nn.Linear(512 * block.expansion, num_classes) 89 | 90 | def _make_layer(self, block, out_channels, num_blocks, stride): 91 | """make resnet layers(by layer i didnt mean this 'layer' was the 92 | same as a neuron netowork layer, ex. conv layer), one layer may 93 | contain more than one residual block 94 | 95 | Args: 96 | block: block type, basic block or bottle neck block 97 | out_channels: output depth channel number of this layer 98 | num_blocks: how many blocks per layer 99 | stride: the stride of the first block of this layer 100 | 101 | Return: 102 | return a resnet layer 103 | """ 104 | 105 | # we have num_block blocks per layer, the first block 106 | # could be 1 or 2, other blocks would always be 1 107 | strides = [stride] + [1] * (num_blocks - 1) 108 | layers = [] 109 | for stride in strides: 110 | layers.append(block(self.in_channels, out_channels, stride)) 111 | self.in_channels = out_channels * block.expansion 112 | 113 | return nn.Sequential(*layers) 114 | 115 | def forward(self, x): 116 | output = self.conv1(x) 117 | output = self.conv2_x(output) 118 | output = self.conv3_x(output) 119 | output = self.conv4_x(output) 120 | output = self.conv5_x(output) 121 | output = self.avg_pool(output) 122 | output = output.view(output.size(0), -1) 123 | output = self.fc(output) 124 | 125 | return output 126 | 127 | def resnet18(num_classes): 128 | """ return a ResNet 18 object 129 | """ 130 | return ResNet(BasicBlock, [2, 2, 2, 2], num_classes = num_classes) 131 | 132 | def resnet34(num_classes): 133 | """ return a ResNet 34 object 134 | """ 135 | return ResNet(BasicBlock, [3, 4, 6, 3], num_classes = num_classes) 136 | 137 | def resnet50(num_classes): 138 | """ return a ResNet 50 object 139 | """ 140 | return ResNet(BottleNeck, [3, 4, 6, 3], num_classes = num_classes) 141 | 142 | def resnet101(num_classes): 143 | """ return a ResNet 101 object 144 | """ 145 | return ResNet(BottleNeck, [3, 4, 23, 3], num_classes = num_classes) 146 | 147 | def resnet152(num_classes): 148 | """ return a ResNet 152 object 149 | """ 150 | return ResNet(BottleNeck, [3, 8, 36, 3], num_classes = num_classes) 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /Code/benchmarking/mobilenet.py: -------------------------------------------------------------------------------- 1 | """mobilenet in pytorch 2 | 3 | 4 | 5 | [1] Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 6 | 7 | MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications 8 | https://arxiv.org/abs/1704.04861 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | class DepthSeperabelConv2d(nn.Module): 16 | 17 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 18 | super().__init__() 19 | self.depthwise = nn.Sequential( 20 | nn.Conv2d( 21 | input_channels, 22 | input_channels, 23 | kernel_size, 24 | groups=input_channels, 25 | **kwargs), 26 | nn.BatchNorm2d(input_channels), 27 | nn.ReLU(inplace=True) 28 | ) 29 | 30 | self.pointwise = nn.Sequential( 31 | nn.Conv2d(input_channels, output_channels, 1), 32 | nn.BatchNorm2d(output_channels), 33 | nn.ReLU(inplace=True) 34 | ) 35 | 36 | def forward(self, x): 37 | x = self.depthwise(x) 38 | x = self.pointwise(x) 39 | 40 | return x 41 | 42 | 43 | class BasicConv2d(nn.Module): 44 | 45 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 46 | 47 | super().__init__() 48 | self.conv = nn.Conv2d( 49 | input_channels, output_channels, kernel_size, **kwargs) 50 | self.bn = nn.BatchNorm2d(output_channels) 51 | self.relu = nn.ReLU(inplace=True) 52 | 53 | def forward(self, x): 54 | x = self.conv(x) 55 | x = self.bn(x) 56 | x = self.relu(x) 57 | 58 | return x 59 | 60 | 61 | class MobileNet(nn.Module): 62 | 63 | """ 64 | Args: 65 | width multipler: The role of the width multiplier α is to thin 66 | a network uniformly at each layer. For a given 67 | layer and width multiplier α, the number of 68 | input channels M becomes αM and the number of 69 | output channels N becomes αN. 70 | """ 71 | 72 | def __init__(self, width_multiplier=1, class_num=100): 73 | super().__init__() 74 | 75 | alpha = width_multiplier 76 | self.stem = nn.Sequential( 77 | BasicConv2d(3, int(32 * alpha), 3, padding=1, bias=False), 78 | DepthSeperabelConv2d( 79 | int(32 * alpha), 80 | int(64 * alpha), 81 | 3, 82 | padding=1, 83 | bias=False 84 | ) 85 | ) 86 | 87 | #downsample 88 | self.conv1 = nn.Sequential( 89 | DepthSeperabelConv2d( 90 | int(64 * alpha), 91 | int(128 * alpha), 92 | 3, 93 | stride=2, 94 | padding=1, 95 | bias=False 96 | ), 97 | DepthSeperabelConv2d( 98 | int(128 * alpha), 99 | int(128 * alpha), 100 | 3, 101 | padding=1, 102 | bias=False 103 | ) 104 | ) 105 | 106 | #downsample 107 | self.conv2 = nn.Sequential( 108 | DepthSeperabelConv2d( 109 | int(128 * alpha), 110 | int(256 * alpha), 111 | 3, 112 | stride=2, 113 | padding=1, 114 | bias=False 115 | ), 116 | DepthSeperabelConv2d( 117 | int(256 * alpha), 118 | int(256 * alpha), 119 | 3, 120 | padding=1, 121 | bias=False 122 | ) 123 | ) 124 | 125 | #downsample 126 | self.conv3 = nn.Sequential( 127 | DepthSeperabelConv2d( 128 | int(256 * alpha), 129 | int(512 * alpha), 130 | 3, 131 | stride=2, 132 | padding=1, 133 | bias=False 134 | ), 135 | 136 | DepthSeperabelConv2d( 137 | int(512 * alpha), 138 | int(512 * alpha), 139 | 3, 140 | padding=1, 141 | bias=False 142 | ), 143 | DepthSeperabelConv2d( 144 | int(512 * alpha), 145 | int(512 * alpha), 146 | 3, 147 | padding=1, 148 | bias=False 149 | ), 150 | DepthSeperabelConv2d( 151 | int(512 * alpha), 152 | int(512 * alpha), 153 | 3, 154 | padding=1, 155 | bias=False 156 | ), 157 | DepthSeperabelConv2d( 158 | int(512 * alpha), 159 | int(512 * alpha), 160 | 3, 161 | padding=1, 162 | bias=False 163 | ), 164 | DepthSeperabelConv2d( 165 | int(512 * alpha), 166 | int(512 * alpha), 167 | 3, 168 | padding=1, 169 | bias=False 170 | ) 171 | ) 172 | 173 | #downsample 174 | self.conv4 = nn.Sequential( 175 | DepthSeperabelConv2d( 176 | int(512 * alpha), 177 | int(1024 * alpha), 178 | 3, 179 | stride=2, 180 | padding=1, 181 | bias=False 182 | ), 183 | DepthSeperabelConv2d( 184 | int(1024 * alpha), 185 | int(1024 * alpha), 186 | 3, 187 | padding=1, 188 | bias=False 189 | ) 190 | ) 191 | 192 | self.fc = nn.Linear(int(1024 * alpha), class_num) 193 | self.avg = nn.AdaptiveAvgPool2d(1) 194 | 195 | def forward(self, x): 196 | x = self.stem(x) 197 | 198 | x = self.conv1(x) 199 | x = self.conv2(x) 200 | x = self.conv3(x) 201 | x = self.conv4(x) 202 | 203 | x = self.avg(x) 204 | x = x.view(x.size(0), -1) 205 | x = self.fc(x) 206 | return x 207 | 208 | 209 | def mobilenet(alpha=1, class_num=100): 210 | return MobileNet(alpha, class_num) 211 | 212 | -------------------------------------------------------------------------------- /Code/benchmarking/xception.py: -------------------------------------------------------------------------------- 1 | """xception in pytorch 2 | 3 | 4 | [1] François Chollet 5 | 6 | Xception: Deep Learning with Depthwise Separable Convolutions 7 | https://arxiv.org/abs/1610.02357 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | class SeperableConv2d(nn.Module): 14 | 15 | #***Figure 4. An “extreme” version of our Inception module, 16 | #with one spatial convolution per output channel of the 1x1 17 | #convolution.""" 18 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 19 | 20 | super().__init__() 21 | self.depthwise = nn.Conv2d( 22 | input_channels, 23 | input_channels, 24 | kernel_size, 25 | groups=input_channels, 26 | bias=False, 27 | **kwargs 28 | ) 29 | 30 | self.pointwise = nn.Conv2d(input_channels, output_channels, 1, bias=False) 31 | 32 | def forward(self, x): 33 | x = self.depthwise(x) 34 | x = self.pointwise(x) 35 | 36 | return x 37 | 38 | class EntryFlow(nn.Module): 39 | 40 | def __init__(self): 41 | 42 | super().__init__() 43 | self.conv1 = nn.Sequential( 44 | nn.Conv2d(3, 32, 3, padding=1, bias=False), 45 | nn.BatchNorm2d(32), 46 | nn.ReLU(inplace=True) 47 | ) 48 | 49 | self.conv2 = nn.Sequential( 50 | nn.Conv2d(32, 64, 3, padding=1, bias=False), 51 | nn.BatchNorm2d(64), 52 | nn.ReLU(inplace=True) 53 | ) 54 | 55 | self.conv3_residual = nn.Sequential( 56 | SeperableConv2d(64, 128, 3, padding=1), 57 | nn.BatchNorm2d(128), 58 | nn.ReLU(inplace=True), 59 | SeperableConv2d(128, 128, 3, padding=1), 60 | nn.BatchNorm2d(128), 61 | nn.MaxPool2d(3, stride=2, padding=1), 62 | ) 63 | 64 | self.conv3_shortcut = nn.Sequential( 65 | nn.Conv2d(64, 128, 1, stride=2), 66 | nn.BatchNorm2d(128), 67 | ) 68 | 69 | self.conv4_residual = nn.Sequential( 70 | nn.ReLU(inplace=True), 71 | SeperableConv2d(128, 256, 3, padding=1), 72 | nn.BatchNorm2d(256), 73 | nn.ReLU(inplace=True), 74 | SeperableConv2d(256, 256, 3, padding=1), 75 | nn.BatchNorm2d(256), 76 | nn.MaxPool2d(3, stride=2, padding=1) 77 | ) 78 | 79 | self.conv4_shortcut = nn.Sequential( 80 | nn.Conv2d(128, 256, 1, stride=2), 81 | nn.BatchNorm2d(256), 82 | ) 83 | 84 | #no downsampling 85 | self.conv5_residual = nn.Sequential( 86 | nn.ReLU(inplace=True), 87 | SeperableConv2d(256, 728, 3, padding=1), 88 | nn.BatchNorm2d(728), 89 | nn.ReLU(inplace=True), 90 | SeperableConv2d(728, 728, 3, padding=1), 91 | nn.BatchNorm2d(728), 92 | nn.MaxPool2d(3, 1, padding=1) 93 | ) 94 | 95 | #no downsampling 96 | self.conv5_shortcut = nn.Sequential( 97 | nn.Conv2d(256, 728, 1), 98 | nn.BatchNorm2d(728) 99 | ) 100 | 101 | def forward(self, x): 102 | x = self.conv1(x) 103 | x = self.conv2(x) 104 | residual = self.conv3_residual(x) 105 | shortcut = self.conv3_shortcut(x) 106 | x = residual + shortcut 107 | residual = self.conv4_residual(x) 108 | shortcut = self.conv4_shortcut(x) 109 | x = residual + shortcut 110 | residual = self.conv5_residual(x) 111 | shortcut = self.conv5_shortcut(x) 112 | x = residual + shortcut 113 | 114 | return x 115 | 116 | class MiddleFLowBlock(nn.Module): 117 | 118 | def __init__(self): 119 | super().__init__() 120 | 121 | self.shortcut = nn.Sequential() 122 | self.conv1 = nn.Sequential( 123 | nn.ReLU(inplace=True), 124 | SeperableConv2d(728, 728, 3, padding=1), 125 | nn.BatchNorm2d(728) 126 | ) 127 | self.conv2 = nn.Sequential( 128 | nn.ReLU(inplace=True), 129 | SeperableConv2d(728, 728, 3, padding=1), 130 | nn.BatchNorm2d(728) 131 | ) 132 | self.conv3 = nn.Sequential( 133 | nn.ReLU(inplace=True), 134 | SeperableConv2d(728, 728, 3, padding=1), 135 | nn.BatchNorm2d(728) 136 | ) 137 | 138 | def forward(self, x): 139 | residual = self.conv1(x) 140 | residual = self.conv2(residual) 141 | residual = self.conv3(residual) 142 | 143 | shortcut = self.shortcut(x) 144 | 145 | return shortcut + residual 146 | 147 | class MiddleFlow(nn.Module): 148 | def __init__(self, block): 149 | super().__init__() 150 | 151 | #"""then through the middle flow which is repeated eight times""" 152 | self.middel_block = self._make_flow(block, 8) 153 | 154 | def forward(self, x): 155 | x = self.middel_block(x) 156 | return x 157 | 158 | def _make_flow(self, block, times): 159 | flows = [] 160 | for i in range(times): 161 | flows.append(block()) 162 | 163 | return nn.Sequential(*flows) 164 | 165 | 166 | class ExitFLow(nn.Module): 167 | 168 | def __init__(self): 169 | super().__init__() 170 | self.residual = nn.Sequential( 171 | nn.ReLU(), 172 | SeperableConv2d(728, 728, 3, padding=1), 173 | nn.BatchNorm2d(728), 174 | nn.ReLU(), 175 | SeperableConv2d(728, 1024, 3, padding=1), 176 | nn.BatchNorm2d(1024), 177 | nn.MaxPool2d(3, stride=2, padding=1) 178 | ) 179 | 180 | self.shortcut = nn.Sequential( 181 | nn.Conv2d(728, 1024, 1, stride=2), 182 | nn.BatchNorm2d(1024) 183 | ) 184 | 185 | self.conv = nn.Sequential( 186 | SeperableConv2d(1024, 1536, 3, padding=1), 187 | nn.BatchNorm2d(1536), 188 | nn.ReLU(inplace=True), 189 | SeperableConv2d(1536, 2048, 3, padding=1), 190 | nn.BatchNorm2d(2048), 191 | nn.ReLU(inplace=True) 192 | ) 193 | 194 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 195 | 196 | def forward(self, x): 197 | shortcut = self.shortcut(x) 198 | residual = self.residual(x) 199 | output = shortcut + residual 200 | output = self.conv(output) 201 | output = self.avgpool(output) 202 | 203 | return output 204 | 205 | class Xception(nn.Module): 206 | 207 | def __init__(self, block, num_class=100): 208 | super().__init__() 209 | self.entry_flow = EntryFlow() 210 | self.middel_flow = MiddleFlow(block) 211 | self.exit_flow = ExitFLow() 212 | 213 | self.fc = nn.Linear(2048, num_class) 214 | 215 | def forward(self, x): 216 | x = self.entry_flow(x) 217 | x = self.middel_flow(x) 218 | x = self.exit_flow(x) 219 | x = x.view(x.size(0), -1) 220 | x = self.fc(x) 221 | 222 | return x 223 | 224 | def xception(): 225 | return Xception(MiddleFLowBlock) 226 | 227 | 228 | -------------------------------------------------------------------------------- /Code/benchmarking/resnet_cbam.py: -------------------------------------------------------------------------------- 1 | """senet in pytorch 2 | 3 | 4 | 5 | [1] Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu 6 | 7 | Squeeze-and-Excitation Networks 8 | https://arxiv.org/abs/1709.01507 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from models.cbam import CBAM 15 | 16 | 17 | def single_list(x): 18 | """ If an Element is a single instead of a list, when a list is expected it created a single element list""" 19 | if x.__class__.__name__ is 'Tensor': 20 | return [x] 21 | else: 22 | return x 23 | 24 | class BasicResidualSEBlock(nn.Module): 25 | expansion = 1 26 | # [global_local_attention_addition, global_attention_addition, global_local_attention_concat, global_attention_concat] 27 | # [global_local_attention_concat_learnable, global_local_attention_addition_learnable] 28 | # [standard_local_attention, identity_local_attention, pre_local_attention] 29 | exp_name = 'standard_cbam' 30 | def __init__(self, in_channels, out_channels, stride, block_num, r=16): 31 | super().__init__() 32 | if not 'concat' in self.exp_name: 33 | block_num = 1 34 | self.residual = nn.Sequential( 35 | nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias = False), 36 | nn.BatchNorm2d(out_channels), 37 | nn.ReLU(), 38 | 39 | nn.Conv2d(out_channels, out_channels * self.expansion, 3, padding=1, bias = False), 40 | nn.BatchNorm2d(out_channels * self.expansion) 41 | ) 42 | 43 | self.shortcut = nn.Sequential() 44 | if stride != 1 or in_channels != out_channels * self.expansion: 45 | self.shortcut = nn.Sequential( 46 | nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride, bias = False), 47 | nn.BatchNorm2d(out_channels * self.expansion) 48 | ) 49 | 50 | self.cbam = CBAM(out_channels * self.expansion * block_num, no_spatial=False, no_channel=False) 51 | 52 | def forward(self, x): 53 | if self.exp_name is 'standard_cbam': 54 | if x.__class__.__name__ is 'Tensor': 55 | current_input = x 56 | else: 57 | current_input = x[0] 58 | 59 | shortcut = self.shortcut(current_input) 60 | residual = self.residual(current_input) 61 | 62 | residual = self.cbam(residual) 63 | 64 | output = residual + shortcut 65 | 66 | return (F.relu(output), []) 67 | 68 | 69 | class BottleneckResidualSEBlock(nn.Module): 70 | expansion = 4 71 | 72 | def __init__(self, in_channels, out_channels, stride, r=16): 73 | super().__init__() 74 | 75 | self.residual = nn.Sequential( 76 | nn.Conv2d(in_channels, out_channels, 1), 77 | nn.BatchNorm2d(out_channels), 78 | nn.ReLU(), 79 | 80 | nn.Conv2d(out_channels, out_channels, 3, stride=stride, padding=1), 81 | nn.BatchNorm2d(out_channels), 82 | nn.ReLU(), 83 | 84 | nn.Conv2d(out_channels, out_channels * self.expansion, 1), 85 | nn.BatchNorm2d(out_channels * self.expansion), 86 | nn.ReLU() 87 | ) 88 | 89 | self.squeeze = nn.AdaptiveAvgPool2d(1) 90 | self.excitation1 = nn.Sequential( 91 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r), 92 | nn.ReLU(), 93 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion), 94 | nn.Sigmoid() 95 | ) 96 | self.excitation2 = nn.Sequential( 97 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r), 98 | nn.ReLU(), 99 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion), 100 | nn.Sigmoid() 101 | ) 102 | 103 | self.shortcut = nn.Sequential() 104 | if stride != 1 or in_channels != out_channels * self.expansion: 105 | self.shortcut = nn.Sequential( 106 | nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride), 107 | nn.BatchNorm2d(out_channels * self.expansion) 108 | ) 109 | 110 | def forward(self, x): 111 | x = single_list(x) 112 | current_input = x[-1] 113 | shortcut = self.shortcut(current_input) 114 | 115 | residual = self.residual(current_input) 116 | new_connection = residual 117 | print(len(x)) 118 | for input_ in x[: -1]: 119 | new_connection += input_ 120 | squeeze1 = self.squeeze(new_connection) 121 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 122 | excitation1 = self.excitation1(squeeze1) 123 | excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1) 124 | squeeze2 = self.squeeze(residual) 125 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 126 | excitation2 = self.excitation2(squeeze2) 127 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 128 | 129 | 130 | output = residual * excitation1.expand_as(residual) * excitation2.expand_as(residual) + shortcut 131 | x.append(F.relu(output)) 132 | return x 133 | 134 | 135 | class SEResNet(nn.Module): 136 | 137 | def __init__(self, block, block_num, class_num=120): 138 | super().__init__() 139 | 140 | self.in_channels = 64 141 | 142 | self.pre = nn.Sequential( 143 | nn.Conv2d(3, 64, 3, padding=1), 144 | nn.BatchNorm2d(64), 145 | nn.ReLU() 146 | ) 147 | 148 | self.stage1 = self._make_stage(block, block_num[0], 64, 1) 149 | self.stage2 = self._make_stage(block, block_num[1], 128, 2) 150 | self.stage3 = self._make_stage(block, block_num[2], 256, 2) 151 | self.stage4 = self._make_stage(block, block_num[3], 512, 2) 152 | 153 | self.linear = nn.Linear(self.in_channels, class_num) 154 | 155 | def forward(self, x): 156 | x = self.pre(x) 157 | 158 | x = self.stage1(x) 159 | 160 | x = self.stage2(x[0]) 161 | x = self.stage3(x[0]) 162 | x = self.stage4(x[0]) 163 | x = F.adaptive_avg_pool2d(x[0], 1) 164 | x = x.view(x.size(0), -1) 165 | x = self.linear(x) 166 | 167 | return x 168 | 169 | def _make_stage(self, block, num, out_channels, stride): 170 | layers = [] 171 | layers.append(block(self.in_channels, out_channels, stride, 1)) 172 | self.in_channels = out_channels * block.expansion 173 | 174 | for i in range(1, num): 175 | layers.append(block(self.in_channels, out_channels, 1, i + 1)) 176 | 177 | return nn.Sequential(*layers) 178 | 179 | 180 | def seresnet18(num_classes): 181 | return SEResNet(BasicResidualSEBlock, [2, 2, 2, 2], class_num = num_classes) 182 | 183 | 184 | def seresnet34(num_classes): 185 | return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes) 186 | 187 | 188 | def seresnet50(num_classes): 189 | return SEResNet(BottleneckResidualSEBlock, [3, 4, 6, 3], class_num = num_classes) 190 | 191 | 192 | def seresnet101(num_classes): 193 | return SEResNet(BottleneckResidualSEBlock, [3, 4, 23, 3], class_num = num_classes) 194 | 195 | 196 | def seresnet152(num_classes): 197 | return SEResNet(BottleneckResidualSEBlock, [3, 8, 36, 3], class_num = num_classes) 198 | -------------------------------------------------------------------------------- /Code/benchmarking/rir.py: -------------------------------------------------------------------------------- 1 | """resnet in resnet in pytorch 2 | 3 | 4 | 5 | [1] Sasha Targ, Diogo Almeida, Kevin Lyman. 6 | 7 | Resnet in Resnet: Generalizing Residual Architectures 8 | https://arxiv.org/abs/1603.08029v1 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | #geralized 15 | class ResnetInit(nn.Module): 16 | def __init__(self, in_channel, out_channel, stride): 17 | super().__init__() 18 | 19 | #"""The modular unit of the generalized residual network architecture is a 20 | #generalized residual block consisting of parallel states for a residual stream, 21 | #r, which contains identity shortcut connections and is similar to the structure 22 | #of a residual block from the original ResNet with a single convolutional layer 23 | #(parameters W l,r→r ) 24 | self.residual_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 25 | 26 | #"""and a transient stream, t, which is a standard convolutional layer 27 | #(W l,t→t ).""" 28 | self.transient_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 29 | 30 | #"""Two additional sets of convolutional filters in each block (W l,r→t , W l,t→r ) 31 | #also transfer information across streams.""" 32 | self.residual_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 33 | 34 | #"""We use equal numbers of filters for the residual and transient streams of the 35 | #generalized residual network, but optimizing this hyperparameter could lead to 36 | #further potential improvements.""" 37 | self.transient_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 38 | 39 | self.residual_bn_relu = nn.Sequential( 40 | nn.BatchNorm2d(out_channel), 41 | nn.ReLU(inplace=True) 42 | ) 43 | 44 | self.transient_bn_relu = nn.Sequential( 45 | nn.BatchNorm2d(out_channel), 46 | nn.ReLU(inplace=True) 47 | ) 48 | 49 | #"""The form of the shortcut connection can be an identity function with 50 | #the appropriate padding or a projection as in He et al. (2015b).""" 51 | self.short_cut = nn.Sequential() 52 | if in_channel != out_channel or stride != 1: 53 | self.short_cut = nn.Sequential( 54 | nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) 55 | ) 56 | 57 | 58 | def forward(self, x): 59 | x_residual, x_transient = x 60 | residual_r_r = self.residual_stream_conv(x_residual) 61 | residual_r_t = self.residual_stream_conv_across(x_residual) 62 | residual_shortcut = self.short_cut(x_residual) 63 | 64 | transient_t_t = self.transient_stream_conv(x_transient) 65 | transient_t_r = self.transient_stream_conv_across(x_transient) 66 | 67 | #transient_t_t = self.transient_stream_conv(x_residual) 68 | #transient_t_r = self.transient_stream_conv_across(x_residual) 69 | #"""Same-stream and cross-stream activations are summed (along with the 70 | #shortcut connection for the residual stream) before applying batch 71 | #normalization and ReLU nonlinearities (together σ) to get the output 72 | #states of the block (Equation 1) (Ioffe & Szegedy, 2015).""" 73 | x_residual = self.residual_bn_relu(residual_r_r + transient_t_r + residual_shortcut) 74 | x_transient = self.transient_bn_relu(residual_r_t + transient_t_t) 75 | 76 | return x_residual, x_transient 77 | 78 | 79 | 80 | class RiRBlock(nn.Module): 81 | def __init__(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit): 82 | super().__init__() 83 | self.resnetinit = self._make_layers(in_channel, out_channel, layer_num, stride) 84 | 85 | #self.short_cut = nn.Sequential() 86 | #if stride != 1 or in_channel != out_channel: 87 | # self.short_cut = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) 88 | 89 | def forward(self, x): 90 | x_residual, x_transient = self.resnetinit(x) 91 | #x_residual = x_residual + self.short_cut(x[0]) 92 | #x_transient = x_transient + self.short_cut(x[1]) 93 | 94 | return (x_residual, x_transient) 95 | 96 | #"""Replacing each of the convolutional layers within a residual 97 | #block from the original ResNet (Figure 1a) with a generalized residual block 98 | #(Figure 1b) leads us to a new architecture we call ResNet in ResNet (RiR) 99 | #(Figure 1d).""" 100 | def _make_layers(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit): 101 | strides = [stride] + [1] * (layer_num - 1) 102 | layers = nn.Sequential() 103 | for index, s in enumerate(strides): 104 | layers.add_module("generalized layers{}".format(index), layer(in_channel, out_channel, s)) 105 | in_channel = out_channel 106 | 107 | return layers 108 | 109 | class ResnetInResneet(nn.Module): 110 | def __init__(self, num_classes=100): 111 | super().__init__() 112 | base = int(96 / 2) 113 | self.residual_pre_conv = nn.Sequential( 114 | nn.Conv2d(3, base, 3, padding=1), 115 | nn.BatchNorm2d(base), 116 | nn.ReLU(inplace=True) 117 | ) 118 | self.transient_pre_conv = nn.Sequential( 119 | nn.Conv2d(3, base, 3, padding=1), 120 | nn.BatchNorm2d(base), 121 | nn.ReLU(inplace=True) 122 | ) 123 | 124 | self.rir1 = RiRBlock(base, base, 2, 1) 125 | self.rir2 = RiRBlock(base, base, 2, 1) 126 | self.rir3 = RiRBlock(base, base * 2, 2, 2) 127 | self.rir4 = RiRBlock(base * 2, base * 2, 2, 1) 128 | self.rir5 = RiRBlock(base * 2, base * 2, 2, 1) 129 | self.rir6 = RiRBlock(base * 2, base * 4, 2, 2) 130 | self.rir7 = RiRBlock(base * 4, base * 4, 2, 1) 131 | self.rir8 = RiRBlock(base * 4, base * 4, 2, 1) 132 | 133 | self.conv1 = nn.Sequential( 134 | nn.Conv2d(384, num_classes, kernel_size=3, stride=2), #without this convolution, loss will soon be nan 135 | nn.BatchNorm2d(num_classes), 136 | nn.ReLU(inplace=True), 137 | ) 138 | 139 | self.classifier = nn.Sequential( 140 | nn.Linear(900, 450), 141 | nn.ReLU(), 142 | nn.Dropout(), 143 | nn.Linear(450, 100), 144 | ) 145 | 146 | self._weight_init() 147 | 148 | def forward(self, x): 149 | x_residual = self.residual_pre_conv(x) 150 | x_transient = self.transient_pre_conv(x) 151 | 152 | x_residual, x_transient = self.rir1((x_residual, x_transient)) 153 | x_residual, x_transient = self.rir2((x_residual, x_transient)) 154 | x_residual, x_transient = self.rir3((x_residual, x_transient)) 155 | x_residual, x_transient = self.rir4((x_residual, x_transient)) 156 | x_residual, x_transient = self.rir5((x_residual, x_transient)) 157 | x_residual, x_transient = self.rir6((x_residual, x_transient)) 158 | x_residual, x_transient = self.rir7((x_residual, x_transient)) 159 | x_residual, x_transient = self.rir8((x_residual, x_transient)) 160 | h = torch.cat([x_residual, x_transient], 1) 161 | h = self.conv1(h) 162 | h = h.view(h.size()[0], -1) 163 | h = self.classifier(h) 164 | 165 | return h 166 | 167 | def _weight_init(self): 168 | for m in self.modules(): 169 | if isinstance(m, nn.Conv2d): 170 | torch.nn.init.kaiming_normal(m.weight) 171 | m.bias.data.fill_(0.01) 172 | 173 | 174 | def resnet_in_resnet(): 175 | return ResnetInResneet() 176 | 177 | #from torch.autograd import Variable 178 | # 179 | #net = resnet_in_resnet() 180 | #print(net(Variable(torch.randn(3, 3, 32, 32))).shape) 181 | -------------------------------------------------------------------------------- /Code/benchmarking/shufflenet.py: -------------------------------------------------------------------------------- 1 | """shufflenet in pytorch 2 | 3 | 4 | 5 | [1] Xiangyu Zhang, Xinyu Zhou, Mengxiao Lin, Jian Sun. 6 | 7 | ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices 8 | https://arxiv.org/abs/1707.01083v2 9 | """ 10 | 11 | from functools import partial 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | 17 | class BasicConv2d(nn.Module): 18 | 19 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 20 | super().__init__() 21 | self.conv = nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs) 22 | self.bn = nn.BatchNorm2d(output_channels) 23 | self.relu = nn.ReLU(inplace=True) 24 | 25 | def forward(self, x): 26 | x = self.conv(x) 27 | x = self.bn(x) 28 | x = self.relu(x) 29 | return x 30 | 31 | class ChannelShuffle(nn.Module): 32 | 33 | def __init__(self, groups): 34 | super().__init__() 35 | self.groups = groups 36 | 37 | def forward(self, x): 38 | batchsize, channels, height, width = x.data.size() 39 | channels_per_group = int(channels / self.groups) 40 | 41 | #"""suppose a convolutional layer with g groups whose output has 42 | #g x n channels; we first reshape the output channel dimension 43 | #into (g, n)""" 44 | x = x.view(batchsize, self.groups, channels_per_group, height, width) 45 | 46 | #"""transposing and then flattening it back as the input of next layer.""" 47 | x = x.transpose(1, 2).contiguous() 48 | x = x.view(batchsize, -1, height, width) 49 | 50 | return x 51 | 52 | class DepthwiseConv2d(nn.Module): 53 | 54 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 55 | super().__init__() 56 | self.depthwise = nn.Sequential( 57 | nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs), 58 | nn.BatchNorm2d(output_channels) 59 | ) 60 | 61 | def forward(self, x): 62 | return self.depthwise(x) 63 | 64 | class PointwiseConv2d(nn.Module): 65 | def __init__(self, input_channels, output_channels, **kwargs): 66 | super().__init__() 67 | self.pointwise = nn.Sequential( 68 | nn.Conv2d(input_channels, output_channels, 1, **kwargs), 69 | nn.BatchNorm2d(output_channels) 70 | ) 71 | 72 | def forward(self, x): 73 | return self.pointwise(x) 74 | 75 | class ShuffleNetUnit(nn.Module): 76 | 77 | def __init__(self, input_channels, output_channels, stage, stride, groups): 78 | super().__init__() 79 | 80 | #"""Similar to [9], we set the number of bottleneck channels to 1/4 81 | #of the output channels for each ShuffleNet unit.""" 82 | self.bottlneck = nn.Sequential( 83 | PointwiseConv2d( 84 | input_channels, 85 | int(output_channels / 4), 86 | groups=groups 87 | ), 88 | nn.ReLU(inplace=True) 89 | ) 90 | 91 | #"""Note that for Stage 2, we do not apply group convolution on the first pointwise 92 | #layer because the number of input channels is relatively small.""" 93 | if stage == 2: 94 | self.bottlneck = nn.Sequential( 95 | PointwiseConv2d( 96 | input_channels, 97 | int(output_channels / 4), 98 | groups=groups 99 | ), 100 | nn.ReLU(inplace=True) 101 | ) 102 | 103 | self.channel_shuffle = ChannelShuffle(groups) 104 | 105 | self.depthwise = DepthwiseConv2d( 106 | int(output_channels / 4), 107 | int(output_channels / 4), 108 | 3, 109 | groups=int(output_channels / 4), 110 | stride=stride, 111 | padding=1 112 | ) 113 | 114 | self.expand = PointwiseConv2d( 115 | int(output_channels / 4), 116 | output_channels, 117 | groups=groups 118 | ) 119 | 120 | self.relu = nn.ReLU(inplace=True) 121 | self.fusion = self._add 122 | self.shortcut = nn.Sequential() 123 | 124 | #"""As for the case where ShuffleNet is applied with stride, 125 | #we simply make two modifications (see Fig 2 (c)): 126 | #(i) add a 3 × 3 average pooling on the shortcut path; 127 | #(ii) replace the element-wise addition with channel concatenation, 128 | #which makes it easy to enlarge channel dimension with little extra 129 | #computation cost. 130 | if stride != 1 or input_channels != output_channels: 131 | self.shortcut = nn.AvgPool2d(3, stride=2, padding=1) 132 | 133 | self.expand = PointwiseConv2d( 134 | int(output_channels / 4), 135 | output_channels - input_channels, 136 | groups=groups 137 | ) 138 | 139 | self.fusion = self._cat 140 | 141 | def _add(self, x, y): 142 | return torch.add(x, y) 143 | 144 | def _cat(self, x, y): 145 | return torch.cat([x, y], dim=1) 146 | 147 | def forward(self, x): 148 | shortcut = self.shortcut(x) 149 | 150 | shuffled = self.bottlneck(x) 151 | shuffled = self.channel_shuffle(shuffled) 152 | shuffled = self.depthwise(shuffled) 153 | shuffled = self.expand(shuffled) 154 | 155 | output = self.fusion(shortcut, shuffled) 156 | output = self.relu(output) 157 | 158 | return output 159 | 160 | class ShuffleNet(nn.Module): 161 | 162 | def __init__(self, num_blocks, num_classes=100, groups=3): 163 | super().__init__() 164 | 165 | if groups == 1: 166 | out_channels = [24, 144, 288, 567] 167 | elif groups == 2: 168 | out_channels = [24, 200, 400, 800] 169 | elif groups == 3: 170 | out_channels = [24, 240, 480, 960] 171 | elif groups == 4: 172 | out_channels = [24, 272, 544, 1088] 173 | elif groups == 8: 174 | out_channels = [24, 384, 768, 1536] 175 | 176 | self.conv1 = BasicConv2d(3, out_channels[0], 3, padding=1, stride=1) 177 | self.input_channels = out_channels[0] 178 | 179 | self.stage2 = self._make_stage( 180 | ShuffleNetUnit, 181 | num_blocks[0], 182 | out_channels[1], 183 | stride=2, 184 | stage=2, 185 | groups=groups 186 | ) 187 | 188 | self.stage3 = self._make_stage( 189 | ShuffleNetUnit, 190 | num_blocks[1], 191 | out_channels[2], 192 | stride=2, 193 | stage=3, 194 | groups=groups 195 | ) 196 | 197 | self.stage4 = self._make_stage( 198 | ShuffleNetUnit, 199 | num_blocks[2], 200 | out_channels[3], 201 | stride=2, 202 | stage=4, 203 | groups=groups 204 | ) 205 | 206 | self.avg = nn.AdaptiveAvgPool2d((1, 1)) 207 | self.fc = nn.Linear(out_channels[3], num_classes) 208 | 209 | def forward(self, x): 210 | x = self.conv1(x) 211 | x = self.stage2(x) 212 | x = self.stage3(x) 213 | x = self.stage4(x) 214 | x = self.avg(x) 215 | x = x.view(x.size(0), -1) 216 | x = self.fc(x) 217 | 218 | return x 219 | 220 | def _make_stage(self, block, num_blocks, output_channels, stride, stage, groups): 221 | """make shufflenet stage 222 | 223 | Args: 224 | block: block type, shuffle unit 225 | out_channels: output depth channel number of this stage 226 | num_blocks: how many blocks per stage 227 | stride: the stride of the first block of this stage 228 | stage: stage index 229 | groups: group number of group convolution 230 | Return: 231 | return a shuffle net stage 232 | """ 233 | strides = [stride] + [1] * (num_blocks - 1) 234 | 235 | stage = [] 236 | 237 | for stride in strides: 238 | stage.append( 239 | block( 240 | self.input_channels, 241 | output_channels, 242 | stride=stride, 243 | stage=stage, 244 | groups=groups 245 | ) 246 | ) 247 | self.input_channels = output_channels 248 | 249 | return nn.Sequential(*stage) 250 | 251 | def shufflenet(): 252 | return ShuffleNet([4, 8, 4]) 253 | 254 | 255 | 256 | 257 | -------------------------------------------------------------------------------- /Code/dataset.py: -------------------------------------------------------------------------------- 1 | """ train and test dataset 2 | 3 | author baiyu 4 | """ 5 | import os 6 | import sys 7 | import pickle 8 | import matplotlib.pyplot as plt 9 | import numpy 10 | import torch 11 | from torch.utils.data import Dataset 12 | from torchvision import datasets, transforms 13 | import torchvision 14 | 15 | def get_dataloaders(batch_size, dataset): 16 | print(dataset) 17 | if dataset == 'dogs': 18 | image_transforms = { 19 | # Train uses data augmentation 20 | 'train': 21 | transforms.Compose([ 22 | transforms.RandomResizedCrop(size=135, scale=(0.95, 1.0)), 23 | transforms.RandomRotation(degrees=15), 24 | transforms.ColorJitter(), 25 | transforms.RandomHorizontalFlip(), 26 | transforms.CenterCrop(size=128), # Image net standards 27 | transforms.ToTensor(), 28 | transforms.Normalize([0.485, 0.456, 0.406], 29 | [0.229, 0.224, 0.225]) # Imagenet standards 30 | ]), 31 | 'test': 32 | transforms.Compose([ 33 | transforms.Resize(size=128), 34 | transforms.CenterCrop(size=128), 35 | transforms.ToTensor(), 36 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 37 | ]) 38 | } 39 | all_data = datasets.ImageFolder(root='data/archive/images/Images/') 40 | torch.manual_seed(42) 41 | train_data_len = int(len(all_data)*0.8) 42 | valid_data_len = int((len(all_data) - train_data_len)) 43 | train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len]) 44 | train_data.dataset.transform = image_transforms['train'] 45 | val_data.dataset.transform = image_transforms['test'] 46 | print(len(train_data), len(val_data)) 47 | 48 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True) 49 | val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True) 50 | 51 | return {'train': train_loader, 'val' :val_loader} 52 | 53 | elif dataset == 'imagenet': 54 | data_path = "/mnt/4T_2/imagenet_dataset" 55 | traindir = os.path.join(data_path, 'train') 56 | valdir = os.path.join(data_path, 'val') 57 | image_transforms = { 58 | 'train': 59 | transforms.Compose([transforms.RandomRotation(degrees=15), transforms.ColorJitter(), 60 | transforms.RandomHorizontalFlip(), transforms.RandomResizedCrop(224), 61 | transforms.ToTensor(), 62 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]), 63 | 'test': 64 | transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), 65 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 66 | } 67 | train_dataset = datasets.ImageFolder(traindir, image_transforms["train"]) 68 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 69 | num_workers=30, pin_memory=True) 70 | val_dataset = datasets.ImageFolder(valdir, image_transforms["test"]) 71 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 72 | num_workers=30, pin_memory=True) 73 | print(len(train_dataset), len(val_dataset)) 74 | return {'train': train_loader, 'val': val_loader} 75 | 76 | elif dataset == 'tiny-imagenet': 77 | image_transforms = { 78 | # Train uses data augmentation 79 | 'train': 80 | transforms.Compose([ 81 | transforms.RandomRotation(degrees=15), 82 | transforms.ColorJitter(), 83 | transforms.RandomHorizontalFlip(), 84 | transforms.CenterCrop(size=64), # Image net standards 85 | transforms.ToTensor(), 86 | transforms.Normalize([0.485, 0.456, 0.406], 87 | [0.229, 0.224, 0.225]) # Imagenet standards 88 | ]), 89 | 'test': 90 | transforms.Compose([ 91 | transforms.Resize(size=64), 92 | transforms.CenterCrop(size=64), 93 | transforms.ToTensor(), 94 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 95 | ]) 96 | } 97 | all_data = datasets.ImageFolder(root='data/tiny-imagenet/tiny-imagenet-200/train') 98 | torch.manual_seed(42) 99 | train_data_len = int(len(all_data)*0.8) 100 | valid_data_len = int((len(all_data) - train_data_len)) 101 | train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len]) 102 | train_data.dataset.transform = image_transforms['train'] 103 | val_data.dataset.transform = image_transforms['test'] 104 | print(len(train_data), len(val_data)) 105 | 106 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True) 107 | val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True) 108 | 109 | return {'train': train_loader, 'val' :val_loader} 110 | 111 | elif dataset == 'cifar100': 112 | mean = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343) 113 | std = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) 114 | image_transforms = { 115 | 'train': 116 | transforms.Compose([ 117 | #transforms.ToPILImage(), 118 | transforms.RandomCrop(32, padding=4), 119 | transforms.RandomHorizontalFlip(), 120 | transforms.RandomRotation(15), 121 | transforms.ToTensor(), 122 | transforms.Normalize(mean, std) 123 | ]), 124 | 'test': 125 | transforms.Compose([ 126 | transforms.ToTensor(), 127 | transforms.Normalize(mean, std) 128 | ]) 129 | } 130 | cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=image_transforms['train']) 131 | train_loader = torch.utils.data.DataLoader( 132 | cifar100_training, shuffle=True, batch_size=batch_size) 133 | 134 | cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=image_transforms['test']) 135 | val_loader = torch.utils.data.DataLoader( 136 | cifar100_test, shuffle=True, batch_size=batch_size) 137 | 138 | return {'train': train_loader, 'val' :val_loader} 139 | 140 | elif dataset == 'cifar10': 141 | mean = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343) 142 | std = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) 143 | image_transforms = { 144 | 'train': 145 | transforms.Compose([ 146 | #transforms.ToPILImage(), 147 | transforms.RandomCrop(32, padding=4), 148 | transforms.RandomHorizontalFlip(), 149 | transforms.RandomRotation(15), 150 | transforms.ToTensor(), 151 | transforms.Normalize(mean, std) 152 | ]), 153 | 'test': 154 | transforms.Compose([ 155 | transforms.ToTensor(), 156 | transforms.Normalize(mean, std) 157 | ]) 158 | } 159 | cifar10_training = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=image_transforms['train']) 160 | train_loader = torch.utils.data.DataLoader( 161 | cifar10_training, shuffle=True, batch_size=batch_size) 162 | 163 | cifar10_test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=image_transforms['test']) 164 | val_loader = torch.utils.data.DataLoader( 165 | cifar10_test, shuffle=True, batch_size=batch_size) 166 | 167 | return {'train': train_loader, 'val' :val_loader} 168 | 169 | 170 | elif dataset == 'caltech': 171 | image_transforms = { 172 | # Train uses data augmentation 173 | 'train': 174 | transforms.Compose([ 175 | transforms.Resize((128,128)), 176 | transforms.ColorJitter(), 177 | transforms.RandomHorizontalFlip(), 178 | transforms.ToTensor(), 179 | transforms.Normalize([0.485, 0.456, 0.406], 180 | [0.229, 0.224, 0.225]) # Imagenet standards 181 | ]), 182 | 'test': 183 | transforms.Compose([ 184 | transforms.Resize((128, 128)), 185 | transforms.ToTensor(), 186 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 187 | ]) 188 | } 189 | all_data = datasets.ImageFolder(root='data/caltech/256_ObjectCategories') 190 | torch.manual_seed(42) 191 | train_data_len = int(len(all_data)*0.8) 192 | valid_data_len = int((len(all_data) - train_data_len)) 193 | train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len]) 194 | train_data.dataset.transform = image_transforms['train'] 195 | val_data.dataset.transform = image_transforms['test'] 196 | print(len(train_data), len(val_data)) 197 | 198 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True) 199 | val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True) 200 | 201 | return {'train': train_loader, 'val' :val_loader} 202 | 203 | 204 | 205 | else: 206 | print('This dataset isn\'t supported yet') 207 | -------------------------------------------------------------------------------- /Code/benchmarking/densenet.py: -------------------------------------------------------------------------------- 1 | """dense net in pytorch 2 | 3 | 4 | 5 | [1] Gao Huang, Zhuang Liu, Laurens van der Maaten, Kilian Q. Weinberger. 6 | 7 | Densely Connected Convolutional Networks 8 | https://arxiv.org/abs/1608.06993v5 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | 16 | #"""Bottleneck layers. Although each layer only produces k 17 | #output feature-maps, it typically has many more inputs. It 18 | #has been noted in [37, 11] that a 1×1 convolution can be in- 19 | #troduced as bottleneck layer before each 3×3 convolution 20 | #to reduce the number of input feature-maps, and thus to 21 | #improve computational efficiency.""" 22 | class Bottleneck(nn.Module): 23 | def __init__(self, in_channels, growth_rate, inner_channels_list): 24 | super().__init__() 25 | # """In our experiments, we let each 1×1 convolution 26 | # produce 4k feature-maps.""" 27 | inner_channel = 4 * growth_rate 28 | 29 | # """We find this design especially effective for DenseNet and 30 | # we refer to our network with such a bottleneck layer, i.e., 31 | # to the BN-ReLU-Conv(1×1)-BN-ReLU-Conv(3×3) version of H ` , 32 | # as DenseNet-B.""" 33 | self.bottle_neck = nn.Sequential( 34 | nn.BatchNorm2d(in_channels), 35 | nn.ReLU(inplace=True), 36 | nn.Conv2d(in_channels, inner_channel, kernel_size=1, bias=False), 37 | nn.BatchNorm2d(inner_channel), 38 | nn.ReLU(inplace=True), 39 | nn.Conv2d(inner_channel, growth_rate, kernel_size=3, padding=1, bias=False) 40 | ) 41 | 42 | r = 16 43 | # out_channels = in_channels 44 | self.expansion = 1 45 | # self.squeeze1 = nn.AdaptiveAvgPool2d(1) 46 | # self.excitation1 = nn.Sequential( 47 | # nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias=False), 48 | # nn.ReLU(inplace=True), 49 | # nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias=False), 50 | # nn.Sigmoid() 51 | # ) 52 | out_channels = growth_rate 53 | self.squeeze2 = nn.AdaptiveAvgPool2d(1) 54 | self.excitation2 = nn.Sequential( 55 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias=False), 56 | nn.ReLU(inplace=True), 57 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias=False), 58 | nn.Sigmoid() 59 | ) 60 | 61 | def forward(self, x): 62 | # accuracy 74% with bias 63 | se_input = self.bottle_neck(x) 64 | squeeze = self.squeeze2(se_input) 65 | squeeze = squeeze.view(squeeze.size(0), -1) 66 | excitation = self.excitation2(squeeze) 67 | excitation = excitation.view(se_input.size(0), se_input.size(1), 1, 1) 68 | x2 = se_input * excitation.expand_as(se_input) 69 | x = torch.cat([x, x2], 1) 70 | return x 71 | 72 | #se_block before concat 73 | # class Bottleneck(nn.Module): 74 | # def __init__(self, in_channels, growth_rate, inner_channels_list): 75 | # super().__init__() 76 | # #"""In our experiments, we let each 1×1 convolution 77 | # #produce 4k feature-maps.""" 78 | # inner_channel = 4 * growth_rate 79 | 80 | # #"""We find this design especially effective for DenseNet and 81 | # #we refer to our network with such a bottleneck layer, i.e., 82 | # #to the BN-ReLU-Conv(1×1)-BN-ReLU-Conv(3×3) version of H ` , 83 | # #as DenseNet-B.""" 84 | # self.bottle_neck = nn.Sequential( 85 | # nn.BatchNorm2d(in_channels), 86 | # nn.ReLU(inplace=True), 87 | # nn.Conv2d(in_channels, inner_channel, kernel_size=1, bias=False), 88 | # nn.BatchNorm2d(inner_channel), 89 | # nn.ReLU(inplace=True), 90 | # nn.Conv2d(inner_channel, growth_rate, kernel_size=3, padding=1, bias=False) 91 | # ) 92 | # self.inner_channels_list = inner_channels_list.copy() 93 | # r = 16 94 | # self.expansion = 1 95 | # self.squeeze = [] 96 | # self.excitation = [] 97 | # for i in range(len(self.inner_channels_list)): 98 | # if i == 0: 99 | # out_channels = self.inner_channels_list[0] 100 | # else: 101 | # out_channels = growth_rate 102 | # self.squeeze = nn.AdaptiveAvgPool2d(1) 103 | # self.excitation.append(nn.Sequential( 104 | # nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False), 105 | # nn.ReLU(inplace=True), 106 | # nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False), 107 | # nn.Sigmoid() 108 | # )) 109 | # self.excitation = nn.ModuleList(self.excitation) 110 | 111 | # def forward(self, x): 112 | # x_scaled = [] 113 | # #sequential 114 | # for i in range(len(self.inner_channels_list)): 115 | # if i == 0: 116 | # se_input = x[:, :self.inner_channels_list[0], :, :] 117 | # else: 118 | # se_input = x[:, self.inner_channels_list[i-1]: self.inner_channels_list[i], :, :] 119 | # squeeze = self.squeeze(se_input) 120 | # squeeze = squeeze.view(squeeze.size(0), -1) 121 | # excitation = self.excitation[i](squeeze) 122 | # excitation = excitation.view(se_input.size(0), se_input.size(1), 1, 1) 123 | # x1 = se_input * excitation.expand_as(se_input) 124 | # x_scaled.append(x1) 125 | # x_scaled = torch.cat(x_scaled, 1) 126 | # return torch.cat([x_scaled, self.bottle_neck(x_scaled)], 1) 127 | 128 | #"""We refer to layers between blocks as transition 129 | #layers, which do convolution and pooling.""" 130 | class Transition(nn.Module): 131 | def __init__(self, in_channels, out_channels): 132 | super().__init__() 133 | #"""The transition layers used in our experiments 134 | #consist of a batch normalization layer and an 1×1 135 | #convolutional layer followed by a 2×2 average pooling 136 | #layer""". 137 | self.down_sample = nn.Sequential( 138 | nn.BatchNorm2d(in_channels), 139 | nn.Conv2d(in_channels, out_channels, 1, bias=False), 140 | nn.AvgPool2d(2, stride=2) 141 | ) 142 | 143 | def forward(self, x): 144 | return self.down_sample(x) 145 | 146 | #DesneNet-BC 147 | #B stands for bottleneck layer(BN-RELU-CONV(1x1)-BN-RELU-CONV(3x3)) 148 | #C stands for compression factor(0<=theta<=1) 149 | class DenseNet(nn.Module): 150 | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_class=100): 151 | super().__init__() 152 | self.growth_rate = growth_rate 153 | 154 | #"""Before entering the first dense block, a convolution 155 | #with 16 (or twice the growth rate for DenseNet-BC) 156 | #output channels is performed on the input images.""" 157 | inner_channels = 2 * growth_rate 158 | 159 | #For convolutional layers with kernel size 3×3, each 160 | #side of the inputs is zero-padded by one pixel to keep 161 | #the feature-map size fixed. 162 | self.conv1 = nn.Conv2d(3, inner_channels, kernel_size=3, padding=1, bias=False) 163 | 164 | self.features = nn.Sequential() 165 | inner_channels_list = [inner_channels] 166 | for index in range(len(nblocks) - 1): 167 | self.features.add_module("dense_block_layer_{}".format(index), self._make_dense_layers(block, inner_channels, nblocks[index], inner_channels_list)) 168 | inner_channels += growth_rate * nblocks[index] 169 | 170 | #"""If a dense block contains m feature-maps, we let the 171 | #following transition layer generate θm output feature- 172 | #maps, where 0 < θ ≤ 1 is referred to as the compression 173 | #fac-tor. 174 | out_channels = int(reduction * inner_channels) # int() will automatic floor the value 175 | self.features.add_module("transition_layer_{}".format(index), Transition(inner_channels, out_channels)) 176 | inner_channels = out_channels 177 | inner_channels_list = [inner_channels] 178 | 179 | self.features.add_module("dense_block{}".format(len(nblocks) - 1), self._make_dense_layers(block, inner_channels, nblocks[len(nblocks)-1], inner_channels_list)) 180 | inner_channels += growth_rate * nblocks[len(nblocks) - 1] 181 | self.features.add_module('bn', nn.BatchNorm2d(inner_channels)) 182 | self.features.add_module('relu', nn.ReLU(inplace=True)) 183 | 184 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 185 | 186 | self.linear = nn.Linear(inner_channels, num_class) 187 | 188 | def forward(self, x): 189 | output = self.conv1(x) 190 | output = self.features(output) 191 | output = self.avgpool(output) 192 | output = output.view(output.size()[0], -1) 193 | output = self.linear(output) 194 | return output 195 | 196 | def _make_dense_layers(self, block, in_channels, nblocks, inner_channels_list): 197 | dense_block = nn.Sequential() 198 | for index in range(nblocks): 199 | dense_block.add_module('bottle_neck_layer_{}'.format(index), block(in_channels, self.growth_rate, inner_channels_list)) 200 | in_channels += self.growth_rate 201 | inner_channels_list.append(in_channels) 202 | return dense_block 203 | 204 | def densenet121(): 205 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) 206 | 207 | def densenet169(): 208 | return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) 209 | 210 | def densenet201(): 211 | return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) 212 | 213 | def densenet161(): 214 | return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) 215 | 216 | -------------------------------------------------------------------------------- /Code/benchmarking/nasnet.py: -------------------------------------------------------------------------------- 1 | """nasnet in pytorch 2 | 3 | 4 | 5 | [1] Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le 6 | 7 | Learning Transferable Architectures for Scalable Image Recognition 8 | https://arxiv.org/abs/1707.07012 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | class SeperableConv2d(nn.Module): 15 | 16 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 17 | 18 | super().__init__() 19 | self.depthwise = nn.Conv2d( 20 | input_channels, 21 | input_channels, 22 | kernel_size, 23 | groups=input_channels, 24 | **kwargs 25 | ) 26 | 27 | self.pointwise = nn.Conv2d( 28 | input_channels, 29 | output_channels, 30 | 1 31 | ) 32 | def forward(self, x): 33 | x = self.depthwise(x) 34 | x = self.pointwise(x) 35 | 36 | return x 37 | 38 | class SeperableBranch(nn.Module): 39 | 40 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 41 | """Adds 2 blocks of [relu-separable conv-batchnorm].""" 42 | super().__init__() 43 | self.block1 = nn.Sequential( 44 | nn.ReLU(), 45 | SeperableConv2d(input_channels, output_channels, kernel_size, **kwargs), 46 | nn.BatchNorm2d(output_channels) 47 | ) 48 | 49 | self.block2 = nn.Sequential( 50 | nn.ReLU(), 51 | SeperableConv2d(output_channels, output_channels, kernel_size, stride=1, padding=int(kernel_size / 2)), 52 | nn.BatchNorm2d(output_channels) 53 | ) 54 | 55 | def forward(self, x): 56 | x = self.block1(x) 57 | x = self.block2(x) 58 | 59 | return x 60 | 61 | class Fit(nn.Module): 62 | """Make the cell outputs compatible 63 | 64 | Args: 65 | prev_filters: filter number of tensor prev, needs to be modified 66 | filters: filter number of normal cell branch output filters 67 | """ 68 | 69 | def __init__(self, prev_filters, filters): 70 | super().__init__() 71 | self.relu = nn.ReLU() 72 | 73 | self.p1 = nn.Sequential( 74 | nn.AvgPool2d(1, stride=2), 75 | nn.Conv2d(prev_filters, int(filters / 2), 1) 76 | ) 77 | 78 | #make sure there is no information loss 79 | self.p2 = nn.Sequential( 80 | nn.ConstantPad2d((0, 1, 0, 1), 0), 81 | nn.ConstantPad2d((-1, 0, -1, 0), 0), #cropping 82 | nn.AvgPool2d(1, stride=2), 83 | nn.Conv2d(prev_filters, int(filters / 2), 1) 84 | ) 85 | 86 | self.bn = nn.BatchNorm2d(filters) 87 | 88 | self.dim_reduce = nn.Sequential( 89 | nn.ReLU(), 90 | nn.Conv2d(prev_filters, filters, 1), 91 | nn.BatchNorm2d(filters) 92 | ) 93 | 94 | self.filters = filters 95 | 96 | def forward(self, inputs): 97 | x, prev = inputs 98 | if prev is None: 99 | return x 100 | 101 | #image size does not match 102 | elif x.size(2) != prev.size(2): 103 | prev = self.relu(prev) 104 | p1 = self.p1(prev) 105 | p2 = self.p2(prev) 106 | prev = torch.cat([p1, p2], 1) 107 | prev = self.bn(prev) 108 | 109 | elif prev.size(1) != self.filters: 110 | prev = self.dim_reduce(prev) 111 | 112 | return prev 113 | 114 | 115 | class NormalCell(nn.Module): 116 | 117 | def __init__(self, x_in, prev_in, output_channels): 118 | super().__init__() 119 | 120 | self.dem_reduce = nn.Sequential( 121 | nn.ReLU(), 122 | nn.Conv2d(x_in, output_channels, 1, bias=False), 123 | nn.BatchNorm2d(output_channels) 124 | ) 125 | 126 | self.block1_left = SeperableBranch( 127 | output_channels, 128 | output_channels, 129 | kernel_size=3, 130 | padding=1, 131 | bias=False 132 | ) 133 | self.block1_right = nn.Sequential() 134 | 135 | self.block2_left = SeperableBranch( 136 | output_channels, 137 | output_channels, 138 | kernel_size=3, 139 | padding=1, 140 | bias=False 141 | ) 142 | self.block2_right = SeperableBranch( 143 | output_channels, 144 | output_channels, 145 | kernel_size=5, 146 | padding=2, 147 | bias=False 148 | ) 149 | 150 | self.block3_left = nn.AvgPool2d(3, stride=1, padding=1) 151 | self.block3_right = nn.Sequential() 152 | 153 | self.block4_left = nn.AvgPool2d(3, stride=1, padding=1) 154 | self.block4_right = nn.AvgPool2d(3, stride=1, padding=1) 155 | 156 | self.block5_left = SeperableBranch( 157 | output_channels, 158 | output_channels, 159 | kernel_size=5, 160 | padding=2, 161 | bias=False 162 | ) 163 | self.block5_right = SeperableBranch( 164 | output_channels, 165 | output_channels, 166 | kernel_size=3, 167 | padding=1, 168 | bias=False 169 | ) 170 | 171 | self.fit = Fit(prev_in, output_channels) 172 | 173 | def forward(self, x): 174 | x, prev = x 175 | 176 | #return transformed x as new x, and original x as prev 177 | #only prev tensor needs to be modified 178 | prev = self.fit((x, prev)) 179 | 180 | h = self.dem_reduce(x) 181 | 182 | x1 = self.block1_left(h) + self.block1_right(h) 183 | x2 = self.block2_left(prev) + self.block2_right(h) 184 | x3 = self.block3_left(h) + self.block3_right(h) 185 | x4 = self.block4_left(prev) + self.block4_right(prev) 186 | x5 = self.block5_left(prev) + self.block5_right(prev) 187 | 188 | return torch.cat([prev, x1, x2, x3, x4, x5], 1), x 189 | 190 | class ReductionCell(nn.Module): 191 | 192 | def __init__(self, x_in, prev_in, output_channels): 193 | super().__init__() 194 | 195 | self.dim_reduce = nn.Sequential( 196 | nn.ReLU(), 197 | nn.Conv2d(x_in, output_channels, 1), 198 | nn.BatchNorm2d(output_channels) 199 | ) 200 | 201 | #block1 202 | self.layer1block1_left = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3) 203 | self.layer1block1_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2) 204 | 205 | #block2 206 | self.layer1block2_left = nn.MaxPool2d(3, stride=2, padding=1) 207 | self.layer1block2_right = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3) 208 | 209 | #block3 210 | self.layer1block3_left = nn.AvgPool2d(3, 2, 1) 211 | self.layer1block3_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2) 212 | 213 | #block5 214 | self.layer2block1_left = nn.MaxPool2d(3, 2, 1) 215 | self.layer2block1_right = SeperableBranch(output_channels, output_channels, 3, stride=1, padding=1) 216 | 217 | #block4 218 | self.layer2block2_left = nn.AvgPool2d(3, 1, 1) 219 | self.layer2block2_right = nn.Sequential() 220 | 221 | self.fit = Fit(prev_in, output_channels) 222 | 223 | def forward(self, x): 224 | x, prev = x 225 | prev = self.fit((x, prev)) 226 | 227 | h = self.dim_reduce(x) 228 | 229 | layer1block1 = self.layer1block1_left(prev) + self.layer1block1_right(h) 230 | layer1block2 = self.layer1block2_left(h) + self.layer1block2_right(prev) 231 | layer1block3 = self.layer1block3_left(h) + self.layer1block3_right(prev) 232 | layer2block1 = self.layer2block1_left(h) + self.layer2block1_right(layer1block1) 233 | layer2block2 = self.layer2block2_left(layer1block1) + self.layer2block2_right(layer1block2) 234 | 235 | return torch.cat([ 236 | layer1block2, #https://github.com/keras-team/keras-applications/blob/master/keras_applications/nasnet.py line 739 237 | layer1block3, 238 | layer2block1, 239 | layer2block2 240 | ], 1), x 241 | 242 | 243 | class NasNetA(nn.Module): 244 | 245 | def __init__(self, repeat_cell_num, reduction_num, filters, stemfilter, class_num=100): 246 | super().__init__() 247 | 248 | self.stem = nn.Sequential( 249 | nn.Conv2d(3, stemfilter, 3, padding=1, bias=False), 250 | nn.BatchNorm2d(stemfilter) 251 | ) 252 | 253 | self.prev_filters = stemfilter 254 | self.x_filters = stemfilter 255 | self.filters = filters 256 | 257 | self.cell_layers = self._make_layers(repeat_cell_num, reduction_num) 258 | 259 | self.relu = nn.ReLU() 260 | self.avg = nn.AdaptiveAvgPool2d(1) 261 | self.fc = nn.Linear(self.filters * 6, class_num) 262 | 263 | 264 | def _make_normal(self, block, repeat, output): 265 | """make normal cell 266 | Args: 267 | block: cell type 268 | repeat: number of repeated normal cell 269 | output: output filters for each branch in normal cell 270 | Returns: 271 | stacked normal cells 272 | """ 273 | 274 | layers = [] 275 | for r in range(repeat): 276 | layers.append(block(self.x_filters, self.prev_filters, output)) 277 | self.prev_filters = self.x_filters 278 | self.x_filters = output * 6 #concatenate 6 branches 279 | 280 | return layers 281 | 282 | def _make_reduction(self, block, output): 283 | """make normal cell 284 | Args: 285 | block: cell type 286 | output: output filters for each branch in reduction cell 287 | Returns: 288 | reduction cell 289 | """ 290 | 291 | reduction = block(self.x_filters, self.prev_filters, output) 292 | self.prev_filters = self.x_filters 293 | self.x_filters = output * 4 #stack for 4 branches 294 | 295 | return reduction 296 | 297 | def _make_layers(self, repeat_cell_num, reduction_num): 298 | 299 | layers = [] 300 | for i in range(reduction_num): 301 | 302 | layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters)) 303 | self.filters *= 2 304 | layers.append(self._make_reduction(ReductionCell, self.filters)) 305 | 306 | layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters)) 307 | 308 | return nn.Sequential(*layers) 309 | 310 | 311 | def forward(self, x): 312 | 313 | x = self.stem(x) 314 | prev = None 315 | x, prev = self.cell_layers((x, prev)) 316 | x = self.relu(x) 317 | x = self.avg(x) 318 | x = x.view(x.size(0), -1) 319 | x = self.fc(x) 320 | 321 | return x 322 | 323 | 324 | def nasnet(): 325 | 326 | #stem filters must be 44, it's a pytorch workaround, cant change to other number 327 | return NasNetA(4, 2, 44, 44) 328 | 329 | -------------------------------------------------------------------------------- /Code/benchmarking/inceptionv3.py: -------------------------------------------------------------------------------- 1 | """ inceptionv3 in pytorch 2 | 3 | 4 | [1] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna 5 | 6 | Rethinking the Inception Architecture for Computer Vision 7 | https://arxiv.org/abs/1512.00567v3 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | 14 | class BasicConv2d(nn.Module): 15 | 16 | def __init__(self, input_channels, output_channels, **kwargs): 17 | super().__init__() 18 | self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs) 19 | self.bn = nn.BatchNorm2d(output_channels) 20 | self.relu = nn.ReLU(inplace=True) 21 | 22 | def forward(self, x): 23 | x = self.conv(x) 24 | x = self.bn(x) 25 | x = self.relu(x) 26 | 27 | return x 28 | 29 | #same naive inception module 30 | class InceptionA(nn.Module): 31 | 32 | def __init__(self, input_channels, pool_features): 33 | super().__init__() 34 | self.branch1x1 = BasicConv2d(input_channels, 64, kernel_size=1) 35 | 36 | self.branch5x5 = nn.Sequential( 37 | BasicConv2d(input_channels, 48, kernel_size=1), 38 | BasicConv2d(48, 64, kernel_size=5, padding=2) 39 | ) 40 | 41 | self.branch3x3 = nn.Sequential( 42 | BasicConv2d(input_channels, 64, kernel_size=1), 43 | BasicConv2d(64, 96, kernel_size=3, padding=1), 44 | BasicConv2d(96, 96, kernel_size=3, padding=1) 45 | ) 46 | 47 | self.branchpool = nn.Sequential( 48 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 49 | BasicConv2d(input_channels, pool_features, kernel_size=3, padding=1) 50 | ) 51 | 52 | def forward(self, x): 53 | 54 | #x -> 1x1(same) 55 | branch1x1 = self.branch1x1(x) 56 | 57 | #x -> 1x1 -> 5x5(same) 58 | branch5x5 = self.branch5x5(x) 59 | #branch5x5 = self.branch5x5_2(branch5x5) 60 | 61 | #x -> 1x1 -> 3x3 -> 3x3(same) 62 | branch3x3 = self.branch3x3(x) 63 | 64 | #x -> pool -> 1x1(same) 65 | branchpool = self.branchpool(x) 66 | 67 | outputs = [branch1x1, branch5x5, branch3x3, branchpool] 68 | 69 | return torch.cat(outputs, 1) 70 | 71 | #downsample 72 | #Factorization into smaller convolutions 73 | class InceptionB(nn.Module): 74 | 75 | def __init__(self, input_channels): 76 | super().__init__() 77 | 78 | self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=3, stride=2) 79 | 80 | self.branch3x3stack = nn.Sequential( 81 | BasicConv2d(input_channels, 64, kernel_size=1), 82 | BasicConv2d(64, 96, kernel_size=3, padding=1), 83 | BasicConv2d(96, 96, kernel_size=3, stride=2) 84 | ) 85 | 86 | self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2) 87 | 88 | def forward(self, x): 89 | 90 | #x - > 3x3(downsample) 91 | branch3x3 = self.branch3x3(x) 92 | 93 | #x -> 3x3 -> 3x3(downsample) 94 | branch3x3stack = self.branch3x3stack(x) 95 | 96 | #x -> avgpool(downsample) 97 | branchpool = self.branchpool(x) 98 | 99 | #"""We can use two parallel stride 2 blocks: P and C. P is a pooling 100 | #layer (either average or maximum pooling) the activation, both of 101 | #them are stride 2 the filter banks of which are concatenated as in 102 | #figure 10.""" 103 | outputs = [branch3x3, branch3x3stack, branchpool] 104 | 105 | return torch.cat(outputs, 1) 106 | 107 | #Factorizing Convolutions with Large Filter Size 108 | class InceptionC(nn.Module): 109 | def __init__(self, input_channels, channels_7x7): 110 | super().__init__() 111 | self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1) 112 | 113 | c7 = channels_7x7 114 | 115 | #In theory, we could go even further and argue that one can replace any n × n 116 | #convolution by a 1 × n convolution followed by a n × 1 convolution and the 117 | #computational cost saving increases dramatically as n grows (see figure 6). 118 | self.branch7x7 = nn.Sequential( 119 | BasicConv2d(input_channels, c7, kernel_size=1), 120 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 121 | BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3)) 122 | ) 123 | 124 | self.branch7x7stack = nn.Sequential( 125 | BasicConv2d(input_channels, c7, kernel_size=1), 126 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 127 | BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)), 128 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 129 | BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3)) 130 | ) 131 | 132 | self.branch_pool = nn.Sequential( 133 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 134 | BasicConv2d(input_channels, 192, kernel_size=1), 135 | ) 136 | 137 | def forward(self, x): 138 | 139 | #x -> 1x1(same) 140 | branch1x1 = self.branch1x1(x) 141 | 142 | #x -> 1layer 1*7 and 7*1 (same) 143 | branch7x7 = self.branch7x7(x) 144 | 145 | #x-> 2layer 1*7 and 7*1(same) 146 | branch7x7stack = self.branch7x7stack(x) 147 | 148 | #x-> avgpool (same) 149 | branchpool = self.branch_pool(x) 150 | 151 | outputs = [branch1x1, branch7x7, branch7x7stack, branchpool] 152 | 153 | return torch.cat(outputs, 1) 154 | 155 | class InceptionD(nn.Module): 156 | 157 | def __init__(self, input_channels): 158 | super().__init__() 159 | 160 | self.branch3x3 = nn.Sequential( 161 | BasicConv2d(input_channels, 192, kernel_size=1), 162 | BasicConv2d(192, 320, kernel_size=3, stride=2) 163 | ) 164 | 165 | self.branch7x7 = nn.Sequential( 166 | BasicConv2d(input_channels, 192, kernel_size=1), 167 | BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)), 168 | BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0)), 169 | BasicConv2d(192, 192, kernel_size=3, stride=2) 170 | ) 171 | 172 | self.branchpool = nn.AvgPool2d(kernel_size=3, stride=2) 173 | 174 | def forward(self, x): 175 | 176 | #x -> 1x1 -> 3x3(downsample) 177 | branch3x3 = self.branch3x3(x) 178 | 179 | #x -> 1x1 -> 1x7 -> 7x1 -> 3x3 (downsample) 180 | branch7x7 = self.branch7x7(x) 181 | 182 | #x -> avgpool (downsample) 183 | branchpool = self.branchpool(x) 184 | 185 | outputs = [branch3x3, branch7x7, branchpool] 186 | 187 | return torch.cat(outputs, 1) 188 | 189 | 190 | #same 191 | class InceptionE(nn.Module): 192 | def __init__(self, input_channels): 193 | super().__init__() 194 | self.branch1x1 = BasicConv2d(input_channels, 320, kernel_size=1) 195 | 196 | self.branch3x3_1 = BasicConv2d(input_channels, 384, kernel_size=1) 197 | self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1)) 198 | self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0)) 199 | 200 | self.branch3x3stack_1 = BasicConv2d(input_channels, 448, kernel_size=1) 201 | self.branch3x3stack_2 = BasicConv2d(448, 384, kernel_size=3, padding=1) 202 | self.branch3x3stack_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1)) 203 | self.branch3x3stack_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0)) 204 | 205 | self.branch_pool = nn.Sequential( 206 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 207 | BasicConv2d(input_channels, 192, kernel_size=1) 208 | ) 209 | 210 | def forward(self, x): 211 | 212 | #x -> 1x1 (same) 213 | branch1x1 = self.branch1x1(x) 214 | 215 | # x -> 1x1 -> 3x1 216 | # x -> 1x1 -> 1x3 217 | # concatenate(3x1, 1x3) 218 | #"""7. Inception modules with expanded the filter bank outputs. 219 | #This architecture is used on the coarsest (8 × 8) grids to promote 220 | #high dimensional representations, as suggested by principle 221 | #2 of Section 2.""" 222 | branch3x3 = self.branch3x3_1(x) 223 | branch3x3 = [ 224 | self.branch3x3_2a(branch3x3), 225 | self.branch3x3_2b(branch3x3) 226 | ] 227 | branch3x3 = torch.cat(branch3x3, 1) 228 | 229 | # x -> 1x1 -> 3x3 -> 1x3 230 | # x -> 1x1 -> 3x3 -> 3x1 231 | #concatenate(1x3, 3x1) 232 | branch3x3stack = self.branch3x3stack_1(x) 233 | branch3x3stack = self.branch3x3stack_2(branch3x3stack) 234 | branch3x3stack = [ 235 | self.branch3x3stack_3a(branch3x3stack), 236 | self.branch3x3stack_3b(branch3x3stack) 237 | ] 238 | branch3x3stack = torch.cat(branch3x3stack, 1) 239 | 240 | branchpool = self.branch_pool(x) 241 | 242 | outputs = [branch1x1, branch3x3, branch3x3stack, branchpool] 243 | 244 | return torch.cat(outputs, 1) 245 | 246 | class InceptionV3(nn.Module): 247 | 248 | def __init__(self, num_classes=100): 249 | super().__init__() 250 | self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, padding=1) 251 | self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3, padding=1) 252 | self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1) 253 | self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1) 254 | self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3) 255 | 256 | #naive inception module 257 | self.Mixed_5b = InceptionA(192, pool_features=32) 258 | self.Mixed_5c = InceptionA(256, pool_features=64) 259 | self.Mixed_5d = InceptionA(288, pool_features=64) 260 | 261 | #downsample 262 | self.Mixed_6a = InceptionB(288) 263 | 264 | self.Mixed_6b = InceptionC(768, channels_7x7=128) 265 | self.Mixed_6c = InceptionC(768, channels_7x7=160) 266 | self.Mixed_6d = InceptionC(768, channels_7x7=160) 267 | self.Mixed_6e = InceptionC(768, channels_7x7=192) 268 | 269 | #downsample 270 | self.Mixed_7a = InceptionD(768) 271 | 272 | self.Mixed_7b = InceptionE(1280) 273 | self.Mixed_7c = InceptionE(2048) 274 | 275 | #6*6 feature size 276 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 277 | self.dropout = nn.Dropout2d() 278 | self.linear = nn.Linear(2048, num_classes) 279 | 280 | def forward(self, x): 281 | 282 | #32 -> 30 283 | x = self.Conv2d_1a_3x3(x) 284 | x = self.Conv2d_2a_3x3(x) 285 | x = self.Conv2d_2b_3x3(x) 286 | x = self.Conv2d_3b_1x1(x) 287 | x = self.Conv2d_4a_3x3(x) 288 | 289 | #30 -> 30 290 | x = self.Mixed_5b(x) 291 | x = self.Mixed_5c(x) 292 | x = self.Mixed_5d(x) 293 | 294 | #30 -> 14 295 | #Efficient Grid Size Reduction to avoid representation 296 | #bottleneck 297 | x = self.Mixed_6a(x) 298 | 299 | #14 -> 14 300 | #"""In practice, we have found that employing this factorization does not 301 | #work well on early layers, but it gives very good results on medium 302 | #grid-sizes (On m × m feature maps, where m ranges between 12 and 20). 303 | #On that level, very good results can be achieved by using 1 × 7 convolutions 304 | #followed by 7 × 1 convolutions.""" 305 | x = self.Mixed_6b(x) 306 | x = self.Mixed_6c(x) 307 | x = self.Mixed_6d(x) 308 | x = self.Mixed_6e(x) 309 | 310 | #14 -> 6 311 | #Efficient Grid Size Reduction 312 | x = self.Mixed_7a(x) 313 | 314 | #6 -> 6 315 | #We are using this solution only on the coarsest grid, 316 | #since that is the place where producing high dimensional 317 | #sparse representation is the most critical as the ratio of 318 | #local processing (by 1 × 1 convolutions) is increased compared 319 | #to the spatial aggregation.""" 320 | x = self.Mixed_7b(x) 321 | x = self.Mixed_7c(x) 322 | 323 | #6 -> 1 324 | x = self.avgpool(x) 325 | x = self.dropout(x) 326 | x = x.view(x.size(0), -1) 327 | x = self.linear(x) 328 | return x 329 | 330 | 331 | def inceptionv3(): 332 | return InceptionV3() 333 | 334 | 335 | 336 | -------------------------------------------------------------------------------- /Code/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torchvision 11 | import torchvision.transforms as transforms 12 | 13 | from torch.utils.data import DataLoader 14 | from dataset import get_dataloaders 15 | #from dataset import * 16 | from torch.autograd import Variable 17 | 18 | from tensorboardX import SummaryWriter 19 | 20 | from conf import settings 21 | from utils import get_network, get_training_dataloader, get_test_dataloader, WarmUpLR 22 | from nni.compression.torch import AGP_Pruner, Pruner 23 | from nni.compression.torch.pruning.weight_masker import WeightMasker 24 | from nni.compression.torch.pruning.structured_pruning import ActivationFilterPrunerMasker, StructuredWeightMasker 25 | 26 | import matplotlib.pyplot as plt 27 | import timeit 28 | from tqdm import tqdm 29 | 30 | 31 | def draw_weights(weights, index): 32 | print(weights.shape) 33 | weights = weights.cpu().numpy() 34 | for i in range(len(weights)): 35 | fig = plt.figure() 36 | plt.bar(np.arange(len(weights[i])), weights[i]) 37 | #plt.show() 38 | fig.savefig("scaling/scaling" + str(index)) 39 | print("min = ", min(weights[i])) 40 | print("# of zeros = ", np.count_nonzero(weights[i] == 0)) 41 | 42 | class MyMasker(StructuredWeightMasker): 43 | def calc_mask(self, sparsity, wrapper, wrapper_idx=None): 44 | weight = wrapper.module.weight.data 45 | bias = None 46 | if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None: 47 | bias = wrapper.module.bias.data 48 | 49 | if wrapper.weight_mask is None: 50 | mask_weight = torch.ones(weight.size()).type_as(weight).detach() 51 | else: 52 | mask_weight = wrapper.weight_mask.clone() 53 | if bias is not None: 54 | if wrapper.bias_mask is None: 55 | mask_bias = torch.ones(bias.size()).type_as(bias).detach() 56 | else: 57 | mask_bias = wrapper.bias_mask.clone() 58 | else: 59 | mask_bias = None 60 | mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias} 61 | 62 | filters = weight.size(0) 63 | num_prune = int(filters * sparsity) 64 | if filters < 2 or num_prune < 1: 65 | return mask 66 | # weight*mask_weight: apply base mask for iterative pruning 67 | return self.get_mask(mask, weight*mask_weight, num_prune, wrapper, wrapper_idx) 68 | 69 | def get_mask(self, base_mask, weight, num_prune, wrapper, wrapper_idx): 70 | print(activation[list(activation.keys())[wrapper_idx]]) 71 | print(activation[list(activation.keys())[wrapper_idx]].shape) 72 | mask = torch.mean(activation[list(activation.keys())[wrapper_idx]], dim = 0, keepdims = True) 73 | draw_weights(mask, wrapper_idx) 74 | threshold = torch.topk(mask[0], k = num_prune, dim = 0, largest=False)[0].max() 75 | mask_weight = torch.gt(mask[0], threshold)[:, None, None, None].expand_as(weight).type_as(weight) 76 | mask_bias = torch.gt(mask[0], threshold).type_as(weight).detach() if base_mask['bias_mask'] is not None else None 77 | return {'weight_mask': mask_weight.detach(), 'bias_mask': mask_bias} 78 | 79 | 80 | 81 | class MyPruner(Pruner): 82 | def __init__(self, model, config_list, optimizer): 83 | super().__init__(model, config_list, optimizer) 84 | self.set_wrappers_attribute("if_calculated", False) 85 | # construct a weight masker instance 86 | self.masker = MyMasker(model, self) 87 | 88 | def calc_mask(self, wrapper, wrapper_idx=None): 89 | sparsity = wrapper.config['sparsity'] 90 | if wrapper.if_calculated: 91 | # Already pruned, do not prune again as a one-shot pruner 92 | return None 93 | else: 94 | # call your masker to actually calcuate the mask for this layer 95 | masks = self.masker.calc_mask(sparsity=sparsity, wrapper=wrapper, wrapper_idx=wrapper_idx) 96 | wrapper.if_calculated = True 97 | return masks 98 | 99 | def train(epoch): 100 | 101 | net.train() 102 | for batch_index, (images, labels) in enumerate(training_loader): 103 | if epoch <= args.warm: 104 | warmup_scheduler.step() 105 | 106 | images = Variable(images) 107 | labels = Variable(labels) 108 | 109 | labels = labels.cuda() 110 | images = images.cuda() 111 | 112 | optimizer.zero_grad() 113 | outputs = net(images) 114 | loss = loss_function(outputs, labels) 115 | loss.backward() 116 | optimizer.step() 117 | 118 | n_iter = (epoch - 1) * len(training_loader) + batch_index + 1 119 | 120 | last_layer = list(net.children())[-1] 121 | for name, para in last_layer.named_parameters(): 122 | if 'weight' in name: 123 | writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter) 124 | if 'bias' in name: 125 | writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter) 126 | 127 | print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( 128 | loss.item(), 129 | optimizer.param_groups[0]['lr'], 130 | epoch=epoch, 131 | trained_samples=batch_index * args.b + len(images), 132 | total_samples=len(training_loader.dataset) 133 | )) 134 | 135 | #update training loss for each iteration 136 | writer.add_scalar('Train/loss', loss.item(), n_iter) 137 | 138 | for name, param in net.named_parameters(): 139 | layer, attr = os.path.splitext(name) 140 | attr = attr[1:] 141 | writer.add_histogram("{}/{}".format(layer, attr), param, epoch) 142 | 143 | 144 | def Average(lst): 145 | return sum(lst) / len(lst) 146 | 147 | 148 | def eval_training(epoch): 149 | net.eval() 150 | 151 | test_loss = 0.0 # cost function error 152 | correct = 0.0 153 | inference_time=[] 154 | 155 | # calculate FLOPS: 156 | from thop import profile 157 | 158 | macs, params = profile(net, inputs=(torch.randn(1, 3, settings.IMG_SIZE, settings.IMG_SIZE).cuda(), )) 159 | print("macs = ", macs) 160 | print("params = ", params) 161 | 162 | for (images, labels) in tqdm(test_loader): 163 | with torch.no_grad(): 164 | start = timeit.default_timer() 165 | images = Variable(images) 166 | labels = Variable(labels) 167 | 168 | images = images.cuda() 169 | labels = labels.cuda() 170 | 171 | outputs = net(images) 172 | stop = timeit.default_timer() 173 | inference_time.append(stop-start) 174 | loss = loss_function(outputs, labels) 175 | test_loss += loss.item() 176 | _, preds = outputs.max(1) 177 | correct += preds.eq(labels).sum() 178 | 179 | print("FPS = ", 1/Average(inference_time)) 180 | 181 | 182 | print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format( 183 | test_loss / len(test_loader.dataset), 184 | correct.float() / len(test_loader.dataset) 185 | )) 186 | print() 187 | 188 | #add informations to tensorboard 189 | writer.add_scalar('Test/Average loss', test_loss / len(test_loader.dataset), epoch) 190 | writer.add_scalar('Test/Accuracy', correct.float() / len(test_loader.dataset), epoch) 191 | 192 | return correct.float() / len(test_loader.dataset) 193 | 194 | num_classes = {'dogs': 120, 'tiny-imagenet': 200, 'cifar100': 100, 'cifar10': 10, 'caltech': 257, 'imagenet': 1000} 195 | if __name__ == '__main__': 196 | # #config for pruner 197 | # config_list = [{ 198 | # 'initial_sparsity': 0.0, 199 | # 'final_sparsity': 0.8, 200 | # 'start_epoch': 0, 201 | # 'end_epoch': 200, 202 | # 'frequency': 1, 203 | # 'op_types': ['Conv2d'] 204 | # }] 205 | parser = argparse.ArgumentParser() 206 | parser.add_argument('-net', type=str, required=True, help='net type') 207 | parser.add_argument('-gpu', type=bool, default=True, help='use gpu or not') 208 | parser.add_argument('-w', type=int, default=8, help='number of workers for dataloader') 209 | parser.add_argument('-b', type=int, default=32, help='batch size for dataloader') 210 | parser.add_argument('-s', type=bool, default=True, help='whether shuffle the dataset') 211 | parser.add_argument('-warm', type=int, default=1, help='warm up training phase') 212 | parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate') 213 | parser.add_argument('-weights', type=str, default='', help='the weights file you want to load') 214 | parser.add_argument('-data', type=str, default='dogs', help='the weights file you want to load') 215 | args = parser.parse_args() 216 | 217 | net = get_network(args, use_gpu=args.gpu, num_classes = num_classes[args.data]) 218 | # print number of paramters 219 | pytorch_total_params = sum(p.numel() for p in net.parameters()) 220 | print("number of network paramters are ", pytorch_total_params) 221 | pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad) 222 | print("number of network Trainable paramters are ", pytorch_total_params) 223 | 224 | 225 | if args.weights != '': 226 | net.load_state_dict(torch.load(args.weights), args.gpu) 227 | print('loaded checkpoint') 228 | 229 | dataloaders = get_dataloaders(args.b, args.data) 230 | #data preprocessing:aset) 231 | #data preprocessing: 232 | training_loader = dataloaders['train'] 233 | 234 | test_loader = dataloaders['val'] 235 | 236 | loss_function = nn.CrossEntropyLoss() 237 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) 238 | #""" 239 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=9, 240 | verbose=True, threshold=0.001, threshold_mode='rel', 241 | cooldown=0, min_lr=1e-6, eps=1e-08) 242 | #""" 243 | #train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay 244 | iter_per_epoch = len(training_loader) 245 | warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) 246 | checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) 247 | # #pruner 248 | # pruner = AGP_Pruner(net, config_list, optimizer, pruning_algorithm='l1') 249 | # pruner.compress() 250 | # activation = {} 251 | # conv_layers = [] 252 | # def get_activation(name): 253 | # def hook(model, input, output): 254 | # activation[name] = output.detach() 255 | # return hook 256 | # for name, module in net.named_modules(): 257 | # # print(name) 258 | # if len(name) > 1 and name[-1] == '3' and 'excitation' in name: 259 | # module.register_forward_hook(get_activation(name)) 260 | # elif len(name) > 1 and (name[-13 : ] == "bottle_neck.5" or name[-10:] == 'residual.6'): 261 | # conv_layers.append(name) 262 | 263 | # print(conv_layers) 264 | # # print(net) 265 | # config_list = [{ 266 | # 'sparsity': 0.5, 267 | # 'op_types': ['Conv2d'], 268 | # 'op_names': conv_layers}] 269 | 270 | # print(net) 271 | 272 | # use tensorboard 273 | if not os.path.exists(settings.LOG_DIR): 274 | os.mkdir(settings.LOG_DIR) 275 | writer = SummaryWriter(log_dir=os.path.join( 276 | settings.LOG_DIR, args.net, settings.TIME_NOW)) 277 | input_tensor = torch.Tensor(1, 3, settings.IMG_SIZE, settings.IMG_SIZE).cuda() 278 | writer.add_graph(net, Variable(input_tensor, requires_grad=True)) 279 | # pruner = MyPruner(net, config_list, optimizer) 280 | 281 | #acc = eval_training(0) 282 | # pruner.compress() 283 | # pruner.export_model(model_path='test.pth', mask_path='test.pth') 284 | # exit() 285 | #create checkpoint folder to save model 286 | if not os.path.exists(checkpoint_path): 287 | os.makedirs(checkpoint_path) 288 | checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') 289 | 290 | best_acc = 0.0 291 | for epoch in range(1, settings.EPOCH): 292 | #update pruner 293 | # pruner.update_epoch(epoch) 294 | #if epoch > args.warm: 295 | # train_scheduler.step(epoch) 296 | 297 | 298 | train(epoch) 299 | acc = eval_training(epoch) 300 | 301 | if epoch > args.warm: 302 | scheduler.step(acc) 303 | 304 | #start to save best performance model after learning rate decay to 0.01 305 | if best_acc < acc: 306 | torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='best')) 307 | best_acc = acc 308 | continue 309 | 310 | if not epoch % settings.SAVE_EPOCH: 311 | torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='regular')) 312 | # pruner.export_model(model_path='model_l1_freq1.pth', mask_path='mask_l1_freq1.pth') 313 | writer.close() 314 | -------------------------------------------------------------------------------- /Code/benchmarking/attention.py: -------------------------------------------------------------------------------- 1 | """residual attention network in pytorch 2 | 3 | 4 | 5 | [1] Fei Wang, Mengqing Jiang, Chen Qian, Shuo Yang, Cheng Li, Honggang Zhang, Xiaogang Wang, Xiaoou Tang 6 | 7 | Residual Attention Network for Image Classification 8 | https://arxiv.org/abs/1704.06904 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | #"""The Attention Module is built by pre-activation Residual Unit [11] with the 16 | #number of channels in each stage is the same as ResNet [10].""" 17 | 18 | class PreActResidualUnit(nn.Module): 19 | """PreAct Residual Unit 20 | Args: 21 | in_channels: residual unit input channel number 22 | out_channels: residual unit output channel numebr 23 | stride: stride of residual unit when stride = 2, downsample the featuremap 24 | """ 25 | 26 | def __init__(self, in_channels, out_channels, stride): 27 | super().__init__() 28 | 29 | bottleneck_channels = int(out_channels / 4) 30 | self.residual_function = nn.Sequential( 31 | #1x1 conv 32 | nn.BatchNorm2d(in_channels), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(in_channels, bottleneck_channels, 1, stride), 35 | 36 | #3x3 conv 37 | nn.BatchNorm2d(bottleneck_channels), 38 | nn.ReLU(inplace=True), 39 | nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1), 40 | 41 | #1x1 conv 42 | nn.BatchNorm2d(bottleneck_channels), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(bottleneck_channels, out_channels, 1) 45 | ) 46 | 47 | self.shortcut = nn.Sequential() 48 | if stride != 2 or (in_channels != out_channels): 49 | self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=stride) 50 | 51 | def forward(self, x): 52 | 53 | res = self.residual_function(x) 54 | shortcut = self.shortcut(x) 55 | 56 | return res + shortcut 57 | 58 | class AttentionModule1(nn.Module): 59 | 60 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 61 | super().__init__() 62 | #"""The hyperparameter p denotes the number of preprocessing Residual 63 | #Units before splitting into trunk branch and mask branch. t denotes 64 | #the number of Residual Units in trunk branch. r denotes the number of 65 | #Residual Units between adjacent pooling layer in the mask branch.""" 66 | assert in_channels == out_channels 67 | 68 | self.pre = self._make_residual(in_channels, out_channels, p) 69 | self.trunk = self._make_residual(in_channels, out_channels, t) 70 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 71 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 72 | self.soft_resdown3 = self._make_residual(in_channels, out_channels, r) 73 | self.soft_resdown4 = self._make_residual(in_channels, out_channels, r) 74 | 75 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 76 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 77 | self.soft_resup3 = self._make_residual(in_channels, out_channels, r) 78 | self.soft_resup4 = self._make_residual(in_channels, out_channels, r) 79 | 80 | self.shortcut_short = PreActResidualUnit(in_channels, out_channels, 1) 81 | self.shortcut_long = PreActResidualUnit(in_channels, out_channels, 1) 82 | 83 | self.sigmoid = nn.Sequential( 84 | nn.BatchNorm2d(out_channels), 85 | nn.ReLU(inplace=True), 86 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 87 | nn.BatchNorm2d(out_channels), 88 | nn.ReLU(inplace=True), 89 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 90 | nn.Sigmoid() 91 | ) 92 | 93 | self.last = self._make_residual(in_channels, out_channels, p) 94 | 95 | def forward(self, x): 96 | ###We make the size of the smallest output map in each mask branch 7*7 to be consistent 97 | #with the smallest trunk output map size. 98 | ###Thus 3,2,1 max-pooling layers are used in mask branch with input size 56 * 56, 28 * 28, 14 * 14 respectively. 99 | x = self.pre(x) 100 | input_size = (x.size(2), x.size(3)) 101 | 102 | x_t = self.trunk(x) 103 | 104 | #first downsample out 28 105 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 106 | x_s = self.soft_resdown1(x_s) 107 | 108 | #28 shortcut 109 | shape1 = (x_s.size(2), x_s.size(3)) 110 | shortcut_long = self.shortcut_long(x_s) 111 | 112 | #seccond downsample out 14 113 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 114 | x_s = self.soft_resdown2(x_s) 115 | 116 | #14 shortcut 117 | shape2 = (x_s.size(2), x_s.size(3)) 118 | shortcut_short = self.soft_resdown3(x_s) 119 | 120 | #third downsample out 7 121 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 122 | x_s = self.soft_resdown3(x_s) 123 | 124 | #mid 125 | x_s = self.soft_resdown4(x_s) 126 | x_s = self.soft_resup1(x_s) 127 | 128 | #first upsample out 14 129 | x_s = self.soft_resup2(x_s) 130 | x_s = F.interpolate(x_s, size=shape2) 131 | x_s += shortcut_short 132 | 133 | #second upsample out 28 134 | x_s = self.soft_resup3(x_s) 135 | x_s = F.interpolate(x_s, size=shape1) 136 | x_s += shortcut_long 137 | 138 | #thrid upsample out 54 139 | x_s = self.soft_resup4(x_s) 140 | x_s = F.interpolate(x_s, size=input_size) 141 | 142 | x_s = self.sigmoid(x_s) 143 | x = (1 + x_s) * x_t 144 | x = self.last(x) 145 | 146 | return x 147 | 148 | def _make_residual(self, in_channels, out_channels, p): 149 | 150 | layers = [] 151 | for _ in range(p): 152 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 153 | 154 | return nn.Sequential(*layers) 155 | 156 | class AttentionModule2(nn.Module): 157 | 158 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 159 | super().__init__() 160 | #"""The hyperparameter p denotes the number of preprocessing Residual 161 | #Units before splitting into trunk branch and mask branch. t denotes 162 | #the number of Residual Units in trunk branch. r denotes the number of 163 | #Residual Units between adjacent pooling layer in the mask branch.""" 164 | assert in_channels == out_channels 165 | 166 | self.pre = self._make_residual(in_channels, out_channels, p) 167 | self.trunk = self._make_residual(in_channels, out_channels, t) 168 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 169 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 170 | self.soft_resdown3 = self._make_residual(in_channels, out_channels, r) 171 | 172 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 173 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 174 | self.soft_resup3 = self._make_residual(in_channels, out_channels, r) 175 | 176 | self.shortcut = PreActResidualUnit(in_channels, out_channels, 1) 177 | 178 | self.sigmoid = nn.Sequential( 179 | nn.BatchNorm2d(out_channels), 180 | nn.ReLU(inplace=True), 181 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 182 | nn.BatchNorm2d(out_channels), 183 | nn.ReLU(inplace=True), 184 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 185 | nn.Sigmoid() 186 | ) 187 | 188 | self.last = self._make_residual(in_channels, out_channels, p) 189 | 190 | def forward(self, x): 191 | x = self.pre(x) 192 | input_size = (x.size(2), x.size(3)) 193 | 194 | x_t = self.trunk(x) 195 | 196 | #first downsample out 14 197 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 198 | x_s = self.soft_resdown1(x_s) 199 | 200 | #14 shortcut 201 | shape1 = (x_s.size(2), x_s.size(3)) 202 | shortcut = self.shortcut(x_s) 203 | 204 | #seccond downsample out 7 205 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 206 | x_s = self.soft_resdown2(x_s) 207 | 208 | #mid 209 | x_s = self.soft_resdown3(x_s) 210 | x_s = self.soft_resup1(x_s) 211 | 212 | #first upsample out 14 213 | x_s = self.soft_resup2(x_s) 214 | x_s = F.interpolate(x_s, size=shape1) 215 | x_s += shortcut 216 | 217 | #second upsample out 28 218 | x_s = self.soft_resup3(x_s) 219 | x_s = F.interpolate(x_s, size=input_size) 220 | 221 | x_s = self.sigmoid(x_s) 222 | x = (1 + x_s) * x_t 223 | x = self.last(x) 224 | 225 | return x 226 | 227 | def _make_residual(self, in_channels, out_channels, p): 228 | 229 | layers = [] 230 | for _ in range(p): 231 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 232 | 233 | return nn.Sequential(*layers) 234 | 235 | class AttentionModule3(nn.Module): 236 | 237 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 238 | super().__init__() 239 | 240 | assert in_channels == out_channels 241 | 242 | self.pre = self._make_residual(in_channels, out_channels, p) 243 | self.trunk = self._make_residual(in_channels, out_channels, t) 244 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 245 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 246 | 247 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 248 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 249 | 250 | self.shortcut = PreActResidualUnit(in_channels, out_channels, 1) 251 | 252 | self.sigmoid = nn.Sequential( 253 | nn.BatchNorm2d(out_channels), 254 | nn.ReLU(inplace=True), 255 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 256 | nn.BatchNorm2d(out_channels), 257 | nn.ReLU(inplace=True), 258 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 259 | nn.Sigmoid() 260 | ) 261 | 262 | self.last = self._make_residual(in_channels, out_channels, p) 263 | 264 | def forward(self, x): 265 | x = self.pre(x) 266 | input_size = (x.size(2), x.size(3)) 267 | 268 | x_t = self.trunk(x) 269 | 270 | #first downsample out 14 271 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 272 | x_s = self.soft_resdown1(x_s) 273 | 274 | #mid 275 | x_s = self.soft_resdown2(x_s) 276 | x_s = self.soft_resup1(x_s) 277 | 278 | #first upsample out 14 279 | x_s = self.soft_resup2(x_s) 280 | x_s = F.interpolate(x_s, size=input_size) 281 | 282 | x_s = self.sigmoid(x_s) 283 | x = (1 + x_s) * x_t 284 | x = self.last(x) 285 | 286 | return x 287 | 288 | def _make_residual(self, in_channels, out_channels, p): 289 | 290 | layers = [] 291 | for _ in range(p): 292 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 293 | 294 | return nn.Sequential(*layers) 295 | 296 | class Attention(nn.Module): 297 | """residual attention netowrk 298 | Args: 299 | block_num: attention module number for each stage 300 | """ 301 | 302 | def __init__(self, block_num, class_num=100): 303 | 304 | super().__init__() 305 | self.pre_conv = nn.Sequential( 306 | nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1), 307 | nn.BatchNorm2d(64), 308 | nn.ReLU(inplace=True) 309 | ) 310 | 311 | self.stage1 = self._make_stage(64, 256, block_num[0], AttentionModule1) 312 | self.stage2 = self._make_stage(256, 512, block_num[1], AttentionModule2) 313 | self.stage3 = self._make_stage(512, 1024, block_num[2], AttentionModule3) 314 | self.stage4 = nn.Sequential( 315 | PreActResidualUnit(1024, 2048, 2), 316 | PreActResidualUnit(2048, 2048, 1), 317 | PreActResidualUnit(2048, 2048, 1) 318 | ) 319 | self.avg = nn.AdaptiveAvgPool2d(1) 320 | self.linear = nn.Linear(2048, 100) 321 | 322 | def forward(self, x): 323 | x = self.pre_conv(x) 324 | x = self.stage1(x) 325 | x = self.stage2(x) 326 | x = self.stage3(x) 327 | x = self.stage4(x) 328 | x = self.avg(x) 329 | x = x.view(x.size(0), -1) 330 | x = self.linear(x) 331 | 332 | return x 333 | 334 | def _make_stage(self, in_channels, out_channels, num, block): 335 | 336 | layers = [] 337 | layers.append(PreActResidualUnit(in_channels, out_channels, 2)) 338 | 339 | for _ in range(num): 340 | layers.append(block(out_channels, out_channels)) 341 | 342 | return nn.Sequential(*layers) 343 | 344 | def attention56(): 345 | return Attention([1, 1, 1]) 346 | 347 | def attention92(): 348 | return Attention([1, 2, 3]) 349 | 350 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMCA 2 | This is an original Pytorch Implementation for our paper "EMCA: Efficient Multi-Scale Channel Attention Module" 3 | ## 1- Abstract: 4 | Attention mechanisms have been explored with CNNs,both across the spatial and channel dimensions. However,all the existing methods devote the attention modules to cap-ture local interactions from a uni-scale. This paper tacklesthe following question: Can one consolidate multi-scale ag-gregation while learning channel attention more efficiently?To this end, we avail channel-wise attention over multi-ple feature scales, which empirically shows its aptitude toreplace the limited local and uni-scale attention modules.EMCA is lightweight and can efficiently model the globalcontext further it is easily integrated into any feed-forwardCNN architectures and trained in an end-to-end fashion. Wevalidate our novel architecture through comprehensive ex-periments on image classification, object detection and in-stance segmentation with different backbones. Our experi-ments show consistent gains in performances against theircounterparts, where our proposed module, named EMCA,outperforms other channel attention techniques in accuracyand latency trade-off. We also conduct experiments thatprobe the robustness of the learned representations. 5 | 6 | ## 2- Motivation: 7 | ### 2.1- Avoid Dense Integration Intuation: 8 | ![revisit Architecture](Figures/Revisit_Channel_Attention_dense_connection.png) 9 | ### 2.2- Avoid Dense Integration Results: 10 | |Method|Model|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights| 11 | |:----:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:| 12 | | | | *SE* ||||| *ECA* ||||| *SRM* ||||| 13 | | ALL | |187| 11.231 | 70.59 | 89.78 | xx| 192 | 11.148 | 70.75 | 89.74 | xx| 154 | 11.152 | 70.96 | 89.81|xx| 14 | | First | R-18 |204| 11.189| 70.91 | 89.96 | xx| 212 | 11.148 | 70.63 | 89.85 | xx| 165 | 11.150 | 71.31 | 90.07|xx| 15 | | Last | |204| 11.189| 70.92 | 89.83 | xx| 212 | 11.148 | 70.81 | 89.84 | xx| 165 | 11.150 | 71.04 | 90.00|xx| 16 | | All | |101| 20.938| 73.87 | 91.65 | xx| 107 | 20.788 | 74.13 | 91.68 | xx| 82 | 20.795 | 73.98 | 91.68 |xx| 17 | | First | R-34 |122| 20.829| 73.84 | 91.64 | xx| 122 | 20.788 | 74.20 | 91.84 | xx| 96 | 20.790 | 74.51 | 91.91 |xx| 18 | | Last | |122| 20.829| 73.64 | 91.49 | xx| 122 | 20.788 | 73.75 | 91.47 | xx| 96 | 20.790 | 73.63 | 91.44 |xx| 19 | | All | |90| 26.772 | 76.80 | 93.39 | xx| 87 | 24.373 | 77.12 | 93.68 | xx| 71 | 24.402 | 77.13 | 93.51 |xx| 20 | | First | R-50 |97| 25.037| 76.56 | 93.28 | xx| 98 | 24.373 | 77.02 | 93.49 | xx| 81 | 24.380 | 76.98 | 93.41 |xx| 21 | | Last | |97| 25.037| 75.71 | 92.60 | xx| 98 | 24.373 | 76.37 | 93.18 | xx| 81 | 24.380 | 76.73 | 93.26 |xx| 22 | 23 | 24 | 25 | ## 2- EMCA Architecture: 26 | ### 2.-1- Multi-Scale Inocrporation 27 | ![EMCA Architecture](Figures/EMCA_archeticture_only_CVPR.png) 28 | 29 | ### 2.2- Integrating EMCA Module: 30 | ![Integrating EMCA Module](Figures/EMCA_integration_only_CVPR.png) 31 | 32 | ### 2.3- EMCA Algorithm: 33 | ![Pseudo Code](Figures/EMCA_Algorithm.PNG) 34 | 35 | ## 3- HeatMap Visualization: 36 | ![HeatMap Visualization](Figures/gradcam.jpg) 37 | ![HeatMap Visualization](Figures/gradcam2.jpg) 38 | 39 | ## 4- Scales Visualization: 40 | ![HeatMap Visualization](Figures/scaling.png) 41 | 42 | ## 5- Top-1 Accuracy Visualization: 43 | ![HeatMap Visualization](Figures/Top-1_ACC.jpg) 44 | 45 | ## 6- Results: 46 | |S|N`_i-j|Model|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights| 47 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:| 48 | | | | | *SE* ||||| *ECA* ||||| *SRM* ||||| 49 | N/A | N/A | R-18 | 187 | 11.231 |70.59 | 89.78 |xx | 192| 11.148 | 70.75 | 89.74 |xx| 154 | 11.152 | 70.96 | 89.81 |xx | 50 | 0| 0 | | 204 | 11.189 | 70.91 | 89.96 |xx| 212 | 11.148 | 70.63 | 89.85 |xx| 165 | 11.150 | 71.31 | 90.07 |xx| 51 | 1| 1 | | 156 | 11.189 | 71.02 | 89.98 |xx | 174 | 11.148 | 70.83| 89.96 |xx | 123 | 11.150| 71.20| 90.00 |xx | 52 | 1| N_i-j | | 160 | 11.190 | 71.00 | 90.00 |xx| 170 | 11.148 | 71.04 | 89.99|xx| 113 | 11.150 | 71.02| 90.00 |xx| 53 | i-1| 1 | | 153| 11.190| 71.02 |90.12 |xx | 169 | 11.148 | 70.59| 89.78|xx | 113 | 11.150 | 71.00 | 89.81|xx | 54 | N/A | N/A | R-34 | 101 | 20.938 | 73.87 | 91.65 |xx | 107 | 20.788 | 74.13 | 91.68 |xx | 82 | 20.795 | 73.98 | 91.68|xx | 55 | 0 | 0 | | 122 | 20.829 | 73.84 | 91.64 |xx | 122 | 20.788 | 74.20 | 91.84 |xx| 96 | 20.790 | 74.51 | 91.91 |xx | 56 | 1 | 1 | | 109 | 20.829 | 74.33 | 91.89 |xx | 109 | 20.788 | 74.39 | 91.81 |xx | 82 | 20.790 | 74.39 | 91.77|xx | 57 | 1 | N_i-j | | 107 | 20.829 | 74.40 | 91.89|xx | 107 | 20.788 | 74.46 | 91.70 |xx | 81 | 20.790 | 74.38 | 91.87|xx | 58 | i-1 | 1 | | 103 | 20.829 | 74.02 | 91.74 |xx| 108 | 20.788 | 74.14 | 91.81 |xx| 80 | 20.790 | 74.57 | 91.90 |xx | 59 | N/A | N/A | R-50 | 90 | 26.772 | 76.80 | 93.39 |xx| 87 | 24.373 | 77.12 |xx| 93.68 | 71| 24.402 | 77.13 | 93.51|xx| 60 | 0|0 | | 97 | 25.037 | 76.56 | 93.28|xx | 98 | 24.373 | 77.02 | 93.49 |xx| 81 | 24.380 | 76.98 | 93.41 |xx| 61 | 1 | 1 | | 88 | 25.037 |77.10 |93.49 |xx| 94 | 24.373 | 76.98 | 93.55 |xx| 70 | 24.380 | 77.00 | 93.72 |xx| 62 | 1 | N_i-j | | 90 | 25.037 | 77.33 | 93.52 |xx | 92 | 24.373 | 77.13 | 93.49 |xx | 70 | 24.380 | 77.20 | 93.54|xx | 63 | i-1 | 1 | |89 | 25.037 | 76.85 |93.42 |xx | 91 | 24.373 | 76.82 | 93.41 |xx | 71 | 24.380 |77.05 | 93.50 |xx| 64 | 65 | 66 | 67 | 68 | |S | N'_i-j |Model | FPS | \#.P (M) | Top-1 | Top-5 | FPS | \#.P (M) | Top-1 | Top-5 | FPS |\#.P (M) | Top-1 | Top-5| 69 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:| 70 | | | | | *SE* |||| *ECA* |||| *SRM* |||| 71 | N/A | N/A | R-18 | 187 | 11.231 | 70.59 | 89.78 | 192 | 11.148 | 70.75 | 89.74 | 154 | 11.152 | 70.96 | 89.81 72 | 0 | 0 | | 204 | 11.189 | 70.91 | 89.96 | 212 | 11.148 | 70.63 | 89.85 |165 | 11.150 | 71.31 | 90.07 73 | 1, | 1 | | 156 | 11.189 | 71.02 | 89.98 | 174 | 11.148 | 70.83 | 89.96 | 123 | 11.150 | 71.20 | 90.00 74 | 1| N_i-j | | 160 | 11.190 | 71.00 | 90.00 |170 | 11.148 | 71.04 | 89.99 | 113 | 11.150 | 71.02 | 90.00 75 | i-1 | 1 | | 153 | 11.190 | 71.02 | 90.12 | 169 | 11.148 | 70.59 | 89.78 |113 | 11.150 | 71.00 | 89.81 76 | N/A, | N/A | R-34 | 101 | 20.938 | 73.87 | 91.65 |107 | 20.788 | 74.13 | 91.68 | 82 | 20.795 | 73.98 | 91.68 77 | 0, | 0 | | 122 | 20.829 | 73.84 | 91.64 | 122 | 20.788 | 74.20 | 91.84 | 96 | 20.790 | 74.51 | 91.91 78 | 1, | 1 | | 109 | 20.829 | 74.33 | 91.89 | 109 | 20.788 | 74.39 | 91.81 | 82 | 20.790 | 74.39 | 91.77 79 | 1, | N_i-j | | 107 | 20.829 | 74.40 | 91.89 | 107 | 20.788 | 74.46 | 91.70 | 81 | 20.790 | 74.38 | 91.87 80 | i-1, | 1 | | 103 | 20.829 | 74.02 | 91.74 | 108 | 20.788 | 74.14 | 91.81| 80 | 20.790 | 74.57 | 91.90 81 | N/A, | N/A | R-50 | 90 | 26.772 | 76.80 | 93.39 | 87 | 24.373 | 77.12 | 93.68 | 71 | 24.402 | 77.13 | 93.51 82 | 0, | 0 | | 97 | 25.037 | 76.56 | 93.28 | 98 | 24.373 | 77.02 | 93.49 | 81 | 24.380 | 76.98 | 93.41 83 | 1, | 1 | | 88 | 25.037 | 77.10 | 93.49| 94 | 24.373 | 76.98 | 93.55 | 70 | 24.380 | 77.00 | 93.72 84 | 1, | N_i-j | | 90 | 25.037 | 77.33 | 93.52 | 92 | 24.373 | 77.13 | 93.49 | 70 | 24.380 | 77.20 | 93.54 85 | i-1 | 1 | | 89 | 25.037 | 76.85 | 93.42 | 91 | 24.373 | 76.82 | 93.41 | 71 | 24.380 | 77.05 | 93.50 86 | 87 | 88 | Methods |Model | \#.P (M) | GFLOPs | Top-1(RI) | Top-5 | FPS | FPS* | FPS** 89 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:| 90 | ResNet | R-18 | 11.148 | 1.694 | 70.40 | 89.45 | 270 | 23552 | 859 91 | +SENet | | 11.231 | 1.695 | 70.59 | 89.78 | 187 | 21760 | 839 92 | +EMCA-SE | |11.190 |1.695 |71.00(215) |90.00 | 160 | 17313 | 813 93 | +ECANet | | 11.148 | 1.695 | 70.78 | 89.92 | 192 | 22287 | 848 94 | +ECANet* | | 11.148 | 1.695 | 70.75 | 89.74 | 192 | 22287 | 848 95 | +EMCA-ECA | |11.148 |1.695 | 71.04(83) |89.99 | 170 | 19023 | 833 96 | +SRM* | | 11.152 | 1.695 | 70.96 | 89.81 | 154 | 18794 | 823 97 | +EMCA-SRM | | 11.150 | 1.694 |71.02(10) |90.00 | 113 | 15190 | 803 98 | ResNet | R-34 | 20.788 | 3.419 | 73.31 | 91.40 | 168 | 19712 | 840 99 | +SENet | | 20.938 | 3.421 | 73.87 | 91.65 | 101 | 14279 | 805 100 | +EMCA-SE | |20.829 |3.421 | 74.41 (96) | 91.90 | 107 |14372 |812 101 | +ECANet | | 20.788 | 3.420 | 74.21 | 91.83 | 107 | 14067 | 825 102 | +ECANet* | | 20.788 | 3.420 | 74.13 | 91.68 | 107 | 14067 | 825 103 | +EMCA-ECA | |20.788 | 3.421 |74.46 (40) |91.70 | 107 |14080 | 822 104 | +SRM* | | 20.795 | 3.419 | 73.98 | 91.68 | 82 | 12655 | 803 105 | +EMCA-SRM | |20.790 |3.419 |74.38 (59) |91.87 | 81 | 12579 | 795 106 | ResNet | R-50 | 24.373 | 3.829 | 75.89 | 92.85 | 124 | 10032 | 668 107 | +SENet | | 26.772 | 3.837 | 76.80 | 93.39 | 90 | 8156 | 597 108 | +EMCA-SE | |25.037 |3.835 |77.33 (58) |93.52 | 90 | 8099 | 589 109 | +ECANet| | 24.373 | 3.834 | 77.48 | 93.68 | 87 | 8517 | 591 110 | +ECANet * | | 24.373 | 3.834 | 77.12 | 93.68 | 87 | 8517 | 591 111 | +EMCA-ECA | |24.373 |3.834 | 77.13 (1) | 93.49 | 92 |8615 |600 112 | +SRM *| | 24.402 | 3.829 | 77.13 | 93.51 | 71 | 6745 | 536 113 | +EMCA-SRM | | 24.380 |3.829 |77.20 (6) |93.54 | 70 | 6698 | 532 114 | 115 | 116 | 117 | Methods |Model | \#.P (M) | GFLOPs | Top-1 | Top-5 | FPS | FPS* | FPS** 118 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:| 119 | ResNet | R-18 | 11.148 | 1.694 | 70.40 | 89.45 | 270 | 23552 | 859 120 | SENet | | 11.231 | 1.695 | 70.59 | 89.78 | 187 | 21760 | 839 121 | ECANet* | | 11.148 | 1.695 | 70.75 | 89.74 | 192 |22287 |839 122 | SRM* | | 11.152 | 1.694 | 70.96 | 89.81 | 154 | 18794 | 823 123 | FCANet* | | 11.231 | 1.694 | 70.98 | 90.00 | 119 | 17680 | 808 124 | BAM | | 11.712 | 1.821 | 75.98 | 92.82 | 91 | 7159 | 527 125 | CBAM | | 11.234 | 1.695 | 70.73 | 89.91 | 104 | 8734 | 789 126 | EMCA-ECA | |11.148 | 1.695 | 71.04 | 89.99 | 170 | 19023 | 833 127 | EMCA-SRM | | 11.150 | 1.694 |71.02 | 90.00 | 113 | 15190 | 803 128 | EMCA-SE | | 11.190 | 1.695 |71.00 | 90.00 | 160 | 17313 | 813 129 | ResNet |R-34 | 20.788 | 3.419 | 73.31 | 91.4 | 168 | 19712 | 840 130 | SENet | | 20.938 | 3.421 | 73.87 | 91.65 | 101 | 14279 | 805 131 | ECANet* | | 20.788 | 3.420 | 74.13 | 91.68 | 107 | 14067 | 825 132 | SRM* | | 20.795 | 3.419 | 73.98 | 91.68 | 82 | 12655 | 803 133 | FCANet* | | 20.938 | 3.419 | 74.18 | 91.75 | 87 | 13094 | 812 134 | CBAM | | 20.943 | 3.420 | 74.01 | 91.76 | 59 | 12001 | 760 135 | EMCA-ECA | |20.788 | 3.421 |74.46 | 91.70 | 107 | 14080 | 822 136 | EMCA-SRM | | 20.790 |3.419 |74.38 |91.87 | 81 | 12579 | 795 137 | EMCA-SE | | 20.829 | 3.421 | 74.41 | 91.90 | 107 |14372 | 812 138 | ResNet| R-50 | 24.373 | 3.829 | 75.89 | 92.85 | 124 | 10032 | 668 139 | SENet | | 26.772 | 3.837 | 76.80 | 93.39 | 90 | 8156 | 597 140 | ECANet* | | 24.373 | 3.834 | 77.12 | 93.68 | 87 | 8517 | 591 141 | SRM* | | 24.402 | 3.829 | 77.13 | 93.51 | 71 | 6745 | 536 142 | FCANet* | | 26.772 | 3.831 | 77.27 | 93.70 | 74 | 7984 | 549 143 | EPSANet* | | 21.517 | 3.373 | 77.31 |93.72 | 28 | 802 | 388 144 | SANet* | | 24.373 | 3.832 | 77.25 | 93.66 | 68 | 6670 | 406 145 | A^2Nets | | 33.006 | 6.502 | 77.00 | 93.50 | N/A | N/A | N/A 146 | BAM | | 25.92 | 3.946 | 75.98 | 92.82 | 91 | 7159 | 527 147 | CBAM | | 26.775 | 3.837 |77.34 | 93.69 | 55 | 2460 | 208 148 | EMCA-ECA | |24.373 | 3.834 | 77.13 | 93.49 | 92 |8615 |600 149 | EMCA-SRM | | 24.380 |3.829 | 77.20 | 93.54 | 71 | 6698 | 532 150 | EMCA-SE | | 25.037 | 3.835 |77.33 | 93.52 | 90 | 8099 | 589 151 | 152 | 153 | 154 | |Methods | Detectors | \#.P (M) | GFLOPs | AP | AP_50 | AP_75 | AP_S | AP_M | AP_L 155 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-:| 156 | ResNet-50 | | 41.53 | 207.07 | 36.4 | 58.2 | 39.2 | 21.8 | 40.0 | 46.2 157 | +SE | | 44.02 | 207.18 | 37.7 | 60.1 | 40.9 | 22.9 | 41.9 | 48.2 158 | EMCA+SE | | 42.56 | 207.18 | 38.1 |60.6 | 50.2 | 23.6 |42.2 | 48.4 159 | +ECA | | 41.53 | 207.18 | 38.0 | 60.6 | 40.9 | 23.4 | 42.1 | 48.0 160 | +EMCA+ECA |Faster R-CNN | 41.53 | 207.18 | 38.2 |60.9 | 50.0 | 23.7 | 42.2 | 48.2 161 | ResNet-50 | | 44.18 | 275.58 | 37.2 | 58.9 | 40.3 | 22.2 | 40.7 | 48.0 162 | +1 NL | |46.50 | 288.70 | 38.0 | 59.8 | 41.0 | N/A | N/A | N/A 163 | +GC | | 46.90 | 279.60 | 39.4 | 61.6 | 42.4 | N/A | N/A | N/A 164 | +SE | | 46.67 | 275.69 | 38.7 | 60.9 | 42.1 | 23.4 | 42.7 | 50.0 165 | +EMCA+SE | | 45.13 | 275.69 | 39.0 |61.4 | 42.3 | 23.7 |42.9 | 50.1 166 | +ECA | | 44.18 | 275.69 | 39.0 | 61.3 | 42.1 | 24.2 | 42.8 | 49.9 167 | +EMCA+ECA |Mask R-CNN | 44.18 | 275.69 | 39.1 |61.5 | 42.1 | 24.4 |42.9 | 49.9 168 | ResNet-50 | | 37.74 | 239.32 | 35.6 | 55.5 | 38.2 | 20.0 | 39.6 | 46.8 169 | +SE | | 40.23 | 239.43 | 37.1 | 57.2 | 39.9 | 21.2 | 40.7 | 49.3 170 | +EMCA+SE | | 38.88 | 239.43 | 37.2 |57.4 | 39.9 | 21.2 | 40.7 | 49.3 171 | +ECA| | 37.74 | 239.43 | 37.3 | 57.7 | 39.6 | 21.9 | 41.3 | 48.9 172 | +EMCA+ECA | RetinaNet | 37.74 | 239.43 | 37.3 |57.8 | 39.6 | 21.9 | 41.3 | 48.9 173 | 174 | 175 | Methods | \#.P (M) | GFLOPs | AP | AP_50 | AP_75 | AP_S | AP_M | AP_L| 176 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:| 177 | ResNet-50 | 44.18 | 275.58 | 34.1 | 55.5 | 36.2 | 16.1 | 36.7 | 50.0 178 | +SE | 46.67 | 275.69 | 35.4 | 57.4 | 37.8 | 17.1 | 38.6 | 51.8 179 | +EMCA+SE | 45.13 | 275.69 | 35.7 | 58.1 | 38.0 |17.8 | 39.0 | 51.9 180 | +ECA | 44.18 | 275.69 | 35.6 | 58.1 | 37.7 | 17.6 | 39.0 | 51.8 181 | +EMCA+ECA | 44.18 | 275.69 | 35.7 | 58.4 | 37.7 | 17.9 | 39.1 | 51.9 182 | 183 | 184 | 185 | 186 | # Citation 187 | -------------------------------------------------------------------------------- /Code/train_imagenet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import shutil 5 | import time 6 | import warnings 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.parallel 11 | import torch.backends.cudnn as cudnn 12 | import torch.distributed as dist 13 | import torch.optim 14 | import torch.multiprocessing as mp 15 | import torch.utils.data 16 | import torch.utils.data.distributed 17 | import torchvision.transforms as transforms 18 | import torchvision.datasets as datasets 19 | import torchvision.models as models 20 | from tqdm import tqdm 21 | 22 | os.environ["NCCL_DEBUG"] = "INFO" 23 | model_names = sorted(name for name in models.__dict__ 24 | if name.islower() and not name.startswith("__") 25 | and callable(models.__dict__[name])) 26 | 27 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 28 | parser.add_argument('data', metavar='DIR', 29 | help='path to dataset') 30 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', 31 | choices=model_names, 32 | help='model architecture: ' + 33 | ' | '.join(model_names) + 34 | ' (default: resnet18)') 35 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 36 | help='number of data loading workers (default: 4)') 37 | parser.add_argument('--epochs', default=200, type=int, metavar='N', 38 | help='number of total epochs to run') 39 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 40 | help='manual epoch number (useful on restarts)') 41 | parser.add_argument('-b', '--batch-size', default=256, type=int, 42 | metavar='N', 43 | help='mini-batch size (default: 256), this is the total ' 44 | 'batch size of all GPUs on the current node when ' 45 | 'using Data Parallel or Distributed Data Parallel') 46 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 47 | metavar='LR', help='initial learning rate', dest='lr') 48 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 49 | help='momentum') 50 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 51 | metavar='W', help='weight decay (default: 1e-4)', 52 | dest='weight_decay') 53 | parser.add_argument('-p', '--print-freq', default=10, type=int, 54 | metavar='N', help='print frequency (default: 10)') 55 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 56 | help='path to latest checkpoint (default: none)') 57 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 58 | help='evaluate model on validation set') 59 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 60 | help='use pre-trained model') 61 | parser.add_argument('--world-size', default=-1, type=int, 62 | help='number of nodes for distributed training') 63 | parser.add_argument('--rank', default=-1, type=int, 64 | help='node rank for distributed training') 65 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 66 | help='url used to set up distributed training') 67 | parser.add_argument('--dist-backend', default='nccl', type=str, 68 | help='distributed backend') 69 | parser.add_argument('--seed', default=None, type=int, 70 | help='seed for initializing training. ') 71 | parser.add_argument('--gpu', default=None, type=int, 72 | help='GPU id to use.') 73 | parser.add_argument('--multiprocessing-distributed', action='store_true', 74 | help='Use multi-processing distributed training to launch ' 75 | 'N processes per node, which has N GPUs. This is the ' 76 | 'fastest way to use PyTorch for either single node or ' 77 | 'multi node data parallel training') 78 | 79 | best_acc1 = 0 80 | 81 | 82 | def main(): 83 | args = parser.parse_args() 84 | 85 | if args.seed is not None: 86 | random.seed(args.seed) 87 | torch.manual_seed(args.seed) 88 | cudnn.deterministic = True 89 | warnings.warn('You have chosen to seed training. ' 90 | 'This will turn on the CUDNN deterministic setting, ' 91 | 'which can slow down your training considerably! ' 92 | 'You may see unexpected behavior when restarting ' 93 | 'from checkpoints.') 94 | 95 | if args.gpu is not None: 96 | warnings.warn('You have chosen a specific GPU. This will completely ' 97 | 'disable data parallelism.') 98 | 99 | if args.dist_url == "env://" and args.world_size == -1: 100 | args.world_size = int(os.environ["WORLD_SIZE"]) 101 | 102 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 103 | 104 | ngpus_per_node = torch.cuda.device_count() 105 | if args.multiprocessing_distributed: 106 | # Since we have ngpus_per_node processes per node, the total world_size 107 | # needs to be adjusted accordingly 108 | args.world_size = ngpus_per_node * args.world_size 109 | # Use torch.multiprocessing.spawn to launch distributed processes: the 110 | # main_worker process function 111 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) 112 | else: 113 | # Simply call main_worker function 114 | main_worker(args.gpu, ngpus_per_node, args) 115 | 116 | 117 | def main_worker(gpu, ngpus_per_node, args): 118 | global best_acc1 119 | args.gpu = gpu 120 | 121 | if args.gpu is not None: 122 | print("Use GPU: {} for training".format(args.gpu)) 123 | 124 | if args.distributed: 125 | if args.dist_url == "env://" and args.rank == -1: 126 | args.rank = int(os.environ["RANK"]) 127 | if args.multiprocessing_distributed: 128 | # For multiprocessing distributed training, rank needs to be the 129 | # global rank among all the processes 130 | args.rank = args.rank * ngpus_per_node + gpu 131 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 132 | world_size=args.world_size, rank=args.rank) 133 | # create model 134 | if args.pretrained: 135 | print("=> using pre-trained model '{}'".format(args.arch)) 136 | model = models.__dict__[args.arch](pretrained=True) 137 | else: 138 | print("=> creating model '{}'".format(args.arch)) 139 | # original pytorch models 140 | #model = models.__dict__[args.arch]() 141 | # custom models 142 | if args.arch == "resnet18": 143 | from eca_resnet_multi_scale import eca_resnet18 144 | model = eca_resnet18(1000) 145 | elif args.arch == "resnet34": 146 | print("ResNet34") 147 | from eca_resnet_multi_scale import eca_resnet34 148 | model = eca_resnet34(1000) 149 | elif args.arch == "resnet50": 150 | print("ResNet50") 151 | from eca_resnet_multi_scale import eca_resnet50 152 | model = eca_resnet50(1000) 153 | 154 | if not torch.cuda.is_available(): 155 | print('using CPU, this will be slow') 156 | elif args.distributed: 157 | # For multiprocessing distributed, DistributedDataParallel constructor 158 | # should always set the single device scope, otherwise, 159 | # DistributedDataParallel will use all available devices. 160 | if args.gpu is not None: 161 | torch.cuda.set_device(args.gpu) 162 | model.cuda(args.gpu) 163 | # When using a single GPU per process and per 164 | # DistributedDataParallel, we need to divide the batch size 165 | # ourselves based on the total number of GPUs we have 166 | args.batch_size = int(args.batch_size / ngpus_per_node) 167 | args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) 168 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 169 | else: 170 | model.cuda() 171 | # DistributedDataParallel will divide and allocate batch_size to all 172 | # available GPUs if device_ids are not set 173 | model = torch.nn.parallel.DistributedDataParallel(model) 174 | elif args.gpu is not None: 175 | torch.cuda.set_device(args.gpu) 176 | model = model.cuda(args.gpu) 177 | else: 178 | # DataParallel will divide and allocate batch_size to all available GPUs 179 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 180 | model.features = torch.nn.DataParallel(model.features) 181 | model.cuda() 182 | else: 183 | model = torch.nn.DataParallel(model).cuda() 184 | 185 | # define loss function (criterion) and optimizer 186 | criterion = nn.CrossEntropyLoss().cuda(args.gpu) 187 | 188 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 189 | momentum=args.momentum, 190 | weight_decay=args.weight_decay) 191 | 192 | # optionally resume from a checkpoint 193 | if args.resume: 194 | if os.path.isfile(args.resume): 195 | print("=> loading checkpoint '{}'".format(args.resume)) 196 | if args.gpu is None: 197 | checkpoint = torch.load(args.resume) 198 | else: 199 | # Map model to be loaded to specified single gpu. 200 | loc = 'cuda:{}'.format(args.gpu) 201 | checkpoint = torch.load(args.resume, map_location=loc) 202 | args.start_epoch = checkpoint['epoch'] 203 | best_acc1 = checkpoint['best_acc1'] 204 | if args.gpu is not None: 205 | # best_acc1 may be from a checkpoint from a different GPU 206 | best_acc1 = best_acc1.to(args.gpu) 207 | model.load_state_dict(checkpoint['state_dict']) 208 | optimizer.load_state_dict(checkpoint['optimizer']) 209 | print("=> loaded checkpoint '{}' (epoch {})" 210 | .format(args.resume, checkpoint['epoch'])) 211 | else: 212 | print("=> no checkpoint found at '{}'".format(args.resume)) 213 | 214 | cudnn.benchmark = True 215 | 216 | # Data loading code 217 | traindir = os.path.join(args.data, 'train') 218 | valdir = os.path.join(args.data, 'val') 219 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 220 | std=[0.229, 0.224, 0.225]) 221 | 222 | train_dataset = datasets.ImageFolder( 223 | traindir, 224 | transforms.Compose([ 225 | transforms.RandomResizedCrop(224), 226 | transforms.RandomHorizontalFlip(), 227 | transforms.ToTensor(), 228 | normalize, 229 | ])) 230 | 231 | if args.distributed: 232 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 233 | else: 234 | train_sampler = None 235 | 236 | train_loader = torch.utils.data.DataLoader( 237 | train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 238 | num_workers=args.workers, pin_memory=True, sampler=train_sampler) 239 | 240 | val_loader = torch.utils.data.DataLoader( 241 | datasets.ImageFolder(valdir, transforms.Compose([ 242 | transforms.Resize(256), 243 | transforms.CenterCrop(224), 244 | transforms.ToTensor(), 245 | normalize, 246 | ])), 247 | batch_size=args.batch_size, shuffle=False, 248 | num_workers=args.workers, pin_memory=True) 249 | 250 | if args.evaluate: 251 | validate(val_loader, model, criterion, args) 252 | return 253 | 254 | for epoch in range(args.start_epoch, args.epochs): 255 | if args.distributed: 256 | train_sampler.set_epoch(epoch) 257 | adjust_learning_rate(optimizer, epoch, args) 258 | 259 | # train for one epoch 260 | train(train_loader, model, criterion, optimizer, epoch, args) 261 | 262 | # evaluate on validation set 263 | acc1 = validate(val_loader, model, criterion, args) 264 | 265 | # remember best acc@1 and save checkpoint 266 | is_best = acc1 > best_acc1 267 | best_acc1 = max(acc1, best_acc1) 268 | 269 | if not args.multiprocessing_distributed or (args.multiprocessing_distributed 270 | and args.rank % ngpus_per_node == 0): 271 | save_checkpoint({ 272 | 'epoch': epoch + 1, 273 | 'arch': args.arch, 274 | 'state_dict': model.state_dict(), 275 | 'best_acc1': best_acc1, 276 | 'optimizer' : optimizer.state_dict(), 277 | }, is_best) 278 | 279 | 280 | def train(train_loader, model, criterion, optimizer, epoch, args): 281 | batch_time = AverageMeter('Time', ':6.3f') 282 | data_time = AverageMeter('Data', ':6.3f') 283 | losses = AverageMeter('Loss', ':.4e') 284 | top1 = AverageMeter('Acc@1', ':6.2f') 285 | top5 = AverageMeter('Acc@5', ':6.2f') 286 | progress = ProgressMeter( 287 | len(train_loader), 288 | [batch_time, data_time, losses, top1, top5], 289 | prefix="Epoch: [{}]".format(epoch)) 290 | 291 | # switch to train mode 292 | model.train() 293 | 294 | end = time.time() 295 | for i, (images, target) in tqdm(enumerate(train_loader)): 296 | # measure data loading time 297 | data_time.update(time.time() - end) 298 | 299 | if args.gpu is not None: 300 | images = images.cuda(args.gpu, non_blocking=True) 301 | if torch.cuda.is_available(): 302 | target = target.cuda(args.gpu, non_blocking=True) 303 | 304 | # compute output 305 | output = model(images) 306 | loss = criterion(output, target) 307 | 308 | # measure accuracy and record loss 309 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 310 | losses.update(loss.item(), images.size(0)) 311 | top1.update(acc1[0], images.size(0)) 312 | top5.update(acc5[0], images.size(0)) 313 | 314 | # compute gradient and do SGD step 315 | optimizer.zero_grad() 316 | loss.backward() 317 | optimizer.step() 318 | 319 | # measure elapsed time 320 | batch_time.update(time.time() - end) 321 | end = time.time() 322 | 323 | if i % args.print_freq == 0 and False: 324 | progress.display(i) 325 | 326 | 327 | def validate(val_loader, model, criterion, args): 328 | batch_time = AverageMeter('Time', ':6.3f') 329 | losses = AverageMeter('Loss', ':.4e') 330 | top1 = AverageMeter('Acc@1', ':6.2f') 331 | top5 = AverageMeter('Acc@5', ':6.2f') 332 | progress = ProgressMeter( 333 | len(val_loader), 334 | [batch_time, losses, top1, top5], 335 | prefix='Test: ') 336 | 337 | # switch to evaluate mode 338 | model.eval() 339 | 340 | with torch.no_grad(): 341 | end = time.time() 342 | for i, (images, target) in enumerate(val_loader): 343 | if args.gpu is not None: 344 | images = images.cuda(args.gpu, non_blocking=True) 345 | if torch.cuda.is_available(): 346 | target = target.cuda(args.gpu, non_blocking=True) 347 | 348 | # compute output 349 | output = model(images) 350 | loss = criterion(output, target) 351 | 352 | # measure accuracy and record loss 353 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 354 | losses.update(loss.item(), images.size(0)) 355 | top1.update(acc1[0], images.size(0)) 356 | top5.update(acc5[0], images.size(0)) 357 | 358 | # measure elapsed time 359 | batch_time.update(time.time() - end) 360 | end = time.time() 361 | 362 | if i % args.print_freq == 0 and False: 363 | progress.display(i) 364 | 365 | # TODO: this should also be done with the ProgressMeter 366 | print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' 367 | .format(top1=top1, top5=top5)) 368 | 369 | return top1.avg 370 | 371 | 372 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 373 | log_file = open("log.txt","a+") 374 | torch.save(state, filename) 375 | log_file.write("Last epoch is " + str(state['epoch'])) 376 | if is_best: 377 | shutil.copyfile(filename, 'model_best.pth.tar') 378 | log_file.write("\n best epoch at: " + str(state['epoch']) + " " + str(state['best_acc1'])) 379 | 380 | 381 | 382 | class AverageMeter(object): 383 | """Computes and stores the average and current value""" 384 | def __init__(self, name, fmt=':f'): 385 | self.name = name 386 | self.fmt = fmt 387 | self.reset() 388 | 389 | def reset(self): 390 | self.val = 0 391 | self.avg = 0 392 | self.sum = 0 393 | self.count = 0 394 | 395 | def update(self, val, n=1): 396 | self.val = val 397 | self.sum += val * n 398 | self.count += n 399 | self.avg = self.sum / self.count 400 | 401 | def __str__(self): 402 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 403 | return fmtstr.format(**self.__dict__) 404 | 405 | 406 | class ProgressMeter(object): 407 | def __init__(self, num_batches, meters, prefix=""): 408 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 409 | self.meters = meters 410 | self.prefix = prefix 411 | 412 | def display(self, batch): 413 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 414 | entries += [str(meter) for meter in self.meters] 415 | print('\t'.join(entries)) 416 | 417 | def _get_batch_fmtstr(self, num_batches): 418 | num_digits = len(str(num_batches // 1)) 419 | fmt = '{:' + str(num_digits) + 'd}' 420 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 421 | 422 | 423 | def adjust_learning_rate(optimizer, epoch, args): 424 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 425 | lr = args.lr * (0.1 ** (epoch // 30)) 426 | for param_group in optimizer.param_groups: 427 | param_group['lr'] = lr 428 | 429 | 430 | def accuracy(output, target, topk=(1,)): 431 | """Computes the accuracy over the k top predictions for the specified values of k""" 432 | with torch.no_grad(): 433 | maxk = max(topk) 434 | batch_size = target.size(0) 435 | 436 | _, pred = output.topk(maxk, 1, True, True) 437 | pred = pred.t() 438 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 439 | 440 | res = [] 441 | for k in topk: 442 | correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) 443 | res.append(correct_k.mul_(100.0 / batch_size)) 444 | return res 445 | 446 | 447 | if __name__ == '__main__': 448 | main() 449 | -------------------------------------------------------------------------------- /Code/benchmarking/inceptionv4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ inceptionv4 in pytorch 3 | 4 | 5 | [1] Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi 6 | 7 | Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning 8 | https://arxiv.org/abs/1602.07261 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | class BasicConv2d(nn.Module): 15 | 16 | def __init__(self, input_channels, output_channels, **kwargs): 17 | super().__init__() 18 | self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs) 19 | self.bn = nn.BatchNorm2d(output_channels) 20 | self.relu = nn.ReLU(inplace=True) 21 | 22 | def forward(self, x): 23 | x = self.conv(x) 24 | x = self.bn(x) 25 | x = self.relu(x) 26 | 27 | return x 28 | 29 | class Inception_Stem(nn.Module): 30 | 31 | #"""Figure 3. The schema for stem of the pure Inception-v4 and 32 | #Inception-ResNet-v2 networks. This is the input part of those 33 | #networks.""" 34 | def __init__(self, input_channels): 35 | super().__init__() 36 | self.conv1 = nn.Sequential( 37 | BasicConv2d(input_channels, 32, kernel_size=3), 38 | BasicConv2d(32, 32, kernel_size=3, padding=1), 39 | BasicConv2d(32, 64, kernel_size=3, padding=1) 40 | ) 41 | 42 | self.branch3x3_conv = BasicConv2d(64, 96, kernel_size=3, padding=1) 43 | self.branch3x3_pool = nn.MaxPool2d(3, stride=1, padding=1) 44 | 45 | self.branch7x7a = nn.Sequential( 46 | BasicConv2d(160, 64, kernel_size=1), 47 | BasicConv2d(64, 64, kernel_size=(7, 1), padding=(3, 0)), 48 | BasicConv2d(64, 64, kernel_size=(1, 7), padding=(0, 3)), 49 | BasicConv2d(64, 96, kernel_size=3, padding=1) 50 | ) 51 | 52 | self.branch7x7b = nn.Sequential( 53 | BasicConv2d(160, 64, kernel_size=1), 54 | BasicConv2d(64, 96, kernel_size=3, padding=1) 55 | ) 56 | 57 | self.branchpoola = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 58 | self.branchpoolb = BasicConv2d(192, 192, kernel_size=3, stride=1, padding=1) 59 | 60 | def forward(self, x): 61 | 62 | x = self.conv1(x) 63 | 64 | x = [ 65 | self.branch3x3_conv(x), 66 | self.branch3x3_pool(x) 67 | ] 68 | x = torch.cat(x, 1) 69 | 70 | x = [ 71 | self.branch7x7a(x), 72 | self.branch7x7b(x) 73 | ] 74 | x = torch.cat(x, 1) 75 | 76 | x = [ 77 | self.branchpoola(x), 78 | self.branchpoolb(x) 79 | ] 80 | 81 | x = torch.cat(x, 1) 82 | 83 | return x 84 | 85 | class InceptionA(nn.Module): 86 | 87 | #"""Figure 4. The schema for 35 × 35 grid modules of the pure 88 | #Inception-v4 network. This is the Inception-A block of Figure 9.""" 89 | def __init__(self, input_channels): 90 | super().__init__() 91 | 92 | self.branch3x3stack = nn.Sequential( 93 | BasicConv2d(input_channels, 64, kernel_size=1), 94 | BasicConv2d(64, 96, kernel_size=3, padding=1), 95 | BasicConv2d(96, 96, kernel_size=3, padding=1) 96 | ) 97 | 98 | self.branch3x3 = nn.Sequential( 99 | BasicConv2d(input_channels, 64, kernel_size=1), 100 | BasicConv2d(64, 96, kernel_size=3, padding=1) 101 | ) 102 | 103 | self.branch1x1 = BasicConv2d(input_channels, 96, kernel_size=1) 104 | 105 | self.branchpool = nn.Sequential( 106 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 107 | BasicConv2d(input_channels, 96, kernel_size=1) 108 | ) 109 | 110 | def forward(self, x): 111 | 112 | x = [ 113 | self.branch3x3stack(x), 114 | self.branch3x3(x), 115 | self.branch1x1(x), 116 | self.branchpool(x) 117 | ] 118 | 119 | return torch.cat(x, 1) 120 | 121 | class ReductionA(nn.Module): 122 | 123 | #"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module. 124 | #Different variants of this blocks (with various number of filters) 125 | #are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1, 126 | #-ResNet-v2) variants presented in this paper. The k, l, m, n numbers 127 | #represent filter bank sizes which can be looked up in Table 1. 128 | def __init__(self, input_channels, k, l, m, n): 129 | 130 | super().__init__() 131 | self.branch3x3stack = nn.Sequential( 132 | BasicConv2d(input_channels, k, kernel_size=1), 133 | BasicConv2d(k, l, kernel_size=3, padding=1), 134 | BasicConv2d(l, m, kernel_size=3, stride=2) 135 | ) 136 | 137 | self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2) 138 | self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2) 139 | self.output_channels = input_channels + n + m 140 | 141 | def forward(self, x): 142 | 143 | x = [ 144 | self.branch3x3stack(x), 145 | self.branch3x3(x), 146 | self.branchpool(x) 147 | ] 148 | 149 | return torch.cat(x, 1) 150 | 151 | class InceptionB(nn.Module): 152 | 153 | #"""Figure 5. The schema for 17 × 17 grid modules of the pure Inception-v4 network. 154 | #This is the Inception-B block of Figure 9.""" 155 | def __init__(self, input_channels): 156 | super().__init__() 157 | 158 | self.branch7x7stack = nn.Sequential( 159 | BasicConv2d(input_channels, 192, kernel_size=1), 160 | BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)), 161 | BasicConv2d(192, 224, kernel_size=(7, 1), padding=(3, 0)), 162 | BasicConv2d(224, 224, kernel_size=(1, 7), padding=(0, 3)), 163 | BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0)) 164 | ) 165 | 166 | self.branch7x7 = nn.Sequential( 167 | BasicConv2d(input_channels, 192, kernel_size=1), 168 | BasicConv2d(192, 224, kernel_size=(1, 7), padding=(0, 3)), 169 | BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0)) 170 | ) 171 | 172 | self.branch1x1 = BasicConv2d(input_channels, 384, kernel_size=1) 173 | 174 | self.branchpool = nn.Sequential( 175 | nn.AvgPool2d(3, stride=1, padding=1), 176 | BasicConv2d(input_channels, 128, kernel_size=1) 177 | ) 178 | 179 | def forward(self, x): 180 | x = [ 181 | self.branch1x1(x), 182 | self.branch7x7(x), 183 | self.branch7x7stack(x), 184 | self.branchpool(x) 185 | ] 186 | 187 | return torch.cat(x, 1) 188 | 189 | class ReductionB(nn.Module): 190 | 191 | #"""Figure 8. The schema for 17 × 17 to 8 × 8 grid-reduction mod- ule. 192 | #This is the reduction module used by the pure Inception-v4 network in 193 | #Figure 9.""" 194 | def __init__(self, input_channels): 195 | 196 | super().__init__() 197 | self.branch7x7 = nn.Sequential( 198 | BasicConv2d(input_channels, 256, kernel_size=1), 199 | BasicConv2d(256, 256, kernel_size=(1, 7), padding=(0, 3)), 200 | BasicConv2d(256, 320, kernel_size=(7, 1), padding=(3, 0)), 201 | BasicConv2d(320, 320, kernel_size=3, stride=2, padding=1) 202 | ) 203 | 204 | self.branch3x3 = nn.Sequential( 205 | BasicConv2d(input_channels, 192, kernel_size=1), 206 | BasicConv2d(192, 192, kernel_size=3, stride=2, padding=1) 207 | ) 208 | 209 | self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 210 | 211 | def forward(self, x): 212 | 213 | x = [ 214 | self.branch3x3(x), 215 | self.branch7x7(x), 216 | self.branchpool(x) 217 | ] 218 | 219 | return torch.cat(x, 1) 220 | 221 | class InceptionC(nn.Module): 222 | 223 | def __init__(self, input_channels): 224 | #"""Figure 6. The schema for 8×8 grid modules of the pure 225 | #Inceptionv4 network. This is the Inception-C block of Figure 9.""" 226 | 227 | super().__init__() 228 | 229 | self.branch3x3stack = nn.Sequential( 230 | BasicConv2d(input_channels, 384, kernel_size=1), 231 | BasicConv2d(384, 448, kernel_size=(1, 3), padding=(0, 1)), 232 | BasicConv2d(448, 512, kernel_size=(3, 1), padding=(1, 0)), 233 | ) 234 | self.branch3x3stacka = BasicConv2d(512, 256, kernel_size=(1, 3), padding=(0, 1)) 235 | self.branch3x3stackb = BasicConv2d(512, 256, kernel_size=(3, 1), padding=(1, 0)) 236 | 237 | self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=1) 238 | self.branch3x3a = BasicConv2d(384, 256, kernel_size=(3, 1), padding=(1, 0)) 239 | self.branch3x3b = BasicConv2d(384, 256, kernel_size=(1, 3), padding=(0, 1)) 240 | 241 | self.branch1x1 = BasicConv2d(input_channels, 256, kernel_size=1) 242 | 243 | self.branchpool = nn.Sequential( 244 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 245 | BasicConv2d(input_channels, 256, kernel_size=1) 246 | ) 247 | 248 | def forward(self, x): 249 | branch3x3stack_output = self.branch3x3stack(x) 250 | branch3x3stack_output = [ 251 | self.branch3x3stacka(branch3x3stack_output), 252 | self.branch3x3stackb(branch3x3stack_output) 253 | ] 254 | branch3x3stack_output = torch.cat(branch3x3stack_output, 1) 255 | 256 | branch3x3_output = self.branch3x3(x) 257 | branch3x3_output = [ 258 | self.branch3x3a(branch3x3_output), 259 | self.branch3x3b(branch3x3_output) 260 | ] 261 | branch3x3_output = torch.cat(branch3x3_output, 1) 262 | 263 | branch1x1_output = self.branch1x1(x) 264 | 265 | branchpool = self.branchpool(x) 266 | 267 | output = [ 268 | branch1x1_output, 269 | branch3x3_output, 270 | branch3x3stack_output, 271 | branchpool 272 | ] 273 | 274 | return torch.cat(output, 1) 275 | 276 | class InceptionV4(nn.Module): 277 | 278 | def __init__(self, A, B, C, k=192, l=224, m=256, n=384, class_nums=100): 279 | 280 | super().__init__() 281 | self.stem = Inception_Stem(3) 282 | self.inception_a = self._generate_inception_module(384, 384, A, InceptionA) 283 | self.reduction_a = ReductionA(384, k, l, m, n) 284 | output_channels = self.reduction_a.output_channels 285 | self.inception_b = self._generate_inception_module(output_channels, 1024, B, InceptionB) 286 | self.reduction_b = ReductionB(1024) 287 | self.inception_c = self._generate_inception_module(1536, 1536, C, InceptionC) 288 | self.avgpool = nn.AvgPool2d(7) 289 | 290 | #"""Dropout (keep 0.8)""" 291 | self.dropout = nn.Dropout2d(1 - 0.8) 292 | self.linear = nn.Linear(1536, class_nums) 293 | 294 | def forward(self, x): 295 | x = self.stem(x) 296 | x = self.inception_a(x) 297 | x = self.reduction_a(x) 298 | x = self.inception_b(x) 299 | x = self.reduction_b(x) 300 | x = self.inception_c(x) 301 | x = self.avgpool(x) 302 | x = self.dropout(x) 303 | x = x.view(-1, 1536) 304 | x = self.linear(x) 305 | 306 | return x 307 | 308 | @staticmethod 309 | def _generate_inception_module(input_channels, output_channels, block_num, block): 310 | 311 | layers = nn.Sequential() 312 | for l in range(block_num): 313 | layers.add_module("{}_{}".format(block.__name__, l), block(input_channels)) 314 | input_channels = output_channels 315 | 316 | return layers 317 | 318 | class InceptionResNetA(nn.Module): 319 | 320 | #"""Figure 16. The schema for 35 × 35 grid (Inception-ResNet-A) 321 | #module of the Inception-ResNet-v2 network.""" 322 | def __init__(self, input_channels): 323 | 324 | super().__init__() 325 | self.branch3x3stack = nn.Sequential( 326 | BasicConv2d(input_channels, 32, kernel_size=1), 327 | BasicConv2d(32, 48, kernel_size=3, padding=1), 328 | BasicConv2d(48, 64, kernel_size=3, padding=1) 329 | ) 330 | 331 | self.branch3x3 = nn.Sequential( 332 | BasicConv2d(input_channels, 32, kernel_size=1), 333 | BasicConv2d(32, 32, kernel_size=3, padding=1) 334 | ) 335 | 336 | self.branch1x1 = BasicConv2d(input_channels, 32, kernel_size=1) 337 | 338 | self.reduction1x1 = nn.Conv2d(128, 384, kernel_size=1) 339 | self.shortcut = nn.Conv2d(input_channels, 384, kernel_size=1) 340 | self.bn = nn.BatchNorm2d(384) 341 | self.relu = nn.ReLU(inplace=True) 342 | 343 | def forward(self, x): 344 | 345 | residual = [ 346 | self.branch1x1(x), 347 | self.branch3x3(x), 348 | self.branch3x3stack(x) 349 | ] 350 | 351 | residual = torch.cat(residual, 1) 352 | residual = self.reduction1x1(residual) 353 | shortcut = self.shortcut(x) 354 | 355 | output = self.bn(shortcut + residual) 356 | output = self.relu(output) 357 | 358 | return output 359 | 360 | class InceptionResNetB(nn.Module): 361 | 362 | #"""Figure 17. The schema for 17 × 17 grid (Inception-ResNet-B) module of 363 | #the Inception-ResNet-v2 network.""" 364 | def __init__(self, input_channels): 365 | 366 | super().__init__() 367 | self.branch7x7 = nn.Sequential( 368 | BasicConv2d(input_channels, 128, kernel_size=1), 369 | BasicConv2d(128, 160, kernel_size=(1, 7), padding=(0, 3)), 370 | BasicConv2d(160, 192, kernel_size=(7, 1), padding=(3, 0)) 371 | ) 372 | 373 | self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1) 374 | 375 | self.reduction1x1 = nn.Conv2d(384, 1154, kernel_size=1) 376 | self.shortcut = nn.Conv2d(input_channels, 1154, kernel_size=1) 377 | 378 | self.bn = nn.BatchNorm2d(1154) 379 | self.relu = nn.ReLU(inplace=True) 380 | 381 | def forward(self, x): 382 | residual = [ 383 | self.branch1x1(x), 384 | self.branch7x7(x) 385 | ] 386 | 387 | residual = torch.cat(residual, 1) 388 | 389 | #"""In general we picked some scaling factors between 0.1 and 0.3 to scale the residuals 390 | #before their being added to the accumulated layer activations (cf. Figure 20).""" 391 | residual = self.reduction1x1(residual) * 0.1 392 | 393 | shortcut = self.shortcut(x) 394 | 395 | output = self.bn(residual + shortcut) 396 | output = self.relu(output) 397 | 398 | return output 399 | 400 | 401 | class InceptionResNetC(nn.Module): 402 | 403 | def __init__(self, input_channels): 404 | 405 | #Figure 19. The schema for 8×8 grid (Inception-ResNet-C) 406 | #module of the Inception-ResNet-v2 network.""" 407 | super().__init__() 408 | self.branch3x3 = nn.Sequential( 409 | BasicConv2d(input_channels, 192, kernel_size=1), 410 | BasicConv2d(192, 224, kernel_size=(1, 3), padding=(0, 1)), 411 | BasicConv2d(224, 256, kernel_size=(3, 1), padding=(1, 0)) 412 | ) 413 | 414 | self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1) 415 | self.reduction1x1 = nn.Conv2d(448, 2048, kernel_size=1) 416 | self.shorcut = nn.Conv2d(input_channels, 2048, kernel_size=1) 417 | self.bn = nn.BatchNorm2d(2048) 418 | self.relu = nn.ReLU(inplace=True) 419 | 420 | def forward(self, x): 421 | residual = [ 422 | self.branch1x1(x), 423 | self.branch3x3(x) 424 | ] 425 | 426 | residual = torch.cat(residual, 1) 427 | residual = self.reduction1x1(residual) * 0.1 428 | 429 | shorcut = self.shorcut(x) 430 | 431 | output = self.bn(shorcut + residual) 432 | output = self.relu(output) 433 | 434 | return output 435 | 436 | class InceptionResNetReductionA(nn.Module): 437 | 438 | #"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module. 439 | #Different variants of this blocks (with various number of filters) 440 | #are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1, 441 | #-ResNet-v2) variants presented in this paper. The k, l, m, n numbers 442 | #represent filter bank sizes which can be looked up in Table 1. 443 | def __init__(self, input_channels, k, l, m, n): 444 | 445 | super().__init__() 446 | self.branch3x3stack = nn.Sequential( 447 | BasicConv2d(input_channels, k, kernel_size=1), 448 | BasicConv2d(k, l, kernel_size=3, padding=1), 449 | BasicConv2d(l, m, kernel_size=3, stride=2) 450 | ) 451 | 452 | self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2) 453 | self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2) 454 | self.output_channels = input_channels + n + m 455 | 456 | def forward(self, x): 457 | 458 | x = [ 459 | self.branch3x3stack(x), 460 | self.branch3x3(x), 461 | self.branchpool(x) 462 | ] 463 | 464 | return torch.cat(x, 1) 465 | 466 | class InceptionResNetReductionB(nn.Module): 467 | 468 | #"""Figure 18. The schema for 17 × 17 to 8 × 8 grid-reduction module. 469 | #Reduction-B module used by the wider Inception-ResNet-v1 network in 470 | #Figure 15.""" 471 | #I believe it was a typo(Inception-ResNet-v1 should be Inception-ResNet-v2) 472 | def __init__(self, input_channels): 473 | 474 | super().__init__() 475 | self.branchpool = nn.MaxPool2d(3, stride=2) 476 | 477 | self.branch3x3a = nn.Sequential( 478 | BasicConv2d(input_channels, 256, kernel_size=1), 479 | BasicConv2d(256, 384, kernel_size=3, stride=2) 480 | ) 481 | 482 | self.branch3x3b = nn.Sequential( 483 | BasicConv2d(input_channels, 256, kernel_size=1), 484 | BasicConv2d(256, 288, kernel_size=3, stride=2) 485 | ) 486 | 487 | self.branch3x3stack = nn.Sequential( 488 | BasicConv2d(input_channels, 256, kernel_size=1), 489 | BasicConv2d(256, 288, kernel_size=3, padding=1), 490 | BasicConv2d(288, 320, kernel_size=3, stride=2) 491 | ) 492 | 493 | def forward(self, x): 494 | x = [ 495 | self.branch3x3a(x), 496 | self.branch3x3b(x), 497 | self.branch3x3stack(x), 498 | self.branchpool(x) 499 | ] 500 | 501 | x = torch.cat(x, 1) 502 | return x 503 | 504 | class InceptionResNetV2(nn.Module): 505 | 506 | def __init__(self, A, B, C, k=256, l=256, m=384, n=384, class_nums=100): 507 | super().__init__() 508 | self.stem = Inception_Stem(3) 509 | self.inception_resnet_a = self._generate_inception_module(384, 384, A, InceptionResNetA) 510 | self.reduction_a = InceptionResNetReductionA(384, k, l, m, n) 511 | output_channels = self.reduction_a.output_channels 512 | self.inception_resnet_b = self._generate_inception_module(output_channels, 1154, B, InceptionResNetB) 513 | self.reduction_b = InceptionResNetReductionB(1154) 514 | self.inception_resnet_c = self._generate_inception_module(2146, 2048, C, InceptionResNetC) 515 | 516 | #6x6 featuresize 517 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 518 | #"""Dropout (keep 0.8)""" 519 | self.dropout = nn.Dropout2d(1 - 0.8) 520 | self.linear = nn.Linear(2048, class_nums) 521 | 522 | def forward(self, x): 523 | x = self.stem(x) 524 | x = self.inception_resnet_a(x) 525 | x = self.reduction_a(x) 526 | x = self.inception_resnet_b(x) 527 | x = self.reduction_b(x) 528 | x = self.inception_resnet_c(x) 529 | x = self.avgpool(x) 530 | x = self.dropout(x) 531 | x = x.view(-1, 2048) 532 | x = self.linear(x) 533 | 534 | return x 535 | 536 | @staticmethod 537 | def _generate_inception_module(input_channels, output_channels, block_num, block): 538 | 539 | layers = nn.Sequential() 540 | for l in range(block_num): 541 | layers.add_module("{}_{}".format(block.__name__, l), block(input_channels)) 542 | input_channels = output_channels 543 | 544 | return layers 545 | 546 | def inceptionv4(): 547 | return InceptionV4(4, 7, 3) 548 | 549 | def inception_resnet_v2(): 550 | return InceptionResNetV2(5, 10, 5) 551 | -------------------------------------------------------------------------------- /Code/benchmarking/senet.py: -------------------------------------------------------------------------------- 1 | """senet in pytorch 2 | 3 | 4 | 5 | [1] Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu 6 | 7 | Squeeze-and-Excitation Networks 8 | https://arxiv.org/abs/1709.01507 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | def single_list(x): 17 | """ If an Element is a single instead of a list, when a list is expected it created a single element list""" 18 | if x.__class__.__name__ is 'Tensor': 19 | return [x] 20 | else: 21 | return x 22 | 23 | class BasicResidualSEBlock(nn.Module): 24 | expansion = 1 25 | # [global_local_attention_addition, global_attention_addition, global_local_attention_concat, global_attention_concat] 26 | # [global_local_attention_concat_learnable, global_local_attention_addition_learnable] 27 | # [global_local_attention_learnable_learnable, global_local_attention_learnable_learnable_att] 28 | # [standard_local_attention, identity_local_attention, pre_local_attention] 29 | # [multi_scale_conv1d] 30 | exp_name = 'global_local_attention_concat_learnable' 31 | def __init__(self, in_channels, out_channels, stride, block_num, r=16): 32 | super().__init__() 33 | 34 | if (not 'concat' in self.exp_name) and (self.exp_name != "global_local_attention_learnable_learnable") and (not "multi_scale" in self.exp_name): 35 | block_num = 1 36 | if bottleneck: 37 | self.expansion = 4 38 | self.residual = nn.Sequential( 39 | nn.Conv2d(in_channels, out_channels, 1), 40 | nn.BatchNorm2d(out_channels), 41 | nn.ReLU(), 42 | 43 | nn.Conv2d(out_channels, out_channels, 3, stride=stride, padding=1), 44 | nn.BatchNorm2d(out_channels), 45 | nn.ReLU(), 46 | 47 | nn.Conv2d(out_channels, out_channels * self.expansion, 1), 48 | nn.BatchNorm2d(out_channels * self.expansion), 49 | nn.ReLU() 50 | ) 51 | else: 52 | self.residual = nn.Sequential( 53 | nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias = False), 54 | nn.BatchNorm2d(out_channels), 55 | nn.ReLU(), 56 | 57 | nn.Conv2d(out_channels, out_channels * self.expansion, 3, padding=1, bias = False), 58 | nn.BatchNorm2d(out_channels * self.expansion) 59 | ) 60 | 61 | self.shortcut = nn.Sequential() 62 | if stride != 1 or in_channels != out_channels * self.expansion: 63 | self.shortcut = nn.Sequential( 64 | nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride, bias = False), 65 | nn.BatchNorm2d(out_channels * self.expansion) 66 | ) 67 | 68 | self.squeeze = nn.AdaptiveAvgPool2d(1) 69 | 70 | if "multi_scale" in self.exp_name and block_num==1: 71 | self.excitation2 = nn.Sequential( 72 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False), 73 | nn.ReLU(), 74 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False), 75 | nn.Sigmoid() 76 | ) 77 | 78 | if "multi_scale" in self.exp_name and block_num>1: 79 | self.multi_scale_Conv1d = nn.Sequential( 80 | nn.Conv1d(in_channels=1, out_channels=1, kernel_size=block_num, stride=block_num, 81 | padding=0, bias = False), 82 | nn.Sigmoid() 83 | ) 84 | if "multi_scale" in self.exp_name: 85 | return 86 | 87 | self.excitation2 = nn.Sequential( 88 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False), 89 | nn.ReLU(), 90 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False), 91 | nn.Sigmoid() 92 | ) 93 | if 'global_local_attention_learnable_learnable' == self.exp_name: 94 | self.in_1_1_conv = nn.Sequential( 95 | nn.Conv2d(out_channels * self.expansion * block_num, out_channels * self.expansion, 1, bias = False), 96 | nn.ReLU() 97 | ) 98 | block_num = 1 99 | if not 'standard' in self.exp_name: 100 | self.excitation1 = nn.Sequential( 101 | nn.Linear(out_channels * self.expansion * block_num, out_channels * self.expansion // r, bias = False), 102 | nn.ReLU(), 103 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False), 104 | nn.Sigmoid() 105 | ) 106 | 107 | if not 'standard' in self.exp_name: 108 | self.fc = nn.Sequential( 109 | nn.Linear(out_channels * self.expansion *2, out_channels * self.expansion, bias = False), 110 | nn.Sigmoid() 111 | ) 112 | 113 | if 'global_local_attention_learnable_learnable_att' == self.exp_name: 114 | self.att = nn.MultiheadAttention(embed_dim =1, num_heads=1, kdim=64, vdim=64) 115 | 116 | def forward(self, x): 117 | 118 | if self.exp_name is 'global_local_attention_addition': 119 | if x.__class__.__name__ is 'Tensor': 120 | current_input = x 121 | shortcut = self.shortcut(current_input) 122 | residual = self.residual(current_input) 123 | squeeze2 = self.squeeze(residual) 124 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 125 | excitation2 = self.excitation2(squeeze2) 126 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 127 | output = residual * excitation2.expand_as(residual) + shortcut 128 | return (F.relu(output), [residual]) 129 | else : 130 | current_input = x[0] 131 | previous_inputs = x[1] 132 | shortcut = self.shortcut(current_input) 133 | residual = self.residual(current_input) 134 | new_connection = residual 135 | for input_ in previous_inputs: 136 | #if input_.shape[2] != residual.shape[2]: 137 | # input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item()) 138 | new_connection += input_ 139 | squeeze1 = self.squeeze(new_connection) 140 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 141 | excitation1 = self.excitation1(squeeze1) 142 | # excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1) 143 | squeeze2 = self.squeeze(residual) 144 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 145 | excitation2 = self.excitation2(squeeze2) 146 | # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 147 | 148 | local_global_mean = torch.mean(torch.stack([excitation1, excitation2]), 0) 149 | local_global_mean = local_global_mean.view(residual.size(0), residual.size(1), 1, 1) 150 | output = residual * local_global_mean + shortcut 151 | previous_inputs.append(residual) 152 | x = (F.relu(output), previous_inputs) 153 | return x 154 | elif self.exp_name is 'global_local_attention_addition_learnable': 155 | if x.__class__.__name__ is 'Tensor': 156 | current_input = x 157 | shortcut = self.shortcut(current_input) 158 | residual = self.residual(current_input) 159 | squeeze2 = self.squeeze(residual) 160 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 161 | excitation2 = self.excitation2(squeeze2) 162 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 163 | output = residual * excitation2.expand_as(residual) + shortcut 164 | return (F.relu(output), [residual]) 165 | else : 166 | current_input = x[0] 167 | previous_inputs = x[1] 168 | shortcut = self.shortcut(current_input) 169 | residual = self.residual(current_input) 170 | new_connection = residual 171 | for input_ in previous_inputs: 172 | #if input_.shape[2] != residual.shape[2]: 173 | # input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item()) 174 | new_connection += input_ 175 | squeeze1 = self.squeeze(new_connection) 176 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 177 | excitation1 = self.excitation1(squeeze1) 178 | squeeze2 = self.squeeze(residual) 179 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 180 | excitation2 = self.excitation2(squeeze2) 181 | local_global = torch.cat([excitation1, excitation2], dim = 1) 182 | local_global = self.fc(local_global) 183 | local_global = local_global.view(residual.size(0), residual.size(1), 1, 1) 184 | output = residual * local_global + shortcut 185 | previous_inputs.append(residual) 186 | x = (F.relu(output), previous_inputs) 187 | return x 188 | elif self.exp_name is 'global_attention_addition': 189 | if x.__class__.__name__ is 'Tensor': 190 | current_input = x 191 | shortcut = self.shortcut(current_input) 192 | residual = self.residual(current_input) 193 | output = residual + shortcut 194 | return (F.relu(output), [residual]) 195 | else : 196 | current_input = x[0] 197 | previous_inputs = x[1] 198 | shortcut = self.shortcut(current_input) 199 | residual = self.residual(current_input) 200 | new_connection = residual 201 | for input_ in previous_inputs: 202 | #if input_.shape[2] != residual.shape[2]: 203 | # input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item()) 204 | new_connection += input_ 205 | squeeze1 = self.squeeze(new_connection) 206 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 207 | excitation1 = self.excitation1(squeeze1) 208 | excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1) 209 | output = residual * excitation1.expand_as(residual) + shortcut 210 | previous_inputs.append(residual) 211 | x = (F.relu(output), previous_inputs) 212 | return x 213 | elif self.exp_name is 'global_local_attention_concat': 214 | if x.__class__.__name__ is 'Tensor': 215 | current_input = x 216 | shortcut = self.shortcut(current_input) 217 | residual = self.residual(current_input) 218 | squeeze2 = self.squeeze(residual) 219 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 220 | excitation2 = self.excitation2(squeeze2) 221 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 222 | output = residual * excitation2.expand_as(residual) + shortcut 223 | return (F.relu(output), [residual]) 224 | else : 225 | current_input = x[0] 226 | previous_inputs = x[1] 227 | shortcut = self.shortcut(current_input) 228 | residual = self.residual(current_input) 229 | previous_inputs.append(residual) 230 | new_connection = torch.cat(previous_inputs, dim = 1) 231 | squeeze1 = self.squeeze(new_connection) 232 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 233 | excitation1 = self.excitation1(squeeze1) 234 | # excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1) 235 | squeeze2 = self.squeeze(residual) 236 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 237 | excitation2 = self.excitation2(squeeze2) 238 | # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 239 | local_global_mean = torch.mean(torch.stack([excitation1, excitation2]), 0) 240 | local_global_mean = local_global_mean.view(residual.size(0), residual.size(1), 1, 1) 241 | output = residual * local_global_mean + shortcut 242 | 243 | x = (F.relu(output), previous_inputs) 244 | 245 | return x 246 | elif self.exp_name is 'global_local_attention_concat_learnable': 247 | if x.__class__.__name__ is 'Tensor': 248 | current_input = x 249 | shortcut = self.shortcut(current_input) 250 | residual = self.residual(current_input) 251 | squeeze2 = self.squeeze(residual) 252 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 253 | excitation2 = self.excitation2(squeeze2) 254 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 255 | output = residual * excitation2.expand_as(residual) + shortcut 256 | return (F.relu(output), [residual]) 257 | else : 258 | current_input = x[0] 259 | previous_inputs = x[1] 260 | shortcut = self.shortcut(current_input) 261 | residual = self.residual(current_input) 262 | previous_inputs.append(residual) 263 | new_connection = torch.cat(previous_inputs, dim = 1) 264 | squeeze1 = self.squeeze(new_connection) 265 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 266 | excitation1 = self.excitation1(squeeze1) 267 | # excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1) 268 | squeeze2 = self.squeeze(residual) 269 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 270 | excitation2 = self.excitation2(squeeze2) 271 | # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 272 | local_global = torch.cat([excitation1, excitation2], dim = 1) 273 | local_global = self.fc(local_global) 274 | local_global = local_global.view(residual.size(0), residual.size(1), 1, 1) 275 | output = residual * local_global + shortcut 276 | 277 | x = (F.relu(output), previous_inputs) 278 | 279 | return x 280 | elif self.exp_name is 'global_attention_concat': 281 | if x.__class__.__name__ is 'Tensor': 282 | current_input = x 283 | shortcut = self.shortcut(current_input) 284 | residual = self.residual(current_input) 285 | output = residual + shortcut 286 | return (F.relu(output), [residual]) 287 | else : 288 | current_input = x[0] 289 | previous_inputs = x[1] 290 | shortcut = self.shortcut(current_input) 291 | residual = self.residual(current_input) 292 | previous_inputs.append(residual) 293 | new_connection = torch.cat(previous_inputs, dim = 1) 294 | squeeze1 = self.squeeze(new_connection) 295 | squeeze1 = squeeze1.view(squeeze1.size(0), -1) 296 | excitation1 = self.excitation1(squeeze1) 297 | excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1) 298 | output = residual * excitation1.expand_as(residual) + shortcut 299 | x = (F.relu(output), previous_inputs) 300 | return x 301 | elif self.exp_name is 'multi_scale_conv1d': 302 | if x.__class__.__name__ is 'Tensor': 303 | current_input = x 304 | shortcut = self.shortcut(current_input) 305 | residual = self.residual(current_input) 306 | squeeze2 = self.squeeze(residual) 307 | squeeze2 = squeeze2.view(squeeze2.size(0), -1) 308 | excitation2 = self.excitation2(squeeze2) 309 | excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1) 310 | output = residual * (1+excitation2.expand_as(residual)) + shortcut 311 | squeezed = self.squeeze(residual) 312 | squeezed = squeezed.view(squeezed.size(0), -1) # [N, C] 313 | return (F.relu(output), [squeezed]) 314 | else : 315 | current_input = x[0] 316 | previous_inputs = x[1] 317 | shortcut = self.shortcut(current_input) 318 | residual = self.residual(current_input) 319 | squeezed = self.squeeze(residual) 320 | squeezed = squeezed.view(squeezed.size(0), -1) # [N, C] 321 | previous_inputs.append(squeezed) # [old, new] 322 | new_connection = torch.stack(previous_inputs) # [S, N, C] 323 | new_connection = new_connection.permute(1,2,0).contiguous() # [N, C, S] 324 | new_connection = new_connection.view(new_connection.shape[0], -1).unsqueeze(-1) # [N, C*S, 1] 325 | new_connection = new_connection.permute(0,2,1) # [N, 1, C*S][N, Cin, L] 326 | scales = self.multi_scale_Conv1d(new_connection) 327 | scales = scales.view(residual.size(0), residual.size(1), 1, 1) 328 | output = residual * (1+scales) + shortcut 329 | x = (F.relu(output), previous_inputs) 330 | return x 331 | 332 | elif self.exp_name is 'standard_local_attention': 333 | if x.__class__.__name__ is 'Tensor': 334 | current_input = x 335 | else: 336 | current_input = x[0] 337 | 338 | shortcut = self.shortcut(current_input) 339 | residual = self.residual(current_input) 340 | 341 | squeeze = self.squeeze(residual) 342 | squeeze = squeeze.view(squeeze.size(0), -1) 343 | excitation = self.excitation2(squeeze) 344 | excitation = excitation.view(residual.size(0), residual.size(1), 1, 1) 345 | 346 | output = residual * excitation.expand_as(residual) + shortcut 347 | 348 | return (F.relu(output), []) 349 | 350 | elif self.se_type == "identity_local_attention": 351 | if x.__class__.__name__ is 'Tensor': 352 | current_input = x 353 | else: 354 | current_input = x[0] 355 | 356 | shortcut = self.shortcut(current_input) 357 | 358 | squeeze = self.squeeze(shortcut) 359 | squeeze = squeeze.view(squeeze.size(0), -1) 360 | excitation = self.excitation2(squeeze) 361 | excitation = excitation.view(shortcut.size(0), shortcut.size(1), 1, 1) 362 | 363 | residual = self.residual(current_input) 364 | 365 | output = shortcut * excitation.expand_as(shortcut) + residual 366 | 367 | return (F.relu(output), []) 368 | 369 | elif self.se_type == "pre_local_attention": 370 | if x.__class__.__name__ is 'Tensor': 371 | current_input = x 372 | else: 373 | current_input = x[0] 374 | 375 | shortcut = self.shortcut(current_input) 376 | 377 | squeeze = self.squeeze(current_input) 378 | squeeze = squeeze.view(squeeze.size(0), -1) 379 | excitation = self.excitation2(squeeze) 380 | excitation = excitation.view(x.size(0), x.size(1), 1, 1) 381 | y = current_input * excitation.expand_as(current_input) 382 | 383 | residual = self.residual(y) 384 | 385 | output = residual + shortcut 386 | 387 | return (F.relu(output), []) 388 | 389 | 390 | class SEResNet(nn.Module): 391 | def __init__(self, block, block_num, class_num=120, bottleneck=False): 392 | super().__init__() 393 | self.in_channels = 64 394 | self.pre = nn.Sequential( 395 | nn.Conv2d(3, 64, 3, padding=1), 396 | nn.BatchNorm2d(64), 397 | nn.ReLU() 398 | ) 399 | self.stage1 = self._make_stage(block, block_num[0], 64, 1, bottleneck=bottleneck) 400 | self.stage2 = self._make_stage(block, block_num[1], 128, 2, bottleneck=bottleneck) 401 | self.stage3 = self._make_stage(block, block_num[2], 256, 2, bottleneck=bottleneck) 402 | self.stage4 = self._make_stage(block, block_num[3], 512, 2, bottleneck=bottleneck) 403 | self.linear = nn.Linear(self.in_channels, class_num) 404 | 405 | def forward(self, x): 406 | x = self.pre(x) 407 | x = self.stage1(x) 408 | x = self.stage2(x[0]) 409 | x = self.stage3(x[0]) 410 | x = self.stage4(x[0]) 411 | x = F.adaptive_avg_pool2d(x[0], 1) 412 | x = x.view(x.size(0), -1) 413 | x = self.linear(x) 414 | return x 415 | 416 | def _make_stage(self, block, num, out_channels, stride, bottleneck=False): 417 | layers = [] 418 | layers.append(block(self.in_channels, out_channels, stride, 1, bottleneck=bottleneck)) 419 | self.in_channels = out_channels * block.expansion 420 | for i in range(1, num): 421 | layers.append(block(self.in_channels, out_channels, 1, i + 1, bottleneck=bottleneck)) 422 | return nn.Sequential(*layers) 423 | 424 | 425 | def seresnet18(num_classes): 426 | return SEResNet(BasicResidualSEBlock, [2, 2, 2, 2], class_num = num_classes) 427 | 428 | 429 | def seresnet34(num_classes): 430 | return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes) 431 | 432 | 433 | def seresnet50(num_classes): 434 | return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes, bottleneck=True) 435 | 436 | 437 | def seresnet101(num_classes): 438 | return SEResNet(BasicResidualSEBlock, [3, 4, 23, 3], class_num = num_classes, bottleneck=True) 439 | 440 | 441 | def seresnet152(num_classes): 442 | return SEResNet(BasicResidualSEBlock, [3, 8, 36, 3], class_num = num_classes, bottleneck=True) 443 | --------------------------------------------------------------------------------