├── Figures
    ├── gradcam.jpg
    ├── scaling.png
    ├── Top-1_ACC.jpg
    ├── gradcam2.jpg
    ├── EMCA_Algorithm.PNG
    ├── EMCA_and_integration_CVPR.png
    ├── EMCA_archeticture_only_CVPR.png
    ├── EMCA_integration_only_CVPR.png
    └── Revisit_Channel_Attention_dense_connection.png
├── Code
    ├── benchmarking
    │   ├── __pycache__
    │   │   ├── cbam.cpython-37.pyc
    │   │   ├── resnet.cpython-37.pyc
    │   │   ├── senet.cpython-37.pyc
    │   │   ├── densenet.cpython-37.pyc
    │   │   ├── self_att.cpython-37.pyc
    │   │   └── resnet_cbam.cpython-37.pyc
    │   ├── vgg.py
    │   ├── squeezenet.py
    │   ├── mobilenetv2.py
    │   ├── cbam.py
    │   ├── preactresnet.py
    │   ├── resnext.py
    │   ├── googlenet.py
    │   ├── shufflenetv2.py
    │   ├── resnet.py
    │   ├── mobilenet.py
    │   ├── xception.py
    │   ├── resnet_cbam.py
    │   ├── rir.py
    │   ├── shufflenet.py
    │   ├── densenet.py
    │   ├── nasnet.py
    │   ├── inceptionv3.py
    │   ├── attention.py
    │   ├── inceptionv4.py
    │   └── senet.py
    ├── conf
    │   ├── __init__.py
    │   └── global_settings.py
    ├── dataset.py
    ├── train.py
    └── train_imagenet.py
└── README.md


/Figures/gradcam.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/gradcam.jpg


--------------------------------------------------------------------------------
/Figures/scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/scaling.png


--------------------------------------------------------------------------------
/Figures/Top-1_ACC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/Top-1_ACC.jpg


--------------------------------------------------------------------------------
/Figures/gradcam2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/gradcam2.jpg


--------------------------------------------------------------------------------
/Figures/EMCA_Algorithm.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_Algorithm.PNG


--------------------------------------------------------------------------------
/Figures/EMCA_and_integration_CVPR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_and_integration_CVPR.png


--------------------------------------------------------------------------------
/Figures/EMCA_archeticture_only_CVPR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_archeticture_only_CVPR.png


--------------------------------------------------------------------------------
/Figures/EMCA_integration_only_CVPR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/EMCA_integration_only_CVPR.png


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/cbam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/cbam.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/resnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/resnet.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/senet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/senet.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/densenet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/densenet.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/self_att.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/self_att.cpython-37.pyc


--------------------------------------------------------------------------------
/Figures/Revisit_Channel_Attention_dense_connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Figures/Revisit_Channel_Attention_dense_connection.png


--------------------------------------------------------------------------------
/Code/benchmarking/__pycache__/resnet_cbam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eslambakr/EMCA/HEAD/Code/benchmarking/__pycache__/resnet_cbam.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/conf/__init__.py:
--------------------------------------------------------------------------------
 1 | """ dynamically load settings
 2 | 
 3 | author baiyu
 4 | """
 5 | import conf.global_settings as settings
 6 | 
 7 | class Settings:
 8 |     def __init__(self, settings):
 9 | 
10 |         for attr in dir(settings):
11 |             if attr.isupper():
12 |                 setattr(self, attr, getattr(settings, attr))
13 | 
14 | settings = Settings(settings)


--------------------------------------------------------------------------------
/Code/conf/global_settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | #CIFAR100 dataset path (python version)
 5 | #CIFAR100_PATH = '/nfs/private/cifar100/cifar-100-python'
 6 | 
 7 | #mean and std of cifar100 dataset
 8 | CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
 9 | CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
10 | 
11 | #CIFAR100_TEST_MEAN = (0.5088964127604166, 0.48739301317401956, 0.44194221124387256)
12 | #CIFAR100_TEST_STD = (0.2682515741720801, 0.2573637364478126, 0.2770957707973042)
13 | 
14 | # data_set type
15 | data_type = "tiny-imagenet"
16 | if data_type == "cifar100":
17 |     IMG_SIZE = 32
18 | elif data_type == "tiny-imagenet":
19 |     IMG_SIZE = 64
20 | elif data_type == "dogs":
21 |     IMG_SIZE = 128
22 | elif data_type == "imagenet":
23 |     IMG_SIZE = 224
24 | 
25 | #directory to save weights file
26 | CHECKPOINT_PATH = 'checkpoint'
27 | 
28 | #total training epoches
29 | EPOCH = 230 
30 | MILESTONES = [60, 120, 160, 200]
31 | 
32 | #initial learning rate
33 | #INIT_LR = 0.1
34 | 
35 | #time of we run the script
36 | TIME_NOW = 'tiny_imagenet_self_local_ch_att_simple_3att'
37 | 
38 | #tensorboard log dir
39 | LOG_DIR = 'runs'
40 | 
41 | #save weights file per SAVE_EPOCH epoch
42 | SAVE_EPOCH = 100
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Code/benchmarking/vgg.py:
--------------------------------------------------------------------------------
 1 | """vgg in pytorch
 2 | 
 3 | 
 4 | [1] Karen Simonyan, Andrew Zisserman
 5 | 
 6 |     Very Deep Convolutional Networks for Large-Scale Image Recognition.
 7 |     https://arxiv.org/abs/1409.1556v6
 8 | """
 9 | '''VGG11/13/16/19 in Pytorch.'''
10 | 
11 | import torch
12 | import torch.nn as nn
13 | 
14 | cfg = {
15 |     'A' : [64,     'M', 128,      'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
16 |     'B' : [64, 64, 'M', 128, 128, 'M', 256, 256,           'M', 512, 512,           'M', 512, 512,           'M'],
17 |     'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256,      'M', 512, 512, 512,      'M', 512, 512, 512,      'M'],
18 |     'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']
19 | }
20 | 
21 | class VGG(nn.Module):
22 | 
23 |     def __init__(self, features, num_class=100):
24 |         super().__init__()
25 |         self.features = features
26 | 
27 |         self.classifier = nn.Sequential(
28 |             nn.Linear(512, 4096),
29 |             nn.ReLU(inplace=True),
30 |             nn.Dropout(),
31 |             nn.Linear(4096, 4096),
32 |             nn.ReLU(inplace=True),
33 |             nn.Dropout(),
34 |             nn.Linear(4096, num_class)
35 |         )
36 | 
37 |     def forward(self, x):
38 |         output = self.features(x)
39 |         output = output.view(output.size()[0], -1)
40 |         output = self.classifier(output)
41 |     
42 |         return output
43 | 
44 | def make_layers(cfg, batch_norm=False):
45 |     layers = []
46 | 
47 |     input_channel = 3
48 |     for l in cfg:
49 |         if l == 'M':
50 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
51 |             continue
52 | 
53 |         layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)]
54 | 
55 |         if batch_norm:
56 |             layers += [nn.BatchNorm2d(l)]
57 |         
58 |         layers += [nn.ReLU(inplace=True)]
59 |         input_channel = l
60 |     
61 |     return nn.Sequential(*layers)
62 | 
63 | def vgg11_bn():
64 |     return VGG(make_layers(cfg['A'], batch_norm=True))
65 | 
66 | def vgg13_bn():
67 |     return VGG(make_layers(cfg['B'], batch_norm=True))
68 | 
69 | def vgg16_bn():
70 |     return VGG(make_layers(cfg['D'], batch_norm=True))
71 | 
72 | def vgg19_bn():
73 |     return VGG(make_layers(cfg['E'], batch_norm=True))
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/Code/benchmarking/squeezenet.py:
--------------------------------------------------------------------------------
 1 | """squeezenet in pytorch
 2 | 
 3 | 
 4 | 
 5 | [1] Song Han, Jeff Pool, John Tran, William J. Dally
 6 | 
 7 |     squeezenet: Learning both Weights and Connections for Efficient Neural Networks
 8 |     https://arxiv.org/abs/1506.02626
 9 | """
10 | 
11 | import torch
12 | import torch.nn as nn
13 | 
14 | 
15 | class Fire(nn.Module):
16 | 
17 |     def __init__(self, in_channel, out_channel, squzee_channel):
18 | 
19 |         super().__init__()
20 |         self.squeeze = nn.Sequential(
21 |             nn.Conv2d(in_channel, squzee_channel, 1),
22 |             nn.BatchNorm2d(squzee_channel),
23 |             nn.ReLU(inplace=True)
24 |         )
25 | 
26 |         self.expand_1x1 = nn.Sequential(
27 |             nn.Conv2d(squzee_channel, int(out_channel / 2), 1),
28 |             nn.BatchNorm2d(int(out_channel / 2)),
29 |             nn.ReLU(inplace=True)
30 |         )
31 | 
32 |         self.expand_3x3 = nn.Sequential(
33 |             nn.Conv2d(squzee_channel, int(out_channel / 2), 3, padding=1),
34 |             nn.BatchNorm2d(int(out_channel / 2)),
35 |             nn.ReLU(inplace=True)
36 |         )
37 |     
38 |     def forward(self, x):
39 | 
40 |         x = self.squeeze(x)
41 |         x = torch.cat([
42 |             self.expand_1x1(x),
43 |             self.expand_3x3(x)
44 |         ], 1)
45 | 
46 |         return x
47 | 
48 | class SqueezeNet(nn.Module):
49 | 
50 |     """mobile net with simple bypass"""
51 |     def __init__(self, class_num=100):
52 | 
53 |         super().__init__()
54 |         self.stem = nn.Sequential(
55 |             nn.Conv2d(3, 96, 3, padding=1),
56 |             nn.BatchNorm2d(96),
57 |             nn.ReLU(inplace=True),
58 |             nn.MaxPool2d(2, 2)
59 |         )
60 | 
61 |         self.fire2 = Fire(96, 128, 16)
62 |         self.fire3 = Fire(128, 128, 16)
63 |         self.fire4 = Fire(128, 256, 32)
64 |         self.fire5 = Fire(256, 256, 32)
65 |         self.fire6 = Fire(256, 384, 48)
66 |         self.fire7 = Fire(384, 384, 48)
67 |         self.fire8 = Fire(384, 512, 64)
68 |         self.fire9 = Fire(512, 512, 64)
69 | 
70 |         self.conv10 = nn.Conv2d(512, class_num, 1)
71 |         self.avg = nn.AdaptiveAvgPool2d(1)
72 |         self.maxpool = nn.MaxPool2d(2, 2)
73 |             
74 |     def forward(self, x):
75 |         x = self.stem(x)
76 | 
77 |         f2 = self.fire2(x)
78 |         f3 = self.fire3(f2) + f2
79 |         f4 = self.fire4(f3)
80 |         f4 = self.maxpool(f4)
81 | 
82 |         f5 = self.fire5(f4) + f4
83 |         f6 = self.fire6(f5)
84 |         f7 = self.fire7(f6) + f6
85 |         f8 = self.fire8(f7)
86 |         f8 = self.maxpool(f8)
87 | 
88 |         f9 = self.fire9(f8)
89 |         c10 = self.conv10(f9)
90 | 
91 |         x = self.avg(c10)
92 |         x = x.view(x.size(0), -1)
93 | 
94 |         return x
95 | 
96 | def squeezenet(class_num=100):
97 |     return SqueezeNet(class_num=class_num)
98 | 


--------------------------------------------------------------------------------
/Code/benchmarking/mobilenetv2.py:
--------------------------------------------------------------------------------
  1 | """mobilenetv2 in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
  6 | 
  7 |     MobileNetV2: Inverted Residuals and Linear Bottlenecks
  8 |     https://arxiv.org/abs/1801.04381
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | 
 16 | class LinearBottleNeck(nn.Module):
 17 | 
 18 |     def __init__(self, in_channels, out_channels, stride, t=6, class_num=100):
 19 |         super().__init__()
 20 | 
 21 |         self.residual = nn.Sequential(
 22 |             nn.Conv2d(in_channels, in_channels * t, 1),
 23 |             nn.BatchNorm2d(in_channels * t),
 24 |             nn.ReLU6(inplace=True),
 25 | 
 26 |             nn.Conv2d(in_channels * t, in_channels * t, 3, stride=stride, padding=1, groups=in_channels * t),
 27 |             nn.BatchNorm2d(in_channels * t),
 28 |             nn.ReLU6(inplace=True),
 29 | 
 30 |             nn.Conv2d(in_channels * t, out_channels, 1),
 31 |             nn.BatchNorm2d(out_channels)
 32 |         )
 33 | 
 34 |         self.stride = stride
 35 |         self.in_channels = in_channels
 36 |         self.out_channels = out_channels
 37 |     
 38 |     def forward(self, x):
 39 | 
 40 |         residual = self.residual(x)
 41 | 
 42 |         if self.stride == 1 and self.in_channels == self.out_channels:
 43 |             residual += x
 44 |         
 45 |         return residual
 46 | 
 47 | class MobileNetV2(nn.Module):
 48 | 
 49 |     def __init__(self, class_num=100):
 50 |         super().__init__()
 51 | 
 52 |         self.pre = nn.Sequential(
 53 |             nn.Conv2d(3, 32, 1, padding=1),
 54 |             nn.BatchNorm2d(32),
 55 |             nn.ReLU6(inplace=True)
 56 |         )
 57 | 
 58 |         self.stage1 = LinearBottleNeck(32, 16, 1, 1)
 59 |         self.stage2 = self._make_stage(2, 16, 24, 2, 6)
 60 |         self.stage3 = self._make_stage(3, 24, 32, 2, 6)
 61 |         self.stage4 = self._make_stage(4, 32, 64, 2, 6)
 62 |         self.stage5 = self._make_stage(3, 64, 96, 1, 6)
 63 |         self.stage6 = self._make_stage(3, 96, 160, 1, 6)
 64 |         self.stage7 = LinearBottleNeck(160, 320, 1, 6)
 65 | 
 66 |         self.conv1 = nn.Sequential(
 67 |             nn.Conv2d(320, 1280, 1),
 68 |             nn.BatchNorm2d(1280),
 69 |             nn.ReLU6(inplace=True)
 70 |         )
 71 | 
 72 |         self.conv2 = nn.Conv2d(1280, class_num, 1)
 73 |             
 74 |     def forward(self, x):
 75 |         x = self.pre(x)
 76 |         x = self.stage1(x)
 77 |         x = self.stage2(x)
 78 |         x = self.stage3(x)
 79 |         x = self.stage4(x)
 80 |         x = self.stage5(x)
 81 |         x = self.stage6(x)
 82 |         x = self.stage7(x)
 83 |         x = self.conv1(x)
 84 |         x = F.adaptive_avg_pool2d(x, 1)
 85 |         x = self.conv2(x)
 86 |         x = x.view(x.size(0), -1)
 87 | 
 88 |         return x
 89 |     
 90 |     def _make_stage(self, repeat, in_channels, out_channels, stride, t):
 91 | 
 92 |         layers = []
 93 |         layers.append(LinearBottleNeck(in_channels, out_channels, stride, t))
 94 |         
 95 |         while repeat - 1:
 96 |             layers.append(LinearBottleNeck(out_channels, out_channels, 1, t))
 97 |             repeat -= 1
 98 |         
 99 |         return nn.Sequential(*layers)
100 | 
101 | def mobilenetv2():
102 |     return MobileNetV2()


--------------------------------------------------------------------------------
/Code/benchmarking/cbam.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | class BasicConv(nn.Module):
  7 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
  8 |         super(BasicConv, self).__init__()
  9 |         self.out_channels = out_planes
 10 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
 11 |         self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
 12 |         self.relu = nn.ReLU() if relu else None
 13 | 
 14 |     def forward(self, x):
 15 |         x = self.conv(x)
 16 |         if self.bn is not None:
 17 |             x = self.bn(x)
 18 |         if self.relu is not None:
 19 |             x = self.relu(x)
 20 |         return x
 21 | 
 22 | class Flatten(nn.Module):
 23 |     def forward(self, x):
 24 |         return x.view(x.size(0), -1)
 25 | 
 26 | class ChannelGate(nn.Module):
 27 |     def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
 28 |         super(ChannelGate, self).__init__()
 29 |         self.gate_channels = gate_channels
 30 |         self.mlp = nn.Sequential(
 31 |             Flatten(),
 32 |             nn.Linear(gate_channels, gate_channels // reduction_ratio),
 33 |             nn.ReLU(),
 34 |             nn.Linear(gate_channels // reduction_ratio, gate_channels)
 35 |             )
 36 |         self.pool_types = pool_types
 37 |     def forward(self, x):
 38 |         channel_att_sum = None
 39 |         for pool_type in self.pool_types:
 40 |             if pool_type=='avg':
 41 |                 avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
 42 |                 channel_att_raw = self.mlp( avg_pool )
 43 |             elif pool_type=='max':
 44 |                 max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
 45 |                 channel_att_raw = self.mlp( max_pool )
 46 |             elif pool_type=='lp':
 47 |                 lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
 48 |                 channel_att_raw = self.mlp( lp_pool )
 49 |             elif pool_type=='lse':
 50 |                 # LSE pool only
 51 |                 lse_pool = logsumexp_2d(x)
 52 |                 channel_att_raw = self.mlp( lse_pool )
 53 | 
 54 |             if channel_att_sum is None:
 55 |                 channel_att_sum = channel_att_raw
 56 |             else:
 57 |                 channel_att_sum = channel_att_sum + channel_att_raw
 58 | 
 59 |         scale = F.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
 60 |         return x * scale
 61 | 
 62 | def logsumexp_2d(tensor):
 63 |     tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
 64 |     s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
 65 |     outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
 66 |     return outputs
 67 | 
 68 | class ChannelPool(nn.Module):
 69 |     def forward(self, x):
 70 |         return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 )
 71 | 
 72 | class SpatialGate(nn.Module):
 73 |     def __init__(self):
 74 |         super(SpatialGate, self).__init__()
 75 |         kernel_size = 7
 76 |         self.compress = ChannelPool()
 77 |         self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
 78 |     def forward(self, x):
 79 |         x_compress = self.compress(x)
 80 |         x_out = self.spatial(x_compress)
 81 |         scale = F.sigmoid(x_out) # broadcasting
 82 |         return x * scale
 83 | 
 84 | class CBAM(nn.Module):
 85 |     def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'], no_spatial=False, no_channel=False):
 86 |         super(CBAM, self).__init__()
 87 |         self.no_channel=no_channel
 88 |         if not no_channel:
 89 |             self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
 90 |         self.no_spatial=no_spatial
 91 |         if not no_spatial:
 92 |             self.SpatialGate = SpatialGate()
 93 |     def forward(self, x):
 94 |         if not self.no_channel:
 95 |             x_out = self.ChannelGate(x)
 96 |         else:
 97 |             x_out = x
 98 |         if not self.no_spatial:
 99 |             x_out = self.SpatialGate(x_out)
100 |         return x_out
101 | 


--------------------------------------------------------------------------------
/Code/benchmarking/preactresnet.py:
--------------------------------------------------------------------------------
  1 | """preactresnet in pytorch
  2 | 
  3 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 
  4 | 
  5 |     Identity Mappings in Deep Residual Networks
  6 |     https://arxiv.org/abs/1603.05027
  7 | """
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | class PreActBasic(nn.Module):
 14 | 
 15 |     expansion = 1
 16 |     def __init__(self, in_channels, out_channels, stride):
 17 |         super().__init__()
 18 |         self.residual = nn.Sequential(
 19 |             nn.BatchNorm2d(in_channels),
 20 |             nn.ReLU(inplace=True),
 21 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1),
 22 |             nn.BatchNorm2d(out_channels),
 23 |             nn.ReLU(inplace=True),
 24 |             nn.Conv2d(out_channels, out_channels * PreActBasic.expansion, kernel_size=3, padding=1)
 25 |         )
 26 | 
 27 |         self.shortcut = nn.Sequential()
 28 |         if stride != 1 or in_channels != out_channels * PreActBasic.expansion:
 29 |             self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBasic.expansion, 1, stride=stride)
 30 |         
 31 |     def forward(self, x):
 32 | 
 33 |         res = self.residual(x)
 34 |         shortcut = self.shortcut(x)
 35 | 
 36 |         return res + shortcut
 37 | 
 38 | 
 39 | class PreActBottleNeck(nn.Module):
 40 | 
 41 |     expansion = 4
 42 |     def __init__(self, in_channels, out_channels, stride):
 43 |         super().__init__()
 44 | 
 45 |         self.residual = nn.Sequential(
 46 |             nn.BatchNorm2d(in_channels),
 47 |             nn.ReLU(inplace=True),
 48 |             nn.Conv2d(in_channels, out_channels, 1, stride=stride),
 49 | 
 50 |             nn.BatchNorm2d(out_channels),
 51 |             nn.ReLU(inplace=True),
 52 |             nn.Conv2d(out_channels, out_channels, 3, padding=1),
 53 | 
 54 |             nn.BatchNorm2d(out_channels),
 55 |             nn.ReLU(inplace=True),
 56 |             nn.Conv2d(out_channels, out_channels * PreActBottleNeck.expansion, 1)
 57 |         )
 58 | 
 59 |         self.shortcut = nn.Sequential()
 60 | 
 61 |         if stride != 1 or in_channels != out_channels * PreActBottleNeck.expansion:
 62 |             self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBottleNeck.expansion, 1, stride=stride)
 63 |     
 64 |     def forward(self, x):
 65 | 
 66 |         res = self.residual(x)
 67 |         shortcut = self.shortcut(x)
 68 | 
 69 |         return res + shortcut
 70 | 
 71 | class PreActResNet(nn.Module):
 72 | 
 73 |     def __init__(self, block, num_block, class_num=100):
 74 |         super().__init__()
 75 |         self.input_channels = 64
 76 | 
 77 |         self.pre = nn.Sequential(
 78 |             nn.Conv2d(3, 64, 3, padding=1),
 79 |             nn.BatchNorm2d(64),
 80 |             nn.ReLU(inplace=True)
 81 |         )
 82 | 
 83 |         self.stage1 = self._make_layers(block, num_block[0], 64,  1)
 84 |         self.stage2 = self._make_layers(block, num_block[1], 128, 2)
 85 |         self.stage3 = self._make_layers(block, num_block[2], 256, 2)
 86 |         self.stage4 = self._make_layers(block, num_block[3], 512, 2)
 87 | 
 88 |         self.linear = nn.Linear(self.input_channels, class_num)
 89 |     
 90 |     def _make_layers(self, block, block_num, out_channels, stride):
 91 |         layers = []
 92 | 
 93 |         layers.append(block(self.input_channels, out_channels, stride))
 94 |         self.input_channels = out_channels * block.expansion
 95 | 
 96 |         while block_num - 1:
 97 |             layers.append(block(self.input_channels, out_channels, 1))
 98 |             self.input_channels = out_channels * block.expansion
 99 |             block_num -= 1
100 |         
101 |         return nn.Sequential(*layers)
102 | 
103 |     def forward(self, x):
104 |         x = self.pre(x)
105 | 
106 |         x = self.stage1(x)
107 |         x = self.stage2(x)
108 |         x = self.stage3(x)
109 |         x = self.stage4(x)
110 | 
111 |         x = F.adaptive_avg_pool2d(x, 1)
112 |         x = x.view(x.size(0), -1)
113 |         x = self.linear(x)
114 | 
115 |         return x
116 | 
117 | def preactresnet18():
118 |     return PreActResNet(PreActBasic, [2, 2, 2, 2])
119 |     
120 | def preactresnet34():
121 |     return PreActResNet(PreActBasic, [3, 4, 6, 3])
122 | 
123 | def preactresnet50():
124 |     return PreActResNet(PreActBottleNeck, [3, 4, 6, 3])
125 | 
126 | def preactresnet101():
127 |     return PreActResNet(PreActBottleNeck, [3, 4, 23, 3])
128 | 
129 | def preactresnet152():
130 |     return PreActResNet(PreActBottleNeck, [3, 8, 36, 3])
131 | 
132 | 


--------------------------------------------------------------------------------
/Code/benchmarking/resnext.py:
--------------------------------------------------------------------------------
  1 | """resnext in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He.
  6 | 
  7 |     Aggregated Residual Transformations for Deep Neural Networks
  8 |     https://arxiv.org/abs/1611.05431
  9 | """
 10 | 
 11 | import math
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | 
 16 | #only implements ResNext bottleneck c
 17 | 
 18 | 
 19 | #"""This strategy exposes a new dimension, which we call “cardinality” 
 20 | #(the size of the set of transformations), as an essential factor 
 21 | #in addition to the dimensions of depth and width."""
 22 | CARDINALITY = 32
 23 | DEPTH = 4
 24 | BASEWIDTH = 64
 25 | 
 26 | #"""The grouped convolutional layer in Fig. 3(c) performs 32 groups 
 27 | #of convolutions whose input and output channels are 4-dimensional. 
 28 | #The grouped convolutional layer concatenates them as the outputs 
 29 | #of the layer."""
 30 | 
 31 | class ResNextBottleNeckC(nn.Module):
 32 | 
 33 |     def __init__(self, in_channels, out_channels, stride):
 34 |         super().__init__()
 35 | 
 36 |         C = CARDINALITY #How many groups a feature map was splitted into
 37 | 
 38 |         #"""We note that the input/output width of the template is fixed as 
 39 |         #256-d (Fig. 3), We note that the input/output width of the template 
 40 |         #is fixed as 256-d (Fig. 3), and all widths are dou- bled each time 
 41 |         #when the feature map is subsampled (see Table 1)."""
 42 |         D = int(DEPTH * out_channels / BASEWIDTH) #number of channels per group
 43 |         self.split_transforms = nn.Sequential(
 44 |             nn.Conv2d(in_channels, C * D, kernel_size=1, groups=C, bias=False),
 45 |             nn.BatchNorm2d(C * D),
 46 |             nn.ReLU(inplace=True),
 47 |             nn.Conv2d(C * D, C * D, kernel_size=3, stride=stride, groups=C, padding=1, bias=False),
 48 |             nn.BatchNorm2d(C * D),
 49 |             nn.ReLU(inplace=True),
 50 |             nn.Conv2d(C * D, out_channels * 4, kernel_size=1, bias=False),
 51 |             nn.BatchNorm2d(out_channels * 4),
 52 |         )
 53 | 
 54 |         self.shortcut = nn.Sequential()
 55 | 
 56 |         if stride != 1 or in_channels != out_channels * 4:
 57 |             self.shortcut = nn.Sequential(
 58 |                 nn.Conv2d(in_channels, out_channels * 4, stride=stride, kernel_size=1, bias=False),
 59 |                 nn.BatchNorm2d(out_channels * 4)
 60 |             )
 61 | 
 62 |     def forward(self, x):
 63 |         return F.relu(self.split_transforms(x) + self.shortcut(x))
 64 | 
 65 | class ResNext(nn.Module):
 66 | 
 67 |     def __init__(self, block, num_blocks, class_names=100):
 68 |         super().__init__()
 69 |         self.in_channels = 64
 70 | 
 71 |         self.conv1 = nn.Sequential(
 72 |             nn.Conv2d(3, 64, 3, stride=1, padding=1, bias=False),
 73 |             nn.BatchNorm2d(64),
 74 |             nn.ReLU(inplace=True)
 75 |         )
 76 | 
 77 |         self.conv2 = self._make_layer(block, num_blocks[0], 64, 1)
 78 |         self.conv3 = self._make_layer(block, num_blocks[1], 128, 2)
 79 |         self.conv4 = self._make_layer(block, num_blocks[2], 256, 2)
 80 |         self.conv5 = self._make_layer(block, num_blocks[3], 512, 2)
 81 |         self.avg = nn.AdaptiveAvgPool2d((1, 1))
 82 |         self.fc = nn.Linear(512 * 4, 100)
 83 |     
 84 |     def forward(self, x):
 85 |         x = self.conv1(x)
 86 |         x = self.conv2(x)
 87 |         x = self.conv3(x)
 88 |         x = self.conv4(x)
 89 |         x = self.conv5(x)
 90 |         x = self.avg(x)
 91 |         x = x.view(x.size(0), -1)
 92 |         x = self.fc(x)
 93 |         return x
 94 |         
 95 |     def _make_layer(self, block, num_block, out_channels, stride):
 96 |         """Building resnext block
 97 |         Args:
 98 |             block: block type(default resnext bottleneck c)
 99 |             num_block: number of blocks per layer
100 |             out_channels: output channels per block
101 |             stride: block stride
102 |         
103 |         Returns:
104 |             a resnext layer
105 |         """
106 |         strides = [stride] + [1] * (num_block - 1)
107 |         layers = []
108 |         for stride in strides:
109 |             layers.append(block(self.in_channels, out_channels, stride))
110 |             self.in_channels = out_channels * 4
111 | 
112 |         return nn.Sequential(*layers)
113 | 
114 | def resnext50():
115 |     """ return a resnext50(c32x4d) network
116 |     """
117 |     return ResNext(ResNextBottleNeckC, [3, 4, 6, 3])
118 | 
119 | def resnext101():
120 |     """ return a resnext101(c32x4d) network
121 |     """
122 |     return ResNext(ResNextBottleNeckC, [3, 4, 23, 3])
123 | 
124 | def resnext152():
125 |     """ return a resnext101(c32x4d) network
126 |     """
127 |     return ResNext(ResNextBottleNeckC, [3, 4, 36, 3])
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/Code/benchmarking/googlenet.py:
--------------------------------------------------------------------------------
  1 | """google net in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 
  6 |     Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
  7 | 
  8 |     Going Deeper with Convolutions
  9 |     https://arxiv.org/abs/1409.4842v1
 10 | """
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | 
 15 | class Inception(nn.Module):
 16 |     def __init__(self, input_channels, n1x1, n3x3_reduce, n3x3, n5x5_reduce, n5x5, pool_proj):
 17 |         super().__init__()
 18 | 
 19 |         #1x1conv branch
 20 |         self.b1 = nn.Sequential(
 21 |             nn.Conv2d(input_channels, n1x1, kernel_size=1),
 22 |             nn.BatchNorm2d(n1x1),
 23 |             nn.ReLU(inplace=True)
 24 |         )
 25 | 
 26 |         #1x1conv -> 3x3conv branch
 27 |         self.b2 = nn.Sequential(
 28 |             nn.Conv2d(input_channels, n3x3_reduce, kernel_size=1),
 29 |             nn.BatchNorm2d(n3x3_reduce),
 30 |             nn.ReLU(inplace=True),
 31 |             nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
 32 |             nn.BatchNorm2d(n3x3),
 33 |             nn.ReLU(inplace=True)
 34 |         )
 35 | 
 36 |         #1x1conv -> 5x5conv branch
 37 |         #we use 2 3x3 conv filters stacked instead
 38 |         #of 1 5x5 filters to obtain the same receptive
 39 |         #field with fewer parameters
 40 |         self.b3 = nn.Sequential(
 41 |             nn.Conv2d(input_channels, n5x5_reduce, kernel_size=1),
 42 |             nn.BatchNorm2d(n5x5_reduce),
 43 |             nn.ReLU(inplace=True),
 44 |             nn.Conv2d(n5x5_reduce, n5x5, kernel_size=3, padding=1),
 45 |             nn.BatchNorm2d(n5x5, n5x5),
 46 |             nn.ReLU(inplace=True),
 47 |             nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
 48 |             nn.BatchNorm2d(n5x5),
 49 |             nn.ReLU(inplace=True)
 50 |         )
 51 | 
 52 |         #3x3pooling -> 1x1conv
 53 |         #same conv
 54 |         self.b4 = nn.Sequential(
 55 |             nn.MaxPool2d(3, stride=1, padding=1),
 56 |             nn.Conv2d(input_channels, pool_proj, kernel_size=1),
 57 |             nn.BatchNorm2d(pool_proj),
 58 |             nn.ReLU(inplace=True)
 59 |         )
 60 |     
 61 |     def forward(self, x):
 62 |         return torch.cat([self.b1(x), self.b2(x), self.b3(x), self.b4(x)], dim=1)
 63 | 
 64 | 
 65 | class GoogleNet(nn.Module):
 66 | 
 67 |     def __init__(self, num_class=100):
 68 |         super().__init__()
 69 |         self.prelayer = nn.Sequential(
 70 |             nn.Conv2d(3, 192, kernel_size=3, padding=1),
 71 |             nn.BatchNorm2d(192),
 72 |             nn.ReLU(inplace=True)
 73 |         )
 74 | 
 75 |         #although we only use 1 conv layer as prelayer,
 76 |         #we still use name a3, b3.......
 77 |         self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
 78 |         self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
 79 | 
 80 |         #"""In general, an Inception network is a network consisting of
 81 |         #modules of the above type stacked upon each other, with occasional 
 82 |         #max-pooling layers with stride 2 to halve the resolution of the 
 83 |         #grid"""
 84 |         self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
 85 | 
 86 |         self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
 87 |         self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
 88 |         self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
 89 |         self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
 90 |         self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
 91 | 
 92 |         self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
 93 |         self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
 94 | 
 95 |         #input feature size: 8*8*1024
 96 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
 97 |         self.dropout = nn.Dropout2d(p=0.4)
 98 |         self.linear = nn.Linear(1024, num_class)
 99 |     
100 |     def forward(self, x):
101 |         output = self.prelayer(x)
102 |         output = self.a3(output)
103 |         output = self.b3(output)
104 |         
105 |         output = self.maxpool(output)
106 | 
107 |         output = self.a4(output)
108 |         output = self.b4(output)
109 |         output = self.c4(output)
110 |         output = self.d4(output)
111 |         output = self.e4(output)
112 | 
113 |         output = self.maxpool(output)
114 | 
115 |         output = self.a5(output)
116 |         output = self.b5(output)
117 | 
118 |         #"""It was found that a move from fully connected layers to
119 |         #average pooling improved the top-1 accuracy by about 0.6%, 
120 |         #however the use of dropout remained essential even after 
121 |         #removing the fully connected layers."""
122 |         output = self.avgpool(output)
123 |         output = self.dropout(output)
124 |         output = output.view(output.size()[0], -1)
125 |         output = self.linear(output)
126 | 
127 |         return output
128 | 
129 | def googlenet():
130 |     return GoogleNet()
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/Code/benchmarking/shufflenetv2.py:
--------------------------------------------------------------------------------
  1 | """shufflenetv2 in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Ningning Ma, Xiangyu Zhang, Hai-Tao Zheng, Jian Sun
  6 | 
  7 |     ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
  8 |     https://arxiv.org/abs/1807.11164
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | 
 16 | def channel_split(x, split):
 17 |     """split a tensor into two pieces along channel dimension
 18 |     Args:
 19 |         x: input tensor
 20 |         split:(int) channel size for each pieces
 21 |     """
 22 |     assert x.size(1) == split * 2
 23 |     return torch.split(x, split, dim=1)
 24 |     
 25 | def channel_shuffle(x, groups):
 26 |     """channel shuffle operation
 27 |     Args:
 28 |         x: input tensor
 29 |         groups: input branch number
 30 |     """
 31 | 
 32 |     batch_size, channels, height, width = x.size()
 33 |     channels_per_group = int(channels / groups)
 34 | 
 35 |     x = x.view(batch_size, groups, channels_per_group, height, width)
 36 |     x = x.transpose(1, 2).contiguous()
 37 |     x = x.view(batch_size, -1, height, width)
 38 | 
 39 |     return x
 40 | 
 41 | class ShuffleUnit(nn.Module):
 42 | 
 43 |     def __init__(self, in_channels, out_channels, stride):
 44 |         super().__init__()
 45 | 
 46 |         self.stride = stride
 47 |         self.in_channels = in_channels
 48 |         self.out_channels = out_channels
 49 | 
 50 |         if stride != 1 or in_channels != out_channels:
 51 |             self.residual = nn.Sequential(
 52 |                 nn.Conv2d(in_channels, in_channels, 1),
 53 |                 nn.BatchNorm2d(in_channels),
 54 |                 nn.ReLU(inplace=True),
 55 |                 nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels),
 56 |                 nn.BatchNorm2d(in_channels),
 57 |                 nn.Conv2d(in_channels, int(out_channels / 2), 1),
 58 |                 nn.BatchNorm2d(int(out_channels / 2)),
 59 |                 nn.ReLU(inplace=True)
 60 |             )
 61 | 
 62 |             self.shortcut = nn.Sequential(
 63 |                 nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels),
 64 |                 nn.BatchNorm2d(in_channels),
 65 |                 nn.Conv2d(in_channels, int(out_channels / 2), 1),
 66 |                 nn.BatchNorm2d(int(out_channels / 2)),
 67 |                 nn.ReLU(inplace=True)
 68 |             )
 69 |         else:
 70 |             self.shortcut = nn.Sequential()
 71 | 
 72 |             in_channels = int(in_channels / 2)
 73 |             self.residual = nn.Sequential(
 74 |                 nn.Conv2d(in_channels, in_channels, 1),
 75 |                 nn.BatchNorm2d(in_channels),
 76 |                 nn.ReLU(inplace=True),
 77 |                 nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels),
 78 |                 nn.BatchNorm2d(in_channels),
 79 |                 nn.Conv2d(in_channels, in_channels, 1),
 80 |                 nn.BatchNorm2d(in_channels),
 81 |                 nn.ReLU(inplace=True) 
 82 |             )
 83 | 
 84 |     
 85 |     def forward(self, x):
 86 | 
 87 |         if self.stride == 1 and self.out_channels == self.in_channels:
 88 |             shortcut, residual = channel_split(x, int(self.in_channels / 2))
 89 |         else:
 90 |             shortcut = x
 91 |             residual = x
 92 |         
 93 |         shortcut = self.shortcut(shortcut)
 94 |         residual = self.residual(residual)
 95 |         x = torch.cat([shortcut, residual], dim=1)
 96 |         x = channel_shuffle(x, 2)
 97 |         
 98 |         return x
 99 | 
100 | class ShuffleNetV2(nn.Module):
101 | 
102 |     def __init__(self, ratio=1, class_num=100):
103 |         super().__init__()
104 |         if ratio == 0.5:
105 |             out_channels = [48, 96, 192, 1024]
106 |         elif ratio == 1:
107 |             out_channels = [116, 232, 464, 1024]
108 |         elif ratio == 1.5:
109 |             out_channels = [176, 352, 704, 1024]
110 |         elif ratio == 2:
111 |             out_channels = [244, 488, 976, 2048]
112 |         else:
113 |             ValueError('unsupported ratio number')
114 |         
115 |         self.pre = nn.Sequential(
116 |             nn.Conv2d(3, 24, 3, padding=1),
117 |             nn.BatchNorm2d(24)
118 |         )
119 | 
120 |         self.stage2 = self._make_stage(24, out_channels[0], 3)
121 |         self.stage3 = self._make_stage(out_channels[0], out_channels[1], 7)
122 |         self.stage4 = self._make_stage(out_channels[1], out_channels[2], 3)
123 |         self.conv5 = nn.Sequential(
124 |             nn.Conv2d(out_channels[2], out_channels[3], 1),
125 |             nn.BatchNorm2d(out_channels[3]),
126 |             nn.ReLU(inplace=True)
127 |         )
128 | 
129 |         self.fc = nn.Linear(out_channels[3], class_num)
130 | 
131 |     def forward(self, x):
132 |         x = self.pre(x)
133 |         x = self.stage2(x)
134 |         x = self.stage3(x)
135 |         x = self.stage4(x)
136 |         x = self.conv5(x)
137 |         x = F.adaptive_avg_pool2d(x, 1)
138 |         x = x.view(x.size(0), -1)
139 |         x = self.fc(x)
140 | 
141 |         return x
142 | 
143 |     def _make_stage(self, in_channels, out_channels, repeat):
144 |         layers = []
145 |         layers.append(ShuffleUnit(in_channels, out_channels, 2))
146 | 
147 |         while repeat:
148 |             layers.append(ShuffleUnit(out_channels, out_channels, 1))
149 |             repeat -= 1
150 |         
151 |         return nn.Sequential(*layers)
152 | 
153 | def shufflenetv2():
154 |     return ShuffleNetV2()
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/Code/benchmarking/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | class BasicBlock(nn.Module):
  5 |     """Basic Block for resnet 18 and resnet 34
  6 | 
  7 |     """
  8 | 
  9 |     #BasicBlock and BottleNeck block 
 10 |     #have different output size
 11 |     #we use class attribute expansion
 12 |     #to distinct
 13 |     expansion = 1
 14 | 
 15 |     def __init__(self, in_channels, out_channels, stride=1):
 16 |         super().__init__()
 17 | 
 18 |         #residual function
 19 |         self.residual_function = nn.Sequential(
 20 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
 21 |             nn.BatchNorm2d(out_channels),
 22 |             nn.ReLU(inplace=True),
 23 |             nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
 24 |             nn.BatchNorm2d(out_channels * BasicBlock.expansion)
 25 |         )
 26 | 
 27 |         #shortcut
 28 |         self.shortcut = nn.Sequential()
 29 | 
 30 |         #the shortcut output dimension is not the same with residual function
 31 |         #use 1*1 convolution to match the dimension
 32 |         if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
 33 |             self.shortcut = nn.Sequential(
 34 |                 nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
 35 |                 nn.BatchNorm2d(out_channels * BasicBlock.expansion)
 36 |             )
 37 |         
 38 |     def forward(self, x):
 39 |         return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
 40 | 
 41 | class BottleNeck(nn.Module):
 42 |     """Residual block for resnet over 50 layers
 43 | 
 44 |     """
 45 |     expansion = 4
 46 |     def __init__(self, in_channels, out_channels, stride=1):
 47 |         super().__init__()
 48 |         self.residual_function = nn.Sequential(
 49 |             nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
 50 |             nn.BatchNorm2d(out_channels),
 51 |             nn.ReLU(inplace=True),
 52 |             nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False),
 53 |             nn.BatchNorm2d(out_channels),
 54 |             nn.ReLU(inplace=True),
 55 |             nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False),
 56 |             nn.BatchNorm2d(out_channels * BottleNeck.expansion),
 57 |         )
 58 | 
 59 |         self.shortcut = nn.Sequential()
 60 | 
 61 |         if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
 62 |             self.shortcut = nn.Sequential(
 63 |                 nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False),
 64 |                 nn.BatchNorm2d(out_channels * BottleNeck.expansion)
 65 |             )
 66 |         
 67 |     def forward(self, x):
 68 |         return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
 69 |     
 70 | class ResNet(nn.Module):
 71 | 
 72 |     def __init__(self, block, num_block, num_classes=120):
 73 |         super().__init__()
 74 | 
 75 |         self.in_channels = 64
 76 | 
 77 |         self.conv1 = nn.Sequential(
 78 |             nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
 79 |             nn.BatchNorm2d(64),
 80 |             nn.ReLU(inplace=True))
 81 |         #we use a different inputsize than the original paper
 82 |         #so conv2_x's stride is 1
 83 |         self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
 84 |         self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
 85 |         self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
 86 |         self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
 87 |         self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
 88 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
 89 | 
 90 |     def _make_layer(self, block, out_channels, num_blocks, stride):
 91 |         """make resnet layers(by layer i didnt mean this 'layer' was the 
 92 |         same as a neuron netowork layer, ex. conv layer), one layer may 
 93 |         contain more than one residual block 
 94 | 
 95 |         Args:
 96 |             block: block type, basic block or bottle neck block
 97 |             out_channels: output depth channel number of this layer
 98 |             num_blocks: how many blocks per layer
 99 |             stride: the stride of the first block of this layer
100 |         
101 |         Return:
102 |             return a resnet layer
103 |         """
104 | 
105 |         # we have num_block blocks per layer, the first block 
106 |         # could be 1 or 2, other blocks would always be 1
107 |         strides = [stride] + [1] * (num_blocks - 1)
108 |         layers = []
109 |         for stride in strides:
110 |             layers.append(block(self.in_channels, out_channels, stride))
111 |             self.in_channels = out_channels * block.expansion
112 |         
113 |         return nn.Sequential(*layers)
114 | 
115 |     def forward(self, x):
116 |         output = self.conv1(x)
117 |         output = self.conv2_x(output)
118 |         output = self.conv3_x(output)
119 |         output = self.conv4_x(output)
120 |         output = self.conv5_x(output)
121 |         output = self.avg_pool(output)
122 |         output = output.view(output.size(0), -1)
123 |         output = self.fc(output)
124 | 
125 |         return output 
126 | 
127 | def resnet18(num_classes):
128 |     """ return a ResNet 18 object
129 |     """
130 |     return ResNet(BasicBlock, [2, 2, 2, 2], num_classes = num_classes)
131 | 
132 | def resnet34(num_classes):
133 |     """ return a ResNet 34 object
134 |     """
135 |     return ResNet(BasicBlock, [3, 4, 6, 3], num_classes = num_classes)
136 | 
137 | def resnet50(num_classes):
138 |     """ return a ResNet 50 object
139 |     """
140 |     return ResNet(BottleNeck, [3, 4, 6, 3], num_classes = num_classes)
141 | 
142 | def resnet101(num_classes):
143 |     """ return a ResNet 101 object
144 |     """
145 |     return ResNet(BottleNeck, [3, 4, 23, 3], num_classes = num_classes)
146 | 
147 | def resnet152(num_classes):
148 |     """ return a ResNet 152 object
149 |     """
150 |     return ResNet(BottleNeck, [3, 8, 36, 3], num_classes = num_classes)
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/Code/benchmarking/mobilenet.py:
--------------------------------------------------------------------------------
  1 | """mobilenet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
  6 | 
  7 |     MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
  8 |     https://arxiv.org/abs/1704.04861
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | 
 14 | 
 15 | class DepthSeperabelConv2d(nn.Module):
 16 | 
 17 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 18 |         super().__init__()
 19 |         self.depthwise = nn.Sequential(
 20 |             nn.Conv2d(
 21 |                 input_channels,
 22 |                 input_channels,
 23 |                 kernel_size,
 24 |                 groups=input_channels,
 25 |                 **kwargs),
 26 |             nn.BatchNorm2d(input_channels),
 27 |             nn.ReLU(inplace=True)
 28 |         )
 29 | 
 30 |         self.pointwise = nn.Sequential(
 31 |             nn.Conv2d(input_channels, output_channels, 1),
 32 |             nn.BatchNorm2d(output_channels),
 33 |             nn.ReLU(inplace=True)
 34 |         )
 35 | 
 36 |     def forward(self, x):
 37 |         x = self.depthwise(x)
 38 |         x = self.pointwise(x)
 39 | 
 40 |         return x
 41 | 
 42 | 
 43 | class BasicConv2d(nn.Module):
 44 | 
 45 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 46 | 
 47 |         super().__init__()
 48 |         self.conv = nn.Conv2d(
 49 |             input_channels, output_channels, kernel_size, **kwargs)
 50 |         self.bn = nn.BatchNorm2d(output_channels)
 51 |         self.relu = nn.ReLU(inplace=True)
 52 | 
 53 |     def forward(self, x):
 54 |         x = self.conv(x)
 55 |         x = self.bn(x)
 56 |         x = self.relu(x)
 57 | 
 58 |         return x
 59 | 
 60 | 
 61 | class MobileNet(nn.Module):
 62 | 
 63 |     """
 64 |     Args:
 65 |         width multipler: The role of the width multiplier α is to thin 
 66 |                          a network uniformly at each layer. For a given 
 67 |                          layer and width multiplier α, the number of 
 68 |                          input channels M becomes αM and the number of 
 69 |                          output channels N becomes αN.
 70 |     """
 71 | 
 72 |     def __init__(self, width_multiplier=1, class_num=100):
 73 |        super().__init__()
 74 | 
 75 |        alpha = width_multiplier
 76 |        self.stem = nn.Sequential(
 77 |            BasicConv2d(3, int(32 * alpha), 3, padding=1, bias=False),
 78 |            DepthSeperabelConv2d(
 79 |                int(32 * alpha),
 80 |                int(64 * alpha),
 81 |                3,
 82 |                padding=1,
 83 |                bias=False
 84 |            )
 85 |        )
 86 | 
 87 |        #downsample
 88 |        self.conv1 = nn.Sequential(
 89 |            DepthSeperabelConv2d(
 90 |                int(64 * alpha),
 91 |                int(128 * alpha),
 92 |                3,
 93 |                stride=2,
 94 |                padding=1,
 95 |                bias=False
 96 |            ),
 97 |            DepthSeperabelConv2d(
 98 |                int(128 * alpha),
 99 |                int(128 * alpha),
100 |                3,
101 |                padding=1,
102 |                bias=False
103 |            )
104 |        )
105 | 
106 |        #downsample
107 |        self.conv2 = nn.Sequential(
108 |            DepthSeperabelConv2d(
109 |                int(128 * alpha),
110 |                int(256 * alpha),
111 |                3,
112 |                stride=2,
113 |                padding=1,
114 |                bias=False
115 |            ),
116 |            DepthSeperabelConv2d(
117 |                int(256 * alpha),
118 |                int(256 * alpha),
119 |                3,
120 |                padding=1,
121 |                bias=False
122 |            )
123 |        )
124 | 
125 |        #downsample
126 |        self.conv3 = nn.Sequential(
127 |            DepthSeperabelConv2d(
128 |                int(256 * alpha),
129 |                int(512 * alpha),
130 |                3,
131 |                stride=2,
132 |                padding=1,
133 |                bias=False
134 |            ),
135 | 
136 |            DepthSeperabelConv2d(
137 |                int(512 * alpha),
138 |                int(512 * alpha),
139 |                3,
140 |                padding=1,
141 |                bias=False
142 |            ),
143 |            DepthSeperabelConv2d(
144 |                int(512 * alpha),
145 |                int(512 * alpha),
146 |                3,
147 |                padding=1,
148 |                bias=False
149 |            ),
150 |            DepthSeperabelConv2d(
151 |                int(512 * alpha),
152 |                int(512 * alpha),
153 |                3,
154 |                padding=1,
155 |                bias=False
156 |            ),
157 |            DepthSeperabelConv2d(
158 |                int(512 * alpha),
159 |                int(512 * alpha),
160 |                3,
161 |                padding=1,
162 |                bias=False
163 |            ),
164 |            DepthSeperabelConv2d(
165 |                int(512 * alpha),
166 |                int(512 * alpha),
167 |                3,
168 |                padding=1,
169 |                bias=False
170 |            )
171 |        )
172 | 
173 |        #downsample
174 |        self.conv4 = nn.Sequential(
175 |            DepthSeperabelConv2d(
176 |                int(512 * alpha),
177 |                int(1024 * alpha),
178 |                3,
179 |                stride=2,
180 |                padding=1,
181 |                bias=False
182 |            ),
183 |            DepthSeperabelConv2d(
184 |                int(1024 * alpha),
185 |                int(1024 * alpha),
186 |                3,
187 |                padding=1,
188 |                bias=False
189 |            )
190 |        )
191 | 
192 |        self.fc = nn.Linear(int(1024 * alpha), class_num)
193 |        self.avg = nn.AdaptiveAvgPool2d(1)
194 | 
195 |     def forward(self, x):
196 |         x = self.stem(x)
197 | 
198 |         x = self.conv1(x)
199 |         x = self.conv2(x)
200 |         x = self.conv3(x)
201 |         x = self.conv4(x)
202 | 
203 |         x = self.avg(x)
204 |         x = x.view(x.size(0), -1)
205 |         x = self.fc(x)
206 |         return x
207 | 
208 | 
209 | def mobilenet(alpha=1, class_num=100):
210 |     return MobileNet(alpha, class_num)
211 | 
212 | 


--------------------------------------------------------------------------------
/Code/benchmarking/xception.py:
--------------------------------------------------------------------------------
  1 | """xception in pytorch
  2 | 
  3 | 
  4 | [1] François Chollet
  5 | 
  6 |     Xception: Deep Learning with Depthwise Separable Convolutions
  7 |     https://arxiv.org/abs/1610.02357
  8 | """
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | 
 13 | class SeperableConv2d(nn.Module):
 14 | 
 15 |     #***Figure 4. An “extreme” version of our Inception module, 
 16 |     #with one spatial convolution per output channel of the 1x1 
 17 |     #convolution."""
 18 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 19 | 
 20 |         super().__init__()
 21 |         self.depthwise = nn.Conv2d(
 22 |             input_channels, 
 23 |             input_channels, 
 24 |             kernel_size, 
 25 |             groups=input_channels,
 26 |             bias=False,
 27 |             **kwargs
 28 |         )
 29 | 
 30 |         self.pointwise = nn.Conv2d(input_channels, output_channels, 1, bias=False)
 31 | 
 32 |     def forward(self, x):
 33 |         x = self.depthwise(x)
 34 |         x = self.pointwise(x)
 35 | 
 36 |         return x
 37 | 
 38 | class EntryFlow(nn.Module):
 39 | 
 40 |     def __init__(self):
 41 | 
 42 |         super().__init__()
 43 |         self.conv1 = nn.Sequential(
 44 |             nn.Conv2d(3, 32, 3, padding=1, bias=False),
 45 |             nn.BatchNorm2d(32),
 46 |             nn.ReLU(inplace=True)
 47 |         )
 48 | 
 49 |         self.conv2 = nn.Sequential(
 50 |             nn.Conv2d(32, 64, 3, padding=1, bias=False),
 51 |             nn.BatchNorm2d(64),
 52 |             nn.ReLU(inplace=True)
 53 |         )
 54 | 
 55 |         self.conv3_residual = nn.Sequential(
 56 |             SeperableConv2d(64, 128, 3, padding=1),
 57 |             nn.BatchNorm2d(128),
 58 |             nn.ReLU(inplace=True),
 59 |             SeperableConv2d(128, 128, 3, padding=1),
 60 |             nn.BatchNorm2d(128),
 61 |             nn.MaxPool2d(3, stride=2, padding=1),
 62 |         )
 63 | 
 64 |         self.conv3_shortcut = nn.Sequential(
 65 |             nn.Conv2d(64, 128, 1, stride=2),
 66 |             nn.BatchNorm2d(128),
 67 |         )
 68 | 
 69 |         self.conv4_residual = nn.Sequential(
 70 |             nn.ReLU(inplace=True),
 71 |             SeperableConv2d(128, 256, 3, padding=1),
 72 |             nn.BatchNorm2d(256),
 73 |             nn.ReLU(inplace=True),
 74 |             SeperableConv2d(256, 256, 3, padding=1),
 75 |             nn.BatchNorm2d(256),
 76 |             nn.MaxPool2d(3, stride=2, padding=1)
 77 |         )
 78 | 
 79 |         self.conv4_shortcut = nn.Sequential(
 80 |             nn.Conv2d(128, 256, 1, stride=2),
 81 |             nn.BatchNorm2d(256),
 82 |         )
 83 | 
 84 |         #no downsampling
 85 |         self.conv5_residual = nn.Sequential(
 86 |             nn.ReLU(inplace=True),
 87 |             SeperableConv2d(256, 728, 3, padding=1),
 88 |             nn.BatchNorm2d(728),
 89 |             nn.ReLU(inplace=True),
 90 |             SeperableConv2d(728, 728, 3, padding=1),
 91 |             nn.BatchNorm2d(728),
 92 |             nn.MaxPool2d(3, 1, padding=1)
 93 |         )
 94 | 
 95 |         #no downsampling
 96 |         self.conv5_shortcut = nn.Sequential(
 97 |             nn.Conv2d(256, 728, 1),
 98 |             nn.BatchNorm2d(728)
 99 |         )
100 |     
101 |     def forward(self, x):
102 |         x = self.conv1(x)
103 |         x = self.conv2(x)
104 |         residual = self.conv3_residual(x)
105 |         shortcut = self.conv3_shortcut(x)
106 |         x = residual + shortcut
107 |         residual = self.conv4_residual(x)
108 |         shortcut = self.conv4_shortcut(x)
109 |         x = residual + shortcut
110 |         residual = self.conv5_residual(x)
111 |         shortcut = self.conv5_shortcut(x)
112 |         x = residual + shortcut
113 | 
114 |         return x
115 | 
116 | class MiddleFLowBlock(nn.Module):
117 | 
118 |     def __init__(self):
119 |         super().__init__()
120 | 
121 |         self.shortcut = nn.Sequential()
122 |         self.conv1 = nn.Sequential(
123 |             nn.ReLU(inplace=True),
124 |             SeperableConv2d(728, 728, 3, padding=1),
125 |             nn.BatchNorm2d(728)
126 |         )
127 |         self.conv2 = nn.Sequential(
128 |             nn.ReLU(inplace=True),
129 |             SeperableConv2d(728, 728, 3, padding=1),
130 |             nn.BatchNorm2d(728)
131 |         )
132 |         self.conv3 = nn.Sequential(
133 |             nn.ReLU(inplace=True),
134 |             SeperableConv2d(728, 728, 3, padding=1),
135 |             nn.BatchNorm2d(728)
136 |         )
137 |     
138 |     def forward(self, x):
139 |         residual = self.conv1(x)
140 |         residual = self.conv2(residual)
141 |         residual = self.conv3(residual)
142 | 
143 |         shortcut = self.shortcut(x)
144 | 
145 |         return shortcut + residual
146 | 
147 | class MiddleFlow(nn.Module):
148 |     def __init__(self, block):
149 |         super().__init__()
150 | 
151 |         #"""then through the middle flow which is repeated eight times"""
152 |         self.middel_block = self._make_flow(block, 8)
153 |     
154 |     def forward(self, x):
155 |         x = self.middel_block(x)
156 |         return x
157 | 
158 |     def _make_flow(self, block, times):
159 |         flows = []
160 |         for i in range(times):
161 |             flows.append(block())
162 |         
163 |         return nn.Sequential(*flows)
164 | 
165 | 
166 | class ExitFLow(nn.Module):
167 | 
168 |     def __init__(self):
169 |         super().__init__()
170 |         self.residual = nn.Sequential(
171 |             nn.ReLU(),
172 |             SeperableConv2d(728, 728, 3, padding=1),
173 |             nn.BatchNorm2d(728),
174 |             nn.ReLU(),
175 |             SeperableConv2d(728, 1024, 3, padding=1),
176 |             nn.BatchNorm2d(1024),
177 |             nn.MaxPool2d(3, stride=2, padding=1)
178 |         )
179 | 
180 |         self.shortcut = nn.Sequential(
181 |             nn.Conv2d(728, 1024, 1, stride=2),
182 |             nn.BatchNorm2d(1024)
183 |         )
184 | 
185 |         self.conv = nn.Sequential(
186 |             SeperableConv2d(1024, 1536, 3, padding=1),
187 |             nn.BatchNorm2d(1536),
188 |             nn.ReLU(inplace=True),
189 |             SeperableConv2d(1536, 2048, 3, padding=1),
190 |             nn.BatchNorm2d(2048),
191 |             nn.ReLU(inplace=True)
192 |         )
193 | 
194 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
195 | 
196 |     def forward(self, x):
197 |         shortcut = self.shortcut(x) 
198 |         residual = self.residual(x)
199 |         output = shortcut + residual
200 |         output = self.conv(output) 
201 |         output = self.avgpool(output)
202 |        
203 |         return output
204 | 
205 | class Xception(nn.Module):
206 | 
207 |     def __init__(self, block, num_class=100):
208 |         super().__init__()
209 |         self.entry_flow = EntryFlow()
210 |         self.middel_flow = MiddleFlow(block)
211 |         self.exit_flow = ExitFLow()
212 | 
213 |         self.fc = nn.Linear(2048, num_class)
214 |     
215 |     def forward(self, x):
216 |         x = self.entry_flow(x)
217 |         x = self.middel_flow(x)
218 |         x = self.exit_flow(x)
219 |         x = x.view(x.size(0), -1)
220 |         x = self.fc(x)
221 | 
222 |         return x
223 | 
224 | def xception():
225 |     return Xception(MiddleFLowBlock)
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/Code/benchmarking/resnet_cbam.py:
--------------------------------------------------------------------------------
  1 | """senet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu
  6 | 
  7 |     Squeeze-and-Excitation Networks
  8 |     https://arxiv.org/abs/1709.01507
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from models.cbam import CBAM
 15 | 
 16 | 
 17 | def single_list(x):
 18 |     """ If an Element is a single instead of a list, when a list is expected it created a single element list"""
 19 |     if x.__class__.__name__ is 'Tensor':
 20 |        return [x]
 21 |     else:
 22 |         return x
 23 | 
 24 | class BasicResidualSEBlock(nn.Module):
 25 |     expansion = 1
 26 |     # [global_local_attention_addition, global_attention_addition, global_local_attention_concat, global_attention_concat]
 27 |     # [global_local_attention_concat_learnable, global_local_attention_addition_learnable]
 28 |     # [standard_local_attention, identity_local_attention, pre_local_attention]
 29 |     exp_name = 'standard_cbam'
 30 |     def __init__(self, in_channels, out_channels, stride, block_num, r=16):
 31 |         super().__init__()
 32 |         if not 'concat' in self.exp_name:
 33 |             block_num = 1
 34 |         self.residual = nn.Sequential(
 35 |             nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias = False),
 36 |             nn.BatchNorm2d(out_channels),
 37 |             nn.ReLU(),
 38 | 
 39 |             nn.Conv2d(out_channels, out_channels * self.expansion, 3, padding=1, bias = False),
 40 |             nn.BatchNorm2d(out_channels * self.expansion)
 41 |         )
 42 | 
 43 |         self.shortcut = nn.Sequential()
 44 |         if stride != 1 or in_channels != out_channels * self.expansion:
 45 |             self.shortcut = nn.Sequential(
 46 |                 nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride, bias = False),
 47 |                 nn.BatchNorm2d(out_channels * self.expansion)
 48 |             )
 49 | 
 50 |         self.cbam = CBAM(out_channels * self.expansion * block_num, no_spatial=False, no_channel=False)
 51 | 
 52 |     def forward(self, x):
 53 |         if self.exp_name is 'standard_cbam':
 54 |             if x.__class__.__name__ is 'Tensor':
 55 |                 current_input = x
 56 |             else:
 57 |                 current_input = x[0]    
 58 | 
 59 |             shortcut = self.shortcut(current_input)
 60 |             residual = self.residual(current_input)
 61 | 
 62 |             residual = self.cbam(residual)
 63 | 
 64 |             output = residual + shortcut
 65 | 
 66 |             return (F.relu(output), [])
 67 | 
 68 | 
 69 | class BottleneckResidualSEBlock(nn.Module):
 70 |     expansion = 4
 71 | 
 72 |     def __init__(self, in_channels, out_channels, stride, r=16):
 73 |         super().__init__()
 74 | 
 75 |         self.residual = nn.Sequential(
 76 |             nn.Conv2d(in_channels, out_channels, 1),
 77 |             nn.BatchNorm2d(out_channels),
 78 |             nn.ReLU(),
 79 | 
 80 |             nn.Conv2d(out_channels, out_channels, 3, stride=stride, padding=1),
 81 |             nn.BatchNorm2d(out_channels),
 82 |             nn.ReLU(),
 83 | 
 84 |             nn.Conv2d(out_channels, out_channels * self.expansion, 1),
 85 |             nn.BatchNorm2d(out_channels * self.expansion),
 86 |             nn.ReLU()
 87 |         )
 88 | 
 89 |         self.squeeze = nn.AdaptiveAvgPool2d(1)
 90 |         self.excitation1 = nn.Sequential(
 91 |             nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r),
 92 |             nn.ReLU(),
 93 |             nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion),
 94 |             nn.Sigmoid()
 95 |         )
 96 |         self.excitation2 = nn.Sequential(
 97 |             nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r),
 98 |             nn.ReLU(),
 99 |             nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion),
100 |             nn.Sigmoid()
101 |         )
102 | 
103 |         self.shortcut = nn.Sequential()
104 |         if stride != 1 or in_channels != out_channels * self.expansion:
105 |             self.shortcut = nn.Sequential(
106 |                 nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride),
107 |                 nn.BatchNorm2d(out_channels * self.expansion)
108 |             )
109 | 
110 |     def forward(self, x):
111 |         x = single_list(x)
112 |         current_input = x[-1]
113 |         shortcut = self.shortcut(current_input)
114 | 
115 |         residual = self.residual(current_input)
116 |         new_connection = residual
117 |         print(len(x))
118 |         for input_ in x[: -1]:
119 |             new_connection += input_
120 |         squeeze1 = self.squeeze(new_connection)
121 |         squeeze1 = squeeze1.view(squeeze1.size(0), -1)
122 |         excitation1 = self.excitation1(squeeze1)
123 |         excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1)       
124 |         squeeze2 = self.squeeze(residual)
125 |         squeeze2 = squeeze2.view(squeeze2.size(0), -1)
126 |         excitation2 = self.excitation2(squeeze2)
127 |         excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
128 |         
129 | 
130 |         output = residual * excitation1.expand_as(residual) *  excitation2.expand_as(residual) + shortcut
131 |         x.append(F.relu(output))
132 |         return x
133 | 
134 | 
135 | class SEResNet(nn.Module):
136 | 
137 |     def __init__(self, block, block_num, class_num=120):
138 |         super().__init__()
139 | 
140 |         self.in_channels = 64
141 | 
142 |         self.pre = nn.Sequential(
143 |             nn.Conv2d(3, 64, 3, padding=1),
144 |             nn.BatchNorm2d(64),
145 |             nn.ReLU()
146 |         )
147 | 
148 |         self.stage1 = self._make_stage(block, block_num[0], 64, 1)
149 |         self.stage2 = self._make_stage(block, block_num[1], 128, 2)
150 |         self.stage3 = self._make_stage(block, block_num[2], 256, 2)
151 |         self.stage4 = self._make_stage(block, block_num[3], 512, 2)
152 | 
153 |         self.linear = nn.Linear(self.in_channels, class_num)
154 | 
155 |     def forward(self, x):
156 |         x = self.pre(x)
157 | 
158 |         x = self.stage1(x)
159 | 
160 |         x = self.stage2(x[0])
161 |         x = self.stage3(x[0])
162 |         x = self.stage4(x[0])
163 |         x = F.adaptive_avg_pool2d(x[0], 1)
164 |         x = x.view(x.size(0), -1)
165 |         x = self.linear(x)
166 | 
167 |         return x
168 | 
169 |     def _make_stage(self, block, num, out_channels, stride):
170 |         layers = []
171 |         layers.append(block(self.in_channels, out_channels, stride, 1))
172 |         self.in_channels = out_channels * block.expansion
173 | 
174 |         for i in range(1, num):
175 |             layers.append(block(self.in_channels, out_channels, 1, i + 1))
176 | 
177 |         return nn.Sequential(*layers)
178 | 
179 | 
180 | def seresnet18(num_classes):
181 |     return SEResNet(BasicResidualSEBlock, [2, 2, 2, 2], class_num = num_classes)
182 | 
183 | 
184 | def seresnet34(num_classes):
185 |     return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes)
186 | 
187 | 
188 | def seresnet50(num_classes):
189 |     return SEResNet(BottleneckResidualSEBlock, [3, 4, 6, 3], class_num = num_classes)
190 | 
191 | 
192 | def seresnet101(num_classes):
193 |     return SEResNet(BottleneckResidualSEBlock, [3, 4, 23, 3], class_num = num_classes)
194 | 
195 | 
196 | def seresnet152(num_classes):
197 |     return SEResNet(BottleneckResidualSEBlock, [3, 8, 36, 3], class_num = num_classes)
198 | 


--------------------------------------------------------------------------------
/Code/benchmarking/rir.py:
--------------------------------------------------------------------------------
  1 | """resnet in resnet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Sasha Targ, Diogo Almeida, Kevin Lyman.
  6 | 
  7 |     Resnet in Resnet: Generalizing Residual Architectures
  8 |     https://arxiv.org/abs/1603.08029v1
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | 
 14 | #geralized  
 15 | class ResnetInit(nn.Module):
 16 |     def __init__(self, in_channel, out_channel, stride):
 17 |         super().__init__()
 18 | 
 19 |         #"""The modular unit of the generalized residual network architecture is a 
 20 |         #generalized residual block consisting of parallel states for a residual stream, 
 21 |         #r, which contains identity shortcut connections and is similar to the structure 
 22 |         #of a residual block from the original ResNet with a single convolutional layer 
 23 |         #(parameters W l,r→r )
 24 |         self.residual_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride)
 25 | 
 26 |         #"""and a transient stream, t, which is a standard convolutional layer
 27 |         #(W l,t→t )."""
 28 |         self.transient_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride)
 29 | 
 30 |         #"""Two additional sets of convolutional filters in each block (W l,r→t , W l,t→r )
 31 |         #also transfer information across streams."""
 32 |         self.residual_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride)
 33 | 
 34 |         #"""We use equal numbers of filters for the residual and transient streams of the 
 35 |         #generalized residual network, but optimizing this hyperparameter could lead to 
 36 |         #further potential improvements."""
 37 |         self.transient_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride)
 38 | 
 39 |         self.residual_bn_relu = nn.Sequential(
 40 |             nn.BatchNorm2d(out_channel),
 41 |             nn.ReLU(inplace=True)
 42 |         )
 43 | 
 44 |         self.transient_bn_relu = nn.Sequential(
 45 |             nn.BatchNorm2d(out_channel),
 46 |             nn.ReLU(inplace=True)
 47 |         )
 48 | 
 49 |         #"""The form of the shortcut connection can be an identity function with
 50 |         #the appropriate padding or a projection as in He et al. (2015b)."""
 51 |         self.short_cut = nn.Sequential()
 52 |         if in_channel != out_channel or stride != 1:
 53 |             self.short_cut = nn.Sequential(
 54 |                 nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride)
 55 |             )
 56 |         
 57 | 
 58 |     def forward(self, x):
 59 |         x_residual, x_transient = x
 60 |         residual_r_r = self.residual_stream_conv(x_residual)
 61 |         residual_r_t = self.residual_stream_conv_across(x_residual)
 62 |         residual_shortcut = self.short_cut(x_residual)
 63 | 
 64 |         transient_t_t = self.transient_stream_conv(x_transient)
 65 |         transient_t_r = self.transient_stream_conv_across(x_transient)
 66 | 
 67 |         #transient_t_t = self.transient_stream_conv(x_residual)
 68 |         #transient_t_r = self.transient_stream_conv_across(x_residual)
 69 |         #"""Same-stream and cross-stream activations are summed (along with the 
 70 |         #shortcut connection for the residual stream) before applying batch 
 71 |         #normalization and ReLU nonlinearities (together σ) to get the output 
 72 |         #states of the block (Equation 1) (Ioffe & Szegedy, 2015)."""
 73 |         x_residual = self.residual_bn_relu(residual_r_r + transient_t_r + residual_shortcut)
 74 |         x_transient = self.transient_bn_relu(residual_r_t + transient_t_t)
 75 | 
 76 |         return x_residual, x_transient
 77 |     
 78 | 
 79 | 
 80 | class RiRBlock(nn.Module):
 81 |     def __init__(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit):
 82 |         super().__init__()
 83 |         self.resnetinit = self._make_layers(in_channel, out_channel, layer_num, stride)
 84 | 
 85 |         #self.short_cut = nn.Sequential()
 86 |         #if stride != 1 or in_channel != out_channel:
 87 |         #    self.short_cut = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) 
 88 | 
 89 |     def forward(self, x):
 90 |         x_residual, x_transient = self.resnetinit(x)
 91 |         #x_residual = x_residual + self.short_cut(x[0])
 92 |         #x_transient = x_transient + self.short_cut(x[1])
 93 | 
 94 |         return (x_residual, x_transient)
 95 | 
 96 |     #"""Replacing each of the convolutional layers within a residual
 97 |     #block from the original ResNet (Figure 1a) with a generalized residual block 
 98 |     #(Figure 1b) leads us to a new architecture we call ResNet in ResNet (RiR) 
 99 |     #(Figure 1d)."""
100 |     def _make_layers(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit):
101 |         strides = [stride] + [1] * (layer_num - 1)
102 |         layers = nn.Sequential()
103 |         for index, s in enumerate(strides):
104 |             layers.add_module("generalized layers{}".format(index), layer(in_channel, out_channel, s))
105 |             in_channel = out_channel
106 | 
107 |         return layers
108 | 
109 | class ResnetInResneet(nn.Module):
110 |     def __init__(self, num_classes=100):
111 |         super().__init__()
112 |         base = int(96 / 2)
113 |         self.residual_pre_conv = nn.Sequential(
114 |             nn.Conv2d(3, base, 3, padding=1),
115 |             nn.BatchNorm2d(base),
116 |             nn.ReLU(inplace=True)
117 |         )
118 |         self.transient_pre_conv = nn.Sequential(
119 |             nn.Conv2d(3, base, 3, padding=1),
120 |             nn.BatchNorm2d(base),
121 |             nn.ReLU(inplace=True)
122 |         )
123 | 
124 |         self.rir1 = RiRBlock(base, base, 2, 1)
125 |         self.rir2 = RiRBlock(base, base, 2, 1)
126 |         self.rir3 = RiRBlock(base, base * 2, 2, 2)
127 |         self.rir4 = RiRBlock(base * 2, base * 2, 2, 1)
128 |         self.rir5 = RiRBlock(base * 2, base * 2, 2, 1)
129 |         self.rir6 = RiRBlock(base * 2, base * 4, 2, 2)
130 |         self.rir7 = RiRBlock(base * 4, base * 4, 2, 1)
131 |         self.rir8 = RiRBlock(base * 4, base * 4, 2, 1)
132 | 
133 |         self.conv1 = nn.Sequential(
134 |             nn.Conv2d(384, num_classes, kernel_size=3, stride=2), #without this convolution, loss will soon be nan
135 |             nn.BatchNorm2d(num_classes),
136 |             nn.ReLU(inplace=True),
137 |         )
138 | 
139 |         self.classifier = nn.Sequential(
140 |             nn.Linear(900, 450),
141 |             nn.ReLU(),
142 |             nn.Dropout(),
143 |             nn.Linear(450, 100),
144 |         )
145 | 
146 |         self._weight_init()
147 |     
148 |     def forward(self, x):
149 |         x_residual = self.residual_pre_conv(x)
150 |         x_transient = self.transient_pre_conv(x)
151 | 
152 |         x_residual, x_transient = self.rir1((x_residual, x_transient))
153 |         x_residual, x_transient = self.rir2((x_residual, x_transient))
154 |         x_residual, x_transient = self.rir3((x_residual, x_transient))
155 |         x_residual, x_transient = self.rir4((x_residual, x_transient))
156 |         x_residual, x_transient = self.rir5((x_residual, x_transient))
157 |         x_residual, x_transient = self.rir6((x_residual, x_transient))
158 |         x_residual, x_transient = self.rir7((x_residual, x_transient))
159 |         x_residual, x_transient = self.rir8((x_residual, x_transient))
160 |         h = torch.cat([x_residual, x_transient], 1)
161 |         h = self.conv1(h)
162 |         h = h.view(h.size()[0], -1)
163 |         h = self.classifier(h)
164 | 
165 |         return h
166 | 
167 |     def _weight_init(self):
168 |         for m in self.modules():
169 |             if isinstance(m, nn.Conv2d):
170 |                 torch.nn.init.kaiming_normal(m.weight)
171 |                 m.bias.data.fill_(0.01)    
172 |     
173 | 
174 | def resnet_in_resnet():
175 |     return ResnetInResneet()
176 | 
177 | #from torch.autograd import Variable
178 | #
179 | #net = resnet_in_resnet()
180 | #print(net(Variable(torch.randn(3, 3, 32, 32))).shape)
181 | 


--------------------------------------------------------------------------------
/Code/benchmarking/shufflenet.py:
--------------------------------------------------------------------------------
  1 | """shufflenet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Xiangyu Zhang, Xinyu Zhou, Mengxiao Lin, Jian Sun.
  6 | 
  7 |     ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices
  8 |     https://arxiv.org/abs/1707.01083v2
  9 | """
 10 | 
 11 | from functools import partial
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | class BasicConv2d(nn.Module):
 18 | 
 19 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 20 |         super().__init__()
 21 |         self.conv = nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs)
 22 |         self.bn = nn.BatchNorm2d(output_channels)
 23 |         self.relu = nn.ReLU(inplace=True)
 24 |     
 25 |     def forward(self, x):
 26 |         x = self.conv(x)
 27 |         x = self.bn(x)
 28 |         x = self.relu(x)
 29 |         return x
 30 | 
 31 | class ChannelShuffle(nn.Module):
 32 | 
 33 |     def __init__(self, groups):
 34 |         super().__init__()
 35 |         self.groups = groups
 36 |     
 37 |     def forward(self, x):
 38 |         batchsize, channels, height, width = x.data.size()
 39 |         channels_per_group = int(channels / self.groups)
 40 | 
 41 |         #"""suppose a convolutional layer with g groups whose output has
 42 |         #g x n channels; we first reshape the output channel dimension
 43 |         #into (g, n)"""
 44 |         x = x.view(batchsize, self.groups, channels_per_group, height, width)
 45 | 
 46 |         #"""transposing and then flattening it back as the input of next layer."""
 47 |         x = x.transpose(1, 2).contiguous()
 48 |         x = x.view(batchsize, -1, height, width)
 49 | 
 50 |         return x
 51 | 
 52 | class DepthwiseConv2d(nn.Module):
 53 | 
 54 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 55 |         super().__init__()
 56 |         self.depthwise = nn.Sequential(
 57 |             nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs),
 58 |             nn.BatchNorm2d(output_channels)
 59 |         )
 60 | 
 61 |     def forward(self, x):
 62 |         return self.depthwise(x)
 63 | 
 64 | class PointwiseConv2d(nn.Module):
 65 |     def __init__(self, input_channels, output_channels, **kwargs):
 66 |         super().__init__()
 67 |         self.pointwise = nn.Sequential(
 68 |             nn.Conv2d(input_channels, output_channels, 1, **kwargs),
 69 |             nn.BatchNorm2d(output_channels)
 70 |         )
 71 |     
 72 |     def forward(self, x):
 73 |         return self.pointwise(x)
 74 | 
 75 | class ShuffleNetUnit(nn.Module):
 76 | 
 77 |     def __init__(self, input_channels, output_channels, stage, stride, groups):
 78 |         super().__init__()
 79 | 
 80 |         #"""Similar to [9], we set the number of bottleneck channels to 1/4 
 81 |         #of the output channels for each ShuffleNet unit."""
 82 |         self.bottlneck = nn.Sequential(
 83 |             PointwiseConv2d(
 84 |                 input_channels, 
 85 |                 int(output_channels / 4), 
 86 |                 groups=groups
 87 |             ),
 88 |             nn.ReLU(inplace=True)
 89 |         )
 90 | 
 91 |         #"""Note that for Stage 2, we do not apply group convolution on the first pointwise 
 92 |         #layer because the number of input channels is relatively small."""
 93 |         if stage == 2:
 94 |             self.bottlneck = nn.Sequential(
 95 |                 PointwiseConv2d(
 96 |                     input_channels, 
 97 |                     int(output_channels / 4),
 98 |                     groups=groups
 99 |                 ),
100 |                 nn.ReLU(inplace=True)
101 |             )
102 |         
103 |         self.channel_shuffle = ChannelShuffle(groups)
104 | 
105 |         self.depthwise = DepthwiseConv2d(
106 |             int(output_channels / 4), 
107 |             int(output_channels / 4), 
108 |             3, 
109 |             groups=int(output_channels / 4), 
110 |             stride=stride,
111 |             padding=1
112 |         )
113 | 
114 |         self.expand = PointwiseConv2d(
115 |             int(output_channels / 4),
116 |             output_channels,
117 |             groups=groups
118 |         )
119 | 
120 |         self.relu = nn.ReLU(inplace=True)
121 |         self.fusion = self._add
122 |         self.shortcut = nn.Sequential()
123 | 
124 |         #"""As for the case where ShuffleNet is applied with stride, 
125 |         #we simply make two modifications (see Fig 2 (c)): 
126 |         #(i) add a 3 × 3 average pooling on the shortcut path; 
127 |         #(ii) replace the element-wise addition with channel concatenation, 
128 |         #which makes it easy to enlarge channel dimension with little extra 
129 |         #computation cost.
130 |         if stride != 1 or input_channels != output_channels:
131 |             self.shortcut = nn.AvgPool2d(3, stride=2, padding=1)
132 | 
133 |             self.expand = PointwiseConv2d(
134 |                 int(output_channels / 4),
135 |                 output_channels - input_channels,
136 |                 groups=groups
137 |             )
138 | 
139 |             self.fusion = self._cat
140 |     
141 |     def _add(self, x, y):
142 |         return torch.add(x, y)
143 |     
144 |     def _cat(self, x, y):
145 |         return torch.cat([x, y], dim=1)
146 | 
147 |     def forward(self, x):
148 |         shortcut = self.shortcut(x)
149 | 
150 |         shuffled = self.bottlneck(x)
151 |         shuffled = self.channel_shuffle(shuffled)
152 |         shuffled = self.depthwise(shuffled)
153 |         shuffled = self.expand(shuffled)
154 | 
155 |         output = self.fusion(shortcut, shuffled)
156 |         output = self.relu(output)
157 | 
158 |         return output
159 | 
160 | class ShuffleNet(nn.Module):
161 | 
162 |     def __init__(self, num_blocks, num_classes=100, groups=3):
163 |         super().__init__()
164 | 
165 |         if groups == 1:
166 |             out_channels = [24, 144, 288, 567]
167 |         elif groups == 2:
168 |             out_channels = [24, 200, 400, 800]
169 |         elif groups == 3:
170 |             out_channels = [24, 240, 480, 960]
171 |         elif groups == 4:
172 |             out_channels = [24, 272, 544, 1088]
173 |         elif groups == 8:
174 |             out_channels = [24, 384, 768, 1536]
175 | 
176 |         self.conv1 = BasicConv2d(3, out_channels[0], 3, padding=1, stride=1)
177 |         self.input_channels = out_channels[0]
178 | 
179 |         self.stage2 = self._make_stage(
180 |             ShuffleNetUnit, 
181 |             num_blocks[0], 
182 |             out_channels[1], 
183 |             stride=2, 
184 |             stage=2,
185 |             groups=groups
186 |         )
187 | 
188 |         self.stage3 = self._make_stage(
189 |             ShuffleNetUnit, 
190 |             num_blocks[1], 
191 |             out_channels[2], 
192 |             stride=2,
193 |             stage=3, 
194 |             groups=groups
195 |         )
196 | 
197 |         self.stage4 = self._make_stage(
198 |             ShuffleNetUnit,
199 |             num_blocks[2],
200 |             out_channels[3],
201 |             stride=2,
202 |             stage=4,
203 |             groups=groups
204 |         )
205 | 
206 |         self.avg = nn.AdaptiveAvgPool2d((1, 1))
207 |         self.fc = nn.Linear(out_channels[3], num_classes)
208 | 
209 |     def forward(self, x):
210 |         x = self.conv1(x)
211 |         x = self.stage2(x)
212 |         x = self.stage3(x)
213 |         x = self.stage4(x)
214 |         x = self.avg(x)
215 |         x = x.view(x.size(0), -1)
216 |         x = self.fc(x)
217 | 
218 |         return x
219 | 
220 |     def _make_stage(self, block, num_blocks, output_channels, stride, stage, groups):
221 |         """make shufflenet stage 
222 | 
223 |         Args:
224 |             block: block type, shuffle unit
225 |             out_channels: output depth channel number of this stage
226 |             num_blocks: how many blocks per stage
227 |             stride: the stride of the first block of this stage
228 |             stage: stage index
229 |             groups: group number of group convolution 
230 |         Return:
231 |             return a shuffle net stage
232 |         """
233 |         strides = [stride] + [1] * (num_blocks - 1)
234 | 
235 |         stage = []
236 | 
237 |         for stride in strides:
238 |             stage.append(
239 |                 block(
240 |                     self.input_channels, 
241 |                     output_channels, 
242 |                     stride=stride, 
243 |                     stage=stage, 
244 |                     groups=groups
245 |                 )
246 |             )
247 |             self.input_channels = output_channels
248 | 
249 |         return nn.Sequential(*stage)
250 | 
251 | def shufflenet():
252 |     return ShuffleNet([4, 8, 4])
253 |                               
254 | 
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------
/Code/dataset.py:
--------------------------------------------------------------------------------
  1 | """ train and test dataset
  2 | 
  3 | author baiyu
  4 | """
  5 | import os
  6 | import sys
  7 | import pickle
  8 | import matplotlib.pyplot as plt
  9 | import numpy 
 10 | import torch
 11 | from torch.utils.data import Dataset
 12 | from torchvision import datasets, transforms
 13 | import torchvision
 14 | 
 15 | def get_dataloaders(batch_size, dataset):
 16 |     print(dataset)
 17 |     if dataset == 'dogs':
 18 |         image_transforms = {
 19 |         # Train uses data augmentation
 20 |         'train':
 21 |         transforms.Compose([
 22 |             transforms.RandomResizedCrop(size=135, scale=(0.95, 1.0)),
 23 |             transforms.RandomRotation(degrees=15),
 24 |             transforms.ColorJitter(),
 25 |             transforms.RandomHorizontalFlip(),
 26 |             transforms.CenterCrop(size=128),  # Image net standards
 27 |             transforms.ToTensor(),
 28 |             transforms.Normalize([0.485, 0.456, 0.406],
 29 |                                  [0.229, 0.224, 0.225])  # Imagenet standards
 30 |         ]),
 31 |         'test':
 32 |         transforms.Compose([
 33 |             transforms.Resize(size=128),
 34 |             transforms.CenterCrop(size=128),
 35 |             transforms.ToTensor(),
 36 |             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 37 |         ])
 38 |         }   
 39 |         all_data = datasets.ImageFolder(root='data/archive/images/Images/')
 40 |         torch.manual_seed(42)
 41 |         train_data_len = int(len(all_data)*0.8)
 42 |         valid_data_len = int((len(all_data) - train_data_len))
 43 |         train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len])
 44 |         train_data.dataset.transform = image_transforms['train']
 45 |         val_data.dataset.transform = image_transforms['test']
 46 |         print(len(train_data), len(val_data))
 47 | 
 48 |         train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
 49 |         val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)
 50 | 
 51 |         return {'train': train_loader, 'val' :val_loader}
 52 | 
 53 |     elif dataset == 'imagenet':
 54 |         data_path = "/mnt/4T_2/imagenet_dataset"
 55 |         traindir = os.path.join(data_path, 'train')
 56 |         valdir = os.path.join(data_path, 'val')
 57 |         image_transforms = {
 58 |             'train':
 59 |                 transforms.Compose([transforms.RandomRotation(degrees=15), transforms.ColorJitter(),
 60 |                                     transforms.RandomHorizontalFlip(), transforms.RandomResizedCrop(224),
 61 |                                     transforms.ToTensor(),
 62 |                                     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]),
 63 |             'test':
 64 |                 transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),
 65 |                                     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 66 |         }
 67 |         train_dataset = datasets.ImageFolder(traindir, image_transforms["train"])
 68 |         train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
 69 |                                                    num_workers=30, pin_memory=True)
 70 |         val_dataset = datasets.ImageFolder(valdir, image_transforms["test"])
 71 |         val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
 72 |                                                  num_workers=30, pin_memory=True)
 73 |         print(len(train_dataset), len(val_dataset))
 74 |         return {'train': train_loader, 'val': val_loader}
 75 | 
 76 |     elif dataset == 'tiny-imagenet':
 77 |         image_transforms = {
 78 |         # Train uses data augmentation
 79 |         'train':
 80 |         transforms.Compose([
 81 |             transforms.RandomRotation(degrees=15),
 82 |             transforms.ColorJitter(),
 83 |             transforms.RandomHorizontalFlip(),
 84 |             transforms.CenterCrop(size=64),  # Image net standards
 85 |             transforms.ToTensor(),
 86 |             transforms.Normalize([0.485, 0.456, 0.406],
 87 |                                  [0.229, 0.224, 0.225])  # Imagenet standards
 88 |         ]),
 89 |         'test':
 90 |         transforms.Compose([
 91 |             transforms.Resize(size=64),
 92 |             transforms.CenterCrop(size=64),
 93 |             transforms.ToTensor(),
 94 |             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 95 |         ])
 96 |         }
 97 |         all_data = datasets.ImageFolder(root='data/tiny-imagenet/tiny-imagenet-200/train')
 98 |         torch.manual_seed(42)
 99 |         train_data_len = int(len(all_data)*0.8)
100 |         valid_data_len = int((len(all_data) - train_data_len))
101 |         train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len])
102 |         train_data.dataset.transform = image_transforms['train']
103 |         val_data.dataset.transform = image_transforms['test']
104 |         print(len(train_data), len(val_data))
105 | 
106 |         train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
107 |         val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)
108 | 
109 |         return {'train': train_loader, 'val' :val_loader}
110 | 
111 |     elif dataset == 'cifar100':
112 |         mean = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
113 |         std = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
114 |         image_transforms = {
115 |         'train':
116 |         transforms.Compose([
117 |             #transforms.ToPILImage(),
118 |             transforms.RandomCrop(32, padding=4),
119 |             transforms.RandomHorizontalFlip(),
120 |             transforms.RandomRotation(15),
121 |             transforms.ToTensor(),
122 |             transforms.Normalize(mean, std)
123 |         ]),
124 |         'test':
125 |         transforms.Compose([
126 |             transforms.ToTensor(),
127 |             transforms.Normalize(mean, std)
128 |         ])
129 |         }   
130 |         cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=image_transforms['train'])
131 |         train_loader = torch.utils.data.DataLoader(
132 |             cifar100_training, shuffle=True, batch_size=batch_size)
133 | 
134 |         cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=image_transforms['test'])
135 |         val_loader = torch.utils.data.DataLoader(
136 |             cifar100_test, shuffle=True, batch_size=batch_size)
137 | 
138 |         return {'train': train_loader, 'val' :val_loader} 
139 | 
140 |     elif dataset == 'cifar10':
141 |         mean = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
142 |         std = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
143 |         image_transforms = {
144 |         'train':
145 |         transforms.Compose([
146 |             #transforms.ToPILImage(),
147 |             transforms.RandomCrop(32, padding=4),
148 |             transforms.RandomHorizontalFlip(),
149 |             transforms.RandomRotation(15),
150 |             transforms.ToTensor(),
151 |             transforms.Normalize(mean, std)
152 |         ]),
153 |         'test':
154 |         transforms.Compose([
155 |             transforms.ToTensor(),
156 |             transforms.Normalize(mean, std)
157 |         ])
158 |         }   
159 |         cifar10_training = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=image_transforms['train'])
160 |         train_loader = torch.utils.data.DataLoader(
161 |             cifar10_training, shuffle=True, batch_size=batch_size)
162 | 
163 |         cifar10_test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=image_transforms['test'])
164 |         val_loader = torch.utils.data.DataLoader(
165 |             cifar10_test, shuffle=True, batch_size=batch_size)
166 | 
167 |         return {'train': train_loader, 'val' :val_loader} 
168 | 
169 | 
170 |     elif dataset == 'caltech':
171 |         image_transforms = {
172 |         # Train uses data augmentation
173 |         'train':
174 |         transforms.Compose([
175 |             transforms.Resize((128,128)),
176 |             transforms.ColorJitter(),
177 |             transforms.RandomHorizontalFlip(),
178 |             transforms.ToTensor(),
179 |             transforms.Normalize([0.485, 0.456, 0.406],
180 |                                  [0.229, 0.224, 0.225])  # Imagenet standards
181 |         ]),
182 |         'test':
183 |         transforms.Compose([
184 |             transforms.Resize((128, 128)),
185 |             transforms.ToTensor(),
186 |             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
187 |         ])
188 |         }   
189 |         all_data = datasets.ImageFolder(root='data/caltech/256_ObjectCategories')
190 |         torch.manual_seed(42)
191 |         train_data_len = int(len(all_data)*0.8)
192 |         valid_data_len = int((len(all_data) - train_data_len))
193 |         train_data, val_data = torch.utils.data.random_split(all_data, [train_data_len, valid_data_len])
194 |         train_data.dataset.transform = image_transforms['train']
195 |         val_data.dataset.transform = image_transforms['test']
196 |         print(len(train_data), len(val_data))
197 | 
198 |         train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
199 |         val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)
200 | 
201 |         return {'train': train_loader, 'val' :val_loader}
202 | 
203 | 
204 | 
205 |     else:
206 |         print('This dataset isn\'t supported yet') 
207 | 


--------------------------------------------------------------------------------
/Code/benchmarking/densenet.py:
--------------------------------------------------------------------------------
  1 | """dense net in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Gao Huang, Zhuang Liu, Laurens van der Maaten, Kilian Q. Weinberger.
  6 | 
  7 |     Densely Connected Convolutional Networks
  8 |     https://arxiv.org/abs/1608.06993v5
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | 
 14 | 
 15 | 
 16 | #"""Bottleneck layers. Although each layer only produces k
 17 | #output feature-maps, it typically has many more inputs. It
 18 | #has been noted in [37, 11] that a 1×1 convolution can be in-
 19 | #troduced as bottleneck layer before each 3×3 convolution
 20 | #to reduce the number of input feature-maps, and thus to
 21 | #improve computational efficiency."""
 22 | class Bottleneck(nn.Module):
 23 |     def __init__(self, in_channels, growth_rate, inner_channels_list):
 24 |         super().__init__()
 25 |         # """In  our experiments, we let each 1×1 convolution
 26 |         # produce 4k feature-maps."""
 27 |         inner_channel = 4 * growth_rate
 28 | 
 29 |         # """We find this design especially effective for DenseNet and
 30 |         # we refer to our network with such a bottleneck layer, i.e.,
 31 |         # to the BN-ReLU-Conv(1×1)-BN-ReLU-Conv(3×3) version of H ` ,
 32 |         # as DenseNet-B."""
 33 |         self.bottle_neck = nn.Sequential(
 34 |             nn.BatchNorm2d(in_channels),
 35 |             nn.ReLU(inplace=True),
 36 |             nn.Conv2d(in_channels, inner_channel, kernel_size=1, bias=False),
 37 |             nn.BatchNorm2d(inner_channel),
 38 |             nn.ReLU(inplace=True),
 39 |             nn.Conv2d(inner_channel, growth_rate, kernel_size=3, padding=1, bias=False)
 40 |         )
 41 | 
 42 |         r = 16
 43 |         # out_channels = in_channels
 44 |         self.expansion = 1
 45 |         # self.squeeze1 = nn.AdaptiveAvgPool2d(1)
 46 |         # self.excitation1 = nn.Sequential(
 47 |         #     nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias=False),
 48 |         #     nn.ReLU(inplace=True),
 49 |         #     nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias=False),
 50 |         #     nn.Sigmoid()
 51 |         # )
 52 |         out_channels = growth_rate
 53 |         self.squeeze2 = nn.AdaptiveAvgPool2d(1)
 54 |         self.excitation2 = nn.Sequential(
 55 |             nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias=False),
 56 |             nn.ReLU(inplace=True),
 57 |             nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias=False),
 58 |             nn.Sigmoid()
 59 |         )
 60 | 
 61 |     def forward(self, x):
 62 |         # accuracy 74% with bias
 63 |         se_input = self.bottle_neck(x)
 64 |         squeeze = self.squeeze2(se_input)
 65 |         squeeze = squeeze.view(squeeze.size(0), -1)
 66 |         excitation = self.excitation2(squeeze)
 67 |         excitation = excitation.view(se_input.size(0), se_input.size(1), 1, 1)
 68 |         x2 = se_input * excitation.expand_as(se_input)
 69 |         x = torch.cat([x, x2], 1)
 70 |         return x
 71 | 
 72 | #se_block before concat
 73 | # class Bottleneck(nn.Module):
 74 | #     def __init__(self, in_channels, growth_rate, inner_channels_list):
 75 | #         super().__init__()
 76 | #         #"""In  our experiments, we let each 1×1 convolution 
 77 | #         #produce 4k feature-maps."""
 78 | #         inner_channel = 4 * growth_rate
 79 | 
 80 | #         #"""We find this design especially effective for DenseNet and 
 81 | #         #we refer to our network with such a bottleneck layer, i.e., 
 82 | #         #to the BN-ReLU-Conv(1×1)-BN-ReLU-Conv(3×3) version of H ` , 
 83 | #         #as DenseNet-B."""
 84 | #         self.bottle_neck = nn.Sequential(
 85 | #             nn.BatchNorm2d(in_channels),
 86 | #             nn.ReLU(inplace=True),
 87 | #             nn.Conv2d(in_channels, inner_channel, kernel_size=1, bias=False),
 88 | #             nn.BatchNorm2d(inner_channel),
 89 | #             nn.ReLU(inplace=True),
 90 | #             nn.Conv2d(inner_channel, growth_rate, kernel_size=3, padding=1, bias=False)
 91 | #         )
 92 | #         self.inner_channels_list = inner_channels_list.copy()
 93 | #         r = 16
 94 | #         self.expansion = 1
 95 | #         self.squeeze = []
 96 | #         self.excitation = []
 97 | #         for i in range(len(self.inner_channels_list)):
 98 | #             if i == 0:
 99 | #                 out_channels = self.inner_channels_list[0] 
100 | #             else:
101 | #                 out_channels = growth_rate   
102 | #             self.squeeze = nn.AdaptiveAvgPool2d(1)
103 | #             self.excitation.append(nn.Sequential(
104 | #                 nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False),
105 | #                 nn.ReLU(inplace=True),
106 | #                 nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False),
107 | #                 nn.Sigmoid()
108 | #             ))
109 | #         self.excitation = nn.ModuleList(self.excitation)
110 | 
111 | #     def forward(self, x):
112 | #         x_scaled = []
113 | #         #sequential
114 | #         for i in range(len(self.inner_channels_list)):
115 | #             if i == 0:
116 | #                 se_input = x[:, :self.inner_channels_list[0], :, :]
117 | #             else:
118 | #                 se_input = x[:, self.inner_channels_list[i-1]: self.inner_channels_list[i], :, :]
119 | #             squeeze = self.squeeze(se_input)
120 | #             squeeze = squeeze.view(squeeze.size(0), -1)
121 | #             excitation = self.excitation[i](squeeze)
122 | #             excitation = excitation.view(se_input.size(0), se_input.size(1), 1, 1)
123 | #             x1 = se_input * excitation.expand_as(se_input)
124 | #             x_scaled.append(x1)
125 | #         x_scaled = torch.cat(x_scaled, 1)
126 | #         return torch.cat([x_scaled, self.bottle_neck(x_scaled)], 1)
127 | 
128 | #"""We refer to layers between blocks as transition
129 | #layers, which do convolution and pooling."""
130 | class Transition(nn.Module):
131 |     def __init__(self, in_channels, out_channels):
132 |         super().__init__()
133 |         #"""The transition layers used in our experiments 
134 |         #consist of a batch normalization layer and an 1×1 
135 |         #convolutional layer followed by a 2×2 average pooling 
136 |         #layer""".
137 |         self.down_sample = nn.Sequential(
138 |             nn.BatchNorm2d(in_channels),
139 |             nn.Conv2d(in_channels, out_channels, 1, bias=False),
140 |             nn.AvgPool2d(2, stride=2)
141 |         )
142 | 
143 |     def forward(self, x):
144 |         return self.down_sample(x)
145 | 
146 | #DesneNet-BC
147 | #B stands for bottleneck layer(BN-RELU-CONV(1x1)-BN-RELU-CONV(3x3))
148 | #C stands for compression factor(0<=theta<=1)
149 | class DenseNet(nn.Module):
150 |     def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_class=100):
151 |         super().__init__()
152 |         self.growth_rate = growth_rate
153 | 
154 |         #"""Before entering the first dense block, a convolution 
155 |         #with 16 (or twice the growth rate for DenseNet-BC) 
156 |         #output channels is performed on the input images."""
157 |         inner_channels = 2 * growth_rate
158 | 
159 |         #For convolutional layers with kernel size 3×3, each 
160 |         #side of the inputs is zero-padded by one pixel to keep 
161 |         #the feature-map size fixed.
162 |         self.conv1 = nn.Conv2d(3, inner_channels, kernel_size=3, padding=1, bias=False) 
163 | 
164 |         self.features = nn.Sequential()
165 |         inner_channels_list = [inner_channels]
166 |         for index in range(len(nblocks) - 1):
167 |             self.features.add_module("dense_block_layer_{}".format(index), self._make_dense_layers(block, inner_channels, nblocks[index], inner_channels_list))
168 |             inner_channels += growth_rate * nblocks[index]
169 |             
170 |             #"""If a dense block contains m feature-maps, we let the 
171 |             #following transition layer generate θm output feature-
172 |             #maps, where 0 < θ ≤ 1 is referred to as the compression 
173 |             #fac-tor.
174 |             out_channels = int(reduction * inner_channels) # int() will automatic floor the value
175 |             self.features.add_module("transition_layer_{}".format(index), Transition(inner_channels, out_channels))
176 |             inner_channels = out_channels
177 |             inner_channels_list = [inner_channels]
178 | 
179 |         self.features.add_module("dense_block{}".format(len(nblocks) - 1), self._make_dense_layers(block, inner_channels, nblocks[len(nblocks)-1], inner_channels_list))
180 |         inner_channels += growth_rate * nblocks[len(nblocks) - 1]
181 |         self.features.add_module('bn', nn.BatchNorm2d(inner_channels))
182 |         self.features.add_module('relu', nn.ReLU(inplace=True))
183 | 
184 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
185 | 
186 |         self.linear = nn.Linear(inner_channels, num_class)
187 | 
188 |     def forward(self, x):
189 |         output = self.conv1(x)
190 |         output = self.features(output)
191 |         output = self.avgpool(output)
192 |         output = output.view(output.size()[0], -1)
193 |         output = self.linear(output)
194 |         return output
195 | 
196 |     def _make_dense_layers(self, block, in_channels, nblocks, inner_channels_list):
197 |         dense_block = nn.Sequential()
198 |         for index in range(nblocks):
199 |             dense_block.add_module('bottle_neck_layer_{}'.format(index), block(in_channels, self.growth_rate, inner_channels_list))
200 |             in_channels += self.growth_rate
201 |             inner_channels_list.append(in_channels)
202 |         return dense_block
203 | 
204 | def densenet121():
205 |     return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)
206 | 
207 | def densenet169():
208 |     return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)
209 | 
210 | def densenet201():
211 |     return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)
212 | 
213 | def densenet161():
214 |     return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)
215 | 
216 | 


--------------------------------------------------------------------------------
/Code/benchmarking/nasnet.py:
--------------------------------------------------------------------------------
  1 | """nasnet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le
  6 | 
  7 |     Learning Transferable Architectures for Scalable Image Recognition
  8 |     https://arxiv.org/abs/1707.07012
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | 
 14 | class SeperableConv2d(nn.Module):
 15 | 
 16 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 17 | 
 18 |         super().__init__()
 19 |         self.depthwise = nn.Conv2d(
 20 |             input_channels,
 21 |             input_channels,
 22 |             kernel_size,
 23 |             groups=input_channels,
 24 |             **kwargs
 25 |         )
 26 | 
 27 |         self.pointwise = nn.Conv2d(
 28 |             input_channels,
 29 |             output_channels,
 30 |             1
 31 |         )
 32 |     def forward(self, x):
 33 |         x = self.depthwise(x)
 34 |         x = self.pointwise(x)
 35 | 
 36 |         return x
 37 | 
 38 | class SeperableBranch(nn.Module):
 39 | 
 40 |     def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
 41 |         """Adds 2 blocks of [relu-separable conv-batchnorm]."""
 42 |         super().__init__()
 43 |         self.block1 = nn.Sequential(
 44 |             nn.ReLU(),
 45 |             SeperableConv2d(input_channels, output_channels, kernel_size, **kwargs),
 46 |             nn.BatchNorm2d(output_channels)
 47 |         )
 48 | 
 49 |         self.block2 = nn.Sequential(
 50 |             nn.ReLU(),
 51 |             SeperableConv2d(output_channels, output_channels, kernel_size, stride=1, padding=int(kernel_size / 2)),
 52 |             nn.BatchNorm2d(output_channels)
 53 |         )
 54 |     
 55 |     def forward(self, x):
 56 |         x = self.block1(x)
 57 |         x = self.block2(x)
 58 | 
 59 |         return x
 60 | 
 61 | class Fit(nn.Module):
 62 |     """Make the cell outputs compatible
 63 | 
 64 |     Args:
 65 |         prev_filters: filter number of tensor prev, needs to be modified
 66 |         filters: filter number of normal cell branch output filters
 67 |     """
 68 | 
 69 |     def __init__(self, prev_filters, filters):
 70 |         super().__init__()
 71 |         self.relu = nn.ReLU()
 72 | 
 73 |         self.p1 = nn.Sequential(
 74 |             nn.AvgPool2d(1, stride=2),
 75 |             nn.Conv2d(prev_filters, int(filters / 2), 1)
 76 |         )
 77 | 
 78 |         #make sure there is no information loss
 79 |         self.p2 = nn.Sequential(
 80 |             nn.ConstantPad2d((0, 1, 0, 1), 0),
 81 |             nn.ConstantPad2d((-1, 0, -1, 0), 0),   #cropping
 82 |             nn.AvgPool2d(1, stride=2),
 83 |             nn.Conv2d(prev_filters, int(filters / 2), 1)
 84 |         )
 85 | 
 86 |         self.bn = nn.BatchNorm2d(filters)
 87 | 
 88 |         self.dim_reduce = nn.Sequential(
 89 |             nn.ReLU(),
 90 |             nn.Conv2d(prev_filters, filters, 1),
 91 |             nn.BatchNorm2d(filters)
 92 |         )
 93 | 
 94 |         self.filters = filters
 95 |     
 96 |     def forward(self, inputs):
 97 |         x, prev = inputs
 98 |         if prev is None:
 99 |             return x
100 | 
101 |         #image size does not match
102 |         elif x.size(2) != prev.size(2):
103 |             prev = self.relu(prev)
104 |             p1 = self.p1(prev)
105 |             p2 = self.p2(prev)
106 |             prev = torch.cat([p1, p2], 1)
107 |             prev = self.bn(prev)
108 | 
109 |         elif prev.size(1) != self.filters:
110 |             prev = self.dim_reduce(prev)
111 | 
112 |         return prev
113 | 
114 | 
115 | class NormalCell(nn.Module):
116 | 
117 |     def __init__(self, x_in, prev_in, output_channels):
118 |         super().__init__()
119 | 
120 |         self.dem_reduce = nn.Sequential(
121 |             nn.ReLU(),
122 |             nn.Conv2d(x_in, output_channels, 1, bias=False),
123 |             nn.BatchNorm2d(output_channels)
124 |         )
125 | 
126 |         self.block1_left = SeperableBranch(
127 |             output_channels, 
128 |             output_channels,
129 |             kernel_size=3,
130 |             padding=1,
131 |             bias=False
132 |         )
133 |         self.block1_right = nn.Sequential()
134 | 
135 |         self.block2_left = SeperableBranch(
136 |             output_channels,
137 |             output_channels,
138 |             kernel_size=3,
139 |             padding=1,
140 |             bias=False
141 |         )
142 |         self.block2_right = SeperableBranch(
143 |             output_channels,
144 |             output_channels,
145 |             kernel_size=5,
146 |             padding=2,
147 |             bias=False
148 |         )
149 | 
150 |         self.block3_left = nn.AvgPool2d(3, stride=1, padding=1)
151 |         self.block3_right = nn.Sequential()
152 | 
153 |         self.block4_left = nn.AvgPool2d(3, stride=1, padding=1)
154 |         self.block4_right = nn.AvgPool2d(3, stride=1, padding=1)
155 | 
156 |         self.block5_left = SeperableBranch(
157 |             output_channels,
158 |             output_channels,
159 |             kernel_size=5,
160 |             padding=2,
161 |             bias=False
162 |         )
163 |         self.block5_right = SeperableBranch(
164 |             output_channels,
165 |             output_channels,
166 |             kernel_size=3,
167 |             padding=1,
168 |             bias=False
169 |         )
170 | 
171 |         self.fit = Fit(prev_in, output_channels)
172 |     
173 |     def forward(self, x):
174 |         x, prev = x
175 | 
176 |         #return transformed x as new x, and original x as prev 
177 |         #only prev tensor needs to be modified
178 |         prev = self.fit((x, prev)) 
179 | 
180 |         h = self.dem_reduce(x)
181 | 
182 |         x1 = self.block1_left(h) + self.block1_right(h)
183 |         x2 = self.block2_left(prev) + self.block2_right(h)
184 |         x3 = self.block3_left(h) + self.block3_right(h)
185 |         x4 = self.block4_left(prev) + self.block4_right(prev)
186 |         x5 = self.block5_left(prev) + self.block5_right(prev)
187 | 
188 |         return torch.cat([prev, x1, x2, x3, x4, x5], 1), x
189 | 
190 | class ReductionCell(nn.Module):
191 | 
192 |     def __init__(self, x_in, prev_in, output_channels):
193 |         super().__init__()
194 | 
195 |         self.dim_reduce = nn.Sequential(
196 |             nn.ReLU(),
197 |             nn.Conv2d(x_in, output_channels, 1),
198 |             nn.BatchNorm2d(output_channels)
199 |         )
200 | 
201 |         #block1
202 |         self.layer1block1_left = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3)
203 |         self.layer1block1_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2)
204 | 
205 |         #block2
206 |         self.layer1block2_left = nn.MaxPool2d(3, stride=2, padding=1)
207 |         self.layer1block2_right = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3)
208 | 
209 |         #block3
210 |         self.layer1block3_left = nn.AvgPool2d(3, 2, 1)
211 |         self.layer1block3_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2)
212 | 
213 |         #block5
214 |         self.layer2block1_left = nn.MaxPool2d(3, 2, 1)
215 |         self.layer2block1_right = SeperableBranch(output_channels, output_channels, 3, stride=1, padding=1)
216 | 
217 |         #block4
218 |         self.layer2block2_left = nn.AvgPool2d(3, 1, 1)
219 |         self.layer2block2_right = nn.Sequential()
220 |     
221 |         self.fit = Fit(prev_in, output_channels)
222 |     
223 |     def forward(self, x):
224 |         x, prev = x
225 |         prev = self.fit((x, prev))
226 | 
227 |         h = self.dim_reduce(x)
228 | 
229 |         layer1block1 = self.layer1block1_left(prev) + self.layer1block1_right(h)
230 |         layer1block2 = self.layer1block2_left(h) + self.layer1block2_right(prev)
231 |         layer1block3 = self.layer1block3_left(h) + self.layer1block3_right(prev)
232 |         layer2block1 = self.layer2block1_left(h) + self.layer2block1_right(layer1block1)
233 |         layer2block2 = self.layer2block2_left(layer1block1) + self.layer2block2_right(layer1block2)
234 | 
235 |         return torch.cat([
236 |             layer1block2, #https://github.com/keras-team/keras-applications/blob/master/keras_applications/nasnet.py line 739
237 |             layer1block3,
238 |             layer2block1,
239 |             layer2block2
240 |         ], 1), x
241 | 
242 | 
243 | class NasNetA(nn.Module):
244 | 
245 |     def __init__(self, repeat_cell_num, reduction_num, filters, stemfilter, class_num=100):
246 |         super().__init__()
247 | 
248 |         self.stem = nn.Sequential(
249 |             nn.Conv2d(3, stemfilter, 3, padding=1, bias=False),
250 |             nn.BatchNorm2d(stemfilter)
251 |         )
252 | 
253 |         self.prev_filters = stemfilter
254 |         self.x_filters = stemfilter
255 |         self.filters = filters
256 | 
257 |         self.cell_layers = self._make_layers(repeat_cell_num, reduction_num)
258 | 
259 |         self.relu = nn.ReLU()
260 |         self.avg = nn.AdaptiveAvgPool2d(1)
261 |         self.fc = nn.Linear(self.filters * 6, class_num)
262 |     
263 |     
264 |     def _make_normal(self, block, repeat, output):
265 |         """make normal cell
266 |         Args:
267 |             block: cell type
268 |             repeat: number of repeated normal cell
269 |             output: output filters for each branch in normal cell
270 |         Returns:
271 |             stacked normal cells
272 |         """
273 | 
274 |         layers = [] 
275 |         for r in range(repeat):
276 |             layers.append(block(self.x_filters, self.prev_filters, output))
277 |             self.prev_filters = self.x_filters
278 |             self.x_filters = output * 6 #concatenate 6 branches
279 |         
280 |         return layers
281 | 
282 |     def _make_reduction(self, block, output):
283 |         """make normal cell
284 |         Args:
285 |             block: cell type
286 |             output: output filters for each branch in reduction cell
287 |         Returns:
288 |             reduction cell
289 |         """
290 | 
291 |         reduction = block(self.x_filters, self.prev_filters, output)
292 |         self.prev_filters = self.x_filters
293 |         self.x_filters = output * 4 #stack for 4 branches
294 | 
295 |         return reduction
296 |     
297 |     def _make_layers(self, repeat_cell_num, reduction_num):
298 | 
299 |         layers = []
300 |         for i in range(reduction_num):
301 | 
302 |             layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters))
303 |             self.filters *= 2
304 |             layers.append(self._make_reduction(ReductionCell, self.filters))
305 |         
306 |         layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters))
307 | 
308 |         return nn.Sequential(*layers)
309 | 
310 | 
311 |     def forward(self, x):
312 | 
313 |         x = self.stem(x)
314 |         prev = None
315 |         x, prev = self.cell_layers((x, prev))
316 |         x = self.relu(x)
317 |         x = self.avg(x)
318 |         x = x.view(x.size(0), -1)
319 |         x = self.fc(x)
320 | 
321 |         return x
322 |         
323 |         
324 | def nasnet():
325 | 
326 |     #stem filters must be 44, it's a pytorch workaround, cant change to other number
327 |     return NasNetA(4, 2, 44, 44)    
328 | 
329 | 


--------------------------------------------------------------------------------
/Code/benchmarking/inceptionv3.py:
--------------------------------------------------------------------------------
  1 | """ inceptionv3 in pytorch
  2 | 
  3 | 
  4 | [1] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna
  5 | 
  6 |     Rethinking the Inception Architecture for Computer Vision
  7 |     https://arxiv.org/abs/1512.00567v3
  8 | """
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | 
 13 | 
 14 | class BasicConv2d(nn.Module):
 15 | 
 16 |     def __init__(self, input_channels, output_channels, **kwargs):
 17 |         super().__init__()
 18 |         self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs)
 19 |         self.bn = nn.BatchNorm2d(output_channels)
 20 |         self.relu = nn.ReLU(inplace=True)
 21 |     
 22 |     def forward(self, x):
 23 |         x = self.conv(x)
 24 |         x = self.bn(x)
 25 |         x = self.relu(x)
 26 | 
 27 |         return x
 28 | 
 29 | #same naive inception module
 30 | class InceptionA(nn.Module):
 31 | 
 32 |     def __init__(self, input_channels, pool_features):
 33 |         super().__init__()
 34 |         self.branch1x1 = BasicConv2d(input_channels, 64, kernel_size=1)
 35 | 
 36 |         self.branch5x5 = nn.Sequential(
 37 |             BasicConv2d(input_channels, 48, kernel_size=1),
 38 |             BasicConv2d(48, 64, kernel_size=5, padding=2)
 39 |         )
 40 | 
 41 |         self.branch3x3 = nn.Sequential(
 42 |             BasicConv2d(input_channels, 64, kernel_size=1),
 43 |             BasicConv2d(64, 96, kernel_size=3, padding=1),
 44 |             BasicConv2d(96, 96, kernel_size=3, padding=1)
 45 |         )
 46 | 
 47 |         self.branchpool = nn.Sequential(
 48 |             nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
 49 |             BasicConv2d(input_channels, pool_features, kernel_size=3, padding=1)
 50 |         )
 51 |     
 52 |     def forward(self, x):
 53 |         
 54 |         #x -> 1x1(same)
 55 |         branch1x1 = self.branch1x1(x)
 56 | 
 57 |         #x -> 1x1 -> 5x5(same)
 58 |         branch5x5 = self.branch5x5(x)
 59 |         #branch5x5 = self.branch5x5_2(branch5x5)
 60 | 
 61 |         #x -> 1x1 -> 3x3 -> 3x3(same)
 62 |         branch3x3 = self.branch3x3(x)
 63 | 
 64 |         #x -> pool -> 1x1(same)
 65 |         branchpool = self.branchpool(x)
 66 | 
 67 |         outputs = [branch1x1, branch5x5, branch3x3, branchpool]
 68 | 
 69 |         return torch.cat(outputs, 1)
 70 | 
 71 | #downsample
 72 | #Factorization into smaller convolutions
 73 | class InceptionB(nn.Module):
 74 | 
 75 |     def __init__(self, input_channels):
 76 |         super().__init__()
 77 | 
 78 |         self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=3, stride=2)
 79 | 
 80 |         self.branch3x3stack = nn.Sequential(
 81 |             BasicConv2d(input_channels, 64, kernel_size=1),
 82 |             BasicConv2d(64, 96, kernel_size=3, padding=1),
 83 |             BasicConv2d(96, 96, kernel_size=3, stride=2)
 84 |         )
 85 | 
 86 |         self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)
 87 | 
 88 |     def forward(self, x):
 89 | 
 90 |         #x - > 3x3(downsample)
 91 |         branch3x3 = self.branch3x3(x)
 92 | 
 93 |         #x -> 3x3 -> 3x3(downsample)
 94 |         branch3x3stack = self.branch3x3stack(x)
 95 | 
 96 |         #x -> avgpool(downsample)
 97 |         branchpool = self.branchpool(x)
 98 | 
 99 |         #"""We can use two parallel stride 2 blocks: P and C. P is a pooling 
100 |         #layer (either average or maximum pooling) the activation, both of 
101 |         #them are stride 2 the filter banks of which are concatenated as in 
102 |         #figure 10."""
103 |         outputs = [branch3x3, branch3x3stack, branchpool]
104 | 
105 |         return torch.cat(outputs, 1)
106 |     
107 | #Factorizing Convolutions with Large Filter Size
108 | class InceptionC(nn.Module):
109 |     def __init__(self, input_channels, channels_7x7):
110 |         super().__init__()
111 |         self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1)
112 | 
113 |         c7 = channels_7x7
114 | 
115 |         #In theory, we could go even further and argue that one can replace any n × n 
116 |         #convolution by a 1 × n convolution followed by a n × 1 convolution and the 
117 |         #computational cost saving increases dramatically as n grows (see figure 6).
118 |         self.branch7x7 = nn.Sequential(
119 |             BasicConv2d(input_channels, c7, kernel_size=1),
120 |             BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)),
121 |             BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3))
122 |         )
123 | 
124 |         self.branch7x7stack = nn.Sequential(
125 |             BasicConv2d(input_channels, c7, kernel_size=1),
126 |             BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)),
127 |             BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
128 |             BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)),
129 |             BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3))
130 |         )
131 | 
132 |         self.branch_pool = nn.Sequential(
133 |             nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
134 |             BasicConv2d(input_channels, 192, kernel_size=1),
135 |         )
136 | 
137 |     def forward(self, x):
138 | 
139 |         #x -> 1x1(same)
140 |         branch1x1 = self.branch1x1(x)
141 | 
142 |         #x -> 1layer 1*7 and 7*1 (same)
143 |         branch7x7 = self.branch7x7(x)
144 | 
145 |         #x-> 2layer 1*7 and 7*1(same)
146 |         branch7x7stack = self.branch7x7stack(x)
147 | 
148 |         #x-> avgpool (same)
149 |         branchpool = self.branch_pool(x)
150 | 
151 |         outputs = [branch1x1, branch7x7, branch7x7stack, branchpool]
152 | 
153 |         return torch.cat(outputs, 1)
154 | 
155 | class InceptionD(nn.Module):
156 | 
157 |     def __init__(self, input_channels):
158 |         super().__init__()
159 | 
160 |         self.branch3x3 = nn.Sequential(
161 |             BasicConv2d(input_channels, 192, kernel_size=1),
162 |             BasicConv2d(192, 320, kernel_size=3, stride=2)
163 |         )
164 | 
165 |         self.branch7x7 = nn.Sequential(
166 |             BasicConv2d(input_channels, 192, kernel_size=1),
167 |             BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)),
168 |             BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0)),
169 |             BasicConv2d(192, 192, kernel_size=3, stride=2)
170 |         )
171 | 
172 |         self.branchpool = nn.AvgPool2d(kernel_size=3, stride=2)
173 |     
174 |     def forward(self, x):
175 | 
176 |         #x -> 1x1 -> 3x3(downsample)
177 |         branch3x3 = self.branch3x3(x)
178 | 
179 |         #x -> 1x1 -> 1x7 -> 7x1 -> 3x3 (downsample)
180 |         branch7x7 = self.branch7x7(x)
181 | 
182 |         #x -> avgpool (downsample)
183 |         branchpool = self.branchpool(x)
184 | 
185 |         outputs = [branch3x3, branch7x7, branchpool]
186 | 
187 |         return torch.cat(outputs, 1)
188 |     
189 | 
190 | #same
191 | class InceptionE(nn.Module):
192 |     def __init__(self, input_channels):
193 |         super().__init__()
194 |         self.branch1x1 = BasicConv2d(input_channels, 320, kernel_size=1)
195 | 
196 |         self.branch3x3_1 = BasicConv2d(input_channels, 384, kernel_size=1)
197 |         self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
198 |         self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
199 |             
200 |         self.branch3x3stack_1 = BasicConv2d(input_channels, 448, kernel_size=1)
201 |         self.branch3x3stack_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
202 |         self.branch3x3stack_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
203 |         self.branch3x3stack_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
204 | 
205 |         self.branch_pool = nn.Sequential(
206 |             nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
207 |             BasicConv2d(input_channels, 192, kernel_size=1)
208 |         )
209 | 
210 |     def forward(self, x):
211 | 
212 |         #x -> 1x1 (same)
213 |         branch1x1 = self.branch1x1(x)
214 | 
215 |         # x -> 1x1 -> 3x1
216 |         # x -> 1x1 -> 1x3
217 |         # concatenate(3x1, 1x3)
218 |         #"""7. Inception modules with expanded the filter bank outputs. 
219 |         #This architecture is used on the coarsest (8 × 8) grids to promote 
220 |         #high dimensional representations, as suggested by principle 
221 |         #2 of Section 2."""
222 |         branch3x3 = self.branch3x3_1(x)
223 |         branch3x3 = [
224 |             self.branch3x3_2a(branch3x3),
225 |             self.branch3x3_2b(branch3x3)
226 |         ]
227 |         branch3x3 = torch.cat(branch3x3, 1)
228 | 
229 |         # x -> 1x1 -> 3x3 -> 1x3
230 |         # x -> 1x1 -> 3x3 -> 3x1
231 |         #concatenate(1x3, 3x1)
232 |         branch3x3stack = self.branch3x3stack_1(x)
233 |         branch3x3stack = self.branch3x3stack_2(branch3x3stack)
234 |         branch3x3stack = [
235 |             self.branch3x3stack_3a(branch3x3stack),
236 |             self.branch3x3stack_3b(branch3x3stack)
237 |         ]
238 |         branch3x3stack = torch.cat(branch3x3stack, 1)
239 | 
240 |         branchpool = self.branch_pool(x)
241 | 
242 |         outputs = [branch1x1, branch3x3, branch3x3stack, branchpool]
243 | 
244 |         return torch.cat(outputs, 1)
245 | 
246 | class InceptionV3(nn.Module):
247 |     
248 |     def __init__(self, num_classes=100):
249 |         super().__init__()
250 |         self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, padding=1)
251 |         self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3, padding=1)
252 |         self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
253 |         self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
254 |         self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
255 | 
256 |         #naive inception module
257 |         self.Mixed_5b = InceptionA(192, pool_features=32)
258 |         self.Mixed_5c = InceptionA(256, pool_features=64)
259 |         self.Mixed_5d = InceptionA(288, pool_features=64)
260 | 
261 |         #downsample
262 |         self.Mixed_6a = InceptionB(288)
263 | 
264 |         self.Mixed_6b = InceptionC(768, channels_7x7=128)
265 |         self.Mixed_6c = InceptionC(768, channels_7x7=160)
266 |         self.Mixed_6d = InceptionC(768, channels_7x7=160)
267 |         self.Mixed_6e = InceptionC(768, channels_7x7=192)
268 | 
269 |         #downsample
270 |         self.Mixed_7a = InceptionD(768)
271 | 
272 |         self.Mixed_7b = InceptionE(1280)
273 |         self.Mixed_7c = InceptionE(2048)
274 |         
275 |         #6*6 feature size
276 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
277 |         self.dropout = nn.Dropout2d()
278 |         self.linear = nn.Linear(2048, num_classes)
279 | 
280 |     def forward(self, x):
281 | 
282 |         #32 -> 30
283 |         x = self.Conv2d_1a_3x3(x)
284 |         x = self.Conv2d_2a_3x3(x)
285 |         x = self.Conv2d_2b_3x3(x)
286 |         x = self.Conv2d_3b_1x1(x)
287 |         x = self.Conv2d_4a_3x3(x)
288 | 
289 |         #30 -> 30
290 |         x = self.Mixed_5b(x)
291 |         x = self.Mixed_5c(x)
292 |         x = self.Mixed_5d(x)
293 | 
294 |         #30 -> 14
295 |         #Efficient Grid Size Reduction to avoid representation
296 |         #bottleneck
297 |         x = self.Mixed_6a(x)
298 | 
299 |         #14 -> 14
300 |         #"""In practice, we have found that employing this factorization does not 
301 |         #work well on early layers, but it gives very good results on medium 
302 |         #grid-sizes (On m × m feature maps, where m ranges between 12 and 20). 
303 |         #On that level, very good results can be achieved by using 1 × 7 convolutions 
304 |         #followed by 7 × 1 convolutions."""
305 |         x = self.Mixed_6b(x)
306 |         x = self.Mixed_6c(x)
307 |         x = self.Mixed_6d(x)
308 |         x = self.Mixed_6e(x)
309 | 
310 |         #14 -> 6
311 |         #Efficient Grid Size Reduction
312 |         x = self.Mixed_7a(x)
313 | 
314 |         #6 -> 6
315 |         #We are using this solution only on the coarsest grid, 
316 |         #since that is the place where producing high dimensional 
317 |         #sparse representation is the most critical as the ratio of 
318 |         #local processing (by 1 × 1 convolutions) is increased compared 
319 |         #to the spatial aggregation."""
320 |         x = self.Mixed_7b(x)
321 |         x = self.Mixed_7c(x)
322 | 
323 |         #6 -> 1
324 |         x = self.avgpool(x)
325 |         x = self.dropout(x)
326 |         x = x.view(x.size(0), -1)
327 |         x = self.linear(x)
328 |         return x
329 | 
330 | 
331 | def inceptionv3():
332 |     return InceptionV3()
333 | 
334 | 
335 | 
336 | 


--------------------------------------------------------------------------------
/Code/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | from datetime import datetime
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torchvision
 11 | import torchvision.transforms as transforms
 12 | 
 13 | from torch.utils.data import DataLoader
 14 | from dataset import get_dataloaders
 15 | #from dataset import *
 16 | from torch.autograd import Variable
 17 | 
 18 | from tensorboardX import SummaryWriter
 19 | 
 20 | from conf import settings
 21 | from utils import get_network, get_training_dataloader, get_test_dataloader, WarmUpLR
 22 | from nni.compression.torch import AGP_Pruner, Pruner
 23 | from nni.compression.torch.pruning.weight_masker import WeightMasker
 24 | from nni.compression.torch.pruning.structured_pruning import ActivationFilterPrunerMasker, StructuredWeightMasker
 25 | 
 26 | import matplotlib.pyplot as plt
 27 | import timeit
 28 | from tqdm import tqdm
 29 | 
 30 | 
 31 | def draw_weights(weights, index):
 32 |     print(weights.shape)
 33 |     weights = weights.cpu().numpy()
 34 |     for i in range(len(weights)):
 35 |         fig = plt.figure()
 36 |         plt.bar(np.arange(len(weights[i])), weights[i])
 37 |         #plt.show()
 38 |         fig.savefig("scaling/scaling" + str(index))
 39 |         print("min = ", min(weights[i]))
 40 |         print("# of zeros = ", np.count_nonzero(weights[i] == 0))
 41 | 
 42 | class MyMasker(StructuredWeightMasker):
 43 |     def calc_mask(self, sparsity, wrapper, wrapper_idx=None):
 44 |         weight = wrapper.module.weight.data
 45 |         bias = None
 46 |         if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None:
 47 |             bias = wrapper.module.bias.data
 48 | 
 49 |         if wrapper.weight_mask is None:
 50 |             mask_weight = torch.ones(weight.size()).type_as(weight).detach()
 51 |         else:
 52 |             mask_weight = wrapper.weight_mask.clone()
 53 |         if bias is not None:
 54 |             if wrapper.bias_mask is None:
 55 |                 mask_bias = torch.ones(bias.size()).type_as(bias).detach()
 56 |             else:
 57 |                 mask_bias = wrapper.bias_mask.clone()
 58 |         else:
 59 |             mask_bias = None
 60 |         mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias}
 61 | 
 62 |         filters = weight.size(0)
 63 |         num_prune = int(filters * sparsity)
 64 |         if filters < 2 or num_prune < 1:
 65 |             return mask
 66 |         # weight*mask_weight: apply base mask for iterative pruning
 67 |         return self.get_mask(mask, weight*mask_weight, num_prune, wrapper, wrapper_idx)
 68 | 
 69 |     def get_mask(self, base_mask, weight, num_prune, wrapper, wrapper_idx):
 70 |         print(activation[list(activation.keys())[wrapper_idx]])
 71 |         print(activation[list(activation.keys())[wrapper_idx]].shape)
 72 |         mask = torch.mean(activation[list(activation.keys())[wrapper_idx]], dim  = 0, keepdims = True)
 73 |         draw_weights(mask, wrapper_idx)
 74 |         threshold =  torch.topk(mask[0], k = num_prune, dim = 0, largest=False)[0].max()
 75 |         mask_weight = torch.gt(mask[0], threshold)[:, None, None, None].expand_as(weight).type_as(weight)
 76 |         mask_bias = torch.gt(mask[0], threshold).type_as(weight).detach() if base_mask['bias_mask'] is not None else None
 77 |         return {'weight_mask': mask_weight.detach(), 'bias_mask': mask_bias}
 78 | 
 79 | 
 80 | 
 81 | class MyPruner(Pruner):
 82 |     def __init__(self, model, config_list, optimizer):
 83 |         super().__init__(model, config_list, optimizer)
 84 |         self.set_wrappers_attribute("if_calculated", False)
 85 |         # construct a weight masker instance
 86 |         self.masker = MyMasker(model, self)
 87 | 
 88 |     def calc_mask(self, wrapper, wrapper_idx=None):
 89 |         sparsity = wrapper.config['sparsity']
 90 |         if wrapper.if_calculated:
 91 |             # Already pruned, do not prune again as a one-shot pruner
 92 |             return None
 93 |         else:
 94 |             # call your masker to actually calcuate the mask for this layer
 95 |             masks = self.masker.calc_mask(sparsity=sparsity, wrapper=wrapper, wrapper_idx=wrapper_idx)
 96 |             wrapper.if_calculated = True
 97 |             return masks
 98 | 
 99 | def train(epoch):
100 | 
101 |     net.train()
102 |     for batch_index, (images, labels) in enumerate(training_loader):
103 |         if epoch <= args.warm:
104 |             warmup_scheduler.step()
105 | 
106 |         images = Variable(images)
107 |         labels = Variable(labels)
108 | 
109 |         labels = labels.cuda()
110 |         images = images.cuda()
111 | 
112 |         optimizer.zero_grad()
113 |         outputs = net(images)
114 |         loss = loss_function(outputs, labels)
115 |         loss.backward()
116 |         optimizer.step()
117 | 
118 |         n_iter = (epoch - 1) * len(training_loader) + batch_index + 1
119 | 
120 |         last_layer = list(net.children())[-1]
121 |         for name, para in last_layer.named_parameters():
122 |             if 'weight' in name:
123 |                 writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter)
124 |             if 'bias' in name:
125 |                 writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter)
126 | 
127 |         print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
128 |             loss.item(),
129 |             optimizer.param_groups[0]['lr'],
130 |             epoch=epoch,
131 |             trained_samples=batch_index * args.b + len(images),
132 |             total_samples=len(training_loader.dataset)
133 |         ))
134 | 
135 |         #update training loss for each iteration
136 |         writer.add_scalar('Train/loss', loss.item(), n_iter)
137 | 
138 |     for name, param in net.named_parameters():
139 |         layer, attr = os.path.splitext(name)
140 |         attr = attr[1:]
141 |         writer.add_histogram("{}/{}".format(layer, attr), param, epoch)
142 | 
143 | 
144 | def Average(lst): 
145 |     return sum(lst) / len(lst)
146 | 
147 | 
148 | def eval_training(epoch):
149 |     net.eval()
150 | 
151 |     test_loss = 0.0 # cost function error
152 |     correct = 0.0
153 |     inference_time=[]
154 | 
155 |     # calculate FLOPS:
156 |     from thop import profile
157 | 
158 |     macs, params = profile(net, inputs=(torch.randn(1, 3, settings.IMG_SIZE, settings.IMG_SIZE).cuda(), ))
159 |     print("macs = ", macs)
160 |     print("params = ", params)
161 | 
162 |     for (images, labels) in tqdm(test_loader):
163 |         with torch.no_grad():
164 |             start = timeit.default_timer()
165 |             images = Variable(images)
166 |             labels = Variable(labels)
167 |             
168 |             images = images.cuda()
169 |             labels = labels.cuda()
170 | 
171 |             outputs = net(images)
172 |             stop = timeit.default_timer()
173 |             inference_time.append(stop-start)
174 |             loss = loss_function(outputs, labels)
175 |             test_loss += loss.item()
176 |             _, preds = outputs.max(1)
177 |             correct += preds.eq(labels).sum()
178 | 
179 |     print("FPS = ", 1/Average(inference_time))
180 |         
181 | 
182 |     print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
183 |         test_loss / len(test_loader.dataset),
184 |         correct.float() / len(test_loader.dataset)
185 |     ))
186 |     print()
187 | 
188 |     #add informations to tensorboard
189 |     writer.add_scalar('Test/Average loss', test_loss / len(test_loader.dataset), epoch)
190 |     writer.add_scalar('Test/Accuracy', correct.float() / len(test_loader.dataset), epoch)
191 | 
192 |     return correct.float() / len(test_loader.dataset)
193 | 
194 | num_classes = {'dogs': 120, 'tiny-imagenet': 200, 'cifar100': 100, 'cifar10': 10, 'caltech': 257, 'imagenet': 1000}
195 | if __name__ == '__main__':
196 |     # #config for pruner
197 |     # config_list = [{
198 |     # 'initial_sparsity': 0.0,
199 |     # 'final_sparsity': 0.8,
200 |     # 'start_epoch': 0,
201 |     # 'end_epoch': 200,
202 |     # 'frequency': 1,
203 |     # 'op_types': ['Conv2d']
204 |     # }]
205 |     parser = argparse.ArgumentParser()
206 |     parser.add_argument('-net', type=str, required=True, help='net type')
207 |     parser.add_argument('-gpu', type=bool, default=True, help='use gpu or not')
208 |     parser.add_argument('-w', type=int, default=8, help='number of workers for dataloader')
209 |     parser.add_argument('-b', type=int, default=32, help='batch size for dataloader')
210 |     parser.add_argument('-s', type=bool, default=True, help='whether shuffle the dataset')
211 |     parser.add_argument('-warm', type=int, default=1, help='warm up training phase')
212 |     parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate')
213 |     parser.add_argument('-weights', type=str, default='', help='the weights file you want to load')
214 |     parser.add_argument('-data', type=str, default='dogs', help='the weights file you want to load')
215 |     args = parser.parse_args()
216 | 
217 |     net = get_network(args, use_gpu=args.gpu, num_classes = num_classes[args.data])
218 |     # print number of paramters
219 |     pytorch_total_params = sum(p.numel() for p in net.parameters())
220 |     print("number of network paramters are ", pytorch_total_params)
221 |     pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
222 |     print("number of network Trainable paramters are ", pytorch_total_params)
223 |        
224 | 
225 |     if args.weights != '':
226 |         net.load_state_dict(torch.load(args.weights), args.gpu)
227 |         print('loaded checkpoint')
228 | 
229 |     dataloaders = get_dataloaders(args.b, args.data)
230 |     #data preprocessing:aset)
231 |     #data preprocessing:
232 |     training_loader = dataloaders['train']
233 |     
234 |     test_loader = dataloaders['val']
235 |     
236 |     loss_function = nn.CrossEntropyLoss()
237 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
238 |     #"""
239 |     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=9,
240 |                                                            verbose=True, threshold=0.001, threshold_mode='rel',
241 |                                                            cooldown=0, min_lr=1e-6, eps=1e-08)
242 |     #"""
243 |     #train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay
244 |     iter_per_epoch = len(training_loader)
245 |     warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)
246 |     checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW)
247 |     # #pruner
248 |     # pruner = AGP_Pruner(net, config_list, optimizer, pruning_algorithm='l1')
249 |     # pruner.compress()
250 |     # activation = {}
251 |     # conv_layers = []
252 |     # def get_activation(name):
253 |     #     def hook(model, input, output):
254 |     #         activation[name] = output.detach()
255 |     #     return hook
256 |     # for name, module in net.named_modules():
257 |     #     # print(name)
258 |     #     if len(name) > 1 and name[-1] == '3' and 'excitation' in name:
259 |     #         module.register_forward_hook(get_activation(name))
260 |     #     elif len(name) > 1 and (name[-13 : ] == "bottle_neck.5" or name[-10:] == 'residual.6'):
261 |     #         conv_layers.append(name)
262 |     
263 |     # print(conv_layers)
264 |     # # print(net)
265 |     # config_list = [{
266 |     # 'sparsity': 0.5,
267 |     # 'op_types': ['Conv2d'],
268 |     # 'op_names': conv_layers}]
269 | 
270 |     # print(net)
271 | 
272 |     # use tensorboard
273 |     if not os.path.exists(settings.LOG_DIR):
274 |         os.mkdir(settings.LOG_DIR)
275 |     writer = SummaryWriter(log_dir=os.path.join(
276 |             settings.LOG_DIR, args.net, settings.TIME_NOW))
277 |     input_tensor = torch.Tensor(1, 3, settings.IMG_SIZE, settings.IMG_SIZE).cuda()
278 |     writer.add_graph(net, Variable(input_tensor, requires_grad=True))
279 |     # pruner = MyPruner(net, config_list, optimizer)
280 | 
281 |     #acc = eval_training(0)
282 |     # pruner.compress()
283 |     # pruner.export_model(model_path='test.pth', mask_path='test.pth')
284 |     # exit()
285 |     #create checkpoint folder to save model
286 |     if not os.path.exists(checkpoint_path):
287 |         os.makedirs(checkpoint_path)
288 |     checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
289 | 
290 |     best_acc = 0.0
291 |     for epoch in range(1, settings.EPOCH):
292 |         #update pruner
293 |         # pruner.update_epoch(epoch)
294 |         #if epoch > args.warm:
295 |         #    train_scheduler.step(epoch)
296 | 
297 | 
298 |         train(epoch)
299 |         acc = eval_training(epoch)
300 | 
301 |         if epoch > args.warm:
302 |             scheduler.step(acc)
303 | 
304 |         #start to save best performance model after learning rate decay to 0.01 
305 |         if best_acc < acc:
306 |             torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='best'))
307 |             best_acc = acc
308 |             continue
309 | 
310 |         if not epoch % settings.SAVE_EPOCH:
311 |             torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='regular'))
312 |     # pruner.export_model(model_path='model_l1_freq1.pth', mask_path='mask_l1_freq1.pth')
313 |     writer.close()
314 | 


--------------------------------------------------------------------------------
/Code/benchmarking/attention.py:
--------------------------------------------------------------------------------
  1 | """residual attention network in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Fei Wang, Mengqing Jiang, Chen Qian, Shuo Yang, Cheng Li, Honggang Zhang, Xiaogang Wang, Xiaoou Tang
  6 | 
  7 |     Residual Attention Network for Image Classification
  8 |     https://arxiv.org/abs/1704.06904
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | #"""The Attention Module is built by pre-activation Residual Unit [11] with the 
 16 | #number of channels in each stage is the same as ResNet [10]."""
 17 | 
 18 | class PreActResidualUnit(nn.Module):
 19 |     """PreAct Residual Unit
 20 |     Args:
 21 |         in_channels: residual unit input channel number
 22 |         out_channels: residual unit output channel numebr
 23 |         stride: stride of residual unit when stride = 2, downsample the featuremap
 24 |     """
 25 | 
 26 |     def __init__(self, in_channels, out_channels, stride):
 27 |         super().__init__()
 28 | 
 29 |         bottleneck_channels = int(out_channels / 4)
 30 |         self.residual_function = nn.Sequential(
 31 |             #1x1 conv
 32 |             nn.BatchNorm2d(in_channels),
 33 |             nn.ReLU(inplace=True),
 34 |             nn.Conv2d(in_channels, bottleneck_channels, 1, stride),
 35 | 
 36 |             #3x3 conv
 37 |             nn.BatchNorm2d(bottleneck_channels),
 38 |             nn.ReLU(inplace=True),
 39 |             nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1),
 40 | 
 41 |             #1x1 conv
 42 |             nn.BatchNorm2d(bottleneck_channels),
 43 |             nn.ReLU(inplace=True),
 44 |             nn.Conv2d(bottleneck_channels, out_channels, 1)
 45 |         )
 46 | 
 47 |         self.shortcut = nn.Sequential()
 48 |         if stride != 2 or (in_channels != out_channels):
 49 |             self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=stride)
 50 |     
 51 |     def forward(self, x):
 52 | 
 53 |         res = self.residual_function(x)
 54 |         shortcut = self.shortcut(x)
 55 | 
 56 |         return res + shortcut
 57 | 
 58 | class AttentionModule1(nn.Module):
 59 |     
 60 |     def __init__(self, in_channels, out_channels, p=1, t=2, r=1):
 61 |         super().__init__()
 62 |         #"""The hyperparameter p denotes the number of preprocessing Residual 
 63 |         #Units before splitting into trunk branch and mask branch. t denotes 
 64 |         #the number of Residual Units in trunk branch. r denotes the number of 
 65 |         #Residual Units between adjacent pooling layer in the mask branch."""
 66 |         assert in_channels == out_channels
 67 | 
 68 |         self.pre = self._make_residual(in_channels, out_channels, p)
 69 |         self.trunk = self._make_residual(in_channels, out_channels, t)
 70 |         self.soft_resdown1 = self._make_residual(in_channels, out_channels, r)
 71 |         self.soft_resdown2 = self._make_residual(in_channels, out_channels, r)
 72 |         self.soft_resdown3 = self._make_residual(in_channels, out_channels, r)
 73 |         self.soft_resdown4 = self._make_residual(in_channels, out_channels, r)
 74 | 
 75 |         self.soft_resup1 = self._make_residual(in_channels, out_channels, r)
 76 |         self.soft_resup2 = self._make_residual(in_channels, out_channels, r)
 77 |         self.soft_resup3 = self._make_residual(in_channels, out_channels, r)
 78 |         self.soft_resup4 = self._make_residual(in_channels, out_channels, r)
 79 | 
 80 |         self.shortcut_short = PreActResidualUnit(in_channels, out_channels, 1)
 81 |         self.shortcut_long = PreActResidualUnit(in_channels, out_channels, 1)
 82 | 
 83 |         self.sigmoid = nn.Sequential(
 84 |             nn.BatchNorm2d(out_channels),
 85 |             nn.ReLU(inplace=True),
 86 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
 87 |             nn.BatchNorm2d(out_channels),
 88 |             nn.ReLU(inplace=True),
 89 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
 90 |             nn.Sigmoid()
 91 |         ) 
 92 |         
 93 |         self.last = self._make_residual(in_channels, out_channels, p)
 94 |     
 95 |     def forward(self, x):
 96 |         ###We make the size of the smallest output map in each mask branch 7*7 to be consistent
 97 |         #with the smallest trunk output map size.
 98 |         ###Thus 3,2,1 max-pooling layers are used in mask branch with input size 56 * 56, 28 * 28, 14 * 14 respectively.
 99 |         x = self.pre(x)
100 |         input_size = (x.size(2), x.size(3))
101 | 
102 |         x_t = self.trunk(x)
103 | 
104 |         #first downsample out 28
105 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
106 |         x_s = self.soft_resdown1(x_s)
107 | 
108 |         #28 shortcut
109 |         shape1 = (x_s.size(2), x_s.size(3))
110 |         shortcut_long = self.shortcut_long(x_s)
111 | 
112 |         #seccond downsample out 14
113 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
114 |         x_s = self.soft_resdown2(x_s)
115 | 
116 |         #14 shortcut
117 |         shape2 = (x_s.size(2), x_s.size(3))
118 |         shortcut_short = self.soft_resdown3(x_s)
119 | 
120 |         #third downsample out 7
121 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
122 |         x_s = self.soft_resdown3(x_s)
123 | 
124 |         #mid
125 |         x_s = self.soft_resdown4(x_s)
126 |         x_s = self.soft_resup1(x_s)
127 | 
128 |         #first upsample out 14
129 |         x_s = self.soft_resup2(x_s)
130 |         x_s = F.interpolate(x_s, size=shape2)
131 |         x_s += shortcut_short
132 | 
133 |         #second upsample out 28
134 |         x_s = self.soft_resup3(x_s)
135 |         x_s = F.interpolate(x_s, size=shape1)
136 |         x_s += shortcut_long
137 | 
138 |         #thrid upsample out 54
139 |         x_s = self.soft_resup4(x_s)
140 |         x_s = F.interpolate(x_s, size=input_size)
141 | 
142 |         x_s = self.sigmoid(x_s)
143 |         x = (1 + x_s) * x_t
144 |         x = self.last(x)
145 | 
146 |         return x
147 | 
148 |     def _make_residual(self, in_channels, out_channels, p):
149 | 
150 |         layers = []
151 |         for _ in range(p):
152 |             layers.append(PreActResidualUnit(in_channels, out_channels, 1))
153 | 
154 |         return nn.Sequential(*layers)
155 | 
156 | class AttentionModule2(nn.Module):
157 |     
158 |     def __init__(self, in_channels, out_channels, p=1, t=2, r=1):
159 |         super().__init__()
160 |         #"""The hyperparameter p denotes the number of preprocessing Residual 
161 |         #Units before splitting into trunk branch and mask branch. t denotes 
162 |         #the number of Residual Units in trunk branch. r denotes the number of 
163 |         #Residual Units between adjacent pooling layer in the mask branch."""
164 |         assert in_channels == out_channels
165 | 
166 |         self.pre = self._make_residual(in_channels, out_channels, p)
167 |         self.trunk = self._make_residual(in_channels, out_channels, t)
168 |         self.soft_resdown1 = self._make_residual(in_channels, out_channels, r)
169 |         self.soft_resdown2 = self._make_residual(in_channels, out_channels, r)
170 |         self.soft_resdown3 = self._make_residual(in_channels, out_channels, r)
171 | 
172 |         self.soft_resup1 = self._make_residual(in_channels, out_channels, r)
173 |         self.soft_resup2 = self._make_residual(in_channels, out_channels, r)
174 |         self.soft_resup3 = self._make_residual(in_channels, out_channels, r)
175 | 
176 |         self.shortcut = PreActResidualUnit(in_channels, out_channels, 1)
177 | 
178 |         self.sigmoid = nn.Sequential(
179 |             nn.BatchNorm2d(out_channels),
180 |             nn.ReLU(inplace=True),
181 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
182 |             nn.BatchNorm2d(out_channels),
183 |             nn.ReLU(inplace=True),
184 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
185 |             nn.Sigmoid()
186 |         ) 
187 |         
188 |         self.last = self._make_residual(in_channels, out_channels, p)
189 |     
190 |     def forward(self, x):
191 |         x = self.pre(x)
192 |         input_size = (x.size(2), x.size(3))
193 | 
194 |         x_t = self.trunk(x)
195 | 
196 |         #first downsample out 14
197 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
198 |         x_s = self.soft_resdown1(x_s)
199 | 
200 |         #14 shortcut
201 |         shape1 = (x_s.size(2), x_s.size(3))
202 |         shortcut = self.shortcut(x_s)
203 | 
204 |         #seccond downsample out 7 
205 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
206 |         x_s = self.soft_resdown2(x_s)
207 | 
208 |         #mid
209 |         x_s = self.soft_resdown3(x_s)
210 |         x_s = self.soft_resup1(x_s)
211 | 
212 |         #first upsample out 14
213 |         x_s = self.soft_resup2(x_s)
214 |         x_s = F.interpolate(x_s, size=shape1)
215 |         x_s += shortcut
216 | 
217 |         #second upsample out 28
218 |         x_s = self.soft_resup3(x_s)
219 |         x_s = F.interpolate(x_s, size=input_size)
220 | 
221 |         x_s = self.sigmoid(x_s)
222 |         x = (1 + x_s) * x_t
223 |         x = self.last(x)
224 | 
225 |         return x
226 | 
227 |     def _make_residual(self, in_channels, out_channels, p):
228 | 
229 |         layers = []
230 |         for _ in range(p):
231 |             layers.append(PreActResidualUnit(in_channels, out_channels, 1))
232 | 
233 |         return nn.Sequential(*layers)
234 | 
235 | class AttentionModule3(nn.Module):
236 |     
237 |     def __init__(self, in_channels, out_channels, p=1, t=2, r=1):
238 |         super().__init__()
239 | 
240 |         assert in_channels == out_channels
241 | 
242 |         self.pre = self._make_residual(in_channels, out_channels, p)
243 |         self.trunk = self._make_residual(in_channels, out_channels, t)
244 |         self.soft_resdown1 = self._make_residual(in_channels, out_channels, r)
245 |         self.soft_resdown2 = self._make_residual(in_channels, out_channels, r)
246 | 
247 |         self.soft_resup1 = self._make_residual(in_channels, out_channels, r)
248 |         self.soft_resup2 = self._make_residual(in_channels, out_channels, r)
249 | 
250 |         self.shortcut = PreActResidualUnit(in_channels, out_channels, 1)
251 | 
252 |         self.sigmoid = nn.Sequential(
253 |             nn.BatchNorm2d(out_channels),
254 |             nn.ReLU(inplace=True),
255 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
256 |             nn.BatchNorm2d(out_channels),
257 |             nn.ReLU(inplace=True),
258 |             nn.Conv2d(out_channels, out_channels, kernel_size=1),
259 |             nn.Sigmoid()
260 |         ) 
261 |         
262 |         self.last = self._make_residual(in_channels, out_channels, p)
263 |     
264 |     def forward(self, x):
265 |         x = self.pre(x)
266 |         input_size = (x.size(2), x.size(3))
267 | 
268 |         x_t = self.trunk(x)
269 | 
270 |         #first downsample out 14
271 |         x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
272 |         x_s = self.soft_resdown1(x_s)
273 | 
274 |         #mid
275 |         x_s = self.soft_resdown2(x_s)
276 |         x_s = self.soft_resup1(x_s)
277 | 
278 |         #first upsample out 14
279 |         x_s = self.soft_resup2(x_s)
280 |         x_s = F.interpolate(x_s, size=input_size)
281 | 
282 |         x_s = self.sigmoid(x_s)
283 |         x = (1 + x_s) * x_t
284 |         x = self.last(x)
285 | 
286 |         return x
287 | 
288 |     def _make_residual(self, in_channels, out_channels, p):
289 | 
290 |         layers = []
291 |         for _ in range(p):
292 |             layers.append(PreActResidualUnit(in_channels, out_channels, 1))
293 | 
294 |         return nn.Sequential(*layers)
295 | 
296 | class Attention(nn.Module):
297 |     """residual attention netowrk
298 |     Args:
299 |         block_num: attention module number for each stage
300 |     """
301 | 
302 |     def __init__(self, block_num, class_num=100):
303 |         
304 |         super().__init__()
305 |         self.pre_conv = nn.Sequential(
306 |             nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
307 |             nn.BatchNorm2d(64),
308 |             nn.ReLU(inplace=True)
309 |         )
310 | 
311 |         self.stage1 = self._make_stage(64, 256, block_num[0], AttentionModule1)
312 |         self.stage2 = self._make_stage(256, 512, block_num[1], AttentionModule2)
313 |         self.stage3 = self._make_stage(512, 1024, block_num[2], AttentionModule3)
314 |         self.stage4 = nn.Sequential(
315 |             PreActResidualUnit(1024, 2048, 2),
316 |             PreActResidualUnit(2048, 2048, 1),
317 |             PreActResidualUnit(2048, 2048, 1)
318 |         )
319 |         self.avg = nn.AdaptiveAvgPool2d(1)
320 |         self.linear = nn.Linear(2048, 100)
321 |     
322 |     def forward(self, x):
323 |         x = self.pre_conv(x)
324 |         x = self.stage1(x)
325 |         x = self.stage2(x)
326 |         x = self.stage3(x)
327 |         x = self.stage4(x)
328 |         x = self.avg(x)
329 |         x = x.view(x.size(0), -1)
330 |         x = self.linear(x)
331 | 
332 |         return x
333 | 
334 |     def _make_stage(self, in_channels, out_channels, num, block):
335 | 
336 |         layers = []
337 |         layers.append(PreActResidualUnit(in_channels, out_channels, 2))
338 | 
339 |         for _ in range(num):
340 |             layers.append(block(out_channels, out_channels))
341 | 
342 |         return nn.Sequential(*layers)
343 |     
344 | def attention56():
345 |     return Attention([1, 1, 1])
346 | 
347 | def attention92():
348 |     return Attention([1, 2, 3])
349 | 
350 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # EMCA
  2 | This is an original Pytorch Implementation for our paper "EMCA: Efficient Multi-Scale Channel Attention Module"
  3 | ## 1- Abstract:
  4 | Attention  mechanisms  have  been  explored  with  CNNs,both across the spatial and channel dimensions.  However,all the existing methods devote the attention modules to cap-ture local interactions from a uni-scale.  This paper tacklesthe following question: Can one consolidate multi-scale ag-gregation while learning channel attention more efficiently?To  this  end,  we  avail  channel-wise  attention  over  multi-ple feature scales, which empirically shows its aptitude toreplace the limited local and uni-scale attention modules.EMCA is lightweight and can efficiently model the globalcontext further it is easily integrated into any feed-forwardCNN architectures and trained in an end-to-end fashion. Wevalidate our novel architecture through comprehensive ex-periments on image classification, object detection and in-stance segmentation with different backbones.  Our experi-ments show consistent gains in performances against theircounterparts, where our proposed module, named EMCA,outperforms other channel attention techniques in accuracyand  latency  trade-off.   We  also  conduct  experiments  thatprobe the robustness of the learned representations.
  5 | 
  6 | ## 2- Motivation:
  7 | ### 2.1- Avoid Dense Integration Intuation:
  8 | ![revisit Architecture](Figures/Revisit_Channel_Attention_dense_connection.png)
  9 | ### 2.2- Avoid Dense Integration Results:
 10 | |Method|Model|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|
 11 | |:----:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|
 12 | |      |     |                  *SE*             |||||                  *ECA*              |||||                  *SRM*             |||||
 13 | | ALL   |   |187|  11.231 | 70.59  | 89.78 | xx|  192  | 11.148 |  70.75 | 89.74  | xx| 154 | 11.152 | 70.96 | 89.81|xx|
 14 | | First   |    R-18 |204|  11.189| 70.91  | 89.96 | xx|  212  | 11.148 |  70.63 | 89.85  | xx| 165 | 11.150 | 71.31 | 90.07|xx|
 15 | | Last   |     |204|  11.189| 70.92  | 89.83 | xx|  212  | 11.148 |  70.81 | 89.84  | xx| 165 | 11.150 | 71.04 | 90.00|xx|
 16 | | All   |     |101|  20.938| 73.87  | 91.65 | xx|  107  | 20.788 |  74.13 | 91.68  | xx| 82 | 20.795 | 73.98 | 91.68 |xx|
 17 | | First   |  R-34    |122|  20.829| 73.84  | 91.64 | xx|  122  | 20.788 |  74.20 | 91.84 | xx| 96 | 20.790 | 74.51 | 91.91 |xx|
 18 | | Last   |      |122|  20.829| 73.64  | 91.49 | xx|  122  | 20.788 |  73.75 | 91.47 | xx| 96 | 20.790 | 73.63 | 91.44 |xx|
 19 | | All   |     |90|  26.772 | 76.80  | 93.39 | xx|  87  | 24.373   |  77.12 | 93.68  | xx|  71 | 24.402 | 77.13 | 93.51 |xx|
 20 | | First   |  R-50    |97|  25.037| 76.56  | 93.28 | xx|  98  | 24.373 |  77.02 | 93.49 | xx| 81 | 24.380 | 76.98 | 93.41 |xx|
 21 | | Last   |      |97|  25.037|  75.71  | 92.60 | xx|  98  | 24.373 |  76.37 | 93.18 | xx| 81 | 24.380 | 76.73 | 93.26  |xx|
 22 | 
 23 | 
 24 | 
 25 | ## 2- EMCA Architecture:
 26 | ### 2.-1- Multi-Scale Inocrporation
 27 | ![EMCA Architecture](Figures/EMCA_archeticture_only_CVPR.png)
 28 | 
 29 | ### 2.2- Integrating EMCA Module:
 30 | ![Integrating EMCA Module](Figures/EMCA_integration_only_CVPR.png)
 31 | 
 32 | ### 2.3- EMCA Algorithm:
 33 | ![Pseudo Code](Figures/EMCA_Algorithm.PNG)
 34 | 
 35 | ## 3- HeatMap Visualization:
 36 | ![HeatMap Visualization](Figures/gradcam.jpg)
 37 | ![HeatMap Visualization](Figures/gradcam2.jpg)
 38 | 
 39 | ## 4- Scales Visualization:
 40 | ![HeatMap Visualization](Figures/scaling.png)
 41 | 
 42 | ## 5- Top-1 Accuracy Visualization:
 43 | ![HeatMap Visualization](Figures/Top-1_ACC.jpg)
 44 | 
 45 | ## 6- Results:
 46 | |S|N`_i-j|Model|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|FPS|#.P (M)|Top-1(%)|Top-5(%)|Weights|
 47 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|
 48 | |   |      |     |                  *SE*             |||||                  *ECA*              |||||                  *SRM*             |||||
 49 |  N/A |  N/A |   R-18  |    187   |   11.231   |70.59  | 89.78  |xx |   192|    11.148  |  70.75 | 89.74     |xx|  154    |  11.152 |  70.96 |   89.81  |xx |
 50 |  0|  0    |  | 204   |   11.189 |  70.91  | 89.96  |xx|    212  |   11.148  |   70.63 |  89.85     |xx| 165 |  11.150  | 71.31 | 90.07   |xx|
 51 |  1|  1   |  | 156  |    11.189  | 71.02  | 89.98  |xx |    174    |  11.148  |  70.83|  89.96   |xx |   123     |   11.150| 71.20|  90.00 |xx |
 52 | 1|  N_i-j  |      |  160     |  11.190 |  71.00  | 90.00 |xx|    170     |   11.148 |  71.04 | 89.99|xx|       113   |   11.150 | 71.02|    90.00 |xx|
 53 | i-1|  1  |    |   153|    11.190|  71.02  |90.12 |xx |    169  |  11.148 |  70.59|  89.78|xx |      113  |   11.150  | 71.00 | 89.81|xx | 
 54 | N/A |   N/A   |  R-34  |     101 |      20.938   |    73.87 |   91.65    |xx |  107  |      20.788  |  74.13  |   91.68  |xx |      82  |    20.795   |  73.98  |    91.68|xx | 
 55 | 0 | 0    |      | 122  |     20.829    |  73.84   | 91.64 |xx |      122   |   20.788   |  74.20 |     91.84     |xx|   96  |    20.790   | 74.51  |   91.91  |xx | 
 56 | 1 |  1    |   |   109    |   20.829  |     74.33  |   91.89  |xx |    109 |      20.788 |   74.39   |  91.81  |xx |      82     | 20.790  |  74.39 |   91.77|xx  | 
 57 | 1 |   N_i-j |  |  107     |   20.829   |  74.40   |  91.89|xx  |     107    |    20.788 |     74.46  |  91.70 |xx  |      81   |   20.790  |  74.38  |    91.87|xx  | 
 58 | i-1 |   1    |   |  103  |    20.829   |    74.02   | 91.74  |xx|      108  |   20.788  |   74.14 |     91.81 |xx|      80     |   20.790    | 74.57  |  91.90  |xx |
 59 | N/A | N/A  | R-50 |    90  |   26.772 |  76.80 |  93.39  |xx|   87    | 24.373  |  77.12 |xx| 93.68     | 71|    24.402 |  77.13 | 93.51|xx|  
 60 | 0|0 |  | 97  |  25.037   |    76.56  |  93.28|xx |      98   |    24.373 |    77.02  | 93.49     |xx|   81    |  24.380  |  76.98   | 93.41 |xx|   
 61 | 1 |  1 |    |  88   |   25.037    |77.10   |93.49   |xx|   94    |    24.373   |  76.98 |    93.55  |xx|     70   |    24.380   | 77.00  | 93.72  |xx|
 62 | 1 |  N_i-j  |  |  90    |  25.037   | 77.33   | 93.52 |xx |     92   |  24.373  |   77.13  |  93.49 |xx |    70    |   24.380   | 77.20  |    93.54|xx |
 63 | i-1 |  1  |   |89   |   25.037   |     76.85   |93.42   |xx |   91  |     24.373  |  76.82 |  93.41  |xx  |   71   |     24.380   |77.05  |   93.50   |xx|
 64 | 
 65 | 
 66 | 
 67 | 
 68 | |S | N'_i-j |Model  | FPS | \#.P (M)  |  Top-1  |  Top-5 | FPS | \#.P (M) | Top-1 | Top-5 | FPS |\#.P (M) | Top-1 |  Top-5|  
 69 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-----:|
 70 | |   |      |     |                  *SE*             ||||                 *ECA*              ||||                 *SRM*             ||||
 71 |  N/A | N/A |  R-18 | 187    |  11.231  | 70.59  | 89.78 |   192    | 11.148  |  70.75 | 89.74  | 154     | 11.152 | 70.96  | 89.81  
 72 | 0 | 0 |   |  204    |  11.189  | 70.91  | 89.96 | 212    | 11.148   |  70.63 |  89.85 |165  | 11.150 |  71.31 | 90.07  
 73 | 1, | 1 |      | 156    |  11.189  | 71.02  | 89.98 |  174     | 11.148  |  70.83 | 89.96  | 123 |  11.150 | 71.20 | 90.00 
 74 | 1| N_i-j |     |  160     |  11.190  | 71.00  | 90.00 |170     |   11.148 |  71.04 | 89.99  | 113    |  11.150 | 71.02  |  90.00 
 75 | i-1 | 1 |    |  153   | 11.190 | 71.02 | 90.12 | 169   | 11.148  | 70.59 | 89.78  |113    | 11.150  |  71.00 | 89.81 
 76 | N/A, | N/A |   R-34  |  101    | 20.938    | 73.87 | 91.65 |107    |  20.788 | 74.13  | 91.68  |  82   | 20.795 | 73.98  |  91.68 
 77 | 0, | 0 |   | 122   |  20.829  |  73.84 | 91.64 |   122   | 20.788  | 74.20  |  91.84 | 96   | 20.790 | 74.51  | 91.91  
 78 | 1, | 1 | | 109    | 20.829    | 74.33  | 91.89 | 109    | 20.788 | 74.39  | 91.81  | 82   | 20.790 | 74.39 | 91.77 
 79 | 1, | N_i-j | | 107    |  20.829  | 74.40   | 91.89 | 107     | 20.788  |  74.46 | 91.70  |  81   | 20.790 | 74.38  |  91.87 
 80 | i-1, | 1 | | 103  |  20.829   |  74.02 | 91.74 |  108  | 20.788  | 74.14  |  91.81| 80    |  20.790 |  74.57 | 91.90  
 81 | N/A, | N/A |   R-50  |  90   |  26.772  | 76.80  | 93.39 | 87    | 24.373   | 77.12  |  93.68 |  71   |  24.402 |  77.13 | 93.51  
 82 | 0, | 0 | | 97   | 25.037     | 76.56  | 93.28  |  98    |  24.373  |  77.02 | 93.49   |  81    | 24.380  | 76.98  | 93.41   
 83 | 1, | 1 |   |  88   |  25.037  | 77.10 | 93.49| 94     |  24.373   | 76.98  |  93.55 |  70    |  24.380 |  77.00 | 93.72 
 84 | 1, | N_i-j | | 90    |  25.037  | 77.33  | 93.52  |  92   | 24.373  |  77.13 |  93.49 | 70     | 24.380  | 77.20   |  93.54 
 85 | i-1 | 1 |  | 89   |  25.037     |  76.85 | 93.42 |  91     | 24.373 |  76.82 | 93.41 |  71     |  24.380 | 77.05  |  93.50    
 86 | 
 87 | 
 88 | Methods |Model | \#.P (M) | GFLOPs |  Top-1(RI) |  Top-5 |  FPS | FPS*  |  FPS** 
 89 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|
 90 | ResNet  |  R-18 | 11.148    | 1.694   | 70.40  | 89.45  |  270 | 23552  | 859
 91 | +SENet  |    | 11.231    | 1.695   | 70.59  | 89.78  |  187 | 21760  | 839
 92 | +EMCA-SE |  |11.190   |1.695   |71.00(215)  |90.00  |  160 | 17313  | 813    
 93 | +ECANet | | 11.148    | 1.695   | 70.78  | 89.92  |  192 | 22287  | 848
 94 | +ECANet* | | 11.148    | 1.695   | 70.75  | 89.74  |  192 | 22287  | 848
 95 | +EMCA-ECA |  |11.148    |1.695  | 71.04(83) |89.99  |  170 | 19023  | 833   
 96 |  +SRM*  | | 11.152    | 1.695   | 70.96  | 89.81  |  154 | 18794  | 823 
 97 | +EMCA-SRM | | 11.150   | 1.694 |71.02(10)  |90.00 |  113 | 15190  | 803  
 98 | ResNet |  R-34 | 20.788    | 3.419   | 73.31  | 91.40  |  168 | 19712  | 840
 99 | +SENet  |     | 20.938    | 3.421   | 73.87  | 91.65  |  101 | 14279  | 805
100 | +EMCA-SE |   |20.829    |3.421 | 74.41 (96) | 91.90 | 107 |14372  |812    
101 | +ECANet |  | 20.788    | 3.420   | 74.21  | 91.83  |  107 | 14067  | 825
102 | +ECANet* |    | 20.788    | 3.420   | 74.13  | 91.68  |  107 | 14067  | 825
103 | +EMCA-ECA |     |20.788    | 3.421   |74.46 (40)   |91.70   | 107 |14080  | 822    
104 | +SRM* |   | 20.795    | 3.419   | 73.98  | 91.68  |  82 | 12655  | 803 
105 | +EMCA-SRM |     |20.790    |3.419   |74.38 (59)  |91.87  |  81 | 12579  | 795   
106 |  ResNet |  R-50 | 24.373    | 3.829   | 75.89  | 92.85  |  124 | 10032  | 668
107 |  +SENet |    | 26.772    | 3.837   | 76.80  | 93.39  |  90 | 8156  | 597
108 |  +EMCA-SE |     |25.037    |3.835 |77.33 (58)  |93.52 | 90 | 8099  | 589    
109 |  +ECANet|   | 24.373   | 3.834  | 77.48  | 93.68  |  87 | 8517  | 591
110 |  +ECANet * |     | 24.373   | 3.834  | 77.12  | 93.68  |  87 | 8517  | 591
111 |  +EMCA-ECA |     |24.373     |3.834  | 77.13 (1) | 93.49  | 92 |8615  |600      
112 |  +SRM *|  | 24.402    | 3.829   | 77.13  | 93.51  |  71 | 6745  | 536 
113 | +EMCA-SRM |  |  24.380  |3.829   |77.20 (6)  |93.54  |  70  | 6698  | 532 
114 | 
115 | 
116 | 
117 | Methods |Model | \#.P (M) | GFLOPs |  Top-1 |  Top-5 |  FPS | FPS*  |  FPS** 
118 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|  
119 |  ResNet  | R-18 | 11.148    | 1.694  | 70.40  | 89.45  |  270 | 23552  | 859
120 | SENet  | | 11.231    | 1.695  | 70.59  | 89.78  |  187 |  21760 | 839 
121 | ECANet*  |    |  11.148   | 1.695  | 70.75  |  89.74 | 192 |22287  |839
122 | SRM*  |   |  11.152   | 1.694  |  70.96 | 89.81  |  154 | 18794   | 823 
123 | FCANet*  |    | 11.231    |  1.694 | 70.98  | 90.00  |  119 |  17680 | 808 
124 | BAM |      | 11.712    | 1.821 | 75.98  | 92.82  |  91 | 7159 | 527
125 | CBAM  |   | 11.234    | 1.695  | 70.73  | 89.91  |  104 |  8734  | 789   
126 | EMCA-ECA |     |11.148    | 1.695  | 71.04 | 89.99  |  170 | 19023  | 833  
127 | EMCA-SRM |    |  11.150   | 1.694 |71.02  | 90.00 |  113 | 15190  | 803 
128 | EMCA-SE |     | 11.190   | 1.695   |71.00  | 90.00  |  160 | 17313  | 813  
129 | ResNet |R-34 | 20.788    | 3.419  | 73.31  | 91.4  |  168 | 19712  | 840
130 | SENet |   | 20.938   | 3.421  | 73.87  | 91.65  |  101 |  14279 | 805 
131 | ECANet* |    |  20.788   | 3.420  | 74.13  |  91.68 | 107 | 14067  | 825
132 | SRM* |   |  20.795  | 3.419  |  73.98 | 91.68  |  82 | 12655   | 803 
133 | FCANet*  |    | 20.938    | 3.419 | 74.18  | 91.75  |  87 |  13094 | 812 
134 | CBAM  |   | 20.943    | 3.420 | 74.01  | 91.76  |  59 | 12001  | 760   
135 | EMCA-ECA |   |20.788    | 3.421   |74.46   | 91.70   | 107 | 14080  | 822    
136 | EMCA-SRM |    | 20.790    |3.419   |74.38  |91.87  |  81 | 12579  | 795        
137 | EMCA-SE |  | 20.829    | 3.421 | 74.41 | 91.90 | 107 |14372  | 812    
138 |  ResNet|  R-50 | 24.373   | 3.829   | 75.89  | 92.85  |  124 | 10032  | 668
139 | SENet |    | 26.772   | 3.837  | 76.80  | 93.39  |  90 |  8156 | 597 
140 | ECANet*  |   | 24.373  | 3.834  | 77.12 |  93.68 |  87 | 8517  | 591
141 | SRM* |    |  24.402  | 3.829  |  77.13 | 93.51  |  71 | 6745   | 536 
142 | FCANet*  |   | 26.772    |  3.831 | 77.27  | 93.70  |  74 |  7984 | 549 
143 | EPSANet* |   | 21.517   | 3.373 | 77.31  |93.72  |  28 | 802  | 388   
144 | SANet*  |      | 24.373   | 3.832  | 77.25  | 93.66  |  68 | 6670  | 406
145 | A^2Nets |       | 33.006   | 6.502  | 77.00  | 93.50  |   N/A | N/A  |  N/A 
146 | BAM  |      | 25.92    | 3.946 | 75.98  | 92.82  |  91 | 7159 | 527
147 | CBAM  |      | 26.775    | 3.837 |77.34  | 93.69  |  55 | 2460 | 208   
148 | EMCA-ECA |    |24.373     | 3.834  |  77.13 | 93.49  | 92 |8615  |600   
149 | EMCA-SRM |   |   24.380  |3.829   | 77.20  | 93.54  |  71  | 6698  | 532  
150 | EMCA-SE |   | 25.037    | 3.835 |77.33  | 93.52 | 90 | 8099  | 589   
151 | 	
152 | 
153 | 
154 | |Methods |  Detectors | \#.P (M) | GFLOPs |  AP |  AP_50 |  AP_75  | AP_S  |  AP_M |  AP_L
155 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:|:-:| 
156 |  ResNet-50  |   | 41.53   | 207.07   | 36.4  | 58.2  |  39.2 | 21.8  | 40.0 |  46.2
157 | +SE  |   | 44.02 | 207.18 | 37.7 | 60.1 | 40.9 | 22.9 | 41.9 | 48.2 
158 | EMCA+SE |  | 42.56 |  207.18 | 38.1 |60.6 | 50.2 | 23.6 |42.2 | 48.4 
159 | +ECA |  | 41.53 | 207.18 | 38.0 | 60.6 | 40.9 | 23.4 | 42.1 | 48.0 
160 | +EMCA+ECA |Faster R-CNN  |  41.53 |   207.18 | 38.2  |60.9 | 50.0 | 23.7  | 42.2 | 48.2
161 | ResNet-50  |   | 44.18 | 275.58 | 37.2 | 58.9 | 40.3 | 22.2 | 40.7 | 48.0 
162 | +1 NL  |  |46.50 | 288.70 | 38.0  | 59.8 | 41.0 | N/A | N/A | N/A 
163 | +GC  |  | 46.90 | 279.60 | 39.4 | 61.6 | 42.4  | N/A | N/A | N/A
164 | +SE  |  | 46.67 | 275.69 | 38.7 | 60.9 | 42.1 | 23.4 | 42.7 | 50.0
165 | +EMCA+SE |  | 45.13 |  275.69 | 39.0  |61.4 | 42.3 | 23.7 |42.9 | 50.1
166 | +ECA  |  | 44.18 | 275.69 | 39.0 | 61.3 | 42.1 | 24.2 | 42.8 | 49.9 
167 | +EMCA+ECA |Mask R-CNN  |  44.18 |  275.69  | 39.1  |61.5 |  42.1 | 24.4 |42.9 |  49.9
168 | ResNet-50 |  | 37.74   | 239.32   | 35.6 | 55.5  |  38.2 | 20.0 | 39.6 |  46.8
169 | +SE  |   | 40.23 | 239.43 | 37.1 | 57.2 | 39.9 | 21.2 | 40.7 | 49.3
170 | +EMCA+SE  |  | 38.88 |  239.43  | 37.2  |57.4 |  39.9 |  21.2 | 40.7 |  49.3
171 | +ECA|  | 37.74 | 239.43 | 37.3 | 57.7 | 39.6 | 21.9 | 41.3 | 48.9   
172 | +EMCA+ECA |  RetinaNet |  37.74 |  239.43  |  37.3 |57.8 |  39.6 |  21.9 | 41.3 |  48.9
173 | 	
174 | 	
175 | Methods |  \#.P (M)  |  GFLOPs |  AP |   AP_50 |  AP_75  | AP_S  |  AP_M |  AP_L|
176 | |:-:|:---:|:---:|:-:|:-----:|:------:|:------:|:-----:|:-:| 
177 | ResNet-50  |  44.18 |  275.58 | 34.1 |  55.5 |  36.2  | 16.1 |  36.7 |  50.0
178 | +SE |  46.67 |  275.69 |  35.4 |  57.4 |  37.8  | 17.1 |  38.6 |  51.8
179 | +EMCA+SE | 45.13 |  275.69  | 35.7 |  58.1 | 38.0  |17.8 | 39.0 | 51.9
180 | +ECA |  44.18 |  275.69 |  35.6 |  58.1 |  37.7  | 17.6 |  39.0 |  51.8    
181 | +EMCA+ECA |  44.18 |  275.69  | 35.7 | 58.4  | 37.7  | 17.9 | 39.1 | 51.9
182 | 
183 |    
184 | 
185 | 
186 | # Citation
187 | 


--------------------------------------------------------------------------------
/Code/train_imagenet.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import shutil
  5 | import time
  6 | import warnings
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.parallel
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.distributed as dist
 13 | import torch.optim
 14 | import torch.multiprocessing as mp
 15 | import torch.utils.data
 16 | import torch.utils.data.distributed
 17 | import torchvision.transforms as transforms
 18 | import torchvision.datasets as datasets
 19 | import torchvision.models as models
 20 | from tqdm import tqdm
 21 | 
 22 | os.environ["NCCL_DEBUG"] = "INFO"
 23 | model_names = sorted(name for name in models.__dict__
 24 |     if name.islower() and not name.startswith("__")
 25 |     and callable(models.__dict__[name]))
 26 | 
 27 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 28 | parser.add_argument('data', metavar='DIR',
 29 |                     help='path to dataset')
 30 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
 31 |                     choices=model_names,
 32 |                     help='model architecture: ' +
 33 |                         ' | '.join(model_names) +
 34 |                         ' (default: resnet18)')
 35 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 36 |                     help='number of data loading workers (default: 4)')
 37 | parser.add_argument('--epochs', default=200, type=int, metavar='N',
 38 |                     help='number of total epochs to run')
 39 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 40 |                     help='manual epoch number (useful on restarts)')
 41 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 42 |                     metavar='N',
 43 |                     help='mini-batch size (default: 256), this is the total '
 44 |                          'batch size of all GPUs on the current node when '
 45 |                          'using Data Parallel or Distributed Data Parallel')
 46 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 47 |                     metavar='LR', help='initial learning rate', dest='lr')
 48 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 49 |                     help='momentum')
 50 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 51 |                     metavar='W', help='weight decay (default: 1e-4)',
 52 |                     dest='weight_decay')
 53 | parser.add_argument('-p', '--print-freq', default=10, type=int,
 54 |                     metavar='N', help='print frequency (default: 10)')
 55 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 56 |                     help='path to latest checkpoint (default: none)')
 57 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 58 |                     help='evaluate model on validation set')
 59 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 60 |                     help='use pre-trained model')
 61 | parser.add_argument('--world-size', default=-1, type=int,
 62 |                     help='number of nodes for distributed training')
 63 | parser.add_argument('--rank', default=-1, type=int,
 64 |                     help='node rank for distributed training')
 65 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 66 |                     help='url used to set up distributed training')
 67 | parser.add_argument('--dist-backend', default='nccl', type=str,
 68 |                     help='distributed backend')
 69 | parser.add_argument('--seed', default=None, type=int,
 70 |                     help='seed for initializing training. ')
 71 | parser.add_argument('--gpu', default=None, type=int,
 72 |                     help='GPU id to use.')
 73 | parser.add_argument('--multiprocessing-distributed', action='store_true',
 74 |                     help='Use multi-processing distributed training to launch '
 75 |                          'N processes per node, which has N GPUs. This is the '
 76 |                          'fastest way to use PyTorch for either single node or '
 77 |                          'multi node data parallel training')
 78 | 
 79 | best_acc1 = 0
 80 | 
 81 | 
 82 | def main():
 83 |     args = parser.parse_args()
 84 | 
 85 |     if args.seed is not None:
 86 |         random.seed(args.seed)
 87 |         torch.manual_seed(args.seed)
 88 |         cudnn.deterministic = True
 89 |         warnings.warn('You have chosen to seed training. '
 90 |                       'This will turn on the CUDNN deterministic setting, '
 91 |                       'which can slow down your training considerably! '
 92 |                       'You may see unexpected behavior when restarting '
 93 |                       'from checkpoints.')
 94 | 
 95 |     if args.gpu is not None:
 96 |         warnings.warn('You have chosen a specific GPU. This will completely '
 97 |                       'disable data parallelism.')
 98 | 
 99 |     if args.dist_url == "env://" and args.world_size == -1:
100 |         args.world_size = int(os.environ["WORLD_SIZE"])
101 | 
102 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
103 | 
104 |     ngpus_per_node = torch.cuda.device_count()
105 |     if args.multiprocessing_distributed:
106 |         # Since we have ngpus_per_node processes per node, the total world_size
107 |         # needs to be adjusted accordingly
108 |         args.world_size = ngpus_per_node * args.world_size
109 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
110 |         # main_worker process function
111 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
112 |     else:
113 |         # Simply call main_worker function
114 |         main_worker(args.gpu, ngpus_per_node, args)
115 | 
116 | 
117 | def main_worker(gpu, ngpus_per_node, args):
118 |     global best_acc1
119 |     args.gpu = gpu
120 | 
121 |     if args.gpu is not None:
122 |         print("Use GPU: {} for training".format(args.gpu))
123 | 
124 |     if args.distributed:
125 |         if args.dist_url == "env://" and args.rank == -1:
126 |             args.rank = int(os.environ["RANK"])
127 |         if args.multiprocessing_distributed:
128 |             # For multiprocessing distributed training, rank needs to be the
129 |             # global rank among all the processes
130 |             args.rank = args.rank * ngpus_per_node + gpu
131 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
132 |                                 world_size=args.world_size, rank=args.rank)
133 |     # create model
134 |     if args.pretrained:
135 |         print("=> using pre-trained model '{}'".format(args.arch))
136 |         model = models.__dict__[args.arch](pretrained=True)
137 |     else:
138 |         print("=> creating model '{}'".format(args.arch))
139 |         # original pytorch models
140 |         #model = models.__dict__[args.arch]()
141 |         # custom models
142 |         if args.arch == "resnet18":
143 |             from eca_resnet_multi_scale import eca_resnet18
144 |             model = eca_resnet18(1000)
145 |         elif args.arch == "resnet34":
146 |             print("ResNet34")
147 |             from eca_resnet_multi_scale import eca_resnet34
148 |             model = eca_resnet34(1000)
149 |         elif args.arch == "resnet50":
150 |             print("ResNet50")
151 |             from eca_resnet_multi_scale import eca_resnet50
152 |             model = eca_resnet50(1000)
153 | 
154 |     if not torch.cuda.is_available():
155 |         print('using CPU, this will be slow')
156 |     elif args.distributed:
157 |         # For multiprocessing distributed, DistributedDataParallel constructor
158 |         # should always set the single device scope, otherwise,
159 |         # DistributedDataParallel will use all available devices.
160 |         if args.gpu is not None:
161 |             torch.cuda.set_device(args.gpu)
162 |             model.cuda(args.gpu)
163 |             # When using a single GPU per process and per
164 |             # DistributedDataParallel, we need to divide the batch size
165 |             # ourselves based on the total number of GPUs we have
166 |             args.batch_size = int(args.batch_size / ngpus_per_node)
167 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
168 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
169 |         else:
170 |             model.cuda()
171 |             # DistributedDataParallel will divide and allocate batch_size to all
172 |             # available GPUs if device_ids are not set
173 |             model = torch.nn.parallel.DistributedDataParallel(model)
174 |     elif args.gpu is not None:
175 |         torch.cuda.set_device(args.gpu)
176 |         model = model.cuda(args.gpu)
177 |     else:
178 |         # DataParallel will divide and allocate batch_size to all available GPUs
179 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
180 |             model.features = torch.nn.DataParallel(model.features)
181 |             model.cuda()
182 |         else:
183 |             model = torch.nn.DataParallel(model).cuda()
184 | 
185 |     # define loss function (criterion) and optimizer
186 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
187 | 
188 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
189 |                                 momentum=args.momentum,
190 |                                 weight_decay=args.weight_decay)
191 | 
192 |     # optionally resume from a checkpoint
193 |     if args.resume:
194 |         if os.path.isfile(args.resume):
195 |             print("=> loading checkpoint '{}'".format(args.resume))
196 |             if args.gpu is None:
197 |                 checkpoint = torch.load(args.resume)
198 |             else:
199 |                 # Map model to be loaded to specified single gpu.
200 |                 loc = 'cuda:{}'.format(args.gpu)
201 |                 checkpoint = torch.load(args.resume, map_location=loc)
202 |             args.start_epoch = checkpoint['epoch']
203 |             best_acc1 = checkpoint['best_acc1']
204 |             if args.gpu is not None:
205 |                 # best_acc1 may be from a checkpoint from a different GPU
206 |                 best_acc1 = best_acc1.to(args.gpu)
207 |             model.load_state_dict(checkpoint['state_dict'])
208 |             optimizer.load_state_dict(checkpoint['optimizer'])
209 |             print("=> loaded checkpoint '{}' (epoch {})"
210 |                   .format(args.resume, checkpoint['epoch']))
211 |         else:
212 |             print("=> no checkpoint found at '{}'".format(args.resume))
213 | 
214 |     cudnn.benchmark = True
215 | 
216 |     # Data loading code
217 |     traindir = os.path.join(args.data, 'train')
218 |     valdir = os.path.join(args.data, 'val')
219 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
220 |                                      std=[0.229, 0.224, 0.225])
221 | 
222 |     train_dataset = datasets.ImageFolder(
223 |         traindir,
224 |         transforms.Compose([
225 |             transforms.RandomResizedCrop(224),
226 |             transforms.RandomHorizontalFlip(),
227 |             transforms.ToTensor(),
228 |             normalize,
229 |         ]))
230 | 
231 |     if args.distributed:
232 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
233 |     else:
234 |         train_sampler = None
235 | 
236 |     train_loader = torch.utils.data.DataLoader(
237 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
238 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler)
239 | 
240 |     val_loader = torch.utils.data.DataLoader(
241 |         datasets.ImageFolder(valdir, transforms.Compose([
242 |             transforms.Resize(256),
243 |             transforms.CenterCrop(224),
244 |             transforms.ToTensor(),
245 |             normalize,
246 |         ])),
247 |         batch_size=args.batch_size, shuffle=False,
248 |         num_workers=args.workers, pin_memory=True)
249 | 
250 |     if args.evaluate:
251 |         validate(val_loader, model, criterion, args)
252 |         return
253 | 
254 |     for epoch in range(args.start_epoch, args.epochs):
255 |         if args.distributed:
256 |             train_sampler.set_epoch(epoch)
257 |         adjust_learning_rate(optimizer, epoch, args)
258 | 
259 |         # train for one epoch
260 |         train(train_loader, model, criterion, optimizer, epoch, args)
261 | 
262 |         # evaluate on validation set
263 |         acc1 = validate(val_loader, model, criterion, args)
264 | 
265 |         # remember best acc@1 and save checkpoint
266 |         is_best = acc1 > best_acc1
267 |         best_acc1 = max(acc1, best_acc1)
268 | 
269 |         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
270 |                 and args.rank % ngpus_per_node == 0):
271 |             save_checkpoint({
272 |                 'epoch': epoch + 1,
273 |                 'arch': args.arch,
274 |                 'state_dict': model.state_dict(),
275 |                 'best_acc1': best_acc1,
276 |                 'optimizer' : optimizer.state_dict(),
277 |             }, is_best)
278 | 
279 | 
280 | def train(train_loader, model, criterion, optimizer, epoch, args):
281 |     batch_time = AverageMeter('Time', ':6.3f')
282 |     data_time = AverageMeter('Data', ':6.3f')
283 |     losses = AverageMeter('Loss', ':.4e')
284 |     top1 = AverageMeter('Acc@1', ':6.2f')
285 |     top5 = AverageMeter('Acc@5', ':6.2f')
286 |     progress = ProgressMeter(
287 |         len(train_loader),
288 |         [batch_time, data_time, losses, top1, top5],
289 |         prefix="Epoch: [{}]".format(epoch))
290 | 
291 |     # switch to train mode
292 |     model.train()
293 | 
294 |     end = time.time()
295 |     for i, (images, target) in tqdm(enumerate(train_loader)):
296 |         # measure data loading time
297 |         data_time.update(time.time() - end)
298 | 
299 |         if args.gpu is not None:
300 |             images = images.cuda(args.gpu, non_blocking=True)
301 |         if torch.cuda.is_available():
302 |             target = target.cuda(args.gpu, non_blocking=True)
303 | 
304 |         # compute output
305 |         output = model(images)
306 |         loss = criterion(output, target)
307 | 
308 |         # measure accuracy and record loss
309 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
310 |         losses.update(loss.item(), images.size(0))
311 |         top1.update(acc1[0], images.size(0))
312 |         top5.update(acc5[0], images.size(0))
313 | 
314 |         # compute gradient and do SGD step
315 |         optimizer.zero_grad()
316 |         loss.backward()
317 |         optimizer.step()
318 | 
319 |         # measure elapsed time
320 |         batch_time.update(time.time() - end)
321 |         end = time.time()
322 | 
323 |         if i % args.print_freq == 0 and False:
324 |             progress.display(i)
325 | 
326 | 
327 | def validate(val_loader, model, criterion, args):
328 |     batch_time = AverageMeter('Time', ':6.3f')
329 |     losses = AverageMeter('Loss', ':.4e')
330 |     top1 = AverageMeter('Acc@1', ':6.2f')
331 |     top5 = AverageMeter('Acc@5', ':6.2f')
332 |     progress = ProgressMeter(
333 |         len(val_loader),
334 |         [batch_time, losses, top1, top5],
335 |         prefix='Test: ')
336 | 
337 |     # switch to evaluate mode
338 |     model.eval()
339 | 
340 |     with torch.no_grad():
341 |         end = time.time()
342 |         for i, (images, target) in enumerate(val_loader):
343 |             if args.gpu is not None:
344 |                 images = images.cuda(args.gpu, non_blocking=True)
345 |             if torch.cuda.is_available():
346 |                 target = target.cuda(args.gpu, non_blocking=True)
347 | 
348 |             # compute output
349 |             output = model(images)
350 |             loss = criterion(output, target)
351 | 
352 |             # measure accuracy and record loss
353 |             acc1, acc5 = accuracy(output, target, topk=(1, 5))
354 |             losses.update(loss.item(), images.size(0))
355 |             top1.update(acc1[0], images.size(0))
356 |             top5.update(acc5[0], images.size(0))
357 | 
358 |             # measure elapsed time
359 |             batch_time.update(time.time() - end)
360 |             end = time.time()
361 | 
362 |             if i % args.print_freq == 0 and False:
363 |                 progress.display(i)
364 | 
365 |         # TODO: this should also be done with the ProgressMeter
366 |         print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
367 |               .format(top1=top1, top5=top5))
368 | 
369 |     return top1.avg
370 | 
371 | 
372 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
373 |     log_file = open("log.txt","a+")
374 |     torch.save(state, filename)
375 |     log_file.write("Last epoch is " + str(state['epoch']))
376 |     if is_best:
377 |         shutil.copyfile(filename, 'model_best.pth.tar')
378 |         log_file.write("\n best epoch at: " + str(state['epoch']) + "   " + str(state['best_acc1']))
379 |     
380 | 
381 | 
382 | class AverageMeter(object):
383 |     """Computes and stores the average and current value"""
384 |     def __init__(self, name, fmt=':f'):
385 |         self.name = name
386 |         self.fmt = fmt
387 |         self.reset()
388 | 
389 |     def reset(self):
390 |         self.val = 0
391 |         self.avg = 0
392 |         self.sum = 0
393 |         self.count = 0
394 | 
395 |     def update(self, val, n=1):
396 |         self.val = val
397 |         self.sum += val * n
398 |         self.count += n
399 |         self.avg = self.sum / self.count
400 | 
401 |     def __str__(self):
402 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
403 |         return fmtstr.format(**self.__dict__)
404 | 
405 | 
406 | class ProgressMeter(object):
407 |     def __init__(self, num_batches, meters, prefix=""):
408 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
409 |         self.meters = meters
410 |         self.prefix = prefix
411 | 
412 |     def display(self, batch):
413 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
414 |         entries += [str(meter) for meter in self.meters]
415 |         print('\t'.join(entries))
416 | 
417 |     def _get_batch_fmtstr(self, num_batches):
418 |         num_digits = len(str(num_batches // 1))
419 |         fmt = '{:' + str(num_digits) + 'd}'
420 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
421 | 
422 | 
423 | def adjust_learning_rate(optimizer, epoch, args):
424 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
425 |     lr = args.lr * (0.1 ** (epoch // 30))
426 |     for param_group in optimizer.param_groups:
427 |         param_group['lr'] = lr
428 | 
429 | 
430 | def accuracy(output, target, topk=(1,)):
431 |     """Computes the accuracy over the k top predictions for the specified values of k"""
432 |     with torch.no_grad():
433 |         maxk = max(topk)
434 |         batch_size = target.size(0)
435 | 
436 |         _, pred = output.topk(maxk, 1, True, True)
437 |         pred = pred.t()
438 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
439 | 
440 |         res = []
441 |         for k in topk:
442 |             correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
443 |             res.append(correct_k.mul_(100.0 / batch_size))
444 |         return res
445 | 
446 | 
447 | if __name__ == '__main__':
448 |     main()
449 | 


--------------------------------------------------------------------------------
/Code/benchmarking/inceptionv4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """ inceptionv4 in pytorch
  3 | 
  4 | 
  5 | [1] Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
  6 | 
  7 |     Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning
  8 |     https://arxiv.org/abs/1602.07261
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | 
 14 | class BasicConv2d(nn.Module):
 15 | 
 16 |     def __init__(self, input_channels, output_channels, **kwargs):
 17 |         super().__init__()
 18 |         self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs)
 19 |         self.bn = nn.BatchNorm2d(output_channels)
 20 |         self.relu = nn.ReLU(inplace=True)
 21 |     
 22 |     def forward(self, x):
 23 |         x = self.conv(x)
 24 |         x = self.bn(x)
 25 |         x = self.relu(x)
 26 | 
 27 |         return x
 28 | 
 29 | class Inception_Stem(nn.Module):
 30 | 
 31 |     #"""Figure 3. The schema for stem of the pure Inception-v4 and 
 32 |     #Inception-ResNet-v2 networks. This is the input part of those 
 33 |     #networks."""
 34 |     def __init__(self, input_channels):
 35 |         super().__init__()
 36 |         self.conv1 = nn.Sequential(
 37 |             BasicConv2d(input_channels, 32, kernel_size=3),
 38 |             BasicConv2d(32, 32, kernel_size=3, padding=1),
 39 |             BasicConv2d(32, 64, kernel_size=3, padding=1)
 40 |         )
 41 | 
 42 |         self.branch3x3_conv = BasicConv2d(64, 96, kernel_size=3, padding=1)
 43 |         self.branch3x3_pool = nn.MaxPool2d(3, stride=1, padding=1)
 44 | 
 45 |         self.branch7x7a = nn.Sequential(
 46 |             BasicConv2d(160, 64, kernel_size=1),
 47 |             BasicConv2d(64, 64, kernel_size=(7, 1), padding=(3, 0)),
 48 |             BasicConv2d(64, 64, kernel_size=(1, 7), padding=(0, 3)),
 49 |             BasicConv2d(64, 96, kernel_size=3, padding=1)
 50 |         )
 51 | 
 52 |         self.branch7x7b = nn.Sequential(
 53 |             BasicConv2d(160, 64, kernel_size=1),
 54 |             BasicConv2d(64, 96, kernel_size=3, padding=1)
 55 |         )
 56 | 
 57 |         self.branchpoola = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 58 |         self.branchpoolb = BasicConv2d(192, 192, kernel_size=3, stride=1, padding=1)
 59 | 
 60 |     def forward(self, x):
 61 | 
 62 |         x = self.conv1(x)
 63 | 
 64 |         x = [
 65 |             self.branch3x3_conv(x),
 66 |             self.branch3x3_pool(x)
 67 |         ]
 68 |         x = torch.cat(x, 1)
 69 | 
 70 |         x = [
 71 |             self.branch7x7a(x),
 72 |             self.branch7x7b(x)
 73 |         ]
 74 |         x = torch.cat(x, 1)
 75 | 
 76 |         x = [
 77 |             self.branchpoola(x),
 78 |             self.branchpoolb(x)
 79 |         ]
 80 | 
 81 |         x = torch.cat(x, 1)
 82 | 
 83 |         return x
 84 | 
 85 | class InceptionA(nn.Module):
 86 | 
 87 |     #"""Figure 4. The schema for 35 × 35 grid modules of the pure 
 88 |     #Inception-v4 network. This is the Inception-A block of Figure 9."""
 89 |     def __init__(self, input_channels):
 90 |         super().__init__()
 91 | 
 92 |         self.branch3x3stack = nn.Sequential(
 93 |             BasicConv2d(input_channels, 64, kernel_size=1),
 94 |             BasicConv2d(64, 96, kernel_size=3, padding=1),
 95 |             BasicConv2d(96, 96, kernel_size=3, padding=1)
 96 |         )
 97 | 
 98 |         self.branch3x3 = nn.Sequential(
 99 |             BasicConv2d(input_channels, 64, kernel_size=1),
100 |             BasicConv2d(64, 96, kernel_size=3, padding=1)
101 |         )
102 | 
103 |         self.branch1x1 = BasicConv2d(input_channels, 96, kernel_size=1)
104 | 
105 |         self.branchpool = nn.Sequential(
106 |             nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
107 |             BasicConv2d(input_channels, 96, kernel_size=1)
108 |         )
109 | 
110 |     def forward(self, x):
111 | 
112 |         x = [
113 |             self.branch3x3stack(x),
114 |             self.branch3x3(x),
115 |             self.branch1x1(x),
116 |             self.branchpool(x)
117 |         ]
118 | 
119 |         return torch.cat(x, 1)
120 | 
121 | class ReductionA(nn.Module):
122 | 
123 |     #"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module. 
124 |     #Different variants of this blocks (with various number of filters) 
125 |     #are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1,
126 |     #-ResNet-v2) variants presented in this paper. The k, l, m, n numbers 
127 |     #represent filter bank sizes which can be looked up in Table 1.
128 |     def __init__(self, input_channels, k, l, m, n):
129 | 
130 |         super().__init__()
131 |         self.branch3x3stack = nn.Sequential(
132 |             BasicConv2d(input_channels, k, kernel_size=1),
133 |             BasicConv2d(k, l, kernel_size=3, padding=1),
134 |             BasicConv2d(l, m, kernel_size=3, stride=2)
135 |         )
136 | 
137 |         self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2)
138 |         self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)
139 |         self.output_channels = input_channels + n + m
140 | 
141 |     def forward(self, x):
142 | 
143 |         x = [
144 |             self.branch3x3stack(x),
145 |             self.branch3x3(x),
146 |             self.branchpool(x)
147 |         ]
148 | 
149 |         return torch.cat(x, 1)
150 | 
151 | class InceptionB(nn.Module):
152 | 
153 |     #"""Figure 5. The schema for 17 × 17 grid modules of the pure Inception-v4 network. 
154 |     #This is the Inception-B block of Figure 9."""
155 |     def __init__(self, input_channels):
156 |         super().__init__()
157 |         
158 |         self.branch7x7stack = nn.Sequential(
159 |             BasicConv2d(input_channels, 192, kernel_size=1),
160 |             BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)),
161 |             BasicConv2d(192, 224, kernel_size=(7, 1), padding=(3, 0)),
162 |             BasicConv2d(224, 224, kernel_size=(1, 7), padding=(0, 3)),
163 |             BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0))
164 |         )
165 | 
166 |         self.branch7x7 = nn.Sequential(
167 |             BasicConv2d(input_channels, 192, kernel_size=1),
168 |             BasicConv2d(192, 224, kernel_size=(1, 7), padding=(0, 3)),
169 |             BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0))
170 |         )
171 | 
172 |         self.branch1x1 = BasicConv2d(input_channels, 384, kernel_size=1) 
173 | 
174 |         self.branchpool = nn.Sequential(
175 |             nn.AvgPool2d(3, stride=1, padding=1),
176 |             BasicConv2d(input_channels, 128, kernel_size=1)
177 |         )
178 |     
179 |     def forward(self, x):
180 |         x = [
181 |             self.branch1x1(x),
182 |             self.branch7x7(x),
183 |             self.branch7x7stack(x),
184 |             self.branchpool(x)
185 |         ]
186 | 
187 |         return torch.cat(x, 1)
188 | 
189 | class ReductionB(nn.Module):
190 | 
191 |     #"""Figure 8. The schema for 17 × 17 to 8 × 8 grid-reduction mod- ule. 
192 |     #This is the reduction module used by the pure Inception-v4 network in 
193 |     #Figure 9."""
194 |     def __init__(self, input_channels):
195 | 
196 |         super().__init__()
197 |         self.branch7x7 = nn.Sequential(
198 |             BasicConv2d(input_channels, 256, kernel_size=1),
199 |             BasicConv2d(256, 256, kernel_size=(1, 7), padding=(0, 3)),
200 |             BasicConv2d(256, 320, kernel_size=(7, 1), padding=(3, 0)),
201 |             BasicConv2d(320, 320, kernel_size=3, stride=2, padding=1)
202 |         )
203 | 
204 |         self.branch3x3 = nn.Sequential(
205 |             BasicConv2d(input_channels, 192, kernel_size=1),
206 |             BasicConv2d(192, 192, kernel_size=3, stride=2, padding=1)
207 |         )
208 | 
209 |         self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
210 | 
211 |     def forward(self, x):
212 | 
213 |         x = [
214 |             self.branch3x3(x),
215 |             self.branch7x7(x),
216 |             self.branchpool(x)
217 |         ]
218 | 
219 |         return torch.cat(x, 1)
220 | 
221 | class InceptionC(nn.Module):
222 | 
223 |     def __init__(self, input_channels):
224 |         #"""Figure 6. The schema for 8×8 grid modules of the pure 
225 |         #Inceptionv4 network. This is the Inception-C block of Figure 9."""
226 |     
227 |         super().__init__()
228 | 
229 |         self.branch3x3stack = nn.Sequential(
230 |             BasicConv2d(input_channels, 384, kernel_size=1),
231 |             BasicConv2d(384, 448, kernel_size=(1, 3), padding=(0, 1)),
232 |             BasicConv2d(448, 512, kernel_size=(3, 1), padding=(1, 0)),
233 |         )
234 |         self.branch3x3stacka = BasicConv2d(512, 256, kernel_size=(1, 3), padding=(0, 1))
235 |         self.branch3x3stackb = BasicConv2d(512, 256, kernel_size=(3, 1), padding=(1, 0))
236 |     
237 |         self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=1)
238 |         self.branch3x3a = BasicConv2d(384, 256, kernel_size=(3, 1), padding=(1, 0))
239 |         self.branch3x3b = BasicConv2d(384, 256, kernel_size=(1, 3), padding=(0, 1))
240 | 
241 |         self.branch1x1 = BasicConv2d(input_channels, 256, kernel_size=1)
242 | 
243 |         self.branchpool = nn.Sequential(
244 |             nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
245 |             BasicConv2d(input_channels, 256, kernel_size=1)
246 |         )
247 | 
248 |     def forward(self, x):
249 |         branch3x3stack_output = self.branch3x3stack(x)
250 |         branch3x3stack_output = [
251 |             self.branch3x3stacka(branch3x3stack_output),
252 |             self.branch3x3stackb(branch3x3stack_output)
253 |         ]
254 |         branch3x3stack_output = torch.cat(branch3x3stack_output, 1)
255 | 
256 |         branch3x3_output = self.branch3x3(x)
257 |         branch3x3_output = [
258 |             self.branch3x3a(branch3x3_output),
259 |             self.branch3x3b(branch3x3_output)
260 |         ]
261 |         branch3x3_output = torch.cat(branch3x3_output, 1)
262 | 
263 |         branch1x1_output = self.branch1x1(x)
264 | 
265 |         branchpool = self.branchpool(x)
266 | 
267 |         output = [
268 |             branch1x1_output,
269 |             branch3x3_output,
270 |             branch3x3stack_output,
271 |             branchpool
272 |         ]
273 | 
274 |         return torch.cat(output, 1)
275 |         
276 | class InceptionV4(nn.Module):
277 | 
278 |     def __init__(self, A, B, C, k=192, l=224, m=256, n=384, class_nums=100):
279 | 
280 |         super().__init__()
281 |         self.stem = Inception_Stem(3)
282 |         self.inception_a = self._generate_inception_module(384, 384, A, InceptionA)
283 |         self.reduction_a = ReductionA(384, k, l, m, n)
284 |         output_channels = self.reduction_a.output_channels
285 |         self.inception_b = self._generate_inception_module(output_channels, 1024, B, InceptionB)
286 |         self.reduction_b = ReductionB(1024)
287 |         self.inception_c = self._generate_inception_module(1536, 1536, C, InceptionC)
288 |         self.avgpool = nn.AvgPool2d(7)
289 | 
290 |         #"""Dropout (keep 0.8)"""
291 |         self.dropout = nn.Dropout2d(1 - 0.8)
292 |         self.linear = nn.Linear(1536, class_nums)
293 | 
294 |     def forward(self, x):
295 |         x = self.stem(x)
296 |         x = self.inception_a(x)
297 |         x = self.reduction_a(x)
298 |         x = self.inception_b(x)
299 |         x = self.reduction_b(x)
300 |         x = self.inception_c(x)
301 |         x = self.avgpool(x)
302 |         x = self.dropout(x)
303 |         x = x.view(-1, 1536)
304 |         x = self.linear(x)
305 | 
306 |         return x
307 | 
308 |     @staticmethod    
309 |     def _generate_inception_module(input_channels, output_channels, block_num, block):
310 | 
311 |         layers = nn.Sequential()
312 |         for l in range(block_num):
313 |             layers.add_module("{}_{}".format(block.__name__, l), block(input_channels))
314 |             input_channels = output_channels
315 |         
316 |         return layers
317 | 
318 | class InceptionResNetA(nn.Module):
319 | 
320 |     #"""Figure 16. The schema for 35 × 35 grid (Inception-ResNet-A) 
321 |     #module of the Inception-ResNet-v2 network."""
322 |     def __init__(self, input_channels):
323 | 
324 |         super().__init__()
325 |         self.branch3x3stack = nn.Sequential(
326 |             BasicConv2d(input_channels, 32, kernel_size=1),
327 |             BasicConv2d(32, 48, kernel_size=3, padding=1),
328 |             BasicConv2d(48, 64, kernel_size=3, padding=1)
329 |         )
330 | 
331 |         self.branch3x3 = nn.Sequential(
332 |             BasicConv2d(input_channels, 32, kernel_size=1),
333 |             BasicConv2d(32, 32, kernel_size=3, padding=1)
334 |         )
335 | 
336 |         self.branch1x1 = BasicConv2d(input_channels, 32, kernel_size=1)
337 | 
338 |         self.reduction1x1 = nn.Conv2d(128, 384, kernel_size=1)
339 |         self.shortcut = nn.Conv2d(input_channels, 384, kernel_size=1)
340 |         self.bn = nn.BatchNorm2d(384)
341 |         self.relu = nn.ReLU(inplace=True)
342 | 
343 |     def forward(self, x):
344 | 
345 |         residual = [
346 |             self.branch1x1(x),
347 |             self.branch3x3(x),
348 |             self.branch3x3stack(x)
349 |         ]
350 | 
351 |         residual = torch.cat(residual, 1)
352 |         residual = self.reduction1x1(residual)
353 |         shortcut = self.shortcut(x)
354 | 
355 |         output = self.bn(shortcut + residual)
356 |         output = self.relu(output)
357 | 
358 |         return output
359 | 
360 | class InceptionResNetB(nn.Module):
361 | 
362 |     #"""Figure 17. The schema for 17 × 17 grid (Inception-ResNet-B) module of 
363 |     #the Inception-ResNet-v2 network."""
364 |     def __init__(self, input_channels):
365 | 
366 |         super().__init__()
367 |         self.branch7x7 = nn.Sequential(
368 |             BasicConv2d(input_channels, 128, kernel_size=1),
369 |             BasicConv2d(128, 160, kernel_size=(1, 7), padding=(0, 3)),
370 |             BasicConv2d(160, 192, kernel_size=(7, 1), padding=(3, 0))
371 |         )
372 | 
373 |         self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1)
374 | 
375 |         self.reduction1x1 = nn.Conv2d(384, 1154, kernel_size=1)
376 |         self.shortcut = nn.Conv2d(input_channels, 1154, kernel_size=1)
377 | 
378 |         self.bn = nn.BatchNorm2d(1154)
379 |         self.relu = nn.ReLU(inplace=True)
380 |     
381 |     def forward(self, x):
382 |         residual = [
383 |             self.branch1x1(x),
384 |             self.branch7x7(x)
385 |         ]
386 | 
387 |         residual = torch.cat(residual, 1)
388 | 
389 |         #"""In general we picked some scaling factors between 0.1 and 0.3 to scale the residuals 
390 |         #before their being added to the accumulated layer activations (cf. Figure 20)."""
391 |         residual = self.reduction1x1(residual) * 0.1
392 | 
393 |         shortcut = self.shortcut(x)
394 | 
395 |         output = self.bn(residual + shortcut)
396 |         output = self.relu(output)
397 | 
398 |         return output
399 | 
400 | 
401 | class InceptionResNetC(nn.Module):
402 | 
403 |     def __init__(self, input_channels):
404 |         
405 |         #Figure 19. The schema for 8×8 grid (Inception-ResNet-C)
406 |         #module of the Inception-ResNet-v2 network."""
407 |         super().__init__()
408 |         self.branch3x3 = nn.Sequential(
409 |             BasicConv2d(input_channels, 192, kernel_size=1),
410 |             BasicConv2d(192, 224, kernel_size=(1, 3), padding=(0, 1)),
411 |             BasicConv2d(224, 256, kernel_size=(3, 1), padding=(1, 0))
412 |         )
413 | 
414 |         self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1)
415 |         self.reduction1x1 = nn.Conv2d(448, 2048, kernel_size=1)
416 |         self.shorcut = nn.Conv2d(input_channels, 2048, kernel_size=1)
417 |         self.bn = nn.BatchNorm2d(2048)
418 |         self.relu = nn.ReLU(inplace=True)
419 |     
420 |     def forward(self, x):
421 |         residual = [
422 |             self.branch1x1(x),
423 |             self.branch3x3(x)
424 |         ]
425 | 
426 |         residual = torch.cat(residual, 1)
427 |         residual = self.reduction1x1(residual) * 0.1
428 | 
429 |         shorcut = self.shorcut(x)
430 | 
431 |         output = self.bn(shorcut + residual)
432 |         output = self.relu(output)
433 | 
434 |         return output
435 | 
436 | class InceptionResNetReductionA(nn.Module):
437 | 
438 |     #"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module. 
439 |     #Different variants of this blocks (with various number of filters) 
440 |     #are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1,
441 |     #-ResNet-v2) variants presented in this paper. The k, l, m, n numbers 
442 |     #represent filter bank sizes which can be looked up in Table 1.
443 |     def __init__(self, input_channels, k, l, m, n):
444 | 
445 |         super().__init__()
446 |         self.branch3x3stack = nn.Sequential(
447 |             BasicConv2d(input_channels, k, kernel_size=1),
448 |             BasicConv2d(k, l, kernel_size=3, padding=1),
449 |             BasicConv2d(l, m, kernel_size=3, stride=2)
450 |         )
451 | 
452 |         self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2)
453 |         self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)
454 |         self.output_channels = input_channels + n + m
455 | 
456 |     def forward(self, x):
457 | 
458 |         x = [
459 |             self.branch3x3stack(x),
460 |             self.branch3x3(x),
461 |             self.branchpool(x)
462 |         ]
463 | 
464 |         return torch.cat(x, 1)
465 | 
466 | class InceptionResNetReductionB(nn.Module):
467 | 
468 |     #"""Figure 18. The schema for 17 × 17 to 8 × 8 grid-reduction module. 
469 |     #Reduction-B module used by the wider Inception-ResNet-v1 network in
470 |     #Figure 15."""
471 |     #I believe it was a typo(Inception-ResNet-v1 should be Inception-ResNet-v2)
472 |     def __init__(self, input_channels):
473 | 
474 |         super().__init__()
475 |         self.branchpool = nn.MaxPool2d(3, stride=2)
476 | 
477 |         self.branch3x3a = nn.Sequential(
478 |             BasicConv2d(input_channels, 256, kernel_size=1),
479 |             BasicConv2d(256, 384, kernel_size=3, stride=2)
480 |         )
481 | 
482 |         self.branch3x3b = nn.Sequential(
483 |             BasicConv2d(input_channels, 256, kernel_size=1),
484 |             BasicConv2d(256, 288, kernel_size=3, stride=2)
485 |         )
486 | 
487 |         self.branch3x3stack = nn.Sequential(
488 |             BasicConv2d(input_channels, 256, kernel_size=1),
489 |             BasicConv2d(256, 288, kernel_size=3, padding=1),
490 |             BasicConv2d(288, 320, kernel_size=3, stride=2)
491 |         )
492 | 
493 |     def forward(self, x):
494 |         x = [
495 |             self.branch3x3a(x),
496 |             self.branch3x3b(x),
497 |             self.branch3x3stack(x),
498 |             self.branchpool(x)
499 |         ]
500 | 
501 |         x = torch.cat(x, 1)
502 |         return x
503 | 
504 | class InceptionResNetV2(nn.Module):
505 | 
506 |     def __init__(self, A, B, C, k=256, l=256, m=384, n=384, class_nums=100):
507 |         super().__init__()
508 |         self.stem = Inception_Stem(3)
509 |         self.inception_resnet_a = self._generate_inception_module(384, 384, A, InceptionResNetA)
510 |         self.reduction_a = InceptionResNetReductionA(384, k, l, m, n)
511 |         output_channels = self.reduction_a.output_channels
512 |         self.inception_resnet_b = self._generate_inception_module(output_channels, 1154, B, InceptionResNetB)
513 |         self.reduction_b = InceptionResNetReductionB(1154)
514 |         self.inception_resnet_c = self._generate_inception_module(2146, 2048, C, InceptionResNetC)
515 | 
516 |         #6x6 featuresize
517 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
518 |         #"""Dropout (keep 0.8)"""
519 |         self.dropout = nn.Dropout2d(1 - 0.8)
520 |         self.linear = nn.Linear(2048, class_nums)
521 | 
522 |     def forward(self, x):
523 |         x = self.stem(x)
524 |         x = self.inception_resnet_a(x)
525 |         x = self.reduction_a(x)
526 |         x = self.inception_resnet_b(x)
527 |         x = self.reduction_b(x)
528 |         x = self.inception_resnet_c(x)
529 |         x = self.avgpool(x)
530 |         x = self.dropout(x)
531 |         x = x.view(-1, 2048)
532 |         x = self.linear(x)
533 | 
534 |         return x
535 | 
536 |     @staticmethod
537 |     def _generate_inception_module(input_channels, output_channels, block_num, block):
538 | 
539 |         layers = nn.Sequential()
540 |         for l in range(block_num):
541 |             layers.add_module("{}_{}".format(block.__name__, l), block(input_channels))
542 |             input_channels = output_channels
543 |         
544 |         return layers
545 | 
546 | def inceptionv4():
547 |     return InceptionV4(4, 7, 3)
548 | 
549 | def inception_resnet_v2():
550 |     return InceptionResNetV2(5, 10, 5)
551 | 


--------------------------------------------------------------------------------
/Code/benchmarking/senet.py:
--------------------------------------------------------------------------------
  1 | """senet in pytorch
  2 | 
  3 | 
  4 | 
  5 | [1] Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu
  6 | 
  7 |     Squeeze-and-Excitation Networks
  8 |     https://arxiv.org/abs/1709.01507
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | 
 16 | def single_list(x):
 17 |     """ If an Element is a single instead of a list, when a list is expected it created a single element list"""
 18 |     if x.__class__.__name__ is 'Tensor':
 19 |        return [x]
 20 |     else:
 21 |         return x
 22 | 
 23 | class BasicResidualSEBlock(nn.Module):
 24 |     expansion = 1
 25 |     # [global_local_attention_addition, global_attention_addition, global_local_attention_concat, global_attention_concat]
 26 |     # [global_local_attention_concat_learnable, global_local_attention_addition_learnable]
 27 |     # [global_local_attention_learnable_learnable, global_local_attention_learnable_learnable_att]
 28 |     # [standard_local_attention, identity_local_attention, pre_local_attention]
 29 |     # [multi_scale_conv1d]
 30 |     exp_name = 'global_local_attention_concat_learnable'
 31 |     def __init__(self, in_channels, out_channels, stride, block_num, r=16):
 32 |         super().__init__()
 33 |         
 34 |         if (not 'concat' in self.exp_name) and (self.exp_name != "global_local_attention_learnable_learnable") and (not "multi_scale" in self.exp_name):
 35 |             block_num = 1
 36 |         if bottleneck:
 37 |             self.expansion = 4
 38 |             self.residual = nn.Sequential(
 39 |                     nn.Conv2d(in_channels, out_channels, 1),
 40 |                     nn.BatchNorm2d(out_channels),
 41 |                     nn.ReLU(),
 42 | 
 43 |                     nn.Conv2d(out_channels, out_channels, 3, stride=stride, padding=1),
 44 |                     nn.BatchNorm2d(out_channels),
 45 |                     nn.ReLU(),
 46 | 
 47 |                     nn.Conv2d(out_channels, out_channels * self.expansion, 1),
 48 |                     nn.BatchNorm2d(out_channels * self.expansion),
 49 |                     nn.ReLU()
 50 |                     )
 51 |         else:      
 52 |             self.residual = nn.Sequential(
 53 |                 nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias = False),
 54 |                 nn.BatchNorm2d(out_channels),
 55 |                 nn.ReLU(),
 56 | 
 57 |                 nn.Conv2d(out_channels, out_channels * self.expansion, 3, padding=1, bias = False),
 58 |                 nn.BatchNorm2d(out_channels * self.expansion)
 59 |             )
 60 | 
 61 |         self.shortcut = nn.Sequential()
 62 |         if stride != 1 or in_channels != out_channels * self.expansion:
 63 |             self.shortcut = nn.Sequential(
 64 |                 nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride, bias = False),
 65 |                 nn.BatchNorm2d(out_channels * self.expansion)
 66 |             )
 67 | 
 68 |         self.squeeze = nn.AdaptiveAvgPool2d(1)
 69 | 
 70 |         if "multi_scale" in self.exp_name and block_num==1:
 71 |             self.excitation2 = nn.Sequential(
 72 |                 nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False),
 73 |                 nn.ReLU(),
 74 |                 nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False),
 75 |                 nn.Sigmoid()
 76 |             )
 77 | 
 78 |         if "multi_scale" in self.exp_name and block_num>1:
 79 |             self.multi_scale_Conv1d = nn.Sequential(
 80 |                 nn.Conv1d(in_channels=1, out_channels=1, kernel_size=block_num, stride=block_num,
 81 |                           padding=0, bias = False),
 82 |                 nn.Sigmoid()
 83 |             )
 84 |         if "multi_scale" in self.exp_name:
 85 |             return
 86 | 
 87 |         self.excitation2 = nn.Sequential(
 88 |             nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r, bias = False),
 89 |             nn.ReLU(),
 90 |             nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False),
 91 |             nn.Sigmoid()
 92 |         )
 93 |         if 'global_local_attention_learnable_learnable' == self.exp_name:
 94 |             self.in_1_1_conv = nn.Sequential(
 95 |                 nn.Conv2d(out_channels * self.expansion * block_num, out_channels * self.expansion, 1, bias = False),
 96 |                 nn.ReLU()
 97 |             )
 98 |             block_num = 1
 99 |         if not 'standard' in self.exp_name:
100 |             self.excitation1 = nn.Sequential(
101 |                 nn.Linear(out_channels * self.expansion * block_num, out_channels * self.expansion // r, bias = False),
102 |                 nn.ReLU(),
103 |                 nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion, bias = False),
104 |                 nn.Sigmoid()
105 |             )
106 |         
107 |         if not 'standard' in self.exp_name:
108 |             self.fc = nn.Sequential(
109 |                 nn.Linear(out_channels * self.expansion *2, out_channels * self.expansion, bias = False),
110 |                 nn.Sigmoid()
111 |             )
112 | 
113 |         if 'global_local_attention_learnable_learnable_att' == self.exp_name:
114 |             self.att = nn.MultiheadAttention(embed_dim =1, num_heads=1, kdim=64, vdim=64)
115 | 
116 |     def forward(self, x):
117 |         
118 |         if self.exp_name is 'global_local_attention_addition':
119 |             if x.__class__.__name__ is 'Tensor':
120 |                 current_input = x
121 |                 shortcut = self.shortcut(current_input)
122 |                 residual = self.residual(current_input)  
123 |                 squeeze2 = self.squeeze(residual)
124 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
125 |                 excitation2 = self.excitation2(squeeze2)
126 |                 excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
127 |                 output = residual *  excitation2.expand_as(residual) + shortcut
128 |                 return (F.relu(output), [residual])
129 |             else :             
130 |                 current_input = x[0]
131 |                 previous_inputs = x[1]
132 |                 shortcut = self.shortcut(current_input)
133 |                 residual = self.residual(current_input)
134 |                 new_connection = residual
135 |                 for input_ in previous_inputs:
136 |                     #if input_.shape[2] != residual.shape[2]:
137 |                     #    input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item())
138 |                     new_connection += input_
139 |                 squeeze1 = self.squeeze(new_connection)
140 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
141 |                 excitation1 = self.excitation1(squeeze1)
142 |                 # excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1)       
143 |                 squeeze2 = self.squeeze(residual)
144 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
145 |                 excitation2 = self.excitation2(squeeze2)
146 |                 # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
147 |                 
148 |                 local_global_mean = torch.mean(torch.stack([excitation1, excitation2]), 0)
149 |                 local_global_mean = local_global_mean.view(residual.size(0), residual.size(1), 1, 1)
150 |                 output = residual * local_global_mean + shortcut
151 |                 previous_inputs.append(residual)
152 |                 x = (F.relu(output), previous_inputs)
153 |             return x
154 |         elif self.exp_name is 'global_local_attention_addition_learnable':
155 |             if x.__class__.__name__ is 'Tensor':
156 |                 current_input = x
157 |                 shortcut = self.shortcut(current_input)
158 |                 residual = self.residual(current_input)  
159 |                 squeeze2 = self.squeeze(residual)
160 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
161 |                 excitation2 = self.excitation2(squeeze2)
162 |                 excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
163 |                 output = residual *  excitation2.expand_as(residual) + shortcut
164 |                 return (F.relu(output), [residual])
165 |             else :             
166 |                 current_input = x[0]
167 |                 previous_inputs = x[1]
168 |                 shortcut = self.shortcut(current_input)
169 |                 residual = self.residual(current_input)
170 |                 new_connection = residual
171 |                 for input_ in previous_inputs:
172 |                     #if input_.shape[2] != residual.shape[2]:
173 |                     #    input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item())
174 |                     new_connection += input_
175 |                 squeeze1 = self.squeeze(new_connection)
176 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
177 |                 excitation1 = self.excitation1(squeeze1)
178 |                 squeeze2 = self.squeeze(residual)
179 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
180 |                 excitation2 = self.excitation2(squeeze2)
181 |                 local_global = torch.cat([excitation1, excitation2], dim = 1)
182 |                 local_global = self.fc(local_global)
183 |                 local_global = local_global.view(residual.size(0), residual.size(1), 1, 1)
184 |                 output = residual * local_global + shortcut
185 |                 previous_inputs.append(residual)
186 |                 x = (F.relu(output), previous_inputs)
187 |             return x
188 |         elif self.exp_name is 'global_attention_addition':
189 |             if x.__class__.__name__ is 'Tensor':
190 |                 current_input = x
191 |                 shortcut = self.shortcut(current_input)
192 |                 residual = self.residual(current_input)  
193 |                 output = residual + shortcut
194 |                 return (F.relu(output), [residual])
195 |             else :             
196 |                 current_input = x[0]
197 |                 previous_inputs = x[1]
198 |                 shortcut = self.shortcut(current_input)
199 |                 residual = self.residual(current_input)
200 |                 new_connection = residual
201 |                 for input_ in previous_inputs:
202 |                     #if input_.shape[2] != residual.shape[2]:
203 |                     #    input_ = F.adaptive_avg_pool2d(input_, residual.shape[2].item())
204 |                     new_connection += input_
205 |                 squeeze1 = self.squeeze(new_connection)
206 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
207 |                 excitation1 = self.excitation1(squeeze1)
208 |                 excitation1 = excitation1.view(new_connection.size(0), new_connection.size(1), 1, 1)       
209 |                 output = residual * excitation1.expand_as(residual) + shortcut
210 |                 previous_inputs.append(residual)
211 |                 x = (F.relu(output), previous_inputs)
212 |             return x
213 |         elif self.exp_name is 'global_local_attention_concat':
214 |             if x.__class__.__name__ is 'Tensor':
215 |                 current_input = x
216 |                 shortcut = self.shortcut(current_input)
217 |                 residual = self.residual(current_input)  
218 |                 squeeze2 = self.squeeze(residual)
219 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
220 |                 excitation2 = self.excitation2(squeeze2)
221 |                 excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
222 |                 output = residual *  excitation2.expand_as(residual) + shortcut
223 |                 return (F.relu(output), [residual])
224 |             else :             
225 |                 current_input = x[0]
226 |                 previous_inputs = x[1]
227 |                 shortcut = self.shortcut(current_input)
228 |                 residual = self.residual(current_input)
229 |                 previous_inputs.append(residual)
230 |                 new_connection = torch.cat(previous_inputs, dim = 1)
231 |                 squeeze1 = self.squeeze(new_connection)
232 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
233 |                 excitation1 = self.excitation1(squeeze1)
234 |                 # excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1)  
235 |                 squeeze2 = self.squeeze(residual)
236 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
237 |                 excitation2 = self.excitation2(squeeze2)
238 |                 # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
239 |                 local_global_mean = torch.mean(torch.stack([excitation1, excitation2]), 0)
240 |                 local_global_mean = local_global_mean.view(residual.size(0), residual.size(1), 1, 1)
241 |                 output = residual * local_global_mean + shortcut
242 |                 
243 |                 x = (F.relu(output), previous_inputs)
244 | 
245 |             return x
246 |         elif self.exp_name is 'global_local_attention_concat_learnable':
247 |             if x.__class__.__name__ is 'Tensor':
248 |                 current_input = x
249 |                 shortcut = self.shortcut(current_input)
250 |                 residual = self.residual(current_input)  
251 |                 squeeze2 = self.squeeze(residual)
252 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
253 |                 excitation2 = self.excitation2(squeeze2)
254 |                 excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
255 |                 output = residual *  excitation2.expand_as(residual) + shortcut
256 |                 return (F.relu(output), [residual])
257 |             else :             
258 |                 current_input = x[0]
259 |                 previous_inputs = x[1]
260 |                 shortcut = self.shortcut(current_input)
261 |                 residual = self.residual(current_input)
262 |                 previous_inputs.append(residual)
263 |                 new_connection = torch.cat(previous_inputs, dim = 1)
264 |                 squeeze1 = self.squeeze(new_connection)
265 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
266 |                 excitation1 = self.excitation1(squeeze1)
267 |                 # excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1)  
268 |                 squeeze2 = self.squeeze(residual)
269 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
270 |                 excitation2 = self.excitation2(squeeze2)
271 |                 # excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
272 |                 local_global = torch.cat([excitation1, excitation2], dim = 1)
273 |                 local_global = self.fc(local_global)
274 |                 local_global = local_global.view(residual.size(0), residual.size(1), 1, 1)
275 |                 output = residual * local_global + shortcut
276 |                 
277 |                 x = (F.relu(output), previous_inputs)
278 | 
279 |             return x
280 |         elif self.exp_name is 'global_attention_concat':
281 |             if x.__class__.__name__ is 'Tensor':
282 |                 current_input = x
283 |                 shortcut = self.shortcut(current_input)
284 |                 residual = self.residual(current_input)  
285 |                 output = residual + shortcut
286 |                 return (F.relu(output), [residual])
287 |             else :             
288 |                 current_input = x[0]
289 |                 previous_inputs = x[1]
290 |                 shortcut = self.shortcut(current_input)
291 |                 residual = self.residual(current_input)
292 |                 previous_inputs.append(residual)
293 |                 new_connection = torch.cat(previous_inputs, dim = 1)
294 |                 squeeze1 = self.squeeze(new_connection)
295 |                 squeeze1 = squeeze1.view(squeeze1.size(0), -1)
296 |                 excitation1 = self.excitation1(squeeze1)
297 |                 excitation1 = excitation1.view(residual.size(0), residual.size(1), 1, 1)      
298 |                 output = residual * excitation1.expand_as(residual) + shortcut
299 |                 x = (F.relu(output), previous_inputs)
300 |             return x
301 |         elif self.exp_name is 'multi_scale_conv1d':
302 |             if x.__class__.__name__ is 'Tensor':
303 |                 current_input = x
304 |                 shortcut = self.shortcut(current_input)
305 |                 residual = self.residual(current_input)  
306 |                 squeeze2 = self.squeeze(residual)
307 |                 squeeze2 = squeeze2.view(squeeze2.size(0), -1)
308 |                 excitation2 = self.excitation2(squeeze2)
309 |                 excitation2 = excitation2.view(residual.size(0), residual.size(1), 1, 1)
310 |                 output = residual * (1+excitation2.expand_as(residual)) + shortcut
311 |                 squeezed = self.squeeze(residual)
312 |                 squeezed = squeezed.view(squeezed.size(0), -1)  # [N, C]
313 |                 return (F.relu(output), [squeezed])
314 |             else :             
315 |                 current_input = x[0]
316 |                 previous_inputs = x[1]
317 |                 shortcut = self.shortcut(current_input)
318 |                 residual = self.residual(current_input)
319 |                 squeezed = self.squeeze(residual)
320 |                 squeezed = squeezed.view(squeezed.size(0), -1)  # [N, C]
321 |                 previous_inputs.append(squeezed)  # [old, new]
322 |                 new_connection = torch.stack(previous_inputs)  # [S, N, C]
323 |                 new_connection = new_connection.permute(1,2,0).contiguous()  # [N, C, S]
324 |                 new_connection = new_connection.view(new_connection.shape[0], -1).unsqueeze(-1)  # [N, C*S, 1]
325 |                 new_connection = new_connection.permute(0,2,1)  # [N, 1, C*S][N, Cin, L]
326 |                 scales = self.multi_scale_Conv1d(new_connection)
327 |                 scales = scales.view(residual.size(0), residual.size(1), 1, 1)
328 |                 output = residual * (1+scales) + shortcut
329 |                 x = (F.relu(output), previous_inputs)
330 |             return x
331 | 
332 |         elif self.exp_name is 'standard_local_attention':
333 |             if x.__class__.__name__ is 'Tensor':
334 |                 current_input = x
335 |             else:
336 |                 current_input = x[0]
337 | 
338 |             shortcut = self.shortcut(current_input)
339 |             residual = self.residual(current_input)
340 | 
341 |             squeeze = self.squeeze(residual)
342 |             squeeze = squeeze.view(squeeze.size(0), -1)
343 |             excitation = self.excitation2(squeeze)
344 |             excitation = excitation.view(residual.size(0), residual.size(1), 1, 1)
345 | 
346 |             output = residual * excitation.expand_as(residual) + shortcut
347 | 
348 |             return (F.relu(output), [])
349 | 
350 |         elif self.se_type == "identity_local_attention":
351 |             if x.__class__.__name__ is 'Tensor':
352 |                 current_input = x
353 |             else:
354 |                 current_input = x[0]    
355 | 
356 |             shortcut = self.shortcut(current_input)
357 | 
358 |             squeeze = self.squeeze(shortcut)
359 |             squeeze = squeeze.view(squeeze.size(0), -1)
360 |             excitation = self.excitation2(squeeze)
361 |             excitation = excitation.view(shortcut.size(0), shortcut.size(1), 1, 1)
362 | 
363 |             residual = self.residual(current_input)
364 | 
365 |             output = shortcut * excitation.expand_as(shortcut) + residual
366 | 
367 |             return (F.relu(output), [])
368 | 
369 |         elif self.se_type == "pre_local_attention":
370 |             if x.__class__.__name__ is 'Tensor':
371 |                 current_input = x
372 |             else:
373 |                 current_input = x[0]    
374 | 
375 |             shortcut = self.shortcut(current_input)
376 | 
377 |             squeeze = self.squeeze(current_input)
378 |             squeeze = squeeze.view(squeeze.size(0), -1)
379 |             excitation = self.excitation2(squeeze)
380 |             excitation = excitation.view(x.size(0), x.size(1), 1, 1)
381 |             y = current_input * excitation.expand_as(current_input)
382 | 
383 |             residual = self.residual(y)
384 | 
385 |             output = residual + shortcut
386 | 
387 |             return (F.relu(output), [])
388 | 
389 | 
390 | class SEResNet(nn.Module):
391 |     def __init__(self, block, block_num, class_num=120, bottleneck=False):
392 |         super().__init__()
393 |         self.in_channels = 64
394 |         self.pre = nn.Sequential(
395 |             nn.Conv2d(3, 64, 3, padding=1),
396 |             nn.BatchNorm2d(64),
397 |             nn.ReLU()
398 |         )
399 |         self.stage1 = self._make_stage(block, block_num[0], 64, 1, bottleneck=bottleneck)
400 |         self.stage2 = self._make_stage(block, block_num[1], 128, 2, bottleneck=bottleneck)
401 |         self.stage3 = self._make_stage(block, block_num[2], 256, 2, bottleneck=bottleneck)
402 |         self.stage4 = self._make_stage(block, block_num[3], 512, 2, bottleneck=bottleneck)
403 |         self.linear = nn.Linear(self.in_channels, class_num)
404 | 
405 |     def forward(self, x):
406 |         x = self.pre(x)
407 |         x = self.stage1(x)
408 |         x = self.stage2(x[0])
409 |         x = self.stage3(x[0])
410 |         x = self.stage4(x[0])
411 |         x = F.adaptive_avg_pool2d(x[0], 1)
412 |         x = x.view(x.size(0), -1)
413 |         x = self.linear(x)
414 |         return x
415 | 
416 |     def _make_stage(self, block, num, out_channels, stride, bottleneck=False):
417 |         layers = []
418 |         layers.append(block(self.in_channels, out_channels, stride, 1, bottleneck=bottleneck))
419 |         self.in_channels = out_channels * block.expansion
420 |         for i in range(1, num):
421 |             layers.append(block(self.in_channels, out_channels, 1, i + 1, bottleneck=bottleneck))
422 |         return nn.Sequential(*layers)
423 | 
424 | 
425 | def seresnet18(num_classes):
426 |     return SEResNet(BasicResidualSEBlock, [2, 2, 2, 2], class_num = num_classes)
427 | 
428 | 
429 | def seresnet34(num_classes):
430 |     return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes)
431 | 
432 | 
433 | def seresnet50(num_classes):
434 |     return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3], class_num = num_classes, bottleneck=True)
435 | 
436 | 
437 | def seresnet101(num_classes):
438 |     return SEResNet(BasicResidualSEBlock, [3, 4, 23, 3], class_num = num_classes, bottleneck=True)
439 | 
440 | 
441 | def seresnet152(num_classes):
442 |     return SEResNet(BasicResidualSEBlock, [3, 8, 36, 3], class_num = num_classes, bottleneck=True)
443 | 


--------------------------------------------------------------------------------