├── README.md
├── resnet.py
└── tsm_util.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Temporal-Shift-Module
 2 | 
 3 | Unofficial implementation for paper Temporal Shift Module
 4 | for Efficient Video Understanding [[1]](#tsm).
 5 | 
 6 | To use this implementation, simply import the utility with:
 7 | 
 8 | ```python
 9 | from tsm_util as tsm
10 | ```
11 | 
12 | Here we use 2D CNN baseline from Temporal Segment Networks
13 | [[2]](#tsn) with 2D CNN backbone of ResNet [[3]](#resnet).
14 | 
15 | ## Reference
16 | 
17 | \[1\] [Temporal Shift Module for EfficientVideo Understanding](https://arxiv.org/abs/1811.08383)
18 | 
19 | \[2\] [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https://arxiv.org/abs/1608.00859)
20 | 
21 | \[3\] [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)


--------------------------------------------------------------------------------
/resnet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An example combining `Temporal Shift Module` with `ResNet`. This implementation
  3 | is based on `Temporal Segment Networks`, which merges temporal dimension into
  4 | batch, i.e. inputs [N*T, C, H, W]. Here we show the case with residual connections
  5 | and zero padding with 8 frames as input.
  6 | """
  7 | import torch.nn as nn
  8 | from tsm_util import tsm
  9 | import torch.utils.model_zoo as model_zoo
 10 | 
 11 | 
 12 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 13 |            'resnet152']
 14 | 
 15 | 
 16 | model_urls = {
 17 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 18 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 19 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 20 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 21 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 22 | }
 23 | 
 24 | 
 25 | def conv3x3(in_planes, out_planes, stride=1):
 26 |     """3x3 convolution with padding"""
 27 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 28 |                      padding=1, bias=False)
 29 | 
 30 | 
 31 | def conv1x1(in_planes, out_planes, stride=1):
 32 |     """1x1 convolution"""
 33 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 34 | 
 35 | 
 36 | class BasicBlock(nn.Module):
 37 |     expansion = 1
 38 | 
 39 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 40 |         super(BasicBlock, self).__init__()
 41 |         self.conv1 = conv3x3(inplanes, planes, stride)
 42 |         self.bn1 = nn.BatchNorm2d(planes)
 43 |         self.relu = nn.ReLU(inplace=True)
 44 |         self.conv2 = conv3x3(planes, planes)
 45 |         self.bn2 = nn.BatchNorm2d(planes)
 46 |         self.downsample = downsample
 47 |         self.stride = stride
 48 | 
 49 |     def forward(self, x):
 50 |         identity = x
 51 | 
 52 |         out = tsm(x, 8, 'zero')
 53 |         out = self.conv1(out)
 54 |         out = self.bn1(out)
 55 |         out = self.relu(out)
 56 | 
 57 |         out = self.conv2(out)
 58 |         out = self.bn2(out)
 59 | 
 60 |         if self.downsample is not None:
 61 |             identity = self.downsample(x)
 62 | 
 63 |         out += identity
 64 |         out = self.relu(out)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class Bottleneck(nn.Module):
 70 |     expansion = 4
 71 | 
 72 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 73 |         super(Bottleneck, self).__init__()
 74 |         self.conv1 = conv1x1(inplanes, planes)
 75 |         self.bn1 = nn.BatchNorm2d(planes)
 76 |         self.conv2 = conv3x3(planes, planes, stride)
 77 |         self.bn2 = nn.BatchNorm2d(planes)
 78 |         self.conv3 = conv1x1(planes, planes * self.expansion)
 79 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 80 |         self.relu = nn.ReLU(inplace=True)
 81 |         self.downsample = downsample
 82 |         self.stride = stride
 83 | 
 84 |     def forward(self, x):
 85 |         identity = x
 86 | 
 87 |         out = tsm(x, 8, 'zero')
 88 |         out = self.conv1(out)
 89 |         out = self.bn1(out)
 90 |         out = self.relu(out)
 91 | 
 92 |         out = self.conv2(out)
 93 |         out = self.bn2(out)
 94 |         out = self.relu(out)
 95 | 
 96 |         out = self.conv3(out)
 97 |         out = self.bn3(out)
 98 | 
 99 |         if self.downsample is not None:
100 |             identity = self.downsample(x)
101 | 
102 |         out += identity
103 |         out = self.relu(out)
104 | 
105 |         return out
106 | 
107 | 
108 | class ResNet(nn.Module):
109 | 
110 |     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
111 |         super(ResNet, self).__init__()
112 |         self.inplanes = 64
113 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
114 |                                bias=False)
115 |         self.bn1 = nn.BatchNorm2d(64)
116 |         self.relu = nn.ReLU(inplace=True)
117 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
118 |         self.layer1 = self._make_layer(block, 64, layers[0])
119 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
120 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
121 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
122 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
123 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
124 | 
125 |         for m in self.modules():
126 |             if isinstance(m, nn.Conv2d):
127 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
128 |             elif isinstance(m, nn.BatchNorm2d):
129 |                 nn.init.constant_(m.weight, 1)
130 |                 nn.init.constant_(m.bias, 0)
131 | 
132 |         # Zero-initialize the last BN in each residual branch,
133 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
134 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
135 |         if zero_init_residual:
136 |             for m in self.modules():
137 |                 if isinstance(m, Bottleneck):
138 |                     nn.init.constant_(m.bn3.weight, 0)
139 |                 elif isinstance(m, BasicBlock):
140 |                     nn.init.constant_(m.bn2.weight, 0)
141 | 
142 |     def _make_layer(self, block, planes, blocks, stride=1):
143 |         downsample = None
144 |         if stride != 1 or self.inplanes != planes * block.expansion:
145 |             downsample = nn.Sequential(
146 |                 conv1x1(self.inplanes, planes * block.expansion, stride),
147 |                 nn.BatchNorm2d(planes * block.expansion),
148 |             )
149 | 
150 |         layers = []
151 |         layers.append(block(self.inplanes, planes, stride, downsample))
152 |         self.inplanes = planes * block.expansion
153 |         for _ in range(1, blocks):
154 |             layers.append(block(self.inplanes, planes))
155 | 
156 |         return nn.Sequential(*layers)
157 | 
158 |     def forward(self, x):
159 |         x = self.conv1(x)
160 |         x = self.bn1(x)
161 |         x = self.relu(x)
162 |         x = self.maxpool(x)
163 | 
164 |         x = self.layer1(x)
165 |         x = self.layer2(x)
166 |         x = self.layer3(x)
167 |         x = self.layer4(x)
168 | 
169 |         x = self.avgpool(x)
170 |         x = x.view(x.size(0), -1)
171 |         x = self.fc(x)
172 | 
173 |         return x
174 | 
175 | 
176 | def resnet18(pretrained=False, **kwargs):
177 |     """Constructs a ResNet-18 model.
178 | 
179 |     Args:
180 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
181 |     """
182 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
183 |     if pretrained:
184 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
185 |     return model
186 | 
187 | 
188 | def resnet34(pretrained=False, **kwargs):
189 |     """Constructs a ResNet-34 model.
190 | 
191 |     Args:
192 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
193 |     """
194 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
195 |     if pretrained:
196 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
197 |     return model
198 | 
199 | 
200 | def resnet50(pretrained=False, **kwargs):
201 |     """Constructs a ResNet-50 model.
202 | 
203 |     Args:
204 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
205 |     """
206 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
207 |     if pretrained:
208 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
209 |     return model
210 | 
211 | 
212 | def resnet101(pretrained=False, **kwargs):
213 |     """Constructs a ResNet-101 model.
214 | 
215 |     Args:
216 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
217 |     """
218 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
219 |     if pretrained:
220 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
221 |     return model
222 | 
223 | 
224 | def resnet152(pretrained=False, **kwargs):
225 |     """Constructs a ResNet-152 model.
226 | 
227 |     Args:
228 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
229 |     """
230 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
231 |     if pretrained:
232 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
233 |     return model
234 | 


--------------------------------------------------------------------------------
/tsm_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | def tsm(tensor, duration, version='zero'):
 5 |     # tensor [N*T, C, H, W]
 6 |     size = tensor.size()
 7 |     tensor = tensor.view((-1, duration) + size[1:])
 8 |     # tensor [N, T, C, H, W]
 9 |     pre_tensor, post_tensor, peri_tensor = tensor.split([size[1] // 4,
10 |                                                          size[1] // 4,
11 |                                                          size[1] // 2], dim=2)
12 |     if version == 'zero':
13 |         pre_tensor  = F.pad(pre_tensor,  (0, 0, 0, 0, 0, 0, 1, 0))[:,  :-1, ...]
14 |         post_tensor = F.pad(post_tensor, (0, 0, 0, 0, 0, 0, 0, 1))[:, 1:  , ...]
15 |     elif version == 'circulant':
16 |         pre_tensor  = torch.cat((pre_tensor [:, -1:  , ...],
17 |                                  pre_tensor [:,   :-1, ...]), dim=1)
18 |         post_tensor = torch.cat((post_tensor[:,  1:  , ...],
19 |                                  post_tensor[:,   :1 , ...]), dim=1)
20 |     else:
21 |         raise ValueError('Unknown TSM version: {}'.format(version))
22 |     return torch.cat((pre_tensor, post_tensor, peri_tensor), dim=2).view(size)
23 | 


--------------------------------------------------------------------------------