├── LICENSE
├── Readme
├── ShuffleNet.py
├── ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar
└── eval.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, ericsun99
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Readme:
--------------------------------------------------------------------------------
 1 | ShuffleNet-1g8-Pytorch 
 2 | Introduction
 3 | 
 4 | This is a Pytorch implementation of faceplusplus's ShuffleNet-1g8. For details, please read the following papers: 
 5 | 	ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices
 6 | 
 7 | Pretrained Models on ImageNet
 8 | 
 9 | We provide pretrained ShuffleNet-1g8 models on ImageNet, which achieve nearly accuracy with the original ones reported in the paper.
10 | 
11 | The top-1/5 accuracy rates by using single center crop (crop size: 224x224, image size: 256xN): 
12 | Network 	Top-1 	Top-5	Top-1(reported in the paper) 
13 | ShuffleNet-1g8 	67.408 	87.258	67.60
14 | 
15 | Evaluate Models python eval.py -a shufflenet --evaluate ./ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar ./ILSVRC2012/
16 | 
17 | Dataset prepare Refer to https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset
18 | 


--------------------------------------------------------------------------------
/ShuffleNet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from collections import OrderedDict
  6 | from torch.nn import init
  7 | 
  8 | 
  9 | def conv3x3(in_channels, out_channels, stride=1, 
 10 |             padding=1, bias=True, groups=1):    
 11 |     """3x3 convolution with padding
 12 |     """
 13 |     return nn.Conv2d(
 14 |         in_channels, 
 15 |         out_channels, 
 16 |         kernel_size=3, 
 17 |         stride=stride,
 18 |         padding=padding,
 19 |         bias=bias,
 20 |         groups=groups)
 21 | 
 22 | 
 23 | def conv1x1(in_channels, out_channels, groups=1):
 24 |     """1x1 convolution with padding
 25 |     - Normal pointwise convolution When groups == 1
 26 |     - Grouped pointwise convolution when groups > 1
 27 |     """
 28 |     return nn.Conv2d(
 29 |         in_channels, 
 30 |         out_channels, 
 31 |         kernel_size=1, 
 32 |         groups=groups,
 33 |         stride=1)
 34 | 
 35 | 
 36 | def channel_shuffle(x, groups):
 37 |     batchsize, num_channels, height, width = x.data.size()
 38 | 
 39 |     channels_per_group = num_channels // groups
 40 |     
 41 |     # reshape
 42 |     x = x.view(batchsize, groups, 
 43 |         channels_per_group, height, width)
 44 | 
 45 |     # transpose
 46 |     # - contiguous() required if transpose() is used before view().
 47 |     #   See https://github.com/pytorch/pytorch/issues/764
 48 |     x = torch.transpose(x, 1, 2).contiguous()
 49 | 
 50 |     # flatten
 51 |     x = x.view(batchsize, -1, height, width)
 52 | 
 53 |     return x
 54 | 
 55 | 
 56 | class ShuffleUnit(nn.Module):
 57 |     def __init__(self, in_channels, out_channels, groups=3,
 58 |                  grouped_conv=True, combine='add'):
 59 |         
 60 |         super(ShuffleUnit, self).__init__()
 61 | 
 62 |         self.in_channels = in_channels
 63 |         self.out_channels = out_channels
 64 |         self.grouped_conv = grouped_conv
 65 |         self.combine = combine
 66 |         self.groups = groups
 67 |         self.bottleneck_channels = self.out_channels // 4
 68 | 
 69 |         # define the type of ShuffleUnit
 70 |         if self.combine == 'add':
 71 |             # ShuffleUnit Figure 2b
 72 |             self.depthwise_stride = 1
 73 |             self._combine_func = self._add
 74 |         elif self.combine == 'concat':
 75 |             # ShuffleUnit Figure 2c
 76 |             self.depthwise_stride = 2
 77 |             self._combine_func = self._concat
 78 |             
 79 |             # ensure output of concat has the same channels as 
 80 |             # original output channels.
 81 |             self.out_channels -= self.in_channels
 82 |         else:
 83 |             raise ValueError("Cannot combine tensors with \"{}\"" \
 84 |                              "Only \"add\" and \"concat\" are" \
 85 |                              "supported".format(self.combine))
 86 | 
 87 |         # Use a 1x1 grouped or non-grouped convolution to reduce input channels
 88 |         # to bottleneck channels, as in a ResNet bottleneck module.
 89 |         # NOTE: Do not use group convolution for the first conv1x1 in Stage 2.
 90 |         self.first_1x1_groups = self.groups if grouped_conv else 1
 91 | 
 92 |         self.g_conv_1x1_compress = self._make_grouped_conv1x1(
 93 |             self.in_channels,
 94 |             self.bottleneck_channels,
 95 |             self.first_1x1_groups,
 96 |             batch_norm=True,
 97 |             relu=True
 98 |             )
 99 | 
100 |         # 3x3 depthwise convolution followed by batch normalization
101 |         self.depthwise_conv3x3 = conv3x3(
102 |             self.bottleneck_channels, self.bottleneck_channels,
103 |             stride=self.depthwise_stride, groups=self.bottleneck_channels)
104 |         self.bn_after_depthwise = nn.BatchNorm2d(self.bottleneck_channels)
105 | 
106 |         # Use 1x1 grouped convolution to expand from 
107 |         # bottleneck_channels to out_channels
108 |         self.g_conv_1x1_expand = self._make_grouped_conv1x1(
109 |             self.bottleneck_channels,
110 |             self.out_channels,
111 |             self.groups,
112 |             batch_norm=True,
113 |             relu=False
114 |             )
115 | 
116 | 
117 |     @staticmethod
118 |     def _add(x, out):
119 |         # residual connection
120 |         return x + out
121 | 
122 | 
123 |     @staticmethod
124 |     def _concat(x, out):
125 |         # concatenate along channel axis
126 |         return torch.cat((x, out), 1)
127 | 
128 | 
129 |     def _make_grouped_conv1x1(self, in_channels, out_channels, groups,
130 |         batch_norm=True, relu=False):
131 | 
132 |         modules = OrderedDict()
133 | 
134 |         conv = conv1x1(in_channels, out_channels, groups=groups)
135 |         modules['conv1x1'] = conv
136 | 
137 |         if batch_norm:
138 |             modules['batch_norm'] = nn.BatchNorm2d(out_channels)
139 |         if relu:
140 |             modules['relu'] = nn.ReLU()
141 |         if len(modules) > 1:
142 |             return nn.Sequential(modules)
143 |         else:
144 |             return conv
145 | 
146 | 
147 |     def forward(self, x):
148 |         # save for combining later with output
149 |         residual = x
150 | 
151 |         if self.combine == 'concat':
152 |             residual = F.avg_pool2d(residual, kernel_size=3, 
153 |                 stride=2, padding=1)
154 | 
155 |         out = self.g_conv_1x1_compress(x)
156 |         out = channel_shuffle(out, self.groups)
157 |         out = self.depthwise_conv3x3(out)
158 |         out = self.bn_after_depthwise(out)
159 |         out = self.g_conv_1x1_expand(out)
160 |         
161 |         out = self._combine_func(residual, out)
162 |         return F.relu(out)
163 | 
164 | 
165 | class ShuffleNet(nn.Module):
166 |     """ShuffleNet implementation.
167 |     """
168 | 
169 |     def __init__(self, groups=3, in_channels=3, num_classes=1000):
170 |         """ShuffleNet constructor.
171 | 
172 |         Arguments:
173 |             groups (int, optional): number of groups to be used in grouped 
174 |                 1x1 convolutions in each ShuffleUnit. Default is 3 for best
175 |                 performance according to original paper.
176 |             in_channels (int, optional): number of channels in the input tensor.
177 |                 Default is 3 for RGB image inputs.
178 |             num_classes (int, optional): number of classes to predict. Default
179 |                 is 1000 for ImageNet.
180 | 
181 |         """
182 |         super(ShuffleNet, self).__init__()
183 | 
184 |         self.groups = groups
185 |         self.stage_repeats = [3, 7, 3]
186 |         self.in_channels =  in_channels
187 |         self.num_classes = num_classes
188 | 
189 |         # index 0 is invalid and should never be called.
190 |         # only used for indexing convenience.
191 |         if groups == 1:
192 |             self.stage_out_channels = [-1, 24, 144, 288, 567]
193 |         elif groups == 2:
194 |             self.stage_out_channels = [-1, 24, 200, 400, 800]
195 |         elif groups == 3:
196 |             self.stage_out_channels = [-1, 24, 240, 480, 960]
197 |         elif groups == 4:
198 |             self.stage_out_channels = [-1, 24, 272, 544, 1088]
199 |         elif groups == 8:
200 |             self.stage_out_channels = [-1, 24, 384, 768, 1536]
201 |         else:
202 |             raise ValueError(
203 |                 """{} groups is not supported for
204 |                    1x1 Grouped Convolutions""".format(num_groups))
205 |         
206 |         # Stage 1 always has 24 output channels
207 |         self.conv1 = conv3x3(self.in_channels,
208 |                              self.stage_out_channels[1], # stage 1
209 |                              stride=2)
210 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
211 | 
212 |         # Stage 2
213 |         self.stage2 = self._make_stage(2)
214 |         # Stage 3
215 |         self.stage3 = self._make_stage(3)
216 |         # Stage 4
217 |         self.stage4 = self._make_stage(4)
218 | 
219 |         # Global pooling:
220 |         # Undefined as PyTorch's functional API can be used for on-the-fly
221 |         # shape inference if input size is not ImageNet's 224x224
222 | 
223 |         # Fully-connected classification layer
224 |         num_inputs = self.stage_out_channels[-1]
225 |         self.fc = nn.Linear(num_inputs, self.num_classes)
226 | 
227 | 
228 |     def _make_stage(self, stage):
229 |         modules = OrderedDict()
230 |         stage_name = "ShuffleUnit_Stage{}".format(stage)
231 |         
232 |         # First ShuffleUnit in the stage
233 |         # 1. non-grouped 1x1 convolution (i.e. pointwise convolution)
234 |         #   is used in Stage 2. Group convolutions used everywhere else.
235 |         grouped_conv = stage > 2
236 |         
237 |         # 2. concatenation unit is always used.
238 |         first_module = ShuffleUnit(
239 |             self.stage_out_channels[stage-1],
240 |             self.stage_out_channels[stage],
241 |             groups=self.groups,
242 |             grouped_conv=grouped_conv,
243 |             combine='concat'
244 |             )
245 |         modules[stage_name+"_0"] = first_module
246 | 
247 |         # add more ShuffleUnits depending on pre-defined number of repeats
248 |         for i in range(self.stage_repeats[stage-2]):
249 |             name = stage_name + "_{}".format(i+1)
250 |             module = ShuffleUnit(
251 |                 self.stage_out_channels[stage],
252 |                 self.stage_out_channels[stage],
253 |                 groups=self.groups,
254 |                 grouped_conv=True,
255 |                 combine='add'
256 |                 )
257 |             modules[name] = module
258 | 
259 |         return nn.Sequential(modules)
260 | 
261 | 
262 |     def forward(self, x):
263 |         x = self.conv1(x)
264 |         x = self.maxpool(x)
265 | 
266 |         x = self.stage2(x)
267 |         x = self.stage3(x)
268 |         x = self.stage4(x)
269 | 
270 |         # global average pooling layer
271 |         x = F.avg_pool2d(x, x.data.size()[-2:])
272 |         
273 |         # flatten for input to fully-connected layer
274 |         x = x.view(x.size(0), -1)
275 |         x = self.fc(x)
276 | 
277 |         return F.log_softmax(x, dim=1)
278 | 
279 | 
280 | if __name__ == "__main__":
281 |     """Testing
282 |     """
283 |     model = ShuffleNet()
284 | 


--------------------------------------------------------------------------------
/ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericsun99/ShuffleNet-1g8-Pytorch/4b82cd8aacb64ed58cafe6d9cb96956a4215016c/ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | import time
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.parallel
  9 | import torch.backends.cudnn as cudnn
 10 | import torch.distributed as dist
 11 | import torch.optim
 12 | import torch.utils.data
 13 | import torch.utils.data.distributed
 14 | import torchvision.transforms as transforms
 15 | import torchvision.datasets as datasets
 16 | import torchvision.models as models
 17 | from ShuffleNet import ShuffleNet
 18 | 
 19 | model_names = sorted(name for name in models.__dict__
 20 |     if name.islower() and not name.startswith("__")
 21 |     and callable(models.__dict__[name]))
 22 | 
 23 | model_names.append('shufflenet')
 24 | 
 25 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 26 | parser.add_argument('data', metavar='DIR',
 27 |                     help='path to dataset')
 28 | parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
 29 |                     choices=model_names,
 30 |                     help='model architecture: ' +
 31 |                         ' | '.join(model_names) +
 32 |                         ' (default: resnet18)')
 33 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 34 |                     help='number of data loading workers (default: 4)')
 35 | parser.add_argument('--epochs', default=90, type=int, metavar='N',
 36 |                     help='number of total epochs to run')
 37 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 38 |                     help='manual epoch number (useful on restarts)')
 39 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 40 |                     metavar='N', help='mini-batch size (default: 256)')
 41 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 42 |                     metavar='LR', help='initial learning rate')
 43 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 44 |                     help='momentum')
 45 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
 46 |                     metavar='W', help='weight decay (default: 1e-4)')
 47 | parser.add_argument('--print-freq', '-p', default=10, type=int,
 48 |                     metavar='N', help='print frequency (default: 10)')
 49 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 50 |                     help='path to latest checkpoint (default: none)')
 51 | parser.add_argument('--evaluate', default='', type=str, metavar='PATH',
 52 |                     help='path to evaluate model (default: none)')
 53 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 54 |                     help='use pre-trained model')
 55 | parser.add_argument('--world-size', default=1, type=int,
 56 |                     help='number of distributed processes')
 57 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 58 |                     help='url used to set up distributed training')
 59 | parser.add_argument('--dist-backend', default='gloo', type=str,
 60 |                     help='distributed backend')
 61 | 
 62 | 
 63 | def main():
 64 |     global args
 65 |     args = parser.parse_args()
 66 | 
 67 |     args.distributed = args.world_size > 1
 68 | 
 69 |     if args.distributed:
 70 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
 71 |                                 world_size=args.world_size)
 72 | 
 73 |     # create model
 74 |     if args.pretrained:
 75 |         print("=> using pre-trained model '{}'".format(args.arch))
 76 |         model = models.__dict__[args.arch](pretrained=True)
 77 |     else:
 78 |         print("=> creating model '{}'".format(args.arch))
 79 |         if args.arch.startswith('mobilenetv1'):
 80 |             model = MobileNetV1()
 81 |         elif args.arch.startswith('mobilenetv2'):
 82 |             model = MobileNetV2()
 83 |         elif args.arch.startswith('shufflenet'):
 84 |             model = ShuffleNet(groups=8)
 85 |         else:
 86 |             model = models.__dict__[args.arch]()
 87 | 			
 88 | 	print(model)
 89 | 
 90 |     if not args.distributed:
 91 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
 92 |             model.features = torch.nn.DataParallel(model.features)
 93 |             model.cuda()
 94 |         else:
 95 |             model = torch.nn.DataParallel(model).cuda()
 96 |     else:
 97 |         model.cuda()
 98 |         model = torch.nn.parallel.DistributedDataParallel(model)
 99 | 
100 |     # define loss function (criterion) and optimizer
101 |     criterion = nn.CrossEntropyLoss().cuda()
102 | 
103 |     if args.evaluate:
104 | 	if os.path.isfile(args.evaluate):
105 |             print("=> loading model '{}'".format(args.evaluate))
106 |             model.load_state_dict(torch.load(args.evaluate))
107 |         else:
108 |             print("=> no model found at '{}'".format(args.evaluate))
109 | 
110 |     cudnn.benchmark = True
111 | 
112 |     # Data loading code
113 |     valdir = os.path.join(args.data, 'val')
114 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
115 |                                      std=[0.229, 0.224, 0.225])
116 | 
117 |     val_loader = torch.utils.data.DataLoader(
118 |         datasets.ImageFolder(valdir, transforms.Compose([
119 |             transforms.Resize(256),
120 |             transforms.CenterCrop(224),
121 |             transforms.ToTensor(),
122 |             normalize,
123 |         ])),
124 |         batch_size=args.batch_size, shuffle=False,
125 |         num_workers=args.workers, pin_memory=True)
126 | 
127 |     if args.evaluate:
128 |         validate(val_loader, model, criterion)
129 |         return
130 | 
131 | 
132 | def validate(val_loader, model, criterion):
133 |     batch_time = AverageMeter()
134 |     losses = AverageMeter()
135 |     top1 = AverageMeter()
136 |     top5 = AverageMeter()
137 | 
138 |     # switch to evaluate mode
139 |     model.eval()
140 | 
141 |     end = time.time()
142 |     for i, (input, target) in enumerate(val_loader):
143 |         target = target.cuda(async=True)
144 |         input_var = torch.autograd.Variable(input, volatile=True)
145 |         target_var = torch.autograd.Variable(target, volatile=True)
146 | 
147 |         # compute output
148 |         output = model(input_var)
149 |         loss = criterion(output, target_var)
150 | 
151 |         # measure accuracy and record loss
152 |         prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
153 |         losses.update(loss.data[0], input.size(0))
154 |         top1.update(prec1[0], input.size(0))
155 |         top5.update(prec5[0], input.size(0))
156 | 
157 |         # measure elapsed time
158 |         batch_time.update(time.time() - end)
159 |         end = time.time()
160 | 
161 |         if i % args.print_freq == 0:
162 |             print('Test: [{0}/{1}]\t'
163 |                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
164 |                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
165 |                   'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
166 |                   'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
167 |                    i, len(val_loader), batch_time=batch_time, loss=losses,
168 |                    top1=top1, top5=top5))
169 | 
170 |     print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
171 |           .format(top1=top1, top5=top5))
172 | 
173 |     return top1.avg
174 | 
175 | 
176 | class AverageMeter(object):
177 |     """Computes and stores the average and current value"""
178 |     def __init__(self):
179 |         self.reset()
180 | 
181 |     def reset(self):
182 |         self.val = 0
183 |         self.avg = 0
184 |         self.sum = 0
185 |         self.count = 0
186 | 
187 |     def update(self, val, n=1):
188 |         self.val = val
189 |         self.sum += val * n
190 |         self.count += n
191 |         self.avg = self.sum / self.count
192 | 
193 | 
194 | def accuracy(output, target, topk=(1,)):
195 |     """Computes the precision@k for the specified values of k"""
196 |     maxk = max(topk)
197 |     batch_size = target.size(0)
198 | 
199 |     _, pred = output.topk(maxk, 1, True, True)
200 |     pred = pred.t()
201 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
202 | 
203 |     res = []
204 |     for k in topk:
205 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
206 |         res.append(correct_k.mul_(100.0 / batch_size))
207 |     return res
208 | 
209 | if __name__ == '__main__':
210 |     main()
211 | 


--------------------------------------------------------------------------------