├── LICENSE ├── Readme ├── ShuffleNet.py ├── ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar └── eval.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, ericsun99 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Readme: -------------------------------------------------------------------------------- 1 | ShuffleNet-1g8-Pytorch 2 | Introduction 3 | 4 | This is a Pytorch implementation of faceplusplus's ShuffleNet-1g8. For details, please read the following papers: 5 | ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices 6 | 7 | Pretrained Models on ImageNet 8 | 9 | We provide pretrained ShuffleNet-1g8 models on ImageNet, which achieve nearly accuracy with the original ones reported in the paper. 10 | 11 | The top-1/5 accuracy rates by using single center crop (crop size: 224x224, image size: 256xN): 12 | Network Top-1 Top-5 Top-1(reported in the paper) 13 | ShuffleNet-1g8 67.408 87.258 67.60 14 | 15 | Evaluate Models python eval.py -a shufflenet --evaluate ./ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar ./ILSVRC2012/ 16 | 17 | Dataset prepare Refer to https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset 18 | -------------------------------------------------------------------------------- /ShuffleNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from collections import OrderedDict 6 | from torch.nn import init 7 | 8 | 9 | def conv3x3(in_channels, out_channels, stride=1, 10 | padding=1, bias=True, groups=1): 11 | """3x3 convolution with padding 12 | """ 13 | return nn.Conv2d( 14 | in_channels, 15 | out_channels, 16 | kernel_size=3, 17 | stride=stride, 18 | padding=padding, 19 | bias=bias, 20 | groups=groups) 21 | 22 | 23 | def conv1x1(in_channels, out_channels, groups=1): 24 | """1x1 convolution with padding 25 | - Normal pointwise convolution When groups == 1 26 | - Grouped pointwise convolution when groups > 1 27 | """ 28 | return nn.Conv2d( 29 | in_channels, 30 | out_channels, 31 | kernel_size=1, 32 | groups=groups, 33 | stride=1) 34 | 35 | 36 | def channel_shuffle(x, groups): 37 | batchsize, num_channels, height, width = x.data.size() 38 | 39 | channels_per_group = num_channels // groups 40 | 41 | # reshape 42 | x = x.view(batchsize, groups, 43 | channels_per_group, height, width) 44 | 45 | # transpose 46 | # - contiguous() required if transpose() is used before view(). 47 | # See https://github.com/pytorch/pytorch/issues/764 48 | x = torch.transpose(x, 1, 2).contiguous() 49 | 50 | # flatten 51 | x = x.view(batchsize, -1, height, width) 52 | 53 | return x 54 | 55 | 56 | class ShuffleUnit(nn.Module): 57 | def __init__(self, in_channels, out_channels, groups=3, 58 | grouped_conv=True, combine='add'): 59 | 60 | super(ShuffleUnit, self).__init__() 61 | 62 | self.in_channels = in_channels 63 | self.out_channels = out_channels 64 | self.grouped_conv = grouped_conv 65 | self.combine = combine 66 | self.groups = groups 67 | self.bottleneck_channels = self.out_channels // 4 68 | 69 | # define the type of ShuffleUnit 70 | if self.combine == 'add': 71 | # ShuffleUnit Figure 2b 72 | self.depthwise_stride = 1 73 | self._combine_func = self._add 74 | elif self.combine == 'concat': 75 | # ShuffleUnit Figure 2c 76 | self.depthwise_stride = 2 77 | self._combine_func = self._concat 78 | 79 | # ensure output of concat has the same channels as 80 | # original output channels. 81 | self.out_channels -= self.in_channels 82 | else: 83 | raise ValueError("Cannot combine tensors with \"{}\"" \ 84 | "Only \"add\" and \"concat\" are" \ 85 | "supported".format(self.combine)) 86 | 87 | # Use a 1x1 grouped or non-grouped convolution to reduce input channels 88 | # to bottleneck channels, as in a ResNet bottleneck module. 89 | # NOTE: Do not use group convolution for the first conv1x1 in Stage 2. 90 | self.first_1x1_groups = self.groups if grouped_conv else 1 91 | 92 | self.g_conv_1x1_compress = self._make_grouped_conv1x1( 93 | self.in_channels, 94 | self.bottleneck_channels, 95 | self.first_1x1_groups, 96 | batch_norm=True, 97 | relu=True 98 | ) 99 | 100 | # 3x3 depthwise convolution followed by batch normalization 101 | self.depthwise_conv3x3 = conv3x3( 102 | self.bottleneck_channels, self.bottleneck_channels, 103 | stride=self.depthwise_stride, groups=self.bottleneck_channels) 104 | self.bn_after_depthwise = nn.BatchNorm2d(self.bottleneck_channels) 105 | 106 | # Use 1x1 grouped convolution to expand from 107 | # bottleneck_channels to out_channels 108 | self.g_conv_1x1_expand = self._make_grouped_conv1x1( 109 | self.bottleneck_channels, 110 | self.out_channels, 111 | self.groups, 112 | batch_norm=True, 113 | relu=False 114 | ) 115 | 116 | 117 | @staticmethod 118 | def _add(x, out): 119 | # residual connection 120 | return x + out 121 | 122 | 123 | @staticmethod 124 | def _concat(x, out): 125 | # concatenate along channel axis 126 | return torch.cat((x, out), 1) 127 | 128 | 129 | def _make_grouped_conv1x1(self, in_channels, out_channels, groups, 130 | batch_norm=True, relu=False): 131 | 132 | modules = OrderedDict() 133 | 134 | conv = conv1x1(in_channels, out_channels, groups=groups) 135 | modules['conv1x1'] = conv 136 | 137 | if batch_norm: 138 | modules['batch_norm'] = nn.BatchNorm2d(out_channels) 139 | if relu: 140 | modules['relu'] = nn.ReLU() 141 | if len(modules) > 1: 142 | return nn.Sequential(modules) 143 | else: 144 | return conv 145 | 146 | 147 | def forward(self, x): 148 | # save for combining later with output 149 | residual = x 150 | 151 | if self.combine == 'concat': 152 | residual = F.avg_pool2d(residual, kernel_size=3, 153 | stride=2, padding=1) 154 | 155 | out = self.g_conv_1x1_compress(x) 156 | out = channel_shuffle(out, self.groups) 157 | out = self.depthwise_conv3x3(out) 158 | out = self.bn_after_depthwise(out) 159 | out = self.g_conv_1x1_expand(out) 160 | 161 | out = self._combine_func(residual, out) 162 | return F.relu(out) 163 | 164 | 165 | class ShuffleNet(nn.Module): 166 | """ShuffleNet implementation. 167 | """ 168 | 169 | def __init__(self, groups=3, in_channels=3, num_classes=1000): 170 | """ShuffleNet constructor. 171 | 172 | Arguments: 173 | groups (int, optional): number of groups to be used in grouped 174 | 1x1 convolutions in each ShuffleUnit. Default is 3 for best 175 | performance according to original paper. 176 | in_channels (int, optional): number of channels in the input tensor. 177 | Default is 3 for RGB image inputs. 178 | num_classes (int, optional): number of classes to predict. Default 179 | is 1000 for ImageNet. 180 | 181 | """ 182 | super(ShuffleNet, self).__init__() 183 | 184 | self.groups = groups 185 | self.stage_repeats = [3, 7, 3] 186 | self.in_channels = in_channels 187 | self.num_classes = num_classes 188 | 189 | # index 0 is invalid and should never be called. 190 | # only used for indexing convenience. 191 | if groups == 1: 192 | self.stage_out_channels = [-1, 24, 144, 288, 567] 193 | elif groups == 2: 194 | self.stage_out_channels = [-1, 24, 200, 400, 800] 195 | elif groups == 3: 196 | self.stage_out_channels = [-1, 24, 240, 480, 960] 197 | elif groups == 4: 198 | self.stage_out_channels = [-1, 24, 272, 544, 1088] 199 | elif groups == 8: 200 | self.stage_out_channels = [-1, 24, 384, 768, 1536] 201 | else: 202 | raise ValueError( 203 | """{} groups is not supported for 204 | 1x1 Grouped Convolutions""".format(num_groups)) 205 | 206 | # Stage 1 always has 24 output channels 207 | self.conv1 = conv3x3(self.in_channels, 208 | self.stage_out_channels[1], # stage 1 209 | stride=2) 210 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 211 | 212 | # Stage 2 213 | self.stage2 = self._make_stage(2) 214 | # Stage 3 215 | self.stage3 = self._make_stage(3) 216 | # Stage 4 217 | self.stage4 = self._make_stage(4) 218 | 219 | # Global pooling: 220 | # Undefined as PyTorch's functional API can be used for on-the-fly 221 | # shape inference if input size is not ImageNet's 224x224 222 | 223 | # Fully-connected classification layer 224 | num_inputs = self.stage_out_channels[-1] 225 | self.fc = nn.Linear(num_inputs, self.num_classes) 226 | 227 | 228 | def _make_stage(self, stage): 229 | modules = OrderedDict() 230 | stage_name = "ShuffleUnit_Stage{}".format(stage) 231 | 232 | # First ShuffleUnit in the stage 233 | # 1. non-grouped 1x1 convolution (i.e. pointwise convolution) 234 | # is used in Stage 2. Group convolutions used everywhere else. 235 | grouped_conv = stage > 2 236 | 237 | # 2. concatenation unit is always used. 238 | first_module = ShuffleUnit( 239 | self.stage_out_channels[stage-1], 240 | self.stage_out_channels[stage], 241 | groups=self.groups, 242 | grouped_conv=grouped_conv, 243 | combine='concat' 244 | ) 245 | modules[stage_name+"_0"] = first_module 246 | 247 | # add more ShuffleUnits depending on pre-defined number of repeats 248 | for i in range(self.stage_repeats[stage-2]): 249 | name = stage_name + "_{}".format(i+1) 250 | module = ShuffleUnit( 251 | self.stage_out_channels[stage], 252 | self.stage_out_channels[stage], 253 | groups=self.groups, 254 | grouped_conv=True, 255 | combine='add' 256 | ) 257 | modules[name] = module 258 | 259 | return nn.Sequential(modules) 260 | 261 | 262 | def forward(self, x): 263 | x = self.conv1(x) 264 | x = self.maxpool(x) 265 | 266 | x = self.stage2(x) 267 | x = self.stage3(x) 268 | x = self.stage4(x) 269 | 270 | # global average pooling layer 271 | x = F.avg_pool2d(x, x.data.size()[-2:]) 272 | 273 | # flatten for input to fully-connected layer 274 | x = x.view(x.size(0), -1) 275 | x = self.fc(x) 276 | 277 | return F.log_softmax(x, dim=1) 278 | 279 | 280 | if __name__ == "__main__": 281 | """Testing 282 | """ 283 | model = ShuffleNet() 284 | -------------------------------------------------------------------------------- /ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericsun99/ShuffleNet-1g8-Pytorch/4b82cd8aacb64ed58cafe6d9cb96956a4215016c/ShuffleNet_1g8_Top1_67.408_Top5_87.258.pth.tar -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.parallel 9 | import torch.backends.cudnn as cudnn 10 | import torch.distributed as dist 11 | import torch.optim 12 | import torch.utils.data 13 | import torch.utils.data.distributed 14 | import torchvision.transforms as transforms 15 | import torchvision.datasets as datasets 16 | import torchvision.models as models 17 | from ShuffleNet import ShuffleNet 18 | 19 | model_names = sorted(name for name in models.__dict__ 20 | if name.islower() and not name.startswith("__") 21 | and callable(models.__dict__[name])) 22 | 23 | model_names.append('shufflenet') 24 | 25 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 26 | parser.add_argument('data', metavar='DIR', 27 | help='path to dataset') 28 | parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 29 | choices=model_names, 30 | help='model architecture: ' + 31 | ' | '.join(model_names) + 32 | ' (default: resnet18)') 33 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 34 | help='number of data loading workers (default: 4)') 35 | parser.add_argument('--epochs', default=90, type=int, metavar='N', 36 | help='number of total epochs to run') 37 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 38 | help='manual epoch number (useful on restarts)') 39 | parser.add_argument('-b', '--batch-size', default=256, type=int, 40 | metavar='N', help='mini-batch size (default: 256)') 41 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 42 | metavar='LR', help='initial learning rate') 43 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 44 | help='momentum') 45 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 46 | metavar='W', help='weight decay (default: 1e-4)') 47 | parser.add_argument('--print-freq', '-p', default=10, type=int, 48 | metavar='N', help='print frequency (default: 10)') 49 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 50 | help='path to latest checkpoint (default: none)') 51 | parser.add_argument('--evaluate', default='', type=str, metavar='PATH', 52 | help='path to evaluate model (default: none)') 53 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 54 | help='use pre-trained model') 55 | parser.add_argument('--world-size', default=1, type=int, 56 | help='number of distributed processes') 57 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 58 | help='url used to set up distributed training') 59 | parser.add_argument('--dist-backend', default='gloo', type=str, 60 | help='distributed backend') 61 | 62 | 63 | def main(): 64 | global args 65 | args = parser.parse_args() 66 | 67 | args.distributed = args.world_size > 1 68 | 69 | if args.distributed: 70 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 71 | world_size=args.world_size) 72 | 73 | # create model 74 | if args.pretrained: 75 | print("=> using pre-trained model '{}'".format(args.arch)) 76 | model = models.__dict__[args.arch](pretrained=True) 77 | else: 78 | print("=> creating model '{}'".format(args.arch)) 79 | if args.arch.startswith('mobilenetv1'): 80 | model = MobileNetV1() 81 | elif args.arch.startswith('mobilenetv2'): 82 | model = MobileNetV2() 83 | elif args.arch.startswith('shufflenet'): 84 | model = ShuffleNet(groups=8) 85 | else: 86 | model = models.__dict__[args.arch]() 87 | 88 | print(model) 89 | 90 | if not args.distributed: 91 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 92 | model.features = torch.nn.DataParallel(model.features) 93 | model.cuda() 94 | else: 95 | model = torch.nn.DataParallel(model).cuda() 96 | else: 97 | model.cuda() 98 | model = torch.nn.parallel.DistributedDataParallel(model) 99 | 100 | # define loss function (criterion) and optimizer 101 | criterion = nn.CrossEntropyLoss().cuda() 102 | 103 | if args.evaluate: 104 | if os.path.isfile(args.evaluate): 105 | print("=> loading model '{}'".format(args.evaluate)) 106 | model.load_state_dict(torch.load(args.evaluate)) 107 | else: 108 | print("=> no model found at '{}'".format(args.evaluate)) 109 | 110 | cudnn.benchmark = True 111 | 112 | # Data loading code 113 | valdir = os.path.join(args.data, 'val') 114 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 115 | std=[0.229, 0.224, 0.225]) 116 | 117 | val_loader = torch.utils.data.DataLoader( 118 | datasets.ImageFolder(valdir, transforms.Compose([ 119 | transforms.Resize(256), 120 | transforms.CenterCrop(224), 121 | transforms.ToTensor(), 122 | normalize, 123 | ])), 124 | batch_size=args.batch_size, shuffle=False, 125 | num_workers=args.workers, pin_memory=True) 126 | 127 | if args.evaluate: 128 | validate(val_loader, model, criterion) 129 | return 130 | 131 | 132 | def validate(val_loader, model, criterion): 133 | batch_time = AverageMeter() 134 | losses = AverageMeter() 135 | top1 = AverageMeter() 136 | top5 = AverageMeter() 137 | 138 | # switch to evaluate mode 139 | model.eval() 140 | 141 | end = time.time() 142 | for i, (input, target) in enumerate(val_loader): 143 | target = target.cuda(async=True) 144 | input_var = torch.autograd.Variable(input, volatile=True) 145 | target_var = torch.autograd.Variable(target, volatile=True) 146 | 147 | # compute output 148 | output = model(input_var) 149 | loss = criterion(output, target_var) 150 | 151 | # measure accuracy and record loss 152 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 153 | losses.update(loss.data[0], input.size(0)) 154 | top1.update(prec1[0], input.size(0)) 155 | top5.update(prec5[0], input.size(0)) 156 | 157 | # measure elapsed time 158 | batch_time.update(time.time() - end) 159 | end = time.time() 160 | 161 | if i % args.print_freq == 0: 162 | print('Test: [{0}/{1}]\t' 163 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 164 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 165 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 166 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 167 | i, len(val_loader), batch_time=batch_time, loss=losses, 168 | top1=top1, top5=top5)) 169 | 170 | print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' 171 | .format(top1=top1, top5=top5)) 172 | 173 | return top1.avg 174 | 175 | 176 | class AverageMeter(object): 177 | """Computes and stores the average and current value""" 178 | def __init__(self): 179 | self.reset() 180 | 181 | def reset(self): 182 | self.val = 0 183 | self.avg = 0 184 | self.sum = 0 185 | self.count = 0 186 | 187 | def update(self, val, n=1): 188 | self.val = val 189 | self.sum += val * n 190 | self.count += n 191 | self.avg = self.sum / self.count 192 | 193 | 194 | def accuracy(output, target, topk=(1,)): 195 | """Computes the precision@k for the specified values of k""" 196 | maxk = max(topk) 197 | batch_size = target.size(0) 198 | 199 | _, pred = output.topk(maxk, 1, True, True) 200 | pred = pred.t() 201 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 202 | 203 | res = [] 204 | for k in topk: 205 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 206 | res.append(correct_k.mul_(100.0 / batch_size)) 207 | return res 208 | 209 | if __name__ == '__main__': 210 | main() 211 | --------------------------------------------------------------------------------