├── .gitignore ├── LICENSE.md ├── README.md ├── download-cifar.sh ├── residual_layers.lua ├── residual_model.lua ├── residual_trainer.lua └── run.py /.gitignore: -------------------------------------------------------------------------------- 1 | snapshots 2 | mnist.hdf5 3 | *.swp 4 | cache 5 | *.tar.gz 6 | cifar-10-batches-py/ 7 | 8 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Michael Wilber 2 | 3 | This software is provided 'as-is', without any express or implied 4 | warranty. In no event will the authors be held liable for any damages 5 | arising from the use of this software. 6 | 7 | Permission is granted to anyone to use this software for any purpose, 8 | including commercial applications, and to alter it and redistribute it 9 | freely, subject to the following restrictions: 10 | 11 | 1. The origin of this software must not be misrepresented; you must not 12 | claim that you wrote the original software. If you use this software 13 | in a product, an acknowledgement in the product documentation would be 14 | appreciated but is not required. 15 | 2. Altered source versions must be plainly marked as such, and must not be 16 | misrepresented as being the original software. 17 | 3. This notice may not be removed or altered from any source distribution. 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Residual Learning for Image Recognition 2 | 3 | This is a pytorch implementation of ["Deep Residual Learning for Image Recognition",Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun](http://arxiv.org/abs/1512.03385) the winners of the 2015 ILSVRC and COCO challenges. 4 | 5 | It's forked from Michael Wilber's [torch-residual-networks](https://github.com/gcr/torch-residual-networks) . 6 | The data loading and preprocessing have been moved from 7 | the lua side into the python side, so you can easily modify the data loading and preprocessing, using the python 8 | tools and libraries you're used to using. 9 | 10 | For full readme on the original torch-residual-networks library, 11 | please see https://github.com/gcr/torch-residual-networks/network 12 | 13 | ## How to use 14 | 15 | - You need at least CUDA 7.0 and CuDNN v4 16 | - Install Torch: 17 | ``` 18 | git clone https://github.com/torch/distro.git ~/torch --recursive 19 | cd ~/torch 20 | # install dependencies. To install everything: 21 | bash install-deps 22 | # Or, if you're on ubuntu, you only need the following dependencies: 23 | sudo apt-get update -y 24 | sudo apt-get install -y wget git gcc g++ cmake libffi-dev \ 25 | libblas-dev liblapack-dev libatlas-base-dev gfortran libreadline-dev 26 | # install torch 27 | ./install.sh 28 | ``` 29 | - install torch cudnn and nninit: 30 | ``` 31 | luarocks install cudnn 32 | luarocks install nninit 33 | ``` 34 | - Setup python (tested on 2.7 for now; 3.4 will follow): 35 | ``` 36 | sudo apt-get install python2.7-dev 37 | virtualenv -p python2.7 ~/env27 38 | source ~/env27/bin/activate 39 | pip install docopt 40 | pip install numpy 41 | ``` 42 | - Install pytorch: 43 | ``` 44 | git clone https://github.com/hughperkins/pytorch ~/pytorch 45 | cd ~/pytorch 46 | source ~/torch/install/bin/torch-activate 47 | ./build.sh 48 | ``` 49 | - clone this repo: 50 | ``` 51 | git clone https://github.com/hughperkins/pytorch-residual-networks ~/pytorch-residual-networks 52 | cd ~/pytorch-residual-networks 53 | ``` 54 | - Download cifar dataset: `./download-cifar.sh` 55 | - Run `python run.py` 56 | 57 | ## Possible issues, and how to deal with them 58 | 59 | - Something about `no file './cunn.lua'` 60 | - reinstall cunn `luarocks install cunn` 61 | 62 | ## Changes 63 | 64 | 2016 April 12: 65 | - working now :-) 66 | 67 | 2016 April 11: 68 | - first forked from https://github.com/gcr/torch-residual-networks/network 69 | 70 | -------------------------------------------------------------------------------- /download-cifar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CIFARMD5=$(md5sum cifar-10-python.tar.gz | awk '{print $1}') 4 | if [[ $CIFARMD5 != c58f30108f718f92721af3b95e74349a ]]; then { 5 | wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -O cifar-10-python.tar.gz 6 | } fi 7 | 8 | if [[ ! -d cifar-10-batches-py ]]; then { 9 | tar -xf cifar-10-python.tar.gz 10 | } fi 11 | 12 | -------------------------------------------------------------------------------- /residual_layers.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | Copyright (c) 2016 Michael Wilber, Hugh Perkins 2016 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgement in the product documentation would be 15 | appreciated but is not required. 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 3. This notice may not be removed or altered from any source distribution. 19 | 20 | History: 21 | - first created by Michael Wilber 2016 22 | - modified by Hugh Perkins 2016 to have own 'namespace' 23 | --]] 24 | 25 | require 'nn' 26 | require 'nngraph' 27 | require 'cudnn' 28 | require 'cunn' 29 | local nninit = require 'nninit' 30 | 31 | residual_layers = {} 32 | 33 | local L = residual_layers 34 | 35 | function L.addResidualLayer2(input, nChannels, nOutChannels, stride) 36 | --[[ 37 | 38 | Residual layers! Implements option (A) from Section 3.3. The input 39 | is passed through two 3x3 convolution filters. In parallel, if the 40 | number of input and output channels differ or if the stride is not 41 | 1, then the input is downsampled or zero-padded to have the correct 42 | size and number of channels. Finally, the two versions of the input 43 | are added together. 44 | 45 | Input 46 | | 47 | ,-------+-----. 48 | Downsampling 3x3 convolution+dimensionality reduction 49 | | | 50 | v v 51 | Zero-padding 3x3 convolution 52 | | | 53 | `-----( Add )---' 54 | | 55 | Output 56 | --]] 57 | nOutChannels = nOutChannels or nChannels 58 | stride = stride or 1 59 | -- Path 1: Convolution 60 | -- The first layer does the downsampling and the striding 61 | local net = cudnn.SpatialConvolution(nChannels, nOutChannels, 62 | 3,3, stride,stride, 1,1) 63 | :init('weight', nninit.kaiming, {gain = 'relu'}) 64 | :init('bias', nninit.constant, 0)(input) 65 | net = cudnn.SpatialBatchNormalization(nOutChannels) 66 | :init('weight', nninit.normal, 1.0, 0.002) 67 | :init('bias', nninit.constant, 0)(net) 68 | net = cudnn.ReLU(true)(net) 69 | net = cudnn.SpatialConvolution(nOutChannels, nOutChannels, 70 | 3,3, 1,1, 1,1) 71 | :init('weight', nninit.kaiming, {gain = 'relu'}) 72 | :init('bias', nninit.constant, 0)(net) 73 | -- Should we put Batch Normalization here? I think not, because 74 | -- BN would force the output to have unit variance, which breaks the residual 75 | -- property of the network. 76 | -- What about ReLU here? I think maybe not for the same reason. Figure 2 77 | -- implies that they don't use it here 78 | 79 | -- Path 2: Identity / skip connection 80 | local skip = input 81 | if stride > 1 then 82 | -- optional downsampling 83 | skip = nn.SpatialAveragePooling(1, 1, stride,stride)(skip) 84 | end 85 | if nOutChannels > nChannels then 86 | -- optional padding 87 | skip = nn.Padding(1, (nOutChannels - nChannels), 3)(skip) 88 | elseif nOutChannels < nChannels then 89 | -- optional narrow, ugh. 90 | skip = nn.Narrow(2, 1, nOutChannels)(skip) 91 | -- NOTE this BREAKS with non-batch inputs!! 92 | end 93 | 94 | -- Add them together 95 | net = cudnn.SpatialBatchNormalization(nOutChannels)(net) 96 | net = nn.CAddTable(){net, skip} 97 | net = cudnn.ReLU(true)(net) 98 | -- ^ don't put a ReLU here! see http://gitxiv.com/comments/7rffyqcPLirEEsmpX 99 | 100 | return net 101 | end 102 | 103 | return L 104 | 105 | -------------------------------------------------------------------------------- /residual_model.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'cudnn' 3 | require 'nngraph' 4 | require 'residual_layers' 5 | local nninit = require 'nninit' 6 | 7 | residual_model = {} 8 | 9 | local L = residual_model 10 | 11 | function L.create(N) 12 | local input = nn.Identity()() 13 | ------> 3, 32,32 14 | local model = cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1) 15 | :init('weight', nninit.kaiming, {gain = 'relu'}) 16 | :init('bias', nninit.constant, 0)(input) 17 | model = cudnn.SpatialBatchNormalization(16)(model) 18 | model = cudnn.ReLU(true)(model) 19 | ------> 16, 32,32 First Group 20 | for i=1,N do model = residual_layers.addResidualLayer2(model, 16) end 21 | ------> 32, 16,16 Second Group 22 | model = residual_layers.addResidualLayer2(model, 16, 32, 2) 23 | for i=1,N-1 do model = residual_layers.addResidualLayer2(model, 32) end 24 | ------> 64, 8,8 Third Group 25 | model = residual_layers.addResidualLayer2(model, 32, 64, 2) 26 | for i=1,N-1 do model = residual_layers.addResidualLayer2(model, 64) end 27 | ------> 10, 8,8 Pooling, Linear, Softmax 28 | model = nn.SpatialAveragePooling(8,8)(model) 29 | model = nn.Reshape(64)(model) 30 | model = nn.Linear(64, 10)(model) 31 | model = nn.LogSoftMax()(model) 32 | 33 | model = nn.gModule({input}, {model}) 34 | return model 35 | end 36 | 37 | return L 38 | 39 | -------------------------------------------------------------------------------- /residual_trainer.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | Copyright (c) 2016 Michael Wilber, Hugh Perkins 2016 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgement in the product documentation would be 15 | appreciated but is not required. 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 3. This notice may not be removed or altered from any source distribution. 19 | 20 | History: 21 | - originally written by Michael Wilber, to run directly from lua/torch 22 | - modified by Hugh Perkins, to run from python, via pytorch 23 | (basically, ripped out all the data loading, preprocessing, and a bunch of the logic around setting 24 | learning rate etc; moved it into python side) 25 | --]] 26 | 27 | require 'os' 28 | require 'nn' 29 | require 'cutorch' 30 | require 'cunn' 31 | require 'cudnn' 32 | require 'optim' 33 | require 'residual_model' 34 | 35 | local ResidualTrainer = torch.class('ResidualTrainer') 36 | 37 | function ResidualTrainer.__init(self, num_layer_groups) 38 | self.num_layer_groups = num_layer_groups 39 | self.model = residual_model.create(num_layer_groups) 40 | self.model:cuda() 41 | 42 | self.loss = nn.ClassNLLCriterion() 43 | self.loss:cuda() 44 | 45 | self.sgdState = { 46 | learningRate = "will be set later", 47 | weightDecay = 1e-4, 48 | momentum = 0.9, 49 | dampening = 0, 50 | nesterov = true 51 | } 52 | self.weights, self.gradients = self.model:getParameters() 53 | end 54 | 55 | function ResidualTrainer.loadFrom(self, filepath) 56 | print("Loading model from ".. filepath) 57 | cutorch.setDevice(1) 58 | self.model = torch.load(filepath) 59 | print "Done" 60 | 61 | local sgdStatePath = string.gsub(filepath, "model", "sgdState") 62 | print("Trying to load sgdState from "..sgdStatePath) 63 | collectgarbage(); collectgarbage(); collectgarbage() 64 | self.sgdState = torch.load(sgdStatePath) 65 | collectgarbage(); collectgarbage(); collectgarbage() 66 | print('loaded sgdState') 67 | -- print("Got", self.sgdState.nSampledImages,"images") 68 | end 69 | 70 | function ResidualTrainer.trainBatch(self, learningRate, batchInputs, batchLabels) 71 | self.sgdState.learningRate = learningRate 72 | 73 | -- copy data to gpu 74 | local inputscu = batchInputs:cuda() 75 | local labelscu = batchLabels:cuda() 76 | 77 | collectgarbage(); collectgarbage(); 78 | self.model:training() 79 | self.gradients:zero() 80 | local y = self.model:forward(inputscu) 81 | local loss_val = self.loss:forward(y, labelscu) 82 | local df_dw = self.loss:backward(y, labelscu) 83 | self.model:backward(inputscu, df_dw) 84 | 85 | optim.sgd(function() 86 | return loss_val, self.gradients 87 | end, 88 | self.weights, 89 | self.sgdState) 90 | return loss_val 91 | end 92 | 93 | function ResidualTrainer.predict(self, batchInputs) 94 | self.model:evaluate() 95 | local batchSize = batchInputs:size(1) 96 | collectgarbage(); collectgarbage(); 97 | local y = self.model:forward(batchInputs:cuda()):float() 98 | local _, indices = torch.sort(y, 2, true) 99 | -- indices has shape (batchSize, nClasses) 100 | local top1 = indices:select(2, 1):byte() 101 | local top5 = indices:narrow(2, 1,5):byte() 102 | return {top1=top1, top5=top5} -- becomes a python dict, containing the tensors 103 | end 104 | 105 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trains cifar on residual network 3 | 4 | Usage: 5 | run.py [options] 6 | 7 | Options: 8 | --batchsize BATCHSIZE batchsize [default: 128] 9 | --loadfrom LOADFROM load from this model file [default: None] 10 | --numlayergroups NUMLAYERGROUPS number layer groups [default: 3] 11 | """ 12 | 13 | from __future__ import print_function, division 14 | import platform 15 | import sys 16 | import os 17 | import random 18 | import time 19 | import readline 20 | from os import path 21 | from os.path import join 22 | from docopt import docopt 23 | import numpy as np 24 | import PyTorchHelpers 25 | pyversion = int(platform.python_version_tuple()[0]) 26 | if pyversion == 2: 27 | import cPickle 28 | else: 29 | import pickle 30 | 31 | 32 | args = docopt(__doc__) 33 | batchSize = int(args['--batchsize']) 34 | loadFrom = args['--loadfrom'] 35 | if loadFrom == 'None': 36 | loadFrom = None 37 | num_layer_groups = int(args['--numlayergroups']) 38 | 39 | data_dir = 'cifar-10-batches-py' 40 | num_datafiles = 5 41 | devMode = False 42 | if 'DEVMODE' in os.environ and os.environ['DEVMODE'] == '1': 43 | devMode = True 44 | num_datafiles = 1 # cos I lack patience during dev :-P 45 | 46 | inputPlanes = 3 47 | inputWidth = 32 48 | inputHeight = 32 49 | 50 | def loadPickle(path): 51 | with open(path, 'rb') as f: 52 | if pyversion == 2: 53 | return cPickle.load(f) 54 | else: 55 | return {k.decode('utf-8'): v for k,v in pickle.load(f, encoding='bytes').items()} 56 | 57 | def epochToLearningRate(epoch): 58 | # From https://github.com/bgshih/cifar.torch/blob/master/train.lua#L119-L128 59 | if epoch < 80: 60 | return 0.1 61 | if epoch < 120: 62 | return 0.01 63 | return 0.001 64 | 65 | def loadData(data_dir, num_datafiles): 66 | # load training data 67 | trainData = None 68 | trainLabels = None 69 | NTrain = None 70 | for i in range(num_datafiles): 71 | d = loadPickle(join(data_dir, 'data_batch_%s' % (i+1))) 72 | dataLength = d['data'].shape[0] 73 | NTrain = num_datafiles * dataLength 74 | if trainData is None: 75 | trainData = np.zeros((NTrain, inputPlanes, inputWidth, inputHeight), np.float32) 76 | trainLabels = np.zeros(NTrain, np.uint8) 77 | data = d['data'].reshape(dataLength, inputPlanes, inputWidth, inputHeight) 78 | trainData[i * dataLength:(i+1) * dataLength] = data 79 | trainLabels[i * dataLength:(i+1) * dataLength] = d['labels'] 80 | 81 | # load test data 82 | d = loadPickle(join(data_dir, 'test_batch')) 83 | NTest = d['data'].shape[0] 84 | testData = np.zeros((NTest, inputPlanes, inputWidth, inputHeight), np.float32) 85 | testLabels = np.zeros(NTest, np.uint8) 86 | data = d['data'].reshape(dataLength, inputPlanes, inputWidth, inputHeight) 87 | testData[:] = data 88 | testLabels[:] = d['labels'] 89 | 90 | return NTrain, trainData, trainLabels, NTest, testData, testLabels 91 | 92 | 93 | # load the lua class 94 | ResidualTrainer = PyTorchHelpers.load_lua_class('residual_trainer.lua', 'ResidualTrainer') 95 | residualTrainer = ResidualTrainer(num_layer_groups) 96 | if loadFrom is not None: 97 | residualTrainer.loadFrom(loadFrom) 98 | print('residualTrainer', residualTrainer) 99 | 100 | NTrain, trainData, trainLabels, NTest, testData, testLabels = loadData(data_dir, num_datafiles) 101 | 102 | print('data loaded :-)') 103 | 104 | # I think the mean and std are over all data, altogether, not specific to planes or pixel location? 105 | mean = trainData.mean() 106 | std = trainData.std() 107 | 108 | trainData -= mean 109 | trainData /= std 110 | 111 | testData -= mean 112 | testData /= std 113 | 114 | print('data normalized check new mean/std:') 115 | print(' trainmean=%s trainstd=%s testmean=%s teststd=%s' % 116 | (trainData.mean(), trainData.std(), testData.mean(), testData.std())) 117 | 118 | # now we just have to call the lua class I think :-) 119 | 120 | batchesPerEpoch = NTrain // batchSize 121 | if devMode: 122 | batchesPerEpoch = 3 # impatient developer :-P 123 | epoch = 0 124 | while True: 125 | # print('epoch', epoch) 126 | learningRate = epochToLearningRate(epoch) 127 | epochLoss = 0 128 | # batchInputs 129 | last = time.time() 130 | for b in range(batchesPerEpoch): 131 | # we have to populate batchInputs and batchLabels :-( 132 | # seems there is a bunch of preprocessing to do :-P 133 | # https://github.com/gcr/torch-residual-networks/blob/bc1bafff731091bb382bece58d8252291bfbf206/data/cifar-dataset.lua#L56-L75 134 | 135 | # so we have to do: 136 | # - randomly sample batchSize inputs, with replacement (both between batches, and within batches) 137 | # - random translate by up to 4 horiz (+ve/-ve) and vert (+ve/-ve) (in the paper, this is described as 138 | # adding 4-padding, then cutting 32x32 patch) 139 | # - randomly flip horizontally 140 | 141 | # draw samples 142 | indexes = np.random.randint(NTrain, size=(batchSize)) 143 | 144 | batchInputs = np.zeros((batchSize, inputPlanes, inputWidth, inputHeight), dtype=np.float32) 145 | batchLabels = trainLabels[indexes] 146 | 147 | # translate (translate directly into batch images) 148 | for i in range(batchSize): 149 | srcIdx = indexes[i] 150 | xoffs, yoffs = random.randint(-4,4), random.randint(-4,4) 151 | batch_y = [max(1, 1 + yoffs), min(32, 32 + yoffs)] 152 | src_y = [max(1, 1 - yoffs), min(32, 32 - yoffs)] 153 | batch_x = [max(1, 1 + xoffs), min(32, 32 + xoffs)] 154 | src_x = [max(1, 1 - xoffs), min(32, 32 - xoffs)] 155 | xmin, xmax = max(1, xoffs), min(32, 32+xoffs) 156 | 157 | batchInputs[i][:, batch_y[0]:batch_y[1], batch_x[0]:batch_x[1]] = \ 158 | trainData[srcIdx][:, src_y[0]:src_y[1], src_x[0]:src_x[1]] 159 | 160 | # flip 161 | for i in range(batchSize): 162 | if random.randint(0,1) == 1: 163 | batchInputs[i] = np.fliplr(batchInputs[i].transpose(1,2,0)).transpose(2,0,1) 164 | 165 | if devMode: 166 | now = time.time() 167 | duration = now - last 168 | print('preprocess time', duration) 169 | last = now 170 | 171 | loss = residualTrainer.trainBatch(learningRate, batchInputs, batchLabels) 172 | print(' epoch %s batch %s/%s loss %s' %(epoch, b, batchesPerEpoch, loss)) 173 | epochLoss += loss 174 | 175 | if devMode: 176 | now = time.time() 177 | duration = now - last 178 | print('batch time', duration) 179 | last = now 180 | 181 | # evaluate on test data 182 | numTestBatches = NTest // batchSize 183 | if devMode: 184 | numTestBatches = 3 # impatient developer :-P 185 | testNumTop1Right = 0 186 | testNumTop5Right = 0 187 | testNumTotal = numTestBatches * batchSize 188 | for b in range(numTestBatches): 189 | batchInputs = testData[b * batchSize:(b+1) * batchSize] 190 | batchLabels = testLabels[b * batchSize:(b+1) * batchSize] 191 | res = residualTrainer.predict(batchInputs) 192 | top1 = res['top1'].asNumpyTensor() 193 | top5 = res['top5'].asNumpyTensor() 194 | labelsTiled5 = np.tile(batchLabels.reshape(batchSize, 1), (1, 5)) 195 | top1_correct = (top1 == batchLabels).sum() 196 | top5_correct = (top5 == labelsTiled5).sum() 197 | testNumTop1Right += top1_correct 198 | testNumTop5Right += top5_correct 199 | # print('correct top1=%s top5=%s', top1_correct, top5_correct) 200 | 201 | testtop1acc = testNumTop1Right / testNumTotal * 100 202 | testtop5acc = testNumTop5Right / testNumTotal * 100 203 | print('epoch %s trainloss=%s top1acc=%s top5acc=%s' % 204 | (epoch, epochLoss, testtop1acc, testtop5acc)) 205 | epoch += 1 206 | 207 | --------------------------------------------------------------------------------