├── README.md
├── caffe
    ├── README.md
    ├── resnet_caffe_gen.py
    └── resnet_cifar10_solver.prototxt
└── mxnet
    ├── README.md
    ├── resnet_cifar10_main.py
    ├── resnet_cifar10_net.py
    ├── resnet_cifar10_train.py
    ├── resnet_ilsvrc12_main.py
    ├── resnet_ilsvrc12_net.py
    ├── run_cifar10.sh
    └── run_ilsvrc12.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # ResNet examples
 2 | This repository contains several implementations of RenNet proposed by Microsoft Research Asia.
 3 | 
 4 | ## Caffe
 5 | please see [caffe](caffe) folder for details
 6 | ## Mxnet (recommended)
 7 | **updated mxnet** implementation on both cifar10 and ilsvrc12 dataset, which is much faster than the caffe version. See the [mxnet](/mxnet) folder for details.
 8 | 
 9 | ##### Fun facts
10 | For my best knowledge, the official version of the code might not be released because of their protocols. Even if it is going to be released, it will take a significant time for them to go through procedures.
11 | 
12 | It is said that the ResNet team used a very old version of Caffe which they forked quite a long while ago, accounting their huge code base change, their tool might be a completely different beast merely bears the name Caffe.
13 | One anecdote about the team stated that their supervisor is not wiling to provide more funding for more GPU cards, that they finally end up buying high end game card instead. Also, they burned up several cards at the early stage of their project due to negligence. It is most probably that they don't even use server solutions but run on several office computers.
14 | 


--------------------------------------------------------------------------------
/caffe/README.md:
--------------------------------------------------------------------------------
 1 | # Caffe ResNet `.prototxt` file generator
 2 | Tested using Python 2.7
 3 | 
 4 | #### How to
 5 | 1. According to your Linux distribution, install Protocol Buffer package which includes `protoc`, whose
 6 | version number should be `2.x`. `3.x` release not tested.
 7 | 2. install Protocol Buffer Python package (if not shipped with `protoc`) by `pip install protobuf`.
 8 | 3. Generate `caffe_pb2.py` file by `protoc --python_out=./ --proto_path=../../caffe/src/caffe/proto/ ../../caffe/src/caffe/proto/caffe.proto`, check if file is successfully generated, modify each file path accordingly.
 9 | 4. Run the generator by `python resnet_caffe_gen.py`, and several network definition files should be generated.
10 | 5. Check the provided `solver_cifar.prototxt` in this directory. Modify the network definition file path in this file if necessary. `mkdir snapshot`, accordingly.
11 | 6. Write a script to call Caffe:
12 | ```
13 | /path/to/caffe/binary
14 | ```
15 | 
16 | #### Current result
17 | `20 layer cifar10`: 86%.  
18 | Almost 10 hours in training one K40 GPU with ECC off (one of the GPU core of several K80 cards).
19 | Even slower when trained with multiple GPUs using official Caffe repo.
20 | Unfortunately, the Nvidia fork of Caffe scales to multiple GPUs in a reasonable sense, however the fork is behind upstream a lot and thus it doesn't provide a Batch Normalization layer. One is encouraged to try it out.
21 | 
22 | 
23 | `54 layer cifar10`: 89%.  
24 | The same hardware for nearly 16 hours.
25 | 
26 | #### Known Issues
27 | 1. `TODO:` generate network for `ilsvrc 12`, currently it only generates files for `cifar10` dataset, according to the descriptions provided in the paper.
28 | 2. This network **hasn't** yet achieve the stipulated accuracy in the paper. But several techniques were used:
29 |   + Padded training image with `4 pixel` each side.
30 |   + Used the Batch Normalization layer.
31 |   + The standard deviation for initialization Gaussian distribution is set to `sqrt(2/(n*n*c))`, where `n` is the kernel size and `c` is the input channel size.
32 |   + **If the learning rate is set to `0.1` as the paper stated, the network will overfit.** The only successfully trained network is using learning rate of `0.01`, even `0.05` would not work.
33 | 3. There is 3 versions of the shortcut link according to the paper, only option `B` is provided here. Since, option `A` is very ambiguous, no one in my group figured out how to perform the zero padding appropriately.
34 |   + `A:` zero Padded
35 |   + `B:` projection only on feature map size change
36 |   + `C:` always project, no matter the size of input/output feature map size
37 | 


--------------------------------------------------------------------------------
/caffe/resnet_caffe_gen.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | import caffe_pb2
  4 | import math
  5 | import google.protobuf as pb
  6 | from copy import deepcopy
  7 | 
  8 | data_train_str = '''
  9 |   name: "data"
 10 |   type: "Data"
 11 |   top: "data"
 12 |   top: "label"
 13 |   include {
 14 |     phase: TRAIN
 15 |   }
 16 |   transform_param {
 17 |     mirror: true
 18 |     crop_size: 32
 19 |     mean_file: "all_pad.mean.binaryproto"
 20 |   }
 21 |   data_param {
 22 |     source: "train.lmdb"
 23 |     batch_size: 256
 24 |     backend: LMDB
 25 |   }
 26 | '''
 27 | 
 28 | data_test_str = '''
 29 |   name: "data"
 30 |   type: "Data"
 31 |   top: "data"
 32 |   top: "label"
 33 |   include {
 34 |     phase: TEST
 35 |   }
 36 |   transform_param {
 37 |     mirror: false
 38 |     crop_size: 32
 39 |     mean_file: "all_no_pad.mean.binaryproto"
 40 |   }
 41 |   data_param {
 42 |     source: "test.lmdb"
 43 |     batch_size: 50
 44 |     backend: LMDB
 45 |   }
 46 | '''
 47 | 
 48 | conv_str = '''
 49 |   name: "conv_"
 50 |   type: "Convolution"
 51 |   bottom: "norm_"
 52 |   top: "conv_"
 53 |   param {
 54 |     lr_mult: 1
 55 |     decay_mult: 1
 56 |   }
 57 |   param {
 58 |     lr_mult: 1
 59 |     decay_mult: 1
 60 |   }
 61 |   convolution_param {
 62 |     num_output: 16
 63 |     kernel_size: 3
 64 |     pad: 1
 65 |     stride: 1
 66 |     weight_filler {
 67 |       type: "gaussian"
 68 |       std: 0.01
 69 |     }
 70 |     bias_filler {
 71 |       type: "constant"
 72 |       value: 0
 73 |     }
 74 |   }
 75 | '''
 76 | 
 77 | relu_str = '''
 78 |   name: "relu_"
 79 |   type: "ReLU"
 80 |   bottom: "conv_"
 81 |   top: "relu_"
 82 | '''
 83 | 
 84 | norm_str = '''
 85 |   name: "norm_"
 86 |   type: "BatchNorm"
 87 |   bottom: "relu_"
 88 |   top: "norm_"
 89 |   batch_norm_param {
 90 |   }
 91 | '''
 92 | #lrn_param
 93 | #batch_norm_param
 94 | #
 95 | '''
 96 | message BatchNormParameter {
 97 |   // If false, accumulate global mean/variance values via a moving average. If
 98 |   // true, use those accumulated values instead of computing mean/variance
 99 |   // across the batch.
100 |   optional bool use_global_stats = 1;
101 |   // How much does the moving average decay each iteration?
102 |   optional float moving_average_fraction = 2 [default = .999];
103 |   // Small value to add to the variance estimate so that we don't divide by
104 |   // zero.
105 |   optional float eps = 3 [default = 1e-5];
106 | }
107 | '''
108 | 
109 | 
110 | pool_str = '''
111 |   name: "pool_"
112 |   type: "Pooling"
113 |   bottom: "relu_"
114 |   top: "pool_"
115 |   pooling_param {
116 |     pool: AVE
117 |     kernel_size: 2
118 |     stride: 2
119 |   }
120 | '''
121 | 
122 | fc_str = '''
123 |   name: "fc_"
124 |   type: "InnerProduct"
125 |   bottom: "pool_"
126 |   top: "fc_"
127 |   param {
128 |     lr_mult: 1
129 |     decay_mult: 1
130 |   }
131 |   param {
132 |     lr_mult: 1
133 |     decay_mult: 1
134 |   }
135 |   inner_product_param {
136 |     num_output: 10
137 |     weight_filler {
138 |       type: "gaussian"
139 |       std: 0.01
140 |     }
141 |     bias_filler {
142 |       type: "constant"
143 |       value: 0
144 |     }
145 |   }
146 | '''
147 | 
148 | elem_str = '''
149 |   name: "shortcut_"
150 |   type: "Eltwise"
151 |   bottom: "relu_"
152 |   bottom: "relu_"
153 |   top: "elem_"
154 |   eltwise_param { operation: SUM }
155 | '''
156 | 
157 | _conv = caffe_pb2.LayerParameter()
158 | pb.text_format.Merge(conv_str, _conv)
159 | _norm = caffe_pb2.LayerParameter()
160 | pb.text_format.Merge(norm_str, _norm)
161 | _relu = caffe_pb2.LayerParameter()
162 | pb.text_format.Merge(relu_str, _relu)
163 | _elem = caffe_pb2.LayerParameter()
164 | pb.text_format.Merge(elem_str, _elem)
165 | _pool = caffe_pb2.LayerParameter()
166 | pb.text_format.Merge(pool_str, _pool)
167 | _fc = caffe_pb2.LayerParameter()
168 | pb.text_format.Merge(fc_str, _fc)
169 | 
170 | layers = []
171 | 
172 | data_train = caffe_pb2.LayerParameter()
173 | pb.text_format.Merge(data_train_str, data_train)
174 | data_test = caffe_pb2.LayerParameter()
175 | pb.text_format.Merge(data_test_str, data_test)
176 | 
177 | layers.extend([data_train, data_test])
178 | 
179 | layer_idx = 0
180 | layer_str = str(layer_idx)
181 | 
182 | conv = deepcopy(_conv)
183 | conv.name = 'conv_' + layer_str
184 | conv.top[0] = 'conv_' + layer_str
185 | conv.bottom[0] = 'data'
186 | conv.convolution_param.weight_filler.std \
187 |     = math.sqrt(2./(3*3*3))
188 | 
189 | norm = deepcopy(_norm)
190 | norm.name = 'norm_' + layer_str
191 | norm.top[0] = 'norm_' + layer_str
192 | norm.bottom[0] = 'conv_' + layer_str
193 | 
194 | relu = deepcopy(_relu)
195 | relu.name = 'relu_' + layer_str
196 | relu.top[0] = 'relu_' + layer_str
197 | relu.bottom[0] = 'norm_' + layer_str
198 | 
199 | layers.extend([conv, norm, relu])
200 | 
201 | 
202 | for n_const in [3, 5, 9]:
203 |     for output_size in [16, 32, 64]:
204 |         for i in range(n_const):
205 |             # 1
206 |             layer_idx += 1
207 |             layer_str = str(layer_idx)
208 | 
209 |             conv = deepcopy(_conv)
210 |             conv.name = 'conv_' + layer_str
211 |             conv.top[0] = 'conv_' + layer_str
212 |             conv.bottom[0] = 'relu_' + str(layer_idx-1)
213 |             conv.convolution_param.num_output = output_size
214 |             for prev_conv_layer in reversed(layers):
215 |                 if prev_conv_layer.name.startswith('conv_'):
216 |                     conv.convolution_param.weight_filler.std \
217 |                         = math.sqrt(2./(prev_conv_layer.convolution_param.num_output*3*3))
218 |                     break
219 | 
220 |             norm = deepcopy(_norm)
221 |             norm.name = 'norm_' + layer_str
222 |             norm.top[0] = 'norm_' + layer_str
223 |             norm.bottom[0] = 'conv_' + layer_str
224 | 
225 |             relu = deepcopy(_relu)
226 |             relu.name = 'relu_' + layer_str
227 |             relu.top[0] = 'relu_' + layer_str
228 |             relu.bottom[0] = 'norm_' + layer_str
229 | 
230 |             layers.extend([conv, norm, relu])
231 |             #################
232 |             # 2
233 |             layer_idx += 1
234 |             layer_str = str(layer_idx)
235 | 
236 |             conv = deepcopy(_conv)
237 |             conv.name = 'conv_' + layer_str
238 |             conv.top[0] = 'conv_' + layer_str
239 |             conv.bottom[0] = 'relu_' + str(layer_idx-1)
240 |             conv.convolution_param.num_output = output_size
241 |             for prev_conv_layer in reversed(layers):
242 |                 if prev_conv_layer.name.startswith('conv_'):
243 |                     conv.convolution_param.weight_filler.std \
244 |                         = math.sqrt(2./(prev_conv_layer.convolution_param.num_output*3*3))
245 |                     break
246 | 
247 |             norm = deepcopy(_norm)
248 |             norm.name = 'norm_' + layer_str
249 |             norm.top[0] = 'norm_' + layer_str
250 |             norm.bottom[0] = 'conv_' + layer_str
251 | 
252 |             ##################
253 |             # shortcut
254 |             elem = deepcopy(_elem)
255 |             elem.name = 'elem_' + layer_str
256 |             elem.top[0] = 'elem_' + layer_str
257 |             elem.bottom[0] = 'norm_' + layer_str
258 |             elem.bottom[1] = 'relu_' + str(layer_idx-2)
259 | 
260 |             relu = deepcopy(_relu)
261 |             relu.name = 'relu_' + layer_str
262 |             relu.top[0] = 'relu_' + layer_str
263 |             relu.bottom[0] = 'elem_' + layer_str
264 | 
265 |             layers.extend([conv, norm])
266 | 
267 |             # short cut with projection
268 |             if layer_idx in [2*n_const+2, 4*n_const+2]:
269 | 
270 |                 conv = deepcopy(_conv)
271 |                 conv.name = 'proj_' + str(layer_idx-1)
272 |                 conv.top[0] = 'proj_' + str(layer_idx-1)
273 |                 conv.bottom[0] = 'relu_' + str(layer_idx-2)
274 |                 conv.convolution_param.num_output = output_size
275 |                 conv.convolution_param.kernel_size[0] = 1
276 |                 conv.convolution_param.stride[0] = 2
277 |                 conv.convolution_param.pad[0] = 0
278 | 
279 |                 layers.extend([conv])
280 | 
281 | 
282 |             layers.extend([elem, relu])
283 | 
284 | 
285 |     prev_layer_idx = layer_idx
286 | 
287 |     for layer_idx, output_size in zip([2*n_const+1, 4*n_const+1], [32, 64]):
288 |         layer_str = str(layer_idx)
289 |         for layer in layers:
290 | 
291 |             if layer.name == 'elem_' + str(layer_idx+1):
292 |                 layer.bottom[1] = 'proj_' + layer_str
293 | 
294 |             if layer.name == 'conv_' + str(layer_idx):
295 |                 layer.convolution_param.stride[0] = 2
296 | 
297 | 
298 |     layer_idx = 1 + prev_layer_idx
299 |     layer_str = str(layer_idx)
300 | 
301 |     pool = deepcopy(_pool)
302 |     pool.name = 'pool_' + layer_str
303 |     pool.bottom[0] = 'relu_' + str(layer_idx-1)
304 |     pool.top[0] = 'pool_' + layer_str
305 | 
306 |     fc = deepcopy(_fc)
307 |     fc.name = 'fc_' + layer_str
308 |     fc.bottom[0] = 'pool_' + layer_str
309 |     fc.top[0] = 'fc_' + layer_str
310 | 
311 |     layers.extend([pool, fc])
312 | 
313 |     loss_str = '''
314 |       name: "loss"
315 |       type: "SoftmaxWithLoss"
316 |       bottom: "fc_"
317 |       bottom: "label"
318 |       top: "loss"
319 |     '''
320 |     _loss = caffe_pb2.LayerParameter()
321 |     pb.text_format.Merge(loss_str, _loss)
322 |     loss = _loss
323 |     loss.bottom[0] = 'fc_' + layer_str
324 | 
325 |     accu_str = '''
326 |       name: "accuracy"
327 |       type: "Accuracy"
328 |       bottom: "fc_"
329 |       bottom: "label"
330 |       top: "accuracy"
331 |       include {
332 |         phase: TEST
333 |       }
334 |     '''
335 |     _accu = caffe_pb2.LayerParameter()
336 |     pb.text_format.Merge(accu_str, _accu)
337 |     accu = _accu
338 |     accu.bottom[0] = 'fc_' + layer_str
339 | 
340 |     layers.extend([loss, accu])
341 | 
342 |     net = caffe_pb2.NetParameter()
343 |     net.name = "resnet_cifar10_" + str(n_const * 6 + 2)
344 |     net.layer.extend(layers)
345 |     open(net.name+'.prototxt', 'w').write(str(net))
346 | 


--------------------------------------------------------------------------------
/caffe/resnet_cifar10_solver.prototxt:
--------------------------------------------------------------------------------
 1 | net: "./resnet_cifar10_20.prototxt"
 2 | # test_iter specifies how many forward passes the test should carry out.
 3 | # In the case of CIFAR-10, we have test batch size 50 and 200 test iterations,
 4 | # covering the full 10,000 testing images.
 5 | test_iter: 200
 6 | test_interval: 2000
 7 | 
 8 | base_lr: 0.01
 9 | lr_policy: "multistep"
10 | gamma: 0.1
11 | stepvalue: 32000
12 | stepvalue: 48000
13 | 
14 | momentum: 0.9
15 | weight_decay: 0.0001
16 | 
17 | display: 100
18 | max_iter: 60000
19 | snapshot: 10000
20 | snapshot_prefix: "snapshot/"
21 | solver_mode: GPU
22 | 
23 | 


--------------------------------------------------------------------------------
/mxnet/README.md:
--------------------------------------------------------------------------------
 1 | # ResNet for mxnet
 2 | 
 3 | ## Usage guide
 4 | 
 5 | #### 1. Prepare dataset according to mxnet documentation
 6 | For both ilsvrc12 and cifar10 dataset, there exists several scipts to make the required `.rec` file for mxnet to use, please consult the `example` folder of the [mxnet repo](https://github.com/shuokay/mxnet/tree/master/example) as well as the [documentation](mxnet.rtfd.org) to make or download these files and put them in to `data_ilsvrc12` and `data_cifar10` folder respectively. See the `*.sh` scripts in this folder to gain a quick insight of how to use these files.
 7 | 
 8 | #### 2. Train the model
 9 | ##### Small network for cifar10
10 | According to the paper, several data augmentation techniques is used the same as [caffe](../caffe) in this repo.  
11 | With padded input and `n = 9` and total batch size 256 for two gpus, the model can achieve `86%` accuracy, which is quite far from their stated accuracy of `94%`. Help appreciated.
12 | 
13 | ##### Large network for ilsvrc12 (ImageNet)
14 | The smallest variation (50 layers) of the large network consumes the GPU memeory quite a lot. When trained on 4 gpus, 5G memory is consumed on each GPU when a total batch size of 128 is set. The training speed for this setting is around 80 images/sec, which means if we are to train this for 45 epoches (the case of alexnet), maybe 10 days are required to get the final result.  
15 | As for the largest the network (154 layers), at a batch size of 8 on one GPU, the memory consumption is 3.8G, and training speed is 10 images/sec.
16 | 
17 | 


--------------------------------------------------------------------------------
/mxnet/resnet_cifar10_main.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import argparse
 3 | import os, sys
 4 | import resnet_cifar10_train as train_model
 5 | 
 6 | parser = argparse.ArgumentParser(description='train an image classifer on cifar10')
 7 | parser.add_argument('--network', type=str, default='resnet',
 8 |                     help = 'the cnn to use')
 9 | parser.add_argument('--data-dir', type=str, default='./',
10 |                     help='the input data directory')
11 | parser.add_argument('--gpus', type=str, default='2,3',
12 |                     help='the gpus will be used, e.g "0,1,2,3"')
13 | parser.add_argument('--num-examples', type=int, default=50000,
14 |                     help='the number of training examples')
15 | parser.add_argument('--batch-size', type=int, default=128,
16 |                     help='the batch size')
17 | parser.add_argument('--lr', type=float, default=.1,
18 |                     help='the initial learning rate')
19 | parser.add_argument('--lr-factor', type=float, default=0.1,
20 |                     help='times the lr with a factor for every lr-factor-epoch epoch')
21 | parser.add_argument('--lr-factor-epoch', type=float, default=100,
22 |                     help='the number of epoch to factor the lr, could be .5')
23 | parser.add_argument('--model-prefix', type=str,
24 |                     help='the prefix of the model to load/save')
25 | parser.add_argument('--num-epochs', type=int, default=300,
26 |                     help='the number of training epochs')
27 | parser.add_argument('--load-epoch', type=int,
28 |                     help="load the model on an epoch using the model-prefix")
29 | parser.add_argument('--kv-store', type=str, default='local',
30 |                     help='the kvstore type')
31 | args = parser.parse_args()
32 | 
33 | # network
34 | import importlib
35 | net = importlib.import_module(args.network + '_cifar10_net').get_symbol(10)
36 | 
37 | # data
38 | def get_iterator(args, kv):
39 |     data_shape = (3, 32, 32)
40 |     train = mx.io.ImageRecordIter(
41 |         path_imgrec = args.data_dir + "train.rec",
42 |         mean_img    = args.data_dir + "mean_pad.bin",
43 |         data_shape  = data_shape,
44 |         batch_size  = args.batch_size,
45 |         rand_crop   = True,
46 |         rand_mirror = True,
47 |         num_parts   = kv.num_workers,
48 |         part_index  = kv.rank)
49 | 
50 |     val = mx.io.ImageRecordIter(
51 |         path_imgrec = args.data_dir + "test.rec",
52 |         mean_img    = args.data_dir + "mean_no_pad.bin",
53 |         rand_crop   = False,
54 |         rand_mirror = False,
55 |         data_shape  = data_shape,
56 |         batch_size  = args.batch_size,
57 |         num_parts   = kv.num_workers,
58 |         part_index  = kv.rank)
59 | 
60 |     return (train, val)
61 | 
62 | # train
63 | train_model.fit(args, net, get_iterator)
64 | 


--------------------------------------------------------------------------------
/mxnet/resnet_cifar10_net.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | 
 3 | def Conv_BN_ReLU(data, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=None, suffix=''):
 4 |     conv = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
 5 |     bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
 6 |     act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
 7 |     return act
 8 | 
 9 | def Conv_BN(data, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=None, suffix=''):
10 |     conv = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
11 |     bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
12 |     return bn
13 | 
14 | def Conv(data, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=None, suffix=''):
15 |     conv = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
16 |     return conv
17 | 
18 | def Neck(data, num_filter, layer_idx, project=False):
19 | 
20 |     # first layer
21 |     layer_idx += 1
22 |     if project:
23 |         proj = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter,
24 |                                      kernel=(1, 1), stride=(2, 2), pad=(0, 0), name='proj_%d' %layer_idx)
25 |         block1_stride = (2, 2)
26 |     else:
27 |         proj = data
28 |         block1_stride = (1, 1)
29 | 
30 |     block1 = Conv_BN_ReLU(data, num_filter, kernel=(3, 3), stride=block1_stride, pad=(1, 1), name=str(layer_idx))
31 | 
32 |     # second layer
33 |     layer_idx += 1
34 |     block2 = Conv(block1, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=str(layer_idx))
35 |     # block2 = Conv_BN(block1, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=str(layer_idx))
36 | 
37 |     esum = mx.symbol.ElementWiseSum(proj, block2)
38 |     bn = mx.symbol.BatchNorm(data=esum, name='bn_%d' %layer_idx)
39 |     relu = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%d' %layer_idx)
40 |     # relu = mx.symbol.Activation(data=esum, act_type='relu', name='relu_%d' %layer_idx)
41 | 
42 |     return layer_idx, relu
43 | 
44 | 
45 | def get_symbol(num_classes=10, n_const=9):
46 | 
47 |     data = mx.symbol.Variable(name='data')
48 | 
49 |     layer_idx = 0
50 |     num_filter = 16 # 32, 64
51 |     neck = Conv_BN_ReLU(data, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=str(layer_idx))
52 | 
53 |     layer_sizes = [num_filter, 32, 64]
54 |     project_enablers = [False, True, True]
55 | 
56 |     for num_filter, project_enable in zip(layer_sizes, project_enablers):
57 |         for n in range(n_const):
58 |             if n == 0 and project_enable:
59 |                 project = True
60 |             else:
61 |                 project = False
62 | 
63 |             layer_idx, neck = Neck(data=neck, num_filter=num_filter, layer_idx=layer_idx, project=project)
64 | 
65 |     layer_idx += 1
66 |     avg = mx.symbol.Pooling(data=neck, kernel=(2, 2), stride=(1, 1), name='global_pool', pool_type='avg')
67 |     flatten = mx.sym.Flatten(data=avg, name="flatten")
68 |     fc0 = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc0')
69 |     softmax = mx.symbol.SoftmaxOutput(data=fc0, name='softmax')
70 | 
71 |     print(layer_idx+1)
72 |     return softmax
73 | 


--------------------------------------------------------------------------------
/mxnet/resnet_cifar10_train.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import logging
 3 | 
 4 | def fit(args, network, data_loader):
 5 |     # kvstore
 6 |     kv = mx.kvstore.create(args.kv_store)
 7 | 
 8 |     # logging
 9 |     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
10 |     logging.basicConfig(level=logging.DEBUG, format=head)
11 |     logging.info('start with arguments %s', args)
12 | 
13 |     # load model?
14 |     model_prefix = args.model_prefix
15 |     if model_prefix is not None:
16 |         model_prefix += "-%d" % (kv.rank)
17 |     model_args = {}
18 |     if args.load_epoch is not None:
19 |         assert model_prefix is not None
20 |         tmp = mx.model.FeedForward.load(model_prefix, args.load_epoch)
21 |         model_args = {'arg_params' : tmp.arg_params,
22 |                       'aux_params' : tmp.aux_params,
23 |                       'begin_epoch' : args.load_epoch}
24 |     # save model?
25 |     checkpoint = None if model_prefix is None else mx.callback.do_checkpoint(model_prefix)
26 | 
27 |     # data
28 |     (train, val) = data_loader(args, kv)
29 | 
30 |     # train
31 |     devs = mx.cpu() if args.gpus is None else [
32 |         mx.gpu(int(i)) for i in args.gpus.split(',')]
33 | 
34 |     epoch_size = args.num_examples / args.batch_size
35 | 
36 |     if args.kv_store == 'dist_sync':
37 |         epoch_size /= kv.num_workers
38 |         model_args['epoch_size'] = epoch_size
39 | 
40 |     if 'lr_factor' in args and args.lr_factor < 1:
41 |         model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
42 |             step = max(int(epoch_size * args.lr_factor_epoch), 1),
43 |             factor = args.lr_factor)
44 | 
45 |     if 'clip_gradient' in args and args.clip_gradient is not None:
46 |         model_args['clip_gradient'] = args.clip_gradient
47 | 
48 |     # disable kvstore for single device
49 |     if 'local' in kv.type and (
50 |             args.gpus is None or len(args.gpus.split(',')) is 1):
51 |         kv = None
52 | 
53 |     model = mx.model.FeedForward(
54 |         ctx                = devs,
55 |         symbol             = network,
56 |         num_epoch          = args.num_epochs,
57 |         learning_rate      = args.lr,
58 |         momentum           = 0.9,
59 |         wd                 = 0.0001,
60 |         initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
61 |         **model_args)
62 | 
63 |     model.fit(
64 |         X                  = train,
65 |         eval_data          = val,
66 |         kvstore            = kv,
67 |         batch_end_callback = mx.callback.Speedometer(args.batch_size, 50),
68 |         epoch_end_callback = checkpoint)
69 | 


--------------------------------------------------------------------------------
/mxnet/resnet_ilsvrc12_main.py:
--------------------------------------------------------------------------------
  1 | import mxnet as mx
  2 | import argparse
  3 | import logging
  4 | 
  5 | parser = argparse.ArgumentParser(description='train an image classifer on ilsvrc12')
  6 | parser.add_argument('--network', type=str, default='resnet',
  7 |                     help = 'the cnn to use')
  8 | parser.add_argument('--data-dir', type=str, default='./',
  9 |                     help='the input data directory')
 10 | parser.add_argument('--gpus', type=str, default='0,1,2,3',
 11 |                     help='the gpus will be used, e.g "0,1,2,3"')
 12 | parser.add_argument('--num-examples', type=int, default=1281167,
 13 |                     help='the number of training examples')
 14 | parser.add_argument('--batch-size', type=int, default=64,
 15 |                     help='the batch size')
 16 | parser.add_argument('--lr', type=float, default=.1,
 17 |                     help='the initial learning rate')
 18 | parser.add_argument('--lr-factor', type=float, default=0.1,
 19 |                     help='times the lr with a factor for every lr-factor-epoch epoch')
 20 | parser.add_argument('--lr-factor-epoch', type=float, default=40,
 21 |                     help='the number of epoch to factor the lr, could be .5')
 22 | parser.add_argument('--model-prefix', type=str,
 23 |                     help='the prefix of the model to load/save')
 24 | parser.add_argument('--num-epochs', type=int, default=120,
 25 |                     help='the number of training epochs')
 26 | parser.add_argument('--load-epoch', type=int,
 27 |                     help="load the model on an epoch using the model-prefix")
 28 | parser.add_argument('--kv-store', type=str, default='local_allreduce_device',
 29 |                     help='the kvstore type')
 30 | args = parser.parse_args()
 31 | 
 32 | # network
 33 | import importlib
 34 | net = importlib.import_module(args.network + '_ilsvrc12_net').get_symbol(1000, 0)
 35 | 
 36 | # data
 37 | def get_iterator(args, kv):
 38 |     data_shape = (3, 224, 224)
 39 |     train = mx.io.ImageRecordIter(
 40 |         path_imgrec = args.data_dir + "train.rec",
 41 |         mean_r      = 123.68,
 42 |         mean_g      = 116.779,
 43 |         mean_b      = 103.939,
 44 |         data_shape  = data_shape,
 45 |         batch_size  = args.batch_size,
 46 |         rand_crop   = True,
 47 |         rand_mirror = True,
 48 |         num_parts   = kv.num_workers,
 49 |         part_index  = kv.rank)
 50 | 
 51 |     val = mx.io.ImageRecordIter(
 52 |         path_imgrec = args.data_dir + "val.rec",
 53 |         mean_r      = 123.68,
 54 |         mean_g      = 116.779,
 55 |         mean_b      = 103.939,
 56 |         rand_crop   = False,
 57 |         rand_mirror = False,
 58 |         data_shape  = data_shape,
 59 |         batch_size  = args.batch_size,
 60 |         num_parts   = kv.num_workers,
 61 |         part_index  = kv.rank)
 62 | 
 63 |     return (train, val)
 64 | 
 65 | def train_model(args, network, data_loader):
 66 |     # kvstore
 67 |     kv = mx.kvstore.create(args.kv_store)
 68 | 
 69 |     # logging
 70 |     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
 71 |     logging.basicConfig(level=logging.DEBUG, format=head)
 72 |     logging.info('start with arguments %s', args)
 73 | 
 74 |     # load model?
 75 |     model_prefix = args.model_prefix
 76 |     if model_prefix is not None:
 77 |         model_prefix += "-%d" % (kv.rank)
 78 |     model_args = {}
 79 |     if args.load_epoch is not None:
 80 |         assert model_prefix is not None
 81 |         tmp = mx.model.FeedForward.load(model_prefix, args.load_epoch)
 82 |         model_args = {'arg_params' : tmp.arg_params,
 83 |                       'aux_params' : tmp.aux_params,
 84 |                       'begin_epoch' : args.load_epoch}
 85 |     # save model?
 86 |     checkpoint = None if model_prefix is None else mx.callback.do_checkpoint(model_prefix)
 87 | 
 88 |     # data
 89 |     (train, val) = data_loader(args, kv)
 90 | 
 91 |     # train
 92 |     devs = mx.cpu() if args.gpus is None else [
 93 |         mx.gpu(int(i)) for i in args.gpus.split(',')]
 94 | 
 95 |     epoch_size = args.num_examples / args.batch_size
 96 | 
 97 |     if args.kv_store == 'dist_sync':
 98 |         epoch_size /= kv.num_workers
 99 |         model_args['epoch_size'] = epoch_size
100 | 
101 |     if 'lr_factor' in args and args.lr_factor < 1:
102 |         model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
103 |             step = max(int(epoch_size * args.lr_factor_epoch), 1),
104 |             factor = args.lr_factor)
105 | 
106 |     if 'clip_gradient' in args and args.clip_gradient is not None:
107 |         model_args['clip_gradient'] = args.clip_gradient
108 | 
109 |     # disable kvstore for single device
110 |     if 'local' in kv.type and (
111 |             args.gpus is None or len(args.gpus.split(',')) is 1):
112 |         kv = None
113 | 
114 |     model = mx.model.FeedForward(
115 |         ctx                = devs,
116 |         symbol             = network,
117 |         num_epoch          = args.num_epochs,
118 |         learning_rate      = args.lr,
119 |         momentum           = 0.9,
120 |         wd                 = 0.0001,
121 |         initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
122 |         **model_args)
123 | 
124 |     model.fit(
125 |         X                  = train,
126 |         eval_data          = val,
127 |         kvstore            = kv,
128 |         batch_end_callback = mx.callback.Speedometer(args.batch_size, 50),
129 |         epoch_end_callback = checkpoint)
130 | 
131 | 
132 | # check the network graph
133 | # g = mx.visualization.plot_network(net)
134 | # g.format = 'png'
135 | # g.render()
136 | 
137 | # train
138 | train_model(args, net, get_iterator)
139 | 


--------------------------------------------------------------------------------
/mxnet/resnet_ilsvrc12_net.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | 
 3 | 
 4 | def ConvFactory(data, num_filter, kernel, stride, pad):
 5 |     conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)
 6 |     bn = mx.symbol.BatchNorm(data=conv)
 7 |     act = mx.symbol.Activation(data=bn, act_type='relu')
 8 |     return act
 9 | 
10 | def BottleneckFactory(data, num_filter, layer_idx, project=False):
11 | 
12 |     # first layer
13 |     layer_idx += 1
14 |     if project:
15 |         proj = mx.symbol.Convolution(data=data, num_filter=num_filter*4, kernel=(1, 1), stride=(2, 2), pad=(0, 0))
16 |         layer1_stride = (2, 2)
17 |     else:
18 |         proj = data
19 |         layer1_stride = (1, 1)
20 |     data = ConvFactory(data, num_filter, kernel=(1, 1), stride=layer1_stride, pad=(0, 0))
21 | 
22 |     # second layer
23 |     layer_idx += 1
24 |     data = ConvFactory(data, num_filter, kernel=(3, 3), stride=(1, 1), pad=(1, 1))
25 | 
26 |     # third layer
27 |     layer_idx += 1
28 |     data = mx.symbol.Convolution(data, num_filter=num_filter*4, kernel=(1, 1), stride=(1, 1), pad=(0, 0))
29 | 
30 |     esum = mx.symbol.ElementWiseSum(proj, data)
31 |     bn = mx.symbol.BatchNorm(data=esum)
32 |     act = mx.symbol.Activation(data=bn, act_type='relu')
33 | 
34 |     return layer_idx, act
35 | 
36 | 
37 | def get_symbol(num_classes=1000, model_idx=2):
38 | 
39 |     layer_idx = 0
40 |     data = mx.symbol.Variable(name='data')
41 | 
42 |     # stage conv1_x
43 |     data = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
44 | 
45 |     # setup model parameters
46 |     num_filter = (64, 128, 256, 512)
47 |     model_cfgs = [
48 |         (3, 4, 6, 3),
49 |         (3, 4, 23, 3),
50 |         (3, 8, 36, 3)
51 |     ]
52 |     model_cfg = model_cfgs[model_idx]
53 | 
54 |     # stage conv2_x to conv5_x, 4 stages
55 |     data = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
56 | 
57 |     # special for conv2_x first block
58 |     stage_filter_size = num_filter[0]
59 |     proj   = mx.symbol.Convolution(data=data, num_filter=stage_filter_size*4, kernel=(1, 1), stride=(1, 1), pad=(0, 0))
60 |     block1 = ConvFactory(data=data, num_filter=stage_filter_size, kernel=(1, 1), stride=(1, 1), pad=(0, 0))
61 |     block2 = ConvFactory(data=block1, num_filter=stage_filter_size, kernel=(3, 3), stride=(1, 1), pad=(1, 1))
62 |     block3 = mx.symbol.Convolution(data=block2, num_filter=stage_filter_size*4, kernel=(1, 1), stride=(1, 1), pad=(0, 0))
63 |     esum = mx.symbol.ElementWiseSum(proj, block3)
64 |     bn = mx.symbol.BatchNorm(esum)
65 |     act = mx.symbol.Activation(bn, act_type='relu')
66 |     necks = act
67 |     layer_idx += 3
68 | 
69 |     for stage_neck_nums, stage_filter_size, stage_idx in zip(model_cfg, num_filter, range(len(model_cfg))):
70 |         for neck_idx in range(stage_neck_nums):
71 |             if neck_idx == 0 and stage_idx == 0:
72 |                 pass
73 |             else:
74 |                 if neck_idx == 0 and stage_idx != 0:
75 |                     project = True
76 |                 else:
77 |                     project = False
78 |                 layer_idx, necks = BottleneckFactory(data=necks, num_filter=stage_filter_size, layer_idx=layer_idx, project=project)
79 | 
80 |     layer_idx += 1
81 |     avg = mx.symbol.Pooling(data=necks, kernel=(7, 7), stride=(1, 1), name='global_pool', pool_type='avg')
82 |     flatten = mx.sym.Flatten(data=avg, name="flatten")
83 |     fc0 = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc0')
84 |     softmax = mx.symbol.SoftmaxOutput(data=fc0, name='softmax')
85 | 
86 |     print(layer_idx+1)
87 |     return softmax
88 | 


--------------------------------------------------------------------------------
/mxnet/run_cifar10.sh:
--------------------------------------------------------------------------------
 1 | time_tag=`date +"%m-%d_%H:%M:%S"`
 2 | log_folder=log_cifar10
 3 | 
 4 | for i in `seq 4 4`
 5 | do
 6 | log_file=$log_folder/output_$i_$time_tag.log
 7 | stat_file=$log_folder/gpustat_$i_$time_tag.log
 8 |     cat resnet_cifar10*.py > $log_file
 9 |     (time python resnet_cifar10_main.py --gpus 1,3,5,7 --data-dir ./data_cifar10/ --batch-size 256) 2>&1 | tee -a $log_file &
10 |     sleep 30; nvidia-smi 2>&1 | tee $stat_file
11 |     sleep 30; nvidia-smi 2>&1 | tee -a $stat_file
12 |     wait
13 | done
14 | 


--------------------------------------------------------------------------------
/mxnet/run_ilsvrc12.sh:
--------------------------------------------------------------------------------
 1 | time_tag=`date +"%m-%d_%H:%M:%S"`
 2 | log_folder=log_ilsvrc12
 3 | 
 4 | for i in `seq 8 8`
 5 | do
 6 | log_file=$log_folder/output_$i_$time_tag.log
 7 | stat_file=$log_folder/gpustat_$i_$time_tag.log
 8 |     cat resnet_ilsvrc12*.py > $log_file
 9 |     (time python resnet_ilsvrc12_main.py --gpus 0,1,2,3,4,5,6,7 --batch-size 256 --data-dir ./data_ilsvrc12/) 2>&1 | tee -a $log_file &
10 |     sleep 30; nvidia-smi 2>&1 | tee $stat_file
11 |     sleep 30; nvidia-smi 2>&1 | tee -a $stat_file
12 |     wait
13 | done
14 | 


--------------------------------------------------------------------------------