├── CNTK
    ├── AlexNet.config
    ├── AlexNet.ndl
    ├── Macros.ndl
    ├── alexnet_1GPU.sh
    ├── alexnet_4GPUs.sh
    ├── createFakeImageNetData.py
    ├── createLabelMap.py
    ├── ffn.config
    ├── ffn_1GPU.sh
    ├── ffn_2GPUs.sh
    ├── ffn_4GPUs.sh
    ├── ffn_orig.config
    ├── labelmap.1K.txt
    └── logs
    │   ├── alexnet
    │       ├── 1
    │       │   └── AlexNet_Train.log
    │       └── 4
    │       │   ├── AlexNet_Train.logrank0
    │       │   ├── AlexNet_Train.logrank1
    │       │   ├── AlexNet_Train.logrank2
    │       │   └── AlexNet_Train.logrank3
    │   └── ffn
    │       ├── 1
    │           └── out_Train.log
    │       ├── 4
    │           ├── out_Train.logrank0
    │           ├── out_Train.logrank1
    │           ├── out_Train.logrank2
    │           └── out_Train.logrank3
    │       ├── 8
    │           ├── out_Train.logrank0
    │           ├── out_Train.logrank1
    │           ├── out_Train.logrank2
    │           ├── out_Train.logrank3
    │           ├── out_Train.logrank4
    │           ├── out_Train.logrank5
    │           ├── out_Train.logrank6
    │           └── out_Train.logrank7
    │       └── 16
    │           ├── out_Train.logrank0
    │           ├── out_Train.logrank1
    │           ├── out_Train.logrank10
    │           ├── out_Train.logrank11
    │           ├── out_Train.logrank12
    │           ├── out_Train.logrank13
    │           ├── out_Train.logrank14
    │           ├── out_Train.logrank15
    │           ├── out_Train.logrank2
    │           ├── out_Train.logrank3
    │           ├── out_Train.logrank4
    │           ├── out_Train.logrank5
    │           ├── out_Train.logrank6
    │           ├── out_Train.logrank7
    │           ├── out_Train.logrank8
    │           └── out_Train.logrank9
├── GPU-hardware.txt
├── README.md
├── TensorFlow
    ├── ffn.py
    ├── ffn_1GPU.log
    ├── ffn_4GPUs.log
    ├── ffn_exp.py
    └── ffn_exp_4GPUs.py
├── Torch
    ├── alexnet.lua
    ├── ffn.log
    └── ffn.lua
├── caffe
    ├── alexnet.prototxt
    ├── alexnet_1GPU.log
    ├── alexnet_1GPU.sh
    ├── alexnet_4GPUs.log
    ├── alexnet_4GPUs.prototxt
    ├── alexnet_4GPUs.sh
    ├── alexnet_4GPUs_solver.prototxt
    ├── alexnet_solver.prototxt
    ├── alexnet_time_1GPU.log
    ├── alexnet_time_1GPU.sh
    ├── createFakeData.py
    ├── createFakeImageNet.py
    ├── ffn.prototxt
    ├── ffn_1GPU.log
    ├── ffn_1GPU.py
    ├── ffn_1GPU.sh
    ├── ffn_2GPUs.log
    ├── ffn_2GPUs.prototxt
    ├── ffn_2GPUs.sh
    ├── ffn_2GPUs_solver.prototxt
    ├── ffn_4GPUs.log
    ├── ffn_4GPUs.prototxt
    ├── ffn_4GPUs.sh
    ├── ffn_4GPUs_solver.prototxt
    └── ffn_solver.prototxt
├── createData.py
└── keras
    ├── ffn.log
    └── ffn.py


/CNTK/AlexNet.config:
--------------------------------------------------------------------------------
 1 | WorkDir=.
 2 | ModelDir=$WorkDir$/_out/$ConfigName$
 3 | stderr=$WorkDir$/_out/$ConfigName$
 4 | 
 5 | ndlMacros=$WorkDir$/Macros.ndl
 6 | 
 7 | precision=float
 8 | deviceId=Auto
 9 | 
10 | command=Train
11 | 
12 | makeMode=false
13 | 
14 | parallelTrain=false
15 | 
16 | prefetch=true
17 | 
18 | traceLevel=1
19 | 
20 | Train=[
21 |     action=train
22 |     modelPath=$ModelDir$/AlexNet
23 | 
24 |     NDLNetworkBuilder=[
25 |         networkDescription=$WorkDir$/AlexNet.ndl
26 |     ]
27 |     
28 |     SGD=[
29 |         epochSize=8192
30 |         minibatchSize=256
31 |         learningRatesPerMB=0.01
32 |         momentumPerMB=0
33 |         maxEpochs=10
34 |         gradUpdateType=None
35 |         L2RegWeight=0
36 |         dropoutRate=0
37 |         
38 |         ParallelTrain=[
39 |             parallelizationMethod=DataParallelSGD
40 |             distributedMBReading=true
41 |             parallelizationStartEpoch=1
42 |             DataParallelSGD=[
43 |                 gradientBits=1
44 |             ]
45 |         ]
46 | 
47 |         numMBsToShowResult=8
48 |     ]
49 | 
50 | reader=[
51 |     readerType=UCIFastReader
52 |     file=$WorkDir$/imagenet_data.txt
53 |     randomize=None
54 |     features=[
55 |         dim=150528
56 |         start=1
57 |     ]
58 |     labels=[
59 |         dim=1
60 |         start=0
61 | 	labelDim=1000
62 | 	labelMappingFile=$WorkDir$/labelmap.1K.txt
63 |     ]
64 | ]
65 |     
66 | ]
67 | 


--------------------------------------------------------------------------------
/CNTK/AlexNet.ndl:
--------------------------------------------------------------------------------
  1 | load=ndlMacros
  2 | run=DNN
  3 | 
  4 | ndlMacros = [
  5 |     ImageW = 224
  6 |     ImageH = 224
  7 |     ImageC = 3
  8 |     LabelDim = 1000
  9 | 
 10 |     features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
 11 |     labels = Input(LabelDim, tag = label)
 12 |     
 13 |     conv1WScale = 0.95
 14 |     conv1BValue = 0
 15 |     conv2WScale = 2
 16 |     conv2BValue = 1
 17 |     conv3WScale = 2.07
 18 |     conv3BValue = 0
 19 |     conv4WScale = 2.9
 20 |     conv4BValue = 1
 21 |     conv5WScale = 2.4
 22 |     conv5BValue = 1
 23 |     fc1WScale = 6.4
 24 |     fc1BValue = 1
 25 |     fc2WScale = 3.2
 26 |     fc2BValue = 1
 27 |     fc3WScale = 3.2
 28 |     fc3BValue = 1
 29 | ]
 30 | 
 31 | DNN=[
 32 |     # conv1
 33 |     kW1 = 11
 34 |     kH1 = 11
 35 |     cMap1 = 96
 36 |     hStride1 = 4
 37 |     vStride1 = 4
 38 |     # weight[cMap1, kW1 * kH1 * ImageC]
 39 |     conv1_act = ConvReLULayer(features, cMap1, 363, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
 40 |     
 41 |     # pool1
 42 |     pool1W = 3
 43 |     pool1H = 3
 44 |     pool1hStride = 2
 45 |     pool1vStride = 2
 46 |     pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
 47 | 
 48 |     # conv2
 49 |     kW2 = 5
 50 |     kH2 = 5
 51 |     cMap2 = 256
 52 |     hStride2 = 1
 53 |     vStride2 = 1
 54 |     # weight[cMap2, kW2 * kH2 * cMap1]
 55 |     conv2_act = ConvReLULayer(pool1, cMap2, 2400, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
 56 | 
 57 |     # pool2
 58 |     pool2W = 3
 59 |     pool2H = 3
 60 |     pool2hStride = 2
 61 |     pool2vStride = 2
 62 |     pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
 63 | 
 64 |     # conv3
 65 |     kW3 = 3
 66 |     kH3 = 3
 67 |     cMap3 = 384
 68 |     hStride3 = 1
 69 |     vStride3 = 1
 70 |     # weight[cMap3, kW3 * kH3 * cMap2]
 71 |     conv3_act = ConvReLULayer(pool2, cMap3, 2304, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
 72 |     
 73 |     # conv4
 74 |     kW4 = 3
 75 |     kH4 = 3
 76 |     cMap4 = 384
 77 |     hStride4 = 1
 78 |     vStride4 = 1
 79 |     # weight[cMap4, kW4 * kH4 * cMap3]
 80 |     conv4_act = ConvReLULayer(conv3_act, cMap4, 3456, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
 81 | 
 82 |     # conv5
 83 |     kW5 = 3
 84 |     kH5 = 3
 85 |     cMap5 = 256
 86 |     hStride5 = 1
 87 |     vStride5 = 1
 88 |     # weight[cMap5, kW5 * kH5 * cMap4]
 89 |     conv5_act = ConvReLULayer(conv4_act, cMap5, 3456, kW5, kH5, hStride5, vStride5, conv5WScale, conv5BValue)
 90 |     
 91 |     # pool3
 92 |     pool3W = 3
 93 |     pool3H = 3
 94 |     pool3hStride = 2
 95 |     pool3vStride = 2
 96 |     pool3 = MaxPooling(conv5_act, pool3W, pool3H, pool3hStride, pool3vStride)
 97 | 
 98 |     hiddenDim = 4096
 99 |     h1 = DNNReLULayer(9216, hiddenDim, pool3, fc1WScale, fc1BValue)
100 |     h1_d = Dropout(h1)
101 |     h2 = DNNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
102 |     h2_d = Dropout(h2)
103 |     ol = DNNLastLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
104 |     
105 |     CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
106 |     Err = ErrorPrediction(labels, ol, tag = Eval)
107 |     OutputNodes = ol
108 | ]
109 | 


--------------------------------------------------------------------------------
/CNTK/Macros.ndl:
--------------------------------------------------------------------------------
 1 | ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
 2 | {
 3 |     convW = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
 4 |     conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
 5 |     convB = Parameter(outMap, 1, init = fixedValue, value = bValue)
 6 |     convPlusB = Plus(conv, convB);
 7 |     act = RectifiedLinear(convPlusB);
 8 | }
 9 | 
10 | DNNReLULayer(inDim, outDim, x, wScale, bValue)
11 | {
12 |     W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
13 |     b = Parameter(outDim, init = fixedValue, value = bValue) 
14 |     t = Times(W, x)
15 |     z = Plus(t, b)
16 |     y = RectifiedLinear(z)
17 | }
18 | 
19 | DNNLastLayer(hiddenDim, labelDim, x, wScale, bValue)
20 | {
21 |     W = Parameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
22 |     b = Parameter(labelDim, init = fixedValue, value = bValue)
23 |     t = Times(W, x)
24 |     z = Plus(t, b)
25 | }
26 | 


--------------------------------------------------------------------------------
/CNTK/alexnet_1GPU.sh:
--------------------------------------------------------------------------------
1 | ~/cntk/bin/cntk configFile=AlexNet.config configName=AlexNet
2 | 
3 | 


--------------------------------------------------------------------------------
/CNTK/alexnet_4GPUs.sh:
--------------------------------------------------------------------------------
1 | mpiexec -n 4 ~/cntk/bin/cntk configFile=AlexNet.config configName=AlexNet parallelTrain=true
2 | 
3 | 


--------------------------------------------------------------------------------
/CNTK/createFakeImageNetData.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | featDim = 224 * 224 * 3
 4 | labDim = 1000
 5 | totalCount = 256 * 32
 6 | 
 7 | def createFakeData(count):
 8 |     features = np.random.randn(count, featDim)
 9 |     labels = np.random.randint(0, labDim, size=(count, 1))
10 |     return features, labels
11 | 
12 | f, l = createFakeData(totalCount)
13 | 
14 | np.savetxt(r'./imagenet_data.txt', np.hstack((l, f)), fmt='%d' + ' %f4' * featDim)
15 | 
16 | 


--------------------------------------------------------------------------------
/CNTK/createLabelMap.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | dim = 1000
3 | a = range(0, dim)
4 | np.savetxt('./labelmap.txt', np.reshape(a, (dim, 1)), fmt='%d')
5 | 


--------------------------------------------------------------------------------
/CNTK/ffn.config:
--------------------------------------------------------------------------------
 1 | WorkDir=.
 2 | ModelDir=$WorkDir$/models/$ConfigName$
 3 | stderr=$WorkDir$/logs/$ConfigName$/out
 4 | precision=float
 5 | deviceId=Auto
 6 | 
 7 | makeMode=false
 8 | 
 9 | command=Train
10 | 
11 | featureDim = 512
12 | labelDim = 10000
13 | hiddenDim = 2048
14 | 
15 | parallelTrain=false
16 | prefetch=true
17 | 
18 | Train=[
19 |     action=train
20 |     modelPath=$ModelDir$/cntk
21 |     deviceId=Auto
22 |     traceLevel=1
23 | 
24 |     SimpleNetworkBuilder=[
25 |         layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
26 |         trainingCriterion=CrossEntropyWithSoftmax
27 |         evalCriterion=ErrorPrediction
28 |         layerTypes=Sigmoid
29 |         applyMeanVarNorm=false
30 |         initValueScale=1.0
31 |         uniformInit=true
32 |         needPrior=false
33 |     ]
34 |     
35 |     SGD=[
36 |         epochSize=262144
37 |         minibatchSize=8192
38 |         learningRatesPerMB=0.01
39 |         numMBsToShowResult=4
40 |         momentumPerSample=0
41 |         dropoutRate=0.0
42 |         maxEpochs=40
43 |         
44 |         ParallelTrain=[
45 |             parallelizationMethod=DataParallelSGD
46 |             distributedMBReading=true
47 |             parallelizationStartEpoch=1
48 |             DataParallelSGD=[
49 |                 gradientBits=1
50 |             ]
51 |         ]
52 | 
53 |         gradUpdateType=None
54 |         normWithAveMultiplier=true
55 |         clippingThresholdPerSample=1#INF
56 |     ]
57 | ]
58 | 
59 | reader=[
60 |     readerType=UCIFastReader
61 |     file=$WorkDir$/../data.txt
62 |     features=[
63 |         dim=$featureDim$
64 |         start=1
65 |     ]
66 |     labels=[
67 |         dim=1
68 |         start=0
69 | 	labelDim=$labelDim$
70 | 	labelMappingFile=$WorkDir$/labelmap.txt
71 |     ]
72 | ]
73 | 


--------------------------------------------------------------------------------
/CNTK/ffn_1GPU.sh:
--------------------------------------------------------------------------------
1 | ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn
2 | 
3 | 


--------------------------------------------------------------------------------
/CNTK/ffn_2GPUs.sh:
--------------------------------------------------------------------------------
1 | mpiexec -n 2 ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn parallelTrain=true
2 | 


--------------------------------------------------------------------------------
/CNTK/ffn_4GPUs.sh:
--------------------------------------------------------------------------------
1 | mpiexec -n 4 ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn parallelTrain=true
2 | 


--------------------------------------------------------------------------------
/CNTK/ffn_orig.config:
--------------------------------------------------------------------------------
 1 | WorkDir=.
 2 | ModelDir=$WorkDir$/models/$ConfigName$
 3 | stderr=$WorkDir$/logs/$ConfigName$/out
 4 | precision=float
 5 | deviceId=Auto
 6 | 
 7 | command=Train
 8 | 
 9 | featureDim = 957
10 | labelDim = 5976
11 | hiddenDim = 2048
12 | 
13 | parallelTrain=false
14 | 
15 | Train=[
16 |     action=train
17 |     modelPath=$ModelDir$/cntk
18 |     deviceId=Auto
19 |     traceLevel=1
20 | 
21 |     SimpleNetworkBuilder=[
22 |         layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$
23 |         trainingCriterion=CrossEntropyWithSoftmax
24 |         evalCriterion=ErrorPrediction
25 |         layerTypes=Sigmoid
26 |         applyMeanVarNorm=false
27 |         initValueScale=1.0
28 |         uniformInit=true
29 |         needPrior=false
30 |     ]
31 |     
32 |     SGD=[
33 |         epochSize=65536
34 |         minibatchSize=512
35 |         learningRatesPerMB=0.1
36 |         numMBsToShowResult=10
37 |         momentumPerSample=0.999589
38 |         dropoutRate=0.0
39 |         maxEpochs=2
40 |         
41 |         ParallelTrain=[
42 |             parallelizationMethod=DataParallelSGD
43 |             distributedMBReading=true
44 |             parallelizationStartEpoch=1
45 |             DataParallelSGD=[
46 |                 gradientBits=32
47 |             ]
48 |         ]
49 | 
50 |         gradUpdateType=None
51 |         normWithAveMultiplier=true
52 |         clippingThresholdPerSample=1#INF
53 |     ]
54 | ]
55 | 
56 | reader=[
57 |     readerType=UCIFastReader
58 |     file=$WorkDir$/../data_orig.txt
59 |     features=[
60 |         dim=$featureDim$
61 |         start=1
62 |     ]
63 |     labels=[
64 |         dim=1
65 |         start=0
66 | 	labelDim=$labelDim$
67 | 	labelMappingFile=$WorkDir$/labelmap_orig.txt
68 |     ]
69 | ]
70 | 


--------------------------------------------------------------------------------
/CNTK/labelmap.1K.txt:
--------------------------------------------------------------------------------
   1 | 0
   2 | 1
   3 | 2
   4 | 3
   5 | 4
   6 | 5
   7 | 6
   8 | 7
   9 | 8
  10 | 9
  11 | 10
  12 | 11
  13 | 12
  14 | 13
  15 | 14
  16 | 15
  17 | 16
  18 | 17
  19 | 18
  20 | 19
  21 | 20
  22 | 21
  23 | 22
  24 | 23
  25 | 24
  26 | 25
  27 | 26
  28 | 27
  29 | 28
  30 | 29
  31 | 30
  32 | 31
  33 | 32
  34 | 33
  35 | 34
  36 | 35
  37 | 36
  38 | 37
  39 | 38
  40 | 39
  41 | 40
  42 | 41
  43 | 42
  44 | 43
  45 | 44
  46 | 45
  47 | 46
  48 | 47
  49 | 48
  50 | 49
  51 | 50
  52 | 51
  53 | 52
  54 | 53
  55 | 54
  56 | 55
  57 | 56
  58 | 57
  59 | 58
  60 | 59
  61 | 60
  62 | 61
  63 | 62
  64 | 63
  65 | 64
  66 | 65
  67 | 66
  68 | 67
  69 | 68
  70 | 69
  71 | 70
  72 | 71
  73 | 72
  74 | 73
  75 | 74
  76 | 75
  77 | 76
  78 | 77
  79 | 78
  80 | 79
  81 | 80
  82 | 81
  83 | 82
  84 | 83
  85 | 84
  86 | 85
  87 | 86
  88 | 87
  89 | 88
  90 | 89
  91 | 90
  92 | 91
  93 | 92
  94 | 93
  95 | 94
  96 | 95
  97 | 96
  98 | 97
  99 | 98
 100 | 99
 101 | 100
 102 | 101
 103 | 102
 104 | 103
 105 | 104
 106 | 105
 107 | 106
 108 | 107
 109 | 108
 110 | 109
 111 | 110
 112 | 111
 113 | 112
 114 | 113
 115 | 114
 116 | 115
 117 | 116
 118 | 117
 119 | 118
 120 | 119
 121 | 120
 122 | 121
 123 | 122
 124 | 123
 125 | 124
 126 | 125
 127 | 126
 128 | 127
 129 | 128
 130 | 129
 131 | 130
 132 | 131
 133 | 132
 134 | 133
 135 | 134
 136 | 135
 137 | 136
 138 | 137
 139 | 138
 140 | 139
 141 | 140
 142 | 141
 143 | 142
 144 | 143
 145 | 144
 146 | 145
 147 | 146
 148 | 147
 149 | 148
 150 | 149
 151 | 150
 152 | 151
 153 | 152
 154 | 153
 155 | 154
 156 | 155
 157 | 156
 158 | 157
 159 | 158
 160 | 159
 161 | 160
 162 | 161
 163 | 162
 164 | 163
 165 | 164
 166 | 165
 167 | 166
 168 | 167
 169 | 168
 170 | 169
 171 | 170
 172 | 171
 173 | 172
 174 | 173
 175 | 174
 176 | 175
 177 | 176
 178 | 177
 179 | 178
 180 | 179
 181 | 180
 182 | 181
 183 | 182
 184 | 183
 185 | 184
 186 | 185
 187 | 186
 188 | 187
 189 | 188
 190 | 189
 191 | 190
 192 | 191
 193 | 192
 194 | 193
 195 | 194
 196 | 195
 197 | 196
 198 | 197
 199 | 198
 200 | 199
 201 | 200
 202 | 201
 203 | 202
 204 | 203
 205 | 204
 206 | 205
 207 | 206
 208 | 207
 209 | 208
 210 | 209
 211 | 210
 212 | 211
 213 | 212
 214 | 213
 215 | 214
 216 | 215
 217 | 216
 218 | 217
 219 | 218
 220 | 219
 221 | 220
 222 | 221
 223 | 222
 224 | 223
 225 | 224
 226 | 225
 227 | 226
 228 | 227
 229 | 228
 230 | 229
 231 | 230
 232 | 231
 233 | 232
 234 | 233
 235 | 234
 236 | 235
 237 | 236
 238 | 237
 239 | 238
 240 | 239
 241 | 240
 242 | 241
 243 | 242
 244 | 243
 245 | 244
 246 | 245
 247 | 246
 248 | 247
 249 | 248
 250 | 249
 251 | 250
 252 | 251
 253 | 252
 254 | 253
 255 | 254
 256 | 255
 257 | 256
 258 | 257
 259 | 258
 260 | 259
 261 | 260
 262 | 261
 263 | 262
 264 | 263
 265 | 264
 266 | 265
 267 | 266
 268 | 267
 269 | 268
 270 | 269
 271 | 270
 272 | 271
 273 | 272
 274 | 273
 275 | 274
 276 | 275
 277 | 276
 278 | 277
 279 | 278
 280 | 279
 281 | 280
 282 | 281
 283 | 282
 284 | 283
 285 | 284
 286 | 285
 287 | 286
 288 | 287
 289 | 288
 290 | 289
 291 | 290
 292 | 291
 293 | 292
 294 | 293
 295 | 294
 296 | 295
 297 | 296
 298 | 297
 299 | 298
 300 | 299
 301 | 300
 302 | 301
 303 | 302
 304 | 303
 305 | 304
 306 | 305
 307 | 306
 308 | 307
 309 | 308
 310 | 309
 311 | 310
 312 | 311
 313 | 312
 314 | 313
 315 | 314
 316 | 315
 317 | 316
 318 | 317
 319 | 318
 320 | 319
 321 | 320
 322 | 321
 323 | 322
 324 | 323
 325 | 324
 326 | 325
 327 | 326
 328 | 327
 329 | 328
 330 | 329
 331 | 330
 332 | 331
 333 | 332
 334 | 333
 335 | 334
 336 | 335
 337 | 336
 338 | 337
 339 | 338
 340 | 339
 341 | 340
 342 | 341
 343 | 342
 344 | 343
 345 | 344
 346 | 345
 347 | 346
 348 | 347
 349 | 348
 350 | 349
 351 | 350
 352 | 351
 353 | 352
 354 | 353
 355 | 354
 356 | 355
 357 | 356
 358 | 357
 359 | 358
 360 | 359
 361 | 360
 362 | 361
 363 | 362
 364 | 363
 365 | 364
 366 | 365
 367 | 366
 368 | 367
 369 | 368
 370 | 369
 371 | 370
 372 | 371
 373 | 372
 374 | 373
 375 | 374
 376 | 375
 377 | 376
 378 | 377
 379 | 378
 380 | 379
 381 | 380
 382 | 381
 383 | 382
 384 | 383
 385 | 384
 386 | 385
 387 | 386
 388 | 387
 389 | 388
 390 | 389
 391 | 390
 392 | 391
 393 | 392
 394 | 393
 395 | 394
 396 | 395
 397 | 396
 398 | 397
 399 | 398
 400 | 399
 401 | 400
 402 | 401
 403 | 402
 404 | 403
 405 | 404
 406 | 405
 407 | 406
 408 | 407
 409 | 408
 410 | 409
 411 | 410
 412 | 411
 413 | 412
 414 | 413
 415 | 414
 416 | 415
 417 | 416
 418 | 417
 419 | 418
 420 | 419
 421 | 420
 422 | 421
 423 | 422
 424 | 423
 425 | 424
 426 | 425
 427 | 426
 428 | 427
 429 | 428
 430 | 429
 431 | 430
 432 | 431
 433 | 432
 434 | 433
 435 | 434
 436 | 435
 437 | 436
 438 | 437
 439 | 438
 440 | 439
 441 | 440
 442 | 441
 443 | 442
 444 | 443
 445 | 444
 446 | 445
 447 | 446
 448 | 447
 449 | 448
 450 | 449
 451 | 450
 452 | 451
 453 | 452
 454 | 453
 455 | 454
 456 | 455
 457 | 456
 458 | 457
 459 | 458
 460 | 459
 461 | 460
 462 | 461
 463 | 462
 464 | 463
 465 | 464
 466 | 465
 467 | 466
 468 | 467
 469 | 468
 470 | 469
 471 | 470
 472 | 471
 473 | 472
 474 | 473
 475 | 474
 476 | 475
 477 | 476
 478 | 477
 479 | 478
 480 | 479
 481 | 480
 482 | 481
 483 | 482
 484 | 483
 485 | 484
 486 | 485
 487 | 486
 488 | 487
 489 | 488
 490 | 489
 491 | 490
 492 | 491
 493 | 492
 494 | 493
 495 | 494
 496 | 495
 497 | 496
 498 | 497
 499 | 498
 500 | 499
 501 | 500
 502 | 501
 503 | 502
 504 | 503
 505 | 504
 506 | 505
 507 | 506
 508 | 507
 509 | 508
 510 | 509
 511 | 510
 512 | 511
 513 | 512
 514 | 513
 515 | 514
 516 | 515
 517 | 516
 518 | 517
 519 | 518
 520 | 519
 521 | 520
 522 | 521
 523 | 522
 524 | 523
 525 | 524
 526 | 525
 527 | 526
 528 | 527
 529 | 528
 530 | 529
 531 | 530
 532 | 531
 533 | 532
 534 | 533
 535 | 534
 536 | 535
 537 | 536
 538 | 537
 539 | 538
 540 | 539
 541 | 540
 542 | 541
 543 | 542
 544 | 543
 545 | 544
 546 | 545
 547 | 546
 548 | 547
 549 | 548
 550 | 549
 551 | 550
 552 | 551
 553 | 552
 554 | 553
 555 | 554
 556 | 555
 557 | 556
 558 | 557
 559 | 558
 560 | 559
 561 | 560
 562 | 561
 563 | 562
 564 | 563
 565 | 564
 566 | 565
 567 | 566
 568 | 567
 569 | 568
 570 | 569
 571 | 570
 572 | 571
 573 | 572
 574 | 573
 575 | 574
 576 | 575
 577 | 576
 578 | 577
 579 | 578
 580 | 579
 581 | 580
 582 | 581
 583 | 582
 584 | 583
 585 | 584
 586 | 585
 587 | 586
 588 | 587
 589 | 588
 590 | 589
 591 | 590
 592 | 591
 593 | 592
 594 | 593
 595 | 594
 596 | 595
 597 | 596
 598 | 597
 599 | 598
 600 | 599
 601 | 600
 602 | 601
 603 | 602
 604 | 603
 605 | 604
 606 | 605
 607 | 606
 608 | 607
 609 | 608
 610 | 609
 611 | 610
 612 | 611
 613 | 612
 614 | 613
 615 | 614
 616 | 615
 617 | 616
 618 | 617
 619 | 618
 620 | 619
 621 | 620
 622 | 621
 623 | 622
 624 | 623
 625 | 624
 626 | 625
 627 | 626
 628 | 627
 629 | 628
 630 | 629
 631 | 630
 632 | 631
 633 | 632
 634 | 633
 635 | 634
 636 | 635
 637 | 636
 638 | 637
 639 | 638
 640 | 639
 641 | 640
 642 | 641
 643 | 642
 644 | 643
 645 | 644
 646 | 645
 647 | 646
 648 | 647
 649 | 648
 650 | 649
 651 | 650
 652 | 651
 653 | 652
 654 | 653
 655 | 654
 656 | 655
 657 | 656
 658 | 657
 659 | 658
 660 | 659
 661 | 660
 662 | 661
 663 | 662
 664 | 663
 665 | 664
 666 | 665
 667 | 666
 668 | 667
 669 | 668
 670 | 669
 671 | 670
 672 | 671
 673 | 672
 674 | 673
 675 | 674
 676 | 675
 677 | 676
 678 | 677
 679 | 678
 680 | 679
 681 | 680
 682 | 681
 683 | 682
 684 | 683
 685 | 684
 686 | 685
 687 | 686
 688 | 687
 689 | 688
 690 | 689
 691 | 690
 692 | 691
 693 | 692
 694 | 693
 695 | 694
 696 | 695
 697 | 696
 698 | 697
 699 | 698
 700 | 699
 701 | 700
 702 | 701
 703 | 702
 704 | 703
 705 | 704
 706 | 705
 707 | 706
 708 | 707
 709 | 708
 710 | 709
 711 | 710
 712 | 711
 713 | 712
 714 | 713
 715 | 714
 716 | 715
 717 | 716
 718 | 717
 719 | 718
 720 | 719
 721 | 720
 722 | 721
 723 | 722
 724 | 723
 725 | 724
 726 | 725
 727 | 726
 728 | 727
 729 | 728
 730 | 729
 731 | 730
 732 | 731
 733 | 732
 734 | 733
 735 | 734
 736 | 735
 737 | 736
 738 | 737
 739 | 738
 740 | 739
 741 | 740
 742 | 741
 743 | 742
 744 | 743
 745 | 744
 746 | 745
 747 | 746
 748 | 747
 749 | 748
 750 | 749
 751 | 750
 752 | 751
 753 | 752
 754 | 753
 755 | 754
 756 | 755
 757 | 756
 758 | 757
 759 | 758
 760 | 759
 761 | 760
 762 | 761
 763 | 762
 764 | 763
 765 | 764
 766 | 765
 767 | 766
 768 | 767
 769 | 768
 770 | 769
 771 | 770
 772 | 771
 773 | 772
 774 | 773
 775 | 774
 776 | 775
 777 | 776
 778 | 777
 779 | 778
 780 | 779
 781 | 780
 782 | 781
 783 | 782
 784 | 783
 785 | 784
 786 | 785
 787 | 786
 788 | 787
 789 | 788
 790 | 789
 791 | 790
 792 | 791
 793 | 792
 794 | 793
 795 | 794
 796 | 795
 797 | 796
 798 | 797
 799 | 798
 800 | 799
 801 | 800
 802 | 801
 803 | 802
 804 | 803
 805 | 804
 806 | 805
 807 | 806
 808 | 807
 809 | 808
 810 | 809
 811 | 810
 812 | 811
 813 | 812
 814 | 813
 815 | 814
 816 | 815
 817 | 816
 818 | 817
 819 | 818
 820 | 819
 821 | 820
 822 | 821
 823 | 822
 824 | 823
 825 | 824
 826 | 825
 827 | 826
 828 | 827
 829 | 828
 830 | 829
 831 | 830
 832 | 831
 833 | 832
 834 | 833
 835 | 834
 836 | 835
 837 | 836
 838 | 837
 839 | 838
 840 | 839
 841 | 840
 842 | 841
 843 | 842
 844 | 843
 845 | 844
 846 | 845
 847 | 846
 848 | 847
 849 | 848
 850 | 849
 851 | 850
 852 | 851
 853 | 852
 854 | 853
 855 | 854
 856 | 855
 857 | 856
 858 | 857
 859 | 858
 860 | 859
 861 | 860
 862 | 861
 863 | 862
 864 | 863
 865 | 864
 866 | 865
 867 | 866
 868 | 867
 869 | 868
 870 | 869
 871 | 870
 872 | 871
 873 | 872
 874 | 873
 875 | 874
 876 | 875
 877 | 876
 878 | 877
 879 | 878
 880 | 879
 881 | 880
 882 | 881
 883 | 882
 884 | 883
 885 | 884
 886 | 885
 887 | 886
 888 | 887
 889 | 888
 890 | 889
 891 | 890
 892 | 891
 893 | 892
 894 | 893
 895 | 894
 896 | 895
 897 | 896
 898 | 897
 899 | 898
 900 | 899
 901 | 900
 902 | 901
 903 | 902
 904 | 903
 905 | 904
 906 | 905
 907 | 906
 908 | 907
 909 | 908
 910 | 909
 911 | 910
 912 | 911
 913 | 912
 914 | 913
 915 | 914
 916 | 915
 917 | 916
 918 | 917
 919 | 918
 920 | 919
 921 | 920
 922 | 921
 923 | 922
 924 | 923
 925 | 924
 926 | 925
 927 | 926
 928 | 927
 929 | 928
 930 | 929
 931 | 930
 932 | 931
 933 | 932
 934 | 933
 935 | 934
 936 | 935
 937 | 936
 938 | 937
 939 | 938
 940 | 939
 941 | 940
 942 | 941
 943 | 942
 944 | 943
 945 | 944
 946 | 945
 947 | 946
 948 | 947
 949 | 948
 950 | 949
 951 | 950
 952 | 951
 953 | 952
 954 | 953
 955 | 954
 956 | 955
 957 | 956
 958 | 957
 959 | 958
 960 | 959
 961 | 960
 962 | 961
 963 | 962
 964 | 963
 965 | 964
 966 | 965
 967 | 966
 968 | 967
 969 | 968
 970 | 969
 971 | 970
 972 | 971
 973 | 972
 974 | 973
 975 | 974
 976 | 975
 977 | 976
 978 | 977
 979 | 978
 980 | 979
 981 | 980
 982 | 981
 983 | 982
 984 | 983
 985 | 984
 986 | 985
 987 | 986
 988 | 987
 989 | 988
 990 | 989
 991 | 990
 992 | 991
 993 | 992
 994 | 993
 995 | 994
 996 | 995
 997 | 996
 998 | 997
 999 | 998
1000 | 999
1001 | 


--------------------------------------------------------------------------------
/GPU-hardware.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | ==============NVSMI LOG==============
  3 | 
  4 | Timestamp                           : Tue Dec  8 06:02:49 2015
  5 | Driver Version                      : 346.72
  6 | 
  7 | Attached GPUs                       : 4
  8 | GPU 0000:0A:00.0
  9 |     Product Name                    : Tesla K40m
 10 |     Product Brand                   : Tesla
 11 |     Display Mode                    : Disabled
 12 |     Display Active                  : Disabled
 13 |     Persistence Mode                : Disabled
 14 |     Accounting Mode                 : Disabled
 15 |     Accounting Mode Buffer Size     : 128
 16 |     Driver Model
 17 |         Current                     : N/A
 18 |         Pending                     : N/A
 19 |     Serial Number                   : 0322815022851
 20 |     GPU UUID                        : GPU-9d74c40c-8145-27f7-a65d-0c8268d892cd
 21 |     Minor Number                    : 1
 22 |     VBIOS Version                   : 80.80.3E.00.0F
 23 |     MultiGPU Board                  : No
 24 |     Board ID                        : 0xa00
 25 |     Inforom Version
 26 |         Image Version               : 2081.0202.01.04
 27 |         OEM Object                  : 1.1
 28 |         ECC Object                  : 3.0
 29 |         Power Management Object     : N/A
 30 |     GPU Operation Mode
 31 |         Current                     : N/A
 32 |         Pending                     : N/A
 33 |     PCI
 34 |         Bus                         : 0x0A
 35 |         Device                      : 0x00
 36 |         Domain                      : 0x0000
 37 |         Device Id                   : 0x102310DE
 38 |         Bus Id                      : 0000:0A:00.0
 39 |         Sub System Id               : 0x097E10DE
 40 |         GPU Link Info
 41 |             PCIe Generation
 42 |                 Max                 : 3
 43 |                 Current             : 3
 44 |             Link Width
 45 |                 Max                 : 16x
 46 |                 Current             : 16x
 47 |         Bridge Chip
 48 |             Type                    : N/A
 49 |             Firmware                : N/A
 50 |         Replays since reset         : 0
 51 |         Tx Throughput               : N/A
 52 |         Rx Throughput               : N/A
 53 |     Fan Speed                       : N/A
 54 |     Performance State               : P0
 55 |     Clocks Throttle Reasons
 56 |         Idle                        : Not Active
 57 |         Applications Clocks Setting : Active
 58 |         SW Power Cap                : Not Active
 59 |         HW Slowdown                 : Not Active
 60 |         Unknown                     : Not Active
 61 |     FB Memory Usage
 62 |         Total                       : 11519 MiB
 63 |         Used                        : 55 MiB
 64 |         Free                        : 11464 MiB
 65 |     BAR1 Memory Usage
 66 |         Total                       : 16384 MiB
 67 |         Used                        : 2 MiB
 68 |         Free                        : 16382 MiB
 69 |     Compute Mode                    : Default
 70 |     Utilization
 71 |         Gpu                         : 0 %
 72 |         Memory                      : 0 %
 73 |         Encoder                     : 0 %
 74 |         Decoder                     : 0 %
 75 |     Ecc Mode
 76 |         Current                     : Enabled
 77 |         Pending                     : Enabled
 78 |     ECC Errors
 79 |         Volatile
 80 |             Single Bit            
 81 |                 Device Memory       : 0
 82 |                 Register File       : 0
 83 |                 L1 Cache            : 0
 84 |                 L2 Cache            : 0
 85 |                 Texture Memory      : 0
 86 |                 Total               : 0
 87 |             Double Bit            
 88 |                 Device Memory       : 0
 89 |                 Register File       : 0
 90 |                 L1 Cache            : 0
 91 |                 L2 Cache            : 0
 92 |                 Texture Memory      : 0
 93 |                 Total               : 0
 94 |         Aggregate
 95 |             Single Bit            
 96 |                 Device Memory       : 0
 97 |                 Register File       : 0
 98 |                 L1 Cache            : 0
 99 |                 L2 Cache            : 0
100 |                 Texture Memory      : 0
101 |                 Total               : 0
102 |             Double Bit            
103 |                 Device Memory       : 0
104 |                 Register File       : 0
105 |                 L1 Cache            : 0
106 |                 L2 Cache            : 0
107 |                 Texture Memory      : 0
108 |                 Total               : 0
109 |     Retired Pages
110 |         Single Bit ECC              : 0
111 |         Double Bit ECC              : 0
112 |         Pending                     : No
113 |     Temperature
114 |         GPU Current Temp            : 38 C
115 |         GPU Shutdown Temp           : 95 C
116 |         GPU Slowdown Temp           : 90 C
117 |     Power Readings
118 |         Power Management            : Supported
119 |         Power Draw                  : 62.50 W
120 |         Power Limit                 : 235.00 W
121 |         Default Power Limit         : 235.00 W
122 |         Enforced Power Limit        : 235.00 W
123 |         Min Power Limit             : 180.00 W
124 |         Max Power Limit             : 235.00 W
125 |     Clocks
126 |         Graphics                    : 745 MHz
127 |         SM                          : 745 MHz
128 |         Memory                      : 3004 MHz
129 |     Applications Clocks
130 |         Graphics                    : 745 MHz
131 |         Memory                      : 3004 MHz
132 |     Default Applications Clocks
133 |         Graphics                    : 745 MHz
134 |         Memory                      : 3004 MHz
135 |     Max Clocks
136 |         Graphics                    : 875 MHz
137 |         SM                          : 875 MHz
138 |         Memory                      : 3004 MHz
139 |     Clock Policy
140 |         Auto Boost                  : N/A
141 |         Auto Boost Default          : N/A
142 |     Processes                       : None
143 | 
144 | GPU 0000:0D:00.0
145 |     Product Name                    : Tesla K40m
146 |     Product Brand                   : Tesla
147 |     Display Mode                    : Disabled
148 |     Display Active                  : Disabled
149 |     Persistence Mode                : Disabled
150 |     Accounting Mode                 : Disabled
151 |     Accounting Mode Buffer Size     : 128
152 |     Driver Model
153 |         Current                     : N/A
154 |         Pending                     : N/A
155 |     Serial Number                   : 0323315059424
156 |     GPU UUID                        : GPU-1c892c8f-e42d-2261-f9a3-a9dbf6d1d2dc
157 |     Minor Number                    : 0
158 |     VBIOS Version                   : 80.80.3E.00.0F
159 |     MultiGPU Board                  : No
160 |     Board ID                        : 0xd00
161 |     Inforom Version
162 |         Image Version               : 2081.0202.01.04
163 |         OEM Object                  : 1.1
164 |         ECC Object                  : 3.0
165 |         Power Management Object     : N/A
166 |     GPU Operation Mode
167 |         Current                     : N/A
168 |         Pending                     : N/A
169 |     PCI
170 |         Bus                         : 0x0D
171 |         Device                      : 0x00
172 |         Domain                      : 0x0000
173 |         Device Id                   : 0x102310DE
174 |         Bus Id                      : 0000:0D:00.0
175 |         Sub System Id               : 0x097E10DE
176 |         GPU Link Info
177 |             PCIe Generation
178 |                 Max                 : 3
179 |                 Current             : 3
180 |             Link Width
181 |                 Max                 : 16x
182 |                 Current             : 16x
183 |         Bridge Chip
184 |             Type                    : N/A
185 |             Firmware                : N/A
186 |         Replays since reset         : 0
187 |         Tx Throughput               : N/A
188 |         Rx Throughput               : N/A
189 |     Fan Speed                       : N/A
190 |     Performance State               : P0
191 |     Clocks Throttle Reasons
192 |         Idle                        : Not Active
193 |         Applications Clocks Setting : Active
194 |         SW Power Cap                : Not Active
195 |         HW Slowdown                 : Not Active
196 |         Unknown                     : Not Active
197 |     FB Memory Usage
198 |         Total                       : 11519 MiB
199 |         Used                        : 55 MiB
200 |         Free                        : 11464 MiB
201 |     BAR1 Memory Usage
202 |         Total                       : 16384 MiB
203 |         Used                        : 2 MiB
204 |         Free                        : 16382 MiB
205 |     Compute Mode                    : Default
206 |     Utilization
207 |         Gpu                         : 0 %
208 |         Memory                      : 0 %
209 |         Encoder                     : 0 %
210 |         Decoder                     : 0 %
211 |     Ecc Mode
212 |         Current                     : Enabled
213 |         Pending                     : Enabled
214 |     ECC Errors
215 |         Volatile
216 |             Single Bit            
217 |                 Device Memory       : 0
218 |                 Register File       : 0
219 |                 L1 Cache            : 0
220 |                 L2 Cache            : 0
221 |                 Texture Memory      : 0
222 |                 Total               : 0
223 |             Double Bit            
224 |                 Device Memory       : 0
225 |                 Register File       : 0
226 |                 L1 Cache            : 0
227 |                 L2 Cache            : 0
228 |                 Texture Memory      : 0
229 |                 Total               : 0
230 |         Aggregate
231 |             Single Bit            
232 |                 Device Memory       : 0
233 |                 Register File       : 0
234 |                 L1 Cache            : 0
235 |                 L2 Cache            : 0
236 |                 Texture Memory      : 0
237 |                 Total               : 0
238 |             Double Bit            
239 |                 Device Memory       : 0
240 |                 Register File       : 0
241 |                 L1 Cache            : 0
242 |                 L2 Cache            : 0
243 |                 Texture Memory      : 0
244 |                 Total               : 0
245 |     Retired Pages
246 |         Single Bit ECC              : 0
247 |         Double Bit ECC              : 0
248 |         Pending                     : No
249 |     Temperature
250 |         GPU Current Temp            : 40 C
251 |         GPU Shutdown Temp           : 95 C
252 |         GPU Slowdown Temp           : 90 C
253 |     Power Readings
254 |         Power Management            : Supported
255 |         Power Draw                  : 64.53 W
256 |         Power Limit                 : 235.00 W
257 |         Default Power Limit         : 235.00 W
258 |         Enforced Power Limit        : 235.00 W
259 |         Min Power Limit             : 180.00 W
260 |         Max Power Limit             : 235.00 W
261 |     Clocks
262 |         Graphics                    : 745 MHz
263 |         SM                          : 745 MHz
264 |         Memory                      : 3004 MHz
265 |     Applications Clocks
266 |         Graphics                    : 745 MHz
267 |         Memory                      : 3004 MHz
268 |     Default Applications Clocks
269 |         Graphics                    : 745 MHz
270 |         Memory                      : 3004 MHz
271 |     Max Clocks
272 |         Graphics                    : 875 MHz
273 |         SM                          : 875 MHz
274 |         Memory                      : 3004 MHz
275 |     Clock Policy
276 |         Auto Boost                  : N/A
277 |         Auto Boost Default          : N/A
278 |     Processes                       : None
279 | 
280 | GPU 0000:2B:00.0
281 |     Product Name                    : Tesla K40m
282 |     Product Brand                   : Tesla
283 |     Display Mode                    : Disabled
284 |     Display Active                  : Disabled
285 |     Persistence Mode                : Disabled
286 |     Accounting Mode                 : Disabled
287 |     Accounting Mode Buffer Size     : 128
288 |     Driver Model
289 |         Current                     : N/A
290 |         Pending                     : N/A
291 |     Serial Number                   : 0323315058830
292 |     GPU UUID                        : GPU-69643319-f398-0e93-e5a0-c1c019b5f866
293 |     Minor Number                    : 2
294 |     VBIOS Version                   : 80.80.3E.00.0F
295 |     MultiGPU Board                  : No
296 |     Board ID                        : 0x2b00
297 |     Inforom Version
298 |         Image Version               : 2081.0202.01.04
299 |         OEM Object                  : 1.1
300 |         ECC Object                  : 3.0
301 |         Power Management Object     : N/A
302 |     GPU Operation Mode
303 |         Current                     : N/A
304 |         Pending                     : N/A
305 |     PCI
306 |         Bus                         : 0x2B
307 |         Device                      : 0x00
308 |         Domain                      : 0x0000
309 |         Device Id                   : 0x102310DE
310 |         Bus Id                      : 0000:2B:00.0
311 |         Sub System Id               : 0x097E10DE
312 |         GPU Link Info
313 |             PCIe Generation
314 |                 Max                 : 3
315 |                 Current             : 3
316 |             Link Width
317 |                 Max                 : 16x
318 |                 Current             : 16x
319 |         Bridge Chip
320 |             Type                    : N/A
321 |             Firmware                : N/A
322 |         Replays since reset         : 0
323 |         Tx Throughput               : N/A
324 |         Rx Throughput               : N/A
325 |     Fan Speed                       : N/A
326 |     Performance State               : P0
327 |     Clocks Throttle Reasons
328 |         Idle                        : Not Active
329 |         Applications Clocks Setting : Active
330 |         SW Power Cap                : Not Active
331 |         HW Slowdown                 : Not Active
332 |         Unknown                     : Not Active
333 |     FB Memory Usage
334 |         Total                       : 11519 MiB
335 |         Used                        : 55 MiB
336 |         Free                        : 11464 MiB
337 |     BAR1 Memory Usage
338 |         Total                       : 16384 MiB
339 |         Used                        : 2 MiB
340 |         Free                        : 16382 MiB
341 |     Compute Mode                    : Default
342 |     Utilization
343 |         Gpu                         : 0 %
344 |         Memory                      : 0 %
345 |         Encoder                     : 0 %
346 |         Decoder                     : 0 %
347 |     Ecc Mode
348 |         Current                     : Enabled
349 |         Pending                     : Enabled
350 |     ECC Errors
351 |         Volatile
352 |             Single Bit            
353 |                 Device Memory       : 0
354 |                 Register File       : 0
355 |                 L1 Cache            : 0
356 |                 L2 Cache            : 0
357 |                 Texture Memory      : 0
358 |                 Total               : 0
359 |             Double Bit            
360 |                 Device Memory       : 0
361 |                 Register File       : 0
362 |                 L1 Cache            : 0
363 |                 L2 Cache            : 0
364 |                 Texture Memory      : 0
365 |                 Total               : 0
366 |         Aggregate
367 |             Single Bit            
368 |                 Device Memory       : 0
369 |                 Register File       : 0
370 |                 L1 Cache            : 0
371 |                 L2 Cache            : 0
372 |                 Texture Memory      : 0
373 |                 Total               : 0
374 |             Double Bit            
375 |                 Device Memory       : 0
376 |                 Register File       : 0
377 |                 L1 Cache            : 0
378 |                 L2 Cache            : 0
379 |                 Texture Memory      : 0
380 |                 Total               : 0
381 |     Retired Pages
382 |         Single Bit ECC              : 0
383 |         Double Bit ECC              : 0
384 |         Pending                     : No
385 |     Temperature
386 |         GPU Current Temp            : 43 C
387 |         GPU Shutdown Temp           : 95 C
388 |         GPU Slowdown Temp           : 90 C
389 |     Power Readings
390 |         Power Management            : Supported
391 |         Power Draw                  : 63.97 W
392 |         Power Limit                 : 235.00 W
393 |         Default Power Limit         : 235.00 W
394 |         Enforced Power Limit        : 235.00 W
395 |         Min Power Limit             : 180.00 W
396 |         Max Power Limit             : 235.00 W
397 |     Clocks
398 |         Graphics                    : 745 MHz
399 |         SM                          : 745 MHz
400 |         Memory                      : 3004 MHz
401 |     Applications Clocks
402 |         Graphics                    : 745 MHz
403 |         Memory                      : 3004 MHz
404 |     Default Applications Clocks
405 |         Graphics                    : 745 MHz
406 |         Memory                      : 3004 MHz
407 |     Max Clocks
408 |         Graphics                    : 875 MHz
409 |         SM                          : 875 MHz
410 |         Memory                      : 3004 MHz
411 |     Clock Policy
412 |         Auto Boost                  : N/A
413 |         Auto Boost Default          : N/A
414 |     Processes                       : None
415 | 
416 | GPU 0000:30:00.0
417 |     Product Name                    : Tesla K40m
418 |     Product Brand                   : Tesla
419 |     Display Mode                    : Disabled
420 |     Display Active                  : Disabled
421 |     Persistence Mode                : Disabled
422 |     Accounting Mode                 : Disabled
423 |     Accounting Mode Buffer Size     : 128
424 |     Driver Model
425 |         Current                     : N/A
426 |         Pending                     : N/A
427 |     Serial Number                   : 0323315059276
428 |     GPU UUID                        : GPU-ea4ed96a-36b3-cb17-1b13-79d47b84b5e7
429 |     Minor Number                    : 3
430 |     VBIOS Version                   : 80.80.3E.00.0F
431 |     MultiGPU Board                  : No
432 |     Board ID                        : 0x3000
433 |     Inforom Version
434 |         Image Version               : 2081.0202.01.04
435 |         OEM Object                  : 1.1
436 |         ECC Object                  : 3.0
437 |         Power Management Object     : N/A
438 |     GPU Operation Mode
439 |         Current                     : N/A
440 |         Pending                     : N/A
441 |     PCI
442 |         Bus                         : 0x30
443 |         Device                      : 0x00
444 |         Domain                      : 0x0000
445 |         Device Id                   : 0x102310DE
446 |         Bus Id                      : 0000:30:00.0
447 |         Sub System Id               : 0x097E10DE
448 |         GPU Link Info
449 |             PCIe Generation
450 |                 Max                 : 3
451 |                 Current             : 3
452 |             Link Width
453 |                 Max                 : 16x
454 |                 Current             : 16x
455 |         Bridge Chip
456 |             Type                    : N/A
457 |             Firmware                : N/A
458 |         Replays since reset         : 0
459 |         Tx Throughput               : N/A
460 |         Rx Throughput               : N/A
461 |     Fan Speed                       : N/A
462 |     Performance State               : P0
463 |     Clocks Throttle Reasons
464 |         Idle                        : Not Active
465 |         Applications Clocks Setting : Active
466 |         SW Power Cap                : Not Active
467 |         HW Slowdown                 : Not Active
468 |         Unknown                     : Not Active
469 |     FB Memory Usage
470 |         Total                       : 11519 MiB
471 |         Used                        : 55 MiB
472 |         Free                        : 11464 MiB
473 |     BAR1 Memory Usage
474 |         Total                       : 16384 MiB
475 |         Used                        : 2 MiB
476 |         Free                        : 16382 MiB
477 |     Compute Mode                    : Default
478 |     Utilization
479 |         Gpu                         : 96 %
480 |         Memory                      : 4 %
481 |         Encoder                     : 0 %
482 |         Decoder                     : 0 %
483 |     Ecc Mode
484 |         Current                     : Enabled
485 |         Pending                     : Enabled
486 |     ECC Errors
487 |         Volatile
488 |             Single Bit            
489 |                 Device Memory       : 0
490 |                 Register File       : 0
491 |                 L1 Cache            : 0
492 |                 L2 Cache            : 0
493 |                 Texture Memory      : 0
494 |                 Total               : 0
495 |             Double Bit            
496 |                 Device Memory       : 0
497 |                 Register File       : 0
498 |                 L1 Cache            : 0
499 |                 L2 Cache            : 0
500 |                 Texture Memory      : 0
501 |                 Total               : 0
502 |         Aggregate
503 |             Single Bit            
504 |                 Device Memory       : 0
505 |                 Register File       : 0
506 |                 L1 Cache            : 0
507 |                 L2 Cache            : 0
508 |                 Texture Memory      : 0
509 |                 Total               : 0
510 |             Double Bit            
511 |                 Device Memory       : 0
512 |                 Register File       : 0
513 |                 L1 Cache            : 0
514 |                 L2 Cache            : 0
515 |                 Texture Memory      : 0
516 |                 Total               : 0
517 |     Retired Pages
518 |         Single Bit ECC              : 0
519 |         Double Bit ECC              : 0
520 |         Pending                     : No
521 |     Temperature
522 |         GPU Current Temp            : 42 C
523 |         GPU Shutdown Temp           : 95 C
524 |         GPU Slowdown Temp           : 90 C
525 |     Power Readings
526 |         Power Management            : Supported
527 |         Power Draw                  : 65.95 W
528 |         Power Limit                 : 235.00 W
529 |         Default Power Limit         : 235.00 W
530 |         Enforced Power Limit        : 235.00 W
531 |         Min Power Limit             : 180.00 W
532 |         Max Power Limit             : 235.00 W
533 |     Clocks
534 |         Graphics                    : 745 MHz
535 |         SM                          : 745 MHz
536 |         Memory                      : 3004 MHz
537 |     Applications Clocks
538 |         Graphics                    : 745 MHz
539 |         Memory                      : 3004 MHz
540 |     Default Applications Clocks
541 |         Graphics                    : 745 MHz
542 |         Memory                      : 3004 MHz
543 |     Max Clocks
544 |         Graphics                    : 875 MHz
545 |         SM                          : 875 MHz
546 |         Memory                      : 3004 MHz
547 |     Clock Policy
548 |         Auto Boost                  : N/A
549 |         Auto Boost Default          : N/A
550 |     Processes                       : None
551 | 
552 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarks
2 | Benchmarks for CNTK and other toolkits.
3 | 
4 | Disclaimer: I'm a Microsoft employee, however, this is my personal github account and information/code shared here does not represent opinions or views of Microsoft in any way.
5 | 


--------------------------------------------------------------------------------
/TensorFlow/ffn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | featureDim = 512
 5 | labelDim = 10000
 6 | hiddenLayDim = 2048
 7 | numMinibatches = 150
 8 | 
 9 | FLAGS = tf.app.flags.FLAGS
10 | tf.app.flags.DEFINE_boolean('logDevicePlacement', False,
11 |                             """Whether to log device placement.""")
12 | tf.app.flags.DEFINE_boolean('noInputFeed', False,
13 |                             """Whether to not feed new features/labels data for each minibatch.""")
14 | 
15 | data = np.loadtxt('../data.txt')
16 | features = data[:,1:]
17 | labels = data[:,0]
18 | 
19 | # Get random parameters initialized with a iniform distribution between -0.5 and 0.5
20 | def getParameters(name, shape):
21 |     return tf.get_variable(name, shape, initializer=tf.random_uniform_initializer(-0.5, 0.5))
22 | 
23 | def sigmoidDNNLayer(layerIdx, input, inputDim, outputDim):
24 |     W = getParameters("W" + str(layerIdx), [inputDim, outputDim])
25 |     B = getParameters("B" + str(layerIdx), [outputDim])
26 |     return tf.nn.sigmoid(tf.nn.xw_plus_b(input, W, B))
27 | 
28 | def getFakeMinibatch(minibatchSize):
29 |     #feat = np.random.randn(minibatchSize, featureDim)
30 |     #lab = np.zeros((minibatchSize, labelDim))
31 |     #for row in lab:
32 |     #	row[np.random.randint(0, labelDim)] = 1
33 |     feat = features[:minibatchSize]
34 |     l = labels[:minibatchSize]
35 |     lab = np.zeros((minibatchSize, labelDim))
36 |     for i in range(lab.shape[0]):
37 |         lab[i][l[i]] = 1
38 |     return feat, lab
39 |     #fakeFeatures = [[0.0 for _ in xrange(featureDim)] for _ in xrange(minibatchSize)]
40 |     #fakeLabels = [[0.0 for _ in xrange(labelDim)] for _ in xrange(minibatchSize)]
41 |     #for sampleIdx in xrange(minibatchSize):
42 |     #    fakeLabels[sampleIdx][np.random.randint(0, labelDim - 1)] = 1.0
43 |     #    for featureIdx in xrange(featureDim):
44 |     #        fakeFeatures[sampleIdx][featureIdx] = np.random.randn()
45 |     #
46 |     #return fakeFeatures, fakeLabels
47 | 
48 | 
49 | def getLossAndAccuracyForSubBatch(features, labels):
50 | 
51 |     HL0 = sigmoidDNNLayer(0, features, featureDim, hiddenLayDim)
52 |     HL1 = sigmoidDNNLayer(1, HL0, hiddenLayDim, hiddenLayDim)
53 |     HL2 = sigmoidDNNLayer(2, HL1, hiddenLayDim, hiddenLayDim)
54 |     HL3 = sigmoidDNNLayer(3, HL2, hiddenLayDim, hiddenLayDim)
55 | 
56 |     outputLayerW = getParameters("W5", [hiddenLayDim, labelDim])
57 |     outputLayerB = getParameters("B5", [labelDim])
58 |     outputLayer = tf.nn.softmax(tf.nn.xw_plus_b(HL3, outputLayerW, outputLayerB))
59 | 
60 |     crossEntropy = -tf.reduce_mean(labels * tf.log(outputLayer))
61 |     predictionCorrectness = tf.equal(tf.argmax(outputLayer, 1), tf.argmax(labels, 1))
62 |     accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))
63 | 
64 |     return crossEntropy, accuracy
65 | 
66 | def printTrainingStats(numGPUs, minibatchSize, perMinibatchTime):
67 |     meanTimePerMinibatch = np.mean(perMinibatchTime)
68 |     medianTimePerMinibatch = np.median(perMinibatchTime)
69 |     minTimePerMinibatch = np.min(perMinibatchTime)
70 | 
71 |     def samplesPerSec(minibatchSize, processingTime):
72 |         return minibatchSize/processingTime
73 | 
74 |     print('*****************************Training on %d GPUs***************************************' % numGPUs)
75 |     print('MinibatchSize=%d, NumMinibatches=%d.' % (minibatchSize, numMinibatches))
76 |     print('Training speed (samples/sec): Average=%d, Median=%d, Max=%d' % (samplesPerSec(minibatchSize, meanTimePerMinibatch),
77 |                                                                            samplesPerSec(minibatchSize, medianTimePerMinibatch),
78 |                                                                            samplesPerSec(minibatchSize, minTimePerMinibatch)))
79 |     print('*************************************************************************************')
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/TensorFlow/ffn_1GPU.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/core/common_runtime/local_device.cc:25] Local device intra op parallelism threads: 20
 2 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 0 with properties: 
 3 | name: Tesla K40m
 4 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
 5 | pciBusID 0000:0d:00.0
 6 | Total memory: 11.25GiB
 7 | Free memory: 11.12GiB
 8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 1 with properties: 
 9 | name: Tesla K40m
10 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
11 | pciBusID 0000:0a:00.0
12 | Total memory: 11.25GiB
13 | Free memory: 11.12GiB
14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 2 with properties: 
15 | name: Tesla K40m
16 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
17 | pciBusID 0000:2b:00.0
18 | Total memory: 11.25GiB
19 | Free memory: 11.12GiB
20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 3 with properties: 
21 | name: Tesla K40m
22 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
23 | pciBusID 0000:30:00.0
24 | Total memory: 11.25GiB
25 | Free memory: 11.12GiB
26 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 2
27 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 3
28 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 2
29 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 3
30 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 0
31 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 1
32 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 0
33 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 1
34 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:112] DMA: 0 1 2 3 
35 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 0:   Y Y N N 
36 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 1:   Y Y N N 
37 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 2:   N N Y Y 
38 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 3:   N N Y Y 
39 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:0d:00.0)
40 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40m, pci bus id: 0000:0a:00.0)
41 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K40m, pci bus id: 0000:2b:00.0)
42 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K40m, pci bus id: 0000:30:00.0)
43 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133
44 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351
45 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133
46 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351
47 | I tensorflow/core/common_runtime/local_session.cc:45] Local session inter op parallelism threads: 20
48 | *****************************Training on 1 GPUs***************************************
49 | MinibatchSize=8192, NumMinibatches=150.
50 | Training speed (samples/sec): Average=8141, Median=8183, Max=8414
51 | *************************************************************************************
52 | 


--------------------------------------------------------------------------------
/TensorFlow/ffn_4GPUs.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/core/common_runtime/local_device.cc:25] Local device intra op parallelism threads: 20
 2 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 0 with properties: 
 3 | name: Tesla K40m
 4 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
 5 | pciBusID 0000:0d:00.0
 6 | Total memory: 11.25GiB
 7 | Free memory: 11.12GiB
 8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 1 with properties: 
 9 | name: Tesla K40m
10 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
11 | pciBusID 0000:0a:00.0
12 | Total memory: 11.25GiB
13 | Free memory: 11.12GiB
14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 2 with properties: 
15 | name: Tesla K40m
16 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
17 | pciBusID 0000:2b:00.0
18 | Total memory: 11.25GiB
19 | Free memory: 11.12GiB
20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 3 with properties: 
21 | name: Tesla K40m
22 | major: 3 minor: 5 memoryClockRate (GHz) 0.745
23 | pciBusID 0000:30:00.0
24 | Total memory: 11.25GiB
25 | Free memory: 11.12GiB
26 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 2
27 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 3
28 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 2
29 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 3
30 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 0
31 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 1
32 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 0
33 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 1
34 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:112] DMA: 0 1 2 3 
35 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 0:   Y Y N N 
36 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 1:   Y Y N N 
37 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 2:   N N Y Y 
38 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 3:   N N Y Y 
39 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:0d:00.0)
40 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40m, pci bus id: 0000:0a:00.0)
41 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K40m, pci bus id: 0000:2b:00.0)
42 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K40m, pci bus id: 0000:30:00.0)
43 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133
44 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351
45 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133
46 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351
47 | I tensorflow/core/common_runtime/local_session.cc:45] Local session inter op parallelism threads: 20
48 | *****************************Training on 4 GPUs***************************************
49 | MinibatchSize=8192, NumMinibatches=150.
50 | Training speed (samples/sec): Average=11255, Median=11381, Max=12359
51 | *************************************************************************************
52 | 


--------------------------------------------------------------------------------
/TensorFlow/ffn_exp.py:
--------------------------------------------------------------------------------
 1 | # A feed-forward DNN with 5 hidden layers using sigmoid activations.
 2 | 
 3 | import time
 4 | import tensorflow as tf
 5 | import ffn
 6 | 
 7 | from ffn import *
 8 | 
 9 | minibatchSize = 8192
10 | 
11 | # Create the model
12 | if (FLAGS.noInputFeed):
13 |   features, labels = getFakeMinibatch(minibatchSize)
14 | else:
15 |   features = tf.placeholder("float", [None, featureDim])
16 |   labels = tf.placeholder("float", [None, labelDim])
17 | 
18 | crossEntropy, accuracy = getLossAndAccuracyForSubBatch(features, labels)
19 | trainStep = tf.train.GradientDescentOptimizer(0.01).minimize(crossEntropy)
20 | 
21 | # Train
22 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.logDevicePlacement))
23 | init = tf.initialize_all_variables()
24 | sess.run(init)
25 | 
26 | perMinibatchTime = []
27 | for i in range(numMinibatches):
28 |   if (FLAGS.noInputFeed == False):
29 |     minibatchFeatures, minibatchLabels = getFakeMinibatch(minibatchSize)
30 | 
31 |   startTime = time.time()
32 |   if (FLAGS.noInputFeed):
33 |     sess.run([trainStep, accuracy])
34 |   else:
35 |     sess.run([trainStep, accuracy], feed_dict={features: minibatchFeatures, labels: minibatchLabels})
36 | 
37 |   currMinibatchDuration = time.time() - startTime
38 |   perMinibatchTime.append(currMinibatchDuration)
39 | 
40 | printTrainingStats(1, minibatchSize, perMinibatchTime)
41 | 
42 | 


--------------------------------------------------------------------------------
/TensorFlow/ffn_exp_4GPUs.py:
--------------------------------------------------------------------------------
  1 | # A feed-forward DNN with 5 hidden layers using sigmoid activations.
  2 | # Uses dataparallel SGD with multiple GPUs
  3 | 
  4 | import time
  5 | import tensorflow as tf
  6 | import ffn
  7 | 
  8 | from ffn import *
  9 | 
 10 | tf.app.flags.DEFINE_integer('numGPUs', 4,
 11 |                             """How many GPUs to use.""")
 12 | 
 13 | subMinibatchSize = 2048
 14 | minibatchSize = FLAGS.numGPUs * subMinibatchSize
 15 | 
 16 | def aggregateGradients(subMinibatchGradients):
 17 |   aggGrads = []
 18 |   for gradAndVars in zip(*subMinibatchGradients):
 19 |     # Note that each gradAndVars looks like the following:
 20 |     #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
 21 |     grads = []
 22 |     for g, _ in gradAndVars:
 23 |       # Add 0 dimension to the gradients to represent the replica.
 24 |       expanded_g = tf.expand_dims(g, 0)
 25 | 
 26 |       # Append on a 'replica' dimension which we will sum over below.
 27 |       grads.append(expanded_g)
 28 | 
 29 |     # Sum over the 'replica' dimension.
 30 |     grad = tf.concat(0, grads)
 31 |     grad = tf.reduce_sum(grad, 0)
 32 | 
 33 |     # Keep in mind that the Variables are redundant because they are shared
 34 |     # across replicas. So .. we will just return the first replica's pointer to
 35 |     # the Variable.
 36 |     v = gradAndVars[0][1]
 37 |     gradAndVar = (grad, v)
 38 |     aggGrads.append(gradAndVar)
 39 |   return aggGrads
 40 | 
 41 | if (FLAGS.noInputFeed):
 42 |   features, labels = getFakeMinibatch(subMinibatchSize)
 43 | else:
 44 | # HACK: Using the same subMinibatch across all GPUs
 45 |   features = tf.placeholder("float", [None, featureDim])
 46 |   labels = tf.placeholder("float", [None, labelDim])
 47 | 
 48 | optimizer = tf.train.GradientDescentOptimizer(0.01)
 49 | 
 50 | # Calculate the gradients for each subBatch on a different GPU
 51 | subMinibatchGradients = []
 52 | subMinibatchAccuracies = []
 53 | for i in xrange(FLAGS.numGPUs):
 54 |   with tf.device('/gpu:%d' % i):
 55 |     with tf.name_scope('%s_%d' % ("replica", i)) as scope:
 56 |       # Calculate the loss for one subBatch. This function
 57 |       # constructs the entire model but shares the variables across
 58 |       # all replicas.
 59 |       loss, accuracy = getLossAndAccuracyForSubBatch(features, labels)
 60 | 
 61 |       # Reuse variables for the next replica.
 62 |       tf.get_variable_scope().reuse_variables()
 63 | 
 64 |       # Calculate the gradients for this subBatch on this GPU
 65 |       grads = optimizer.compute_gradients(loss)
 66 | 
 67 |       # Keep track of the gradients across all replicas.
 68 |       subMinibatchGradients.append(grads)
 69 |       subMinibatchAccuracies.append(accuracy)
 70 | 
 71 | # We must calculate the sum of each gradient. Note that this is the
 72 | # synchronization point across all towers.
 73 | grads = aggregateGradients(subMinibatchGradients)
 74 | accuracy = tf.reduce_sum(tf.pack(subMinibatchAccuracies))
 75 | 
 76 | # Apply the gradients to adjust the shared variables.
 77 | applyGradientOp = optimizer.apply_gradients(grads)
 78 | 
 79 | # Start running operations on the Graph. allow_soft_placement must be set to
 80 | # True to build replicas on GPU, as some of the ops do not have GPU implementations.
 81 | sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.logDevicePlacement))
 82 | init = tf.initialize_all_variables()
 83 | sess.run(init)
 84 | 
 85 | # Start the queue runners.
 86 | tf.train.start_queue_runners(sess=sess)
 87 | 
 88 | perMinibatchTime = []
 89 | for step in xrange(numMinibatches):
 90 |   if (FLAGS.noInputFeed == False):
 91 |     subMinibatchFeatures, subMinibatchLabels = getFakeMinibatch(subMinibatchSize)
 92 | 
 93 |   startTime = time.time()
 94 |   if (FLAGS.noInputFeed):
 95 |     sess.run([applyGradientOp, accuracy])
 96 |   else:
 97 |     sess.run([applyGradientOp, accuracy], feed_dict={features: subMinibatchFeatures, labels: subMinibatchLabels})
 98 | 
 99 |   currMinibatchDuration = time.time() - startTime
100 |   perMinibatchTime.append(currMinibatchDuration)
101 | 
102 | printTrainingStats(FLAGS.numGPUs, minibatchSize, perMinibatchTime)
103 | 
104 | 


--------------------------------------------------------------------------------
/Torch/alexnet.lua:
--------------------------------------------------------------------------------
 1 | require 'sys';
 2 | require 'bit';
 3 | require 'cunn';
 4 | require 'cudnn';
 5 | cudnn.benchmark = true;
 6 | cudnn.verbose = true;
 7 | require 'optim';
 8 | torch.setdefaulttensortype('torch.FloatTensor')
 9 | 
10 | local steps = 1 -- number of runs
11 | 
12 | local Linear = nn.Linear
13 | local Transfer = cudnn.ReLU
14 | local hsize = 4096
15 | local osize = 1000
16 | 
17 | -- Network definition
18 | local cnn = nn.Sequential()
19 | cnn:add(cudnn.SpatialConvolution(3,96,11,11,4,4,2,2)):add(Transfer(true))
20 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2))
21 | cnn:add(cudnn.SpatialConvolution(96,256,5,5,1,1,2,2)):add(Transfer(true))
22 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2))
23 | cnn:add(cudnn.SpatialConvolution(256,384,3,3,1,1,1,1)):add(Transfer(true))
24 | cnn:add(cudnn.SpatialConvolution(384,384,3,3,1,1,1,1)):add(Transfer(true))
25 | cnn:add(cudnn.SpatialConvolution(384,256,3,3,1,1,1,1)):add(Transfer(true))
26 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2))
27 | 
28 | cnn:add(Linear(256*6*6,hsize)):add(Transfer(true)) -- hidden layer 1
29 | cnn:add(nn.Dropout(0.5))
30 | cnn:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 2
31 | cnn:add(nn.Dropout(0.5))
32 | cnn:add(Linear(hsize,osize)):add(cudnn.LogSoftMax()) -- output layer
33 | 
34 | -- Fake data
35 | local bsize = 256
36 | local inputCPU = torch.randn(torch.LongStorage({bsize,3,224,224}))
37 | local input = torch.CudaTensor(inputCPU:size())
38 | local target = torch.IntTensor(bsize):random(1,osize):cuda()
39 | 
40 | for k=0,2 do
41 |     nGPU = bit.lshift(1,k)
42 | 
43 |     local model = nil
44 |     if nGPU > 1 then
45 |         model = nn.DataParallelTable(1)
46 |         for i=1,nGPU do
47 |             cutorch.setDevice(i)
48 |             model:add(cnn:clone():cuda(), i)
49 |         end
50 |         cutorch.setDevice(1)
51 |     else
52 |         model = cnn:cuda()
53 |     end
54 | 
55 |     -- optimizer declarations
56 |     local criterion = nn.ClassNLLCriterion():cuda()
57 |     local parameters, gradParameters = model:getParameters()
58 |     local optimState = { learningRate = 0.01 }
59 | 
60 |     collectgarbage()
61 |     sys.tic()
62 |     for t = 1, steps do
63 |         input:copy(inputCPU) -- transfer data to GPU memory
64 |         feval = function(x)
65 |             model:zeroGradParameters()
66 |             local output = model:forward(input)
67 |             local err = criterion:forward(output, target)
68 |             local gradOutput = criterion:backward(output, target)
69 |             local gradInput = model:backward(input, gradOutput)
70 |             return err, gradParameters
71 |         end
72 |         optim.sgd(feval, parameters, optimState)
73 | 
74 |         -- DataParallelTable's syncParameters
75 |         model:apply(function(m) if m.syncParameters then m:syncParameters() end end)
76 |         cutorch.synchronize()
77 |     end
78 |     local elapsed = sys.toc()
79 | 
80 |     print(string.format("%d GPUs: %0.0f samples per sec", nGPU, steps * bsize / elapsed))
81 | end
82 | 
83 | 


--------------------------------------------------------------------------------
/Torch/ffn.log:
--------------------------------------------------------------------------------
1 | 1 GPUs: 12522 samples per sec	
2 | 2 GPUs: 19751 samples per sec	
3 | 4 GPUs: 23076 samples per sec	
4 | 


--------------------------------------------------------------------------------
/Torch/ffn.lua:
--------------------------------------------------------------------------------
 1 | require 'sys';
 2 | require 'bit';
 3 | require 'cunn';
 4 | require 'cudnn';
 5 | require 'optim';
 6 | torch.setdefaulttensortype('torch.FloatTensor')
 7 | 
 8 | local steps = 100 -- number of runs
 9 | 
10 | local Linear = nn.Linear
11 | local Transfer = nn.Sigmoid
12 | local isize = 512
13 | local hsize = 2048
14 | local osize = 10000
15 | 
16 | -- Network definition
17 | local mlp = nn.Sequential()
18 | mlp:add(Linear(isize,hsize)):add(Transfer(true)) -- hidden layer 1
19 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 2
20 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 3
21 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 4
22 | mlp:add(Linear(hsize,osize)):add(cudnn.LogSoftMax()) -- output layer
23 | 
24 | -- Fake data
25 | local bsize = 8192
26 | local inputCPU = torch.randn(bsize,isize)
27 | local input = torch.CudaTensor(inputCPU:size())
28 | local target = torch.IntTensor(bsize):random(1,bsize):cuda()
29 | 
30 | for k=0,2 do
31 |     nGPU = bit.lshift(1,k)
32 | 
33 |     local model = nil
34 |     if nGPU > 1 then
35 |         model = nn.DataParallelTable(1)
36 |         for i=1,nGPU do
37 |             cutorch.setDevice(i)
38 |             model:add(mlp:clone():cuda(), i)
39 |         end
40 |         cutorch.setDevice(1)
41 |     else
42 |         model = mlp:cuda()
43 |     end
44 | 
45 |     -- optimizer declarations
46 |     local criterion = nn.ClassNLLCriterion():cuda()
47 |     local parameters, gradParameters = model:getParameters()
48 |     local optimState = { learningRate = 0.01 }
49 | 
50 |     collectgarbage()
51 |     sys.tic()
52 |     for t = 1, steps do
53 |         input:copy(inputCPU) -- transfer data to GPU memory
54 |         feval = function(x)
55 |             model:zeroGradParameters()
56 |             local output = model:forward(input)
57 |             local err = criterion:forward(output, target)
58 |             local gradOutput = criterion:backward(output, target)
59 |             local gradInput = model:backward(input, gradOutput)
60 |             return err, gradParameters
61 |         end
62 |         optim.sgd(feval, parameters, optimState)
63 | 
64 |         -- DataParallelTable's syncParameters
65 |         model:apply(function(m) if m.syncParameters then m:syncParameters() end end)
66 |         cutorch.synchronize()
67 |     end
68 |     local elapsed = sys.toc()
69 | 
70 |     print(string.format("%d GPUs: %0.0f samples per sec", nGPU, steps * bsize / elapsed))
71 | end
72 | 
73 | 


--------------------------------------------------------------------------------
/caffe/alexnet.prototxt:
--------------------------------------------------------------------------------
  1 | name: "AlexNet"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   data_param {
 11 |     source: "./fake_image_net.lmdb"
 12 |     batch_size: 256
 13 |     backend: LMDB
 14 |   }
 15 | }
 16 | layer {
 17 |   name: "conv1"
 18 |   type: "Convolution"
 19 |   bottom: "data"
 20 |   top: "conv1"
 21 |   param {
 22 |     lr_mult: 1
 23 |     decay_mult: 1
 24 |   }
 25 |   param {
 26 |     lr_mult: 2
 27 |     decay_mult: 0
 28 |   }
 29 |   convolution_param {
 30 |     num_output: 96
 31 |     kernel_size: 11
 32 |     stride: 4
 33 |     weight_filler {
 34 |       type: "gaussian"
 35 |       std: 0.01
 36 |     }
 37 |     bias_filler {
 38 |       type: "constant"
 39 |       value: 0
 40 |     }
 41 |   }
 42 | }
 43 | layer {
 44 |   name: "relu1"
 45 |   type: "ReLU"
 46 |   bottom: "conv1"
 47 |   top: "conv1"
 48 | }
 49 | layer {
 50 |   name: "pool1"
 51 |   type: "Pooling"
 52 |   bottom: "conv1"
 53 |   top: "pool1"
 54 |   pooling_param {
 55 |     pool: MAX
 56 |     kernel_size: 3
 57 |     stride: 2
 58 |   }
 59 | }
 60 | layer {
 61 |   name: "conv2"
 62 |   type: "Convolution"
 63 |   bottom: "pool1"
 64 |   top: "conv2"
 65 |   param {
 66 |     lr_mult: 1
 67 |     decay_mult: 1
 68 |   }
 69 |   param {
 70 |     lr_mult: 2
 71 |     decay_mult: 0
 72 |   }
 73 |   convolution_param {
 74 |     num_output: 256
 75 |     pad: 2
 76 |     kernel_size: 5
 77 |     weight_filler {
 78 |       type: "gaussian"
 79 |       std: 0.01
 80 |     }
 81 |     bias_filler {
 82 |       type: "constant"
 83 |       value: 0.1
 84 |     }
 85 |   }
 86 | }
 87 | layer {
 88 |   name: "relu2"
 89 |   type: "ReLU"
 90 |   bottom: "conv2"
 91 |   top: "conv2"
 92 | }
 93 | layer {
 94 |   name: "pool2"
 95 |   type: "Pooling"
 96 |   bottom: "conv2"
 97 |   top: "pool2"
 98 |   pooling_param {
 99 |     pool: MAX
100 |     kernel_size: 3
101 |     stride: 2
102 |   }
103 | }
104 | layer {
105 |   name: "conv3"
106 |   type: "Convolution"
107 |   bottom: "pool2"
108 |   top: "conv3"
109 |   param {
110 |     lr_mult: 1
111 |     decay_mult: 1
112 |   }
113 |   param {
114 |     lr_mult: 2
115 |     decay_mult: 0
116 |   }
117 |   convolution_param {
118 |     num_output: 384
119 |     pad: 1
120 |     kernel_size: 3
121 |     weight_filler {
122 |       type: "gaussian"
123 |       std: 0.01
124 |     }
125 |     bias_filler {
126 |       type: "constant"
127 |       value: 0
128 |     }
129 |   }
130 | }
131 | layer {
132 |   name: "relu3"
133 |   type: "ReLU"
134 |   bottom: "conv3"
135 |   top: "conv3"
136 | }
137 | layer {
138 |   name: "conv4"
139 |   type: "Convolution"
140 |   bottom: "conv3"
141 |   top: "conv4"
142 |   param {
143 |     lr_mult: 1
144 |     decay_mult: 1
145 |   }
146 |   param {
147 |     lr_mult: 2
148 |     decay_mult: 0
149 |   }
150 |   convolution_param {
151 |     num_output: 384
152 |     pad: 1
153 |     kernel_size: 3
154 |     weight_filler {
155 |       type: "gaussian"
156 |       std: 0.01
157 |     }
158 |     bias_filler {
159 |       type: "constant"
160 |       value: 0.1
161 |     }
162 |   }
163 | }
164 | layer {
165 |   name: "relu4"
166 |   type: "ReLU"
167 |   bottom: "conv4"
168 |   top: "conv4"
169 | }
170 | layer {
171 |   name: "conv5"
172 |   type: "Convolution"
173 |   bottom: "conv4"
174 |   top: "conv5"
175 |   param {
176 |     lr_mult: 1
177 |     decay_mult: 1
178 |   }
179 |   param {
180 |     lr_mult: 2
181 |     decay_mult: 0
182 |   }
183 |   convolution_param {
184 |     num_output: 256
185 |     pad: 1
186 |     kernel_size: 3
187 |     weight_filler {
188 |       type: "gaussian"
189 |       std: 0.01
190 |     }
191 |     bias_filler {
192 |       type: "constant"
193 |       value: 0.1
194 |     }
195 |   }
196 | }
197 | layer {
198 |   name: "relu5"
199 |   type: "ReLU"
200 |   bottom: "conv5"
201 |   top: "conv5"
202 | }
203 | layer {
204 |   name: "pool5"
205 |   type: "Pooling"
206 |   bottom: "conv5"
207 |   top: "pool5"
208 |   pooling_param {
209 |     pool: MAX
210 |     kernel_size: 3
211 |     stride: 2
212 |   }
213 | }
214 | layer {
215 |   name: "fc6"
216 |   type: "InnerProduct"
217 |   bottom: "pool5"
218 |   top: "fc6"
219 |   param {
220 |     lr_mult: 1
221 |     decay_mult: 1
222 |   }
223 |   param {
224 |     lr_mult: 2
225 |     decay_mult: 0
226 |   }
227 |   inner_product_param {
228 |     num_output: 4096
229 |     weight_filler {
230 |       type: "gaussian"
231 |       std: 0.005
232 |     }
233 |     bias_filler {
234 |       type: "constant"
235 |       value: 0.1
236 |     }
237 |   }
238 | }
239 | layer {
240 |   name: "relu6"
241 |   type: "ReLU"
242 |   bottom: "fc6"
243 |   top: "fc6"
244 | }
245 | layer {
246 |   name: "drop6"
247 |   type: "Dropout"
248 |   bottom: "fc6"
249 |   top: "fc6"
250 |   dropout_param {
251 |     dropout_ratio: 0.5
252 |   }
253 | }
254 | layer {
255 |   name: "fc7"
256 |   type: "InnerProduct"
257 |   bottom: "fc6"
258 |   top: "fc7"
259 |   param {
260 |     lr_mult: 1
261 |     decay_mult: 1
262 |   }
263 |   param {
264 |     lr_mult: 2
265 |     decay_mult: 0
266 |   }
267 |   inner_product_param {
268 |     num_output: 4096
269 |     weight_filler {
270 |       type: "gaussian"
271 |       std: 0.005
272 |     }
273 |     bias_filler {
274 |       type: "constant"
275 |       value: 0.1
276 |     }
277 |   }
278 | }
279 | layer {
280 |   name: "relu7"
281 |   type: "ReLU"
282 |   bottom: "fc7"
283 |   top: "fc7"
284 | }
285 | layer {
286 |   name: "drop7"
287 |   type: "Dropout"
288 |   bottom: "fc7"
289 |   top: "fc7"
290 |   dropout_param {
291 |     dropout_ratio: 0.5
292 |   }
293 | }
294 | layer {
295 |   name: "fc8"
296 |   type: "InnerProduct"
297 |   bottom: "fc7"
298 |   top: "fc8"
299 |   param {
300 |     lr_mult: 1
301 |     decay_mult: 1
302 |   }
303 |   param {
304 |     lr_mult: 2
305 |     decay_mult: 0
306 |   }
307 |   inner_product_param {
308 |     num_output: 1000
309 |     weight_filler {
310 |       type: "gaussian"
311 |       std: 0.01
312 |     }
313 |     bias_filler {
314 |       type: "constant"
315 |       value: 0
316 |     }
317 |   }
318 | }
319 | layer {
320 |   name: "accuracy"
321 |   type: "Accuracy"
322 |   bottom: "fc8"
323 |   bottom: "label"
324 |   top: "accuracy"
325 |   include {
326 |     phase: TEST
327 |   }
328 | }
329 | layer {
330 |   name: "loss"
331 |   type: "SoftmaxWithLoss"
332 |   bottom: "fc8"
333 |   bottom: "label"
334 |   top: "loss"
335 | }
336 | 
337 | 


--------------------------------------------------------------------------------
/caffe/alexnet_1GPU.log:
--------------------------------------------------------------------------------
  1 | I1209 22:09:55.896893 63236 caffe.cpp:184] Using GPUs 0
  2 | I1209 22:10:05.413523 63236 solver.cpp:48] Initializing solver from parameters: 
  3 | base_lr: 0.01
  4 | max_iter: 50
  5 | lr_policy: "fixed"
  6 | solver_mode: GPU
  7 | device_id: 0
  8 | net: "./alexnet.prototxt"
  9 | I1209 22:10:05.413588 63236 solver.cpp:91] Creating training net from net file: ./alexnet.prototxt
 10 | I1209 22:10:05.414489 63236 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
 11 | I1209 22:10:05.414664 63236 net.cpp:49] Initializing net from parameters: 
 12 | name: "AlexNet"
 13 | state {
 14 |   phase: TRAIN
 15 | }
 16 | layer {
 17 |   name: "data"
 18 |   type: "Data"
 19 |   top: "data"
 20 |   top: "label"
 21 |   include {
 22 |     phase: TRAIN
 23 |   }
 24 |   data_param {
 25 |     source: "./fake_image_net.lmdb"
 26 |     batch_size: 256
 27 |     backend: LMDB
 28 |   }
 29 | }
 30 | layer {
 31 |   name: "conv1"
 32 |   type: "Convolution"
 33 |   bottom: "data"
 34 |   top: "conv1"
 35 |   param {
 36 |     lr_mult: 1
 37 |     decay_mult: 1
 38 |   }
 39 |   param {
 40 |     lr_mult: 2
 41 |     decay_mult: 0
 42 |   }
 43 |   convolution_param {
 44 |     num_output: 96
 45 |     kernel_size: 11
 46 |     stride: 4
 47 |     weight_filler {
 48 |       type: "gaussian"
 49 |       std: 0.01
 50 |     }
 51 |     bias_filler {
 52 |       type: "constant"
 53 |       value: 0
 54 |     }
 55 |   }
 56 | }
 57 | layer {
 58 |   name: "relu1"
 59 |   type: "ReLU"
 60 |   bottom: "conv1"
 61 |   top: "conv1"
 62 | }
 63 | layer {
 64 |   name: "pool1"
 65 |   type: "Pooling"
 66 |   bottom: "conv1"
 67 |   top: "pool1"
 68 |   pooling_param {
 69 |     pool: MAX
 70 |     kernel_size: 3
 71 |     stride: 2
 72 |   }
 73 | }
 74 | layer {
 75 |   name: "conv2"
 76 |   type: "Convolution"
 77 |   bottom: "pool1"
 78 |   top: "conv2"
 79 |   param {
 80 |     lr_mult: 1
 81 |     decay_mult: 1
 82 |   }
 83 |   param {
 84 |     lr_mult: 2
 85 |     decay_mult: 0
 86 |   }
 87 |   convolution_param {
 88 |     num_output: 256
 89 |     pad: 2
 90 |     kernel_size: 5
 91 |     weight_filler {
 92 |       type: "gaussian"
 93 |       std: 0.01
 94 |     }
 95 |     bias_filler {
 96 |       type: "constant"
 97 |       value: 0.1
 98 |     }
 99 |   }
100 | }
101 | layer {
102 |   name: "relu2"
103 |   type: "ReLU"
104 |   bottom: "conv2"
105 |   top: "conv2"
106 | }
107 | layer {
108 |   name: "pool2"
109 |   type: "Pooling"
110 |   bottom: "conv2"
111 |   top: "pool2"
112 |   pooling_param {
113 |     pool: MAX
114 |     kernel_size: 3
115 |     stride: 2
116 |   }
117 | }
118 | layer {
119 |   name: "conv3"
120 |   type: "Convolution"
121 |   bottom: "pool2"
122 |   top: "conv3"
123 |   param {
124 |     lr_mult: 1
125 |     decay_mult: 1
126 |   }
127 |   param {
128 |     lr_mult: 2
129 |     decay_mult: 0
130 |   }
131 |   convolution_param {
132 |     num_output: 384
133 |     pad: 1
134 |     kernel_size: 3
135 |     weight_filler {
136 |       type: "gaussian"
137 |       std: 0.01
138 |     }
139 |     bias_filler {
140 |       type: "constant"
141 |       value: 0
142 |     }
143 |   }
144 | }
145 | layer {
146 |   name: "relu3"
147 |   type: "ReLU"
148 |   bottom: "conv3"
149 |   top: "conv3"
150 | }
151 | layer {
152 |   name: "conv4"
153 |   type: "Convolution"
154 |   bottom: "conv3"
155 |   top: "conv4"
156 |   param {
157 |     lr_mult: 1
158 |     decay_mult: 1
159 |   }
160 |   param {
161 |     lr_mult: 2
162 |     decay_mult: 0
163 |   }
164 |   convolution_param {
165 |     num_output: 384
166 |     pad: 1
167 |     kernel_size: 3
168 |     weight_filler {
169 |       type: "gaussian"
170 |       std: 0.01
171 |     }
172 |     bias_filler {
173 |       type: "constant"
174 |       value: 0.1
175 |     }
176 |   }
177 | }
178 | layer {
179 |   name: "relu4"
180 |   type: "ReLU"
181 |   bottom: "conv4"
182 |   top: "conv4"
183 | }
184 | layer {
185 |   name: "conv5"
186 |   type: "Convolution"
187 |   bottom: "conv4"
188 |   top: "conv5"
189 |   param {
190 |     lr_mult: 1
191 |     decay_mult: 1
192 |   }
193 |   param {
194 |     lr_mult: 2
195 |     decay_mult: 0
196 |   }
197 |   convolution_param {
198 |     num_output: 256
199 |     pad: 1
200 |     kernel_size: 3
201 |     weight_filler {
202 |       type: "gaussian"
203 |       std: 0.01
204 |     }
205 |     bias_filler {
206 |       type: "constant"
207 |       value: 0.1
208 |     }
209 |   }
210 | }
211 | layer {
212 |   name: "relu5"
213 |   type: "ReLU"
214 |   bottom: "conv5"
215 |   top: "conv5"
216 | }
217 | layer {
218 |   name: "pool5"
219 |   type: "Pooling"
220 |   bottom: "conv5"
221 |   top: "pool5"
222 |   pooling_param {
223 |     pool: MAX
224 |     kernel_size: 3
225 |     stride: 2
226 |   }
227 | }
228 | layer {
229 |   name: "fc6"
230 |   type: "InnerProduct"
231 |   bottom: "pool5"
232 |   top: "fc6"
233 |   param {
234 |     lr_mult: 1
235 |     decay_mult: 1
236 |   }
237 |   param {
238 |     lr_mult: 2
239 |     decay_mult: 0
240 |   }
241 |   inner_product_param {
242 |     num_output: 4096
243 |     weight_filler {
244 |       type: "gaussian"
245 |       std: 0.005
246 |     }
247 |     bias_filler {
248 |       type: "constant"
249 |       value: 0.1
250 |     }
251 |   }
252 | }
253 | layer {
254 |   name: "relu6"
255 |   type: "ReLU"
256 |   bottom: "fc6"
257 |   top: "fc6"
258 | }
259 | layer {
260 |   name: "drop6"
261 |   type: "Dropout"
262 |   bottom: "fc6"
263 |   top: "fc6"
264 |   dropout_param {
265 |     dropout_ratio: 0.5
266 |   }
267 | }
268 | layer {
269 |   name: "fc7"
270 |   type: "InnerProduct"
271 |   bottom: "fc6"
272 |   top: "fc7"
273 |   param {
274 |     lr_mult: 1
275 |     decay_mult: 1
276 |   }
277 |   param {
278 |     lr_mult: 2
279 |     decay_mult: 0
280 |   }
281 |   inner_product_param {
282 |     num_output: 4096
283 |     weight_filler {
284 |       type: "gaussian"
285 |       std: 0.005
286 |     }
287 |     bias_filler {
288 |       type: "constant"
289 |       value: 0.1
290 |     }
291 |   }
292 | }
293 | layer {
294 |   name: "relu7"
295 |   type: "ReLU"
296 |   bottom: "fc7"
297 |   top: "fc7"
298 | }
299 | layer {
300 |   name: "drop7"
301 |   type: "Dropout"
302 |   bottom: "fc7"
303 |   top: "fc7"
304 |   dropout_param {
305 |     dropout_ratio: 0.5
306 |   }
307 | }
308 | layer {
309 |   name: "fc8"
310 |   type: "InnerProduct"
311 |   bottom: "fc7"
312 |   top: "fc8"
313 |   param {
314 |     lr_mult: 1
315 |     decay_mult: 1
316 |   }
317 |   param {
318 |     lr_mult: 2
319 |     decay_mult: 0
320 |   }
321 |   inner_product_param {
322 |     num_output: 1000
323 |     weight_filler {
324 |       type: "gaussian"
325 |       std: 0.01
326 |     }
327 |     bias_filler {
328 |       type: "constant"
329 |       value: 0
330 |     }
331 |   }
332 | }
333 | layer {
334 |   name: "loss"
335 |   type: "SoftmaxWithLoss"
336 |   bottom: "fc8"
337 |   bottom: "label"
338 |   top: "loss"
339 | }
340 | I1209 22:10:05.414804 63236 layer_factory.hpp:77] Creating layer data
341 | I1209 22:10:05.415355 63236 net.cpp:106] Creating Layer data
342 | I1209 22:10:05.415367 63236 net.cpp:411] data -> data
343 | I1209 22:10:05.415393 63236 net.cpp:411] data -> label
344 | I1209 22:10:05.417349 63238 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb
345 | I1209 22:10:05.433789 63236 data_layer.cpp:41] output data size: 256,3,224,224
346 | I1209 22:10:05.806865 63236 net.cpp:150] Setting up data
347 | I1209 22:10:05.806946 63236 net.cpp:157] Top shape: 256 3 224 224 (38535168)
348 | I1209 22:10:05.806953 63236 net.cpp:157] Top shape: 256 (256)
349 | I1209 22:10:05.806957 63236 net.cpp:165] Memory required for data: 154141696
350 | I1209 22:10:05.806968 63236 layer_factory.hpp:77] Creating layer conv1
351 | I1209 22:10:05.806993 63236 net.cpp:106] Creating Layer conv1
352 | I1209 22:10:05.807001 63236 net.cpp:454] conv1 <- data
353 | I1209 22:10:05.807014 63236 net.cpp:411] conv1 -> conv1
354 | I1209 22:10:05.955500 63236 net.cpp:150] Setting up conv1
355 | I1209 22:10:05.955545 63236 net.cpp:157] Top shape: 256 96 54 54 (71663616)
356 | I1209 22:10:05.955550 63236 net.cpp:165] Memory required for data: 440796160
357 | I1209 22:10:05.955574 63236 layer_factory.hpp:77] Creating layer relu1
358 | I1209 22:10:05.955590 63236 net.cpp:106] Creating Layer relu1
359 | I1209 22:10:05.955596 63236 net.cpp:454] relu1 <- conv1
360 | I1209 22:10:05.955605 63236 net.cpp:397] relu1 -> conv1 (in-place)
361 | I1209 22:10:05.955847 63236 net.cpp:150] Setting up relu1
362 | I1209 22:10:05.955858 63236 net.cpp:157] Top shape: 256 96 54 54 (71663616)
363 | I1209 22:10:05.955862 63236 net.cpp:165] Memory required for data: 727450624
364 | I1209 22:10:05.955867 63236 layer_factory.hpp:77] Creating layer pool1
365 | I1209 22:10:05.955876 63236 net.cpp:106] Creating Layer pool1
366 | I1209 22:10:05.955881 63236 net.cpp:454] pool1 <- conv1
367 | I1209 22:10:05.955888 63236 net.cpp:411] pool1 -> pool1
368 | I1209 22:10:05.956173 63236 net.cpp:150] Setting up pool1
369 | I1209 22:10:05.956202 63236 net.cpp:157] Top shape: 256 96 27 27 (17915904)
370 | I1209 22:10:05.956207 63236 net.cpp:165] Memory required for data: 799114240
371 | I1209 22:10:05.956210 63236 layer_factory.hpp:77] Creating layer conv2
372 | I1209 22:10:05.956226 63236 net.cpp:106] Creating Layer conv2
373 | I1209 22:10:05.956233 63236 net.cpp:454] conv2 <- pool1
374 | I1209 22:10:05.956238 63236 net.cpp:411] conv2 -> conv2
375 | I1209 22:10:05.973860 63236 net.cpp:150] Setting up conv2
376 | I1209 22:10:05.973872 63236 net.cpp:157] Top shape: 256 256 27 27 (47775744)
377 | I1209 22:10:05.973876 63236 net.cpp:165] Memory required for data: 990217216
378 | I1209 22:10:05.973886 63236 layer_factory.hpp:77] Creating layer relu2
379 | I1209 22:10:05.973894 63236 net.cpp:106] Creating Layer relu2
380 | I1209 22:10:05.973898 63236 net.cpp:454] relu2 <- conv2
381 | I1209 22:10:05.973904 63236 net.cpp:397] relu2 -> conv2 (in-place)
382 | I1209 22:10:05.974156 63236 net.cpp:150] Setting up relu2
383 | I1209 22:10:05.974167 63236 net.cpp:157] Top shape: 256 256 27 27 (47775744)
384 | I1209 22:10:05.974171 63236 net.cpp:165] Memory required for data: 1181320192
385 | I1209 22:10:05.974175 63236 layer_factory.hpp:77] Creating layer pool2
386 | I1209 22:10:05.974190 63236 net.cpp:106] Creating Layer pool2
387 | I1209 22:10:05.974195 63236 net.cpp:454] pool2 <- conv2
388 | I1209 22:10:05.974200 63236 net.cpp:411] pool2 -> pool2
389 | I1209 22:10:05.974372 63236 net.cpp:150] Setting up pool2
390 | I1209 22:10:05.974403 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584)
391 | I1209 22:10:05.974407 63236 net.cpp:165] Memory required for data: 1225622528
392 | I1209 22:10:05.974411 63236 layer_factory.hpp:77] Creating layer conv3
393 | I1209 22:10:05.974421 63236 net.cpp:106] Creating Layer conv3
394 | I1209 22:10:05.974424 63236 net.cpp:454] conv3 <- pool2
395 | I1209 22:10:05.974431 63236 net.cpp:411] conv3 -> conv3
396 | I1209 22:10:05.998834 63236 net.cpp:150] Setting up conv3
397 | I1209 22:10:05.998847 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376)
398 | I1209 22:10:05.998850 63236 net.cpp:165] Memory required for data: 1292076032
399 | I1209 22:10:05.998859 63236 layer_factory.hpp:77] Creating layer relu3
400 | I1209 22:10:05.998868 63236 net.cpp:106] Creating Layer relu3
401 | I1209 22:10:05.998873 63236 net.cpp:454] relu3 <- conv3
402 | I1209 22:10:05.998878 63236 net.cpp:397] relu3 -> conv3 (in-place)
403 | I1209 22:10:05.999124 63236 net.cpp:150] Setting up relu3
404 | I1209 22:10:05.999135 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376)
405 | I1209 22:10:05.999137 63236 net.cpp:165] Memory required for data: 1358529536
406 | I1209 22:10:05.999141 63236 layer_factory.hpp:77] Creating layer conv4
407 | I1209 22:10:05.999153 63236 net.cpp:106] Creating Layer conv4
408 | I1209 22:10:05.999157 63236 net.cpp:454] conv4 <- conv3
409 | I1209 22:10:05.999162 63236 net.cpp:411] conv4 -> conv4
410 | I1209 22:10:06.000876 63239 blocking_queue.cpp:50] Waiting for data
411 | I1209 22:10:06.034972 63236 net.cpp:150] Setting up conv4
412 | I1209 22:10:06.034986 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376)
413 | I1209 22:10:06.034989 63236 net.cpp:165] Memory required for data: 1424983040
414 | I1209 22:10:06.034996 63236 layer_factory.hpp:77] Creating layer relu4
415 | I1209 22:10:06.035006 63236 net.cpp:106] Creating Layer relu4
416 | I1209 22:10:06.035009 63236 net.cpp:454] relu4 <- conv4
417 | I1209 22:10:06.035014 63236 net.cpp:397] relu4 -> conv4 (in-place)
418 | I1209 22:10:06.035151 63236 net.cpp:150] Setting up relu4
419 | I1209 22:10:06.035159 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376)
420 | I1209 22:10:06.035163 63236 net.cpp:165] Memory required for data: 1491436544
421 | I1209 22:10:06.035167 63236 layer_factory.hpp:77] Creating layer conv5
422 | I1209 22:10:06.035181 63236 net.cpp:106] Creating Layer conv5
423 | I1209 22:10:06.035187 63236 net.cpp:454] conv5 <- conv4
424 | I1209 22:10:06.035192 63236 net.cpp:411] conv5 -> conv5
425 | I1209 22:10:06.059911 63236 net.cpp:150] Setting up conv5
426 | I1209 22:10:06.059923 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584)
427 | I1209 22:10:06.059927 63236 net.cpp:165] Memory required for data: 1535738880
428 | I1209 22:10:06.059937 63236 layer_factory.hpp:77] Creating layer relu5
429 | I1209 22:10:06.059944 63236 net.cpp:106] Creating Layer relu5
430 | I1209 22:10:06.059948 63236 net.cpp:454] relu5 <- conv5
431 | I1209 22:10:06.059954 63236 net.cpp:397] relu5 -> conv5 (in-place)
432 | I1209 22:10:06.060103 63236 net.cpp:150] Setting up relu5
433 | I1209 22:10:06.060113 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584)
434 | I1209 22:10:06.060117 63236 net.cpp:165] Memory required for data: 1580041216
435 | I1209 22:10:06.060120 63236 layer_factory.hpp:77] Creating layer pool5
436 | I1209 22:10:06.060127 63236 net.cpp:106] Creating Layer pool5
437 | I1209 22:10:06.060130 63236 net.cpp:454] pool5 <- conv5
438 | I1209 22:10:06.060137 63236 net.cpp:411] pool5 -> pool5
439 | I1209 22:10:06.060431 63236 net.cpp:150] Setting up pool5
440 | I1209 22:10:06.060441 63236 net.cpp:157] Top shape: 256 256 6 6 (2359296)
441 | I1209 22:10:06.060444 63236 net.cpp:165] Memory required for data: 1589478400
442 | I1209 22:10:06.060448 63236 layer_factory.hpp:77] Creating layer fc6
443 | I1209 22:10:06.060457 63236 net.cpp:106] Creating Layer fc6
444 | I1209 22:10:06.060461 63236 net.cpp:454] fc6 <- pool5
445 | I1209 22:10:06.060469 63236 net.cpp:411] fc6 -> fc6
446 | I1209 22:10:07.117044 63236 net.cpp:150] Setting up fc6
447 | I1209 22:10:07.117090 63236 net.cpp:157] Top shape: 256 4096 (1048576)
448 | I1209 22:10:07.117095 63236 net.cpp:165] Memory required for data: 1593672704
449 | I1209 22:10:07.117107 63236 layer_factory.hpp:77] Creating layer relu6
450 | I1209 22:10:07.117120 63236 net.cpp:106] Creating Layer relu6
451 | I1209 22:10:07.117132 63236 net.cpp:454] relu6 <- fc6
452 | I1209 22:10:07.117180 63236 net.cpp:397] relu6 -> fc6 (in-place)
453 | I1209 22:10:07.117430 63236 net.cpp:150] Setting up relu6
454 | I1209 22:10:07.117440 63236 net.cpp:157] Top shape: 256 4096 (1048576)
455 | I1209 22:10:07.117444 63236 net.cpp:165] Memory required for data: 1597867008
456 | I1209 22:10:07.117449 63236 layer_factory.hpp:77] Creating layer drop6
457 | I1209 22:10:07.117480 63236 net.cpp:106] Creating Layer drop6
458 | I1209 22:10:07.117483 63236 net.cpp:454] drop6 <- fc6
459 | I1209 22:10:07.117491 63236 net.cpp:397] drop6 -> fc6 (in-place)
460 | I1209 22:10:07.117530 63236 net.cpp:150] Setting up drop6
461 | I1209 22:10:07.117537 63236 net.cpp:157] Top shape: 256 4096 (1048576)
462 | I1209 22:10:07.117542 63236 net.cpp:165] Memory required for data: 1602061312
463 | I1209 22:10:07.117545 63236 layer_factory.hpp:77] Creating layer fc7
464 | I1209 22:10:07.117558 63236 net.cpp:106] Creating Layer fc7
465 | I1209 22:10:07.117561 63236 net.cpp:454] fc7 <- fc6
466 | I1209 22:10:07.117568 63236 net.cpp:411] fc7 -> fc7
467 | I1209 22:10:07.591801 63236 net.cpp:150] Setting up fc7
468 | I1209 22:10:07.591850 63236 net.cpp:157] Top shape: 256 4096 (1048576)
469 | I1209 22:10:07.591856 63236 net.cpp:165] Memory required for data: 1606255616
470 | I1209 22:10:07.591868 63236 layer_factory.hpp:77] Creating layer relu7
471 | I1209 22:10:07.591886 63236 net.cpp:106] Creating Layer relu7
472 | I1209 22:10:07.591892 63236 net.cpp:454] relu7 <- fc7
473 | I1209 22:10:07.591900 63236 net.cpp:397] relu7 -> fc7 (in-place)
474 | I1209 22:10:07.592543 63236 net.cpp:150] Setting up relu7
475 | I1209 22:10:07.592555 63236 net.cpp:157] Top shape: 256 4096 (1048576)
476 | I1209 22:10:07.592558 63236 net.cpp:165] Memory required for data: 1610449920
477 | I1209 22:10:07.592563 63236 layer_factory.hpp:77] Creating layer drop7
478 | I1209 22:10:07.592572 63236 net.cpp:106] Creating Layer drop7
479 | I1209 22:10:07.592577 63236 net.cpp:454] drop7 <- fc7
480 | I1209 22:10:07.592586 63236 net.cpp:397] drop7 -> fc7 (in-place)
481 | I1209 22:10:07.592610 63236 net.cpp:150] Setting up drop7
482 | I1209 22:10:07.592617 63236 net.cpp:157] Top shape: 256 4096 (1048576)
483 | I1209 22:10:07.592620 63236 net.cpp:165] Memory required for data: 1614644224
484 | I1209 22:10:07.592624 63236 layer_factory.hpp:77] Creating layer fc8
485 | I1209 22:10:07.592638 63236 net.cpp:106] Creating Layer fc8
486 | I1209 22:10:07.592641 63236 net.cpp:454] fc8 <- fc7
487 | I1209 22:10:07.592650 63236 net.cpp:411] fc8 -> fc8
488 | I1209 22:10:07.704557 63236 net.cpp:150] Setting up fc8
489 | I1209 22:10:07.704577 63236 net.cpp:157] Top shape: 256 1000 (256000)
490 | I1209 22:10:07.704581 63236 net.cpp:165] Memory required for data: 1615668224
491 | I1209 22:10:07.704589 63236 layer_factory.hpp:77] Creating layer loss
492 | I1209 22:10:07.704597 63236 net.cpp:106] Creating Layer loss
493 | I1209 22:10:07.704602 63236 net.cpp:454] loss <- fc8
494 | I1209 22:10:07.704607 63236 net.cpp:454] loss <- label
495 | I1209 22:10:07.704617 63236 net.cpp:411] loss -> loss
496 | I1209 22:10:07.704629 63236 layer_factory.hpp:77] Creating layer loss
497 | I1209 22:10:07.705862 63236 net.cpp:150] Setting up loss
498 | I1209 22:10:07.705873 63236 net.cpp:157] Top shape: (1)
499 | I1209 22:10:07.705876 63236 net.cpp:160]     with loss weight 1
500 | I1209 22:10:07.705900 63236 net.cpp:165] Memory required for data: 1615668228
501 | I1209 22:10:07.705905 63236 net.cpp:226] loss needs backward computation.
502 | I1209 22:10:07.705909 63236 net.cpp:226] fc8 needs backward computation.
503 | I1209 22:10:07.705914 63236 net.cpp:226] drop7 needs backward computation.
504 | I1209 22:10:07.705917 63236 net.cpp:226] relu7 needs backward computation.
505 | I1209 22:10:07.705921 63236 net.cpp:226] fc7 needs backward computation.
506 | I1209 22:10:07.705925 63236 net.cpp:226] drop6 needs backward computation.
507 | I1209 22:10:07.705930 63236 net.cpp:226] relu6 needs backward computation.
508 | I1209 22:10:07.705934 63236 net.cpp:226] fc6 needs backward computation.
509 | I1209 22:10:07.705938 63236 net.cpp:226] pool5 needs backward computation.
510 | I1209 22:10:07.705942 63236 net.cpp:226] relu5 needs backward computation.
511 | I1209 22:10:07.705946 63236 net.cpp:226] conv5 needs backward computation.
512 | I1209 22:10:07.705951 63236 net.cpp:226] relu4 needs backward computation.
513 | I1209 22:10:07.705962 63236 net.cpp:226] conv4 needs backward computation.
514 | I1209 22:10:07.706001 63236 net.cpp:226] relu3 needs backward computation.
515 | I1209 22:10:07.706004 63236 net.cpp:226] conv3 needs backward computation.
516 | I1209 22:10:07.706009 63236 net.cpp:226] pool2 needs backward computation.
517 | I1209 22:10:07.706017 63236 net.cpp:226] relu2 needs backward computation.
518 | I1209 22:10:07.706020 63236 net.cpp:226] conv2 needs backward computation.
519 | I1209 22:10:07.706025 63236 net.cpp:226] pool1 needs backward computation.
520 | I1209 22:10:07.706028 63236 net.cpp:226] relu1 needs backward computation.
521 | I1209 22:10:07.706032 63236 net.cpp:226] conv1 needs backward computation.
522 | I1209 22:10:07.706037 63236 net.cpp:228] data does not need backward computation.
523 | I1209 22:10:07.706042 63236 net.cpp:270] This network produces output loss
524 | I1209 22:10:07.706055 63236 net.cpp:283] Network initialization done.
525 | I1209 22:10:07.706135 63236 solver.cpp:60] Solver scaffolding done.
526 | I1209 22:10:07.706593 63236 caffe.cpp:212] Starting Optimization
527 | I1209 22:10:07.706603 63236 solver.cpp:288] Solving AlexNet
528 | I1209 22:10:07.706606 63236 solver.cpp:289] Learning Rate Policy: fixed
529 | I1209 22:11:05.112869 63236 solver.cpp:459] Snapshotting to binary proto file _iter_50.caffemodel
530 | I1209 22:11:08.473160 63236 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_50.solverstate
531 | I1209 22:11:10.449729 63236 solver.cpp:326] Optimization Done.
532 | I1209 22:11:10.449767 63236 caffe.cpp:215] Optimization Done.
533 | 


--------------------------------------------------------------------------------
/caffe/alexnet_1GPU.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe train -solver=./alexnet_solver.prototxt -gpu=0 >alexnet_1GPU.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/alexnet_4GPUs.log:
--------------------------------------------------------------------------------
  1 | I1210 17:39:15.308357  1423 caffe.cpp:184] Using GPUs 0, 1, 2, 3
  2 | I1210 17:39:24.950314  1423 solver.cpp:48] Initializing solver from parameters: 
  3 | base_lr: 0.01
  4 | max_iter: 50
  5 | lr_policy: "fixed"
  6 | solver_mode: GPU
  7 | device_id: 0
  8 | net: "./alexnet_4GPUs.prototxt"
  9 | I1210 17:39:24.950376  1423 solver.cpp:91] Creating training net from net file: ./alexnet_4GPUs.prototxt
 10 | I1210 17:39:24.951216  1423 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
 11 | I1210 17:39:24.951390  1423 net.cpp:49] Initializing net from parameters: 
 12 | name: "AlexNet"
 13 | state {
 14 |   phase: TRAIN
 15 | }
 16 | layer {
 17 |   name: "data"
 18 |   type: "Data"
 19 |   top: "data"
 20 |   top: "label"
 21 |   include {
 22 |     phase: TRAIN
 23 |   }
 24 |   data_param {
 25 |     source: "./fake_image_net.lmdb"
 26 |     batch_size: 64
 27 |     backend: LMDB
 28 |   }
 29 | }
 30 | layer {
 31 |   name: "conv1"
 32 |   type: "Convolution"
 33 |   bottom: "data"
 34 |   top: "conv1"
 35 |   param {
 36 |     lr_mult: 1
 37 |     decay_mult: 1
 38 |   }
 39 |   param {
 40 |     lr_mult: 2
 41 |     decay_mult: 0
 42 |   }
 43 |   convolution_param {
 44 |     num_output: 96
 45 |     kernel_size: 11
 46 |     stride: 4
 47 |     weight_filler {
 48 |       type: "gaussian"
 49 |       std: 0.01
 50 |     }
 51 |     bias_filler {
 52 |       type: "constant"
 53 |       value: 0
 54 |     }
 55 |   }
 56 | }
 57 | layer {
 58 |   name: "relu1"
 59 |   type: "ReLU"
 60 |   bottom: "conv1"
 61 |   top: "conv1"
 62 | }
 63 | layer {
 64 |   name: "pool1"
 65 |   type: "Pooling"
 66 |   bottom: "conv1"
 67 |   top: "pool1"
 68 |   pooling_param {
 69 |     pool: MAX
 70 |     kernel_size: 3
 71 |     stride: 2
 72 |   }
 73 | }
 74 | layer {
 75 |   name: "conv2"
 76 |   type: "Convolution"
 77 |   bottom: "pool1"
 78 |   top: "conv2"
 79 |   param {
 80 |     lr_mult: 1
 81 |     decay_mult: 1
 82 |   }
 83 |   param {
 84 |     lr_mult: 2
 85 |     decay_mult: 0
 86 |   }
 87 |   convolution_param {
 88 |     num_output: 256
 89 |     pad: 2
 90 |     kernel_size: 5
 91 |     weight_filler {
 92 |       type: "gaussian"
 93 |       std: 0.01
 94 |     }
 95 |     bias_filler {
 96 |       type: "constant"
 97 |       value: 0.1
 98 |     }
 99 |   }
100 | }
101 | layer {
102 |   name: "relu2"
103 |   type: "ReLU"
104 |   bottom: "conv2"
105 |   top: "conv2"
106 | }
107 | layer {
108 |   name: "pool2"
109 |   type: "Pooling"
110 |   bottom: "conv2"
111 |   top: "pool2"
112 |   pooling_param {
113 |     pool: MAX
114 |     kernel_size: 3
115 |     stride: 2
116 |   }
117 | }
118 | layer {
119 |   name: "conv3"
120 |   type: "Convolution"
121 |   bottom: "pool2"
122 |   top: "conv3"
123 |   param {
124 |     lr_mult: 1
125 |     decay_mult: 1
126 |   }
127 |   param {
128 |     lr_mult: 2
129 |     decay_mult: 0
130 |   }
131 |   convolution_param {
132 |     num_output: 384
133 |     pad: 1
134 |     kernel_size: 3
135 |     weight_filler {
136 |       type: "gaussian"
137 |       std: 0.01
138 |     }
139 |     bias_filler {
140 |       type: "constant"
141 |       value: 0
142 |     }
143 |   }
144 | }
145 | layer {
146 |   name: "relu3"
147 |   type: "ReLU"
148 |   bottom: "conv3"
149 |   top: "conv3"
150 | }
151 | layer {
152 |   name: "conv4"
153 |   type: "Convolution"
154 |   bottom: "conv3"
155 |   top: "conv4"
156 |   param {
157 |     lr_mult: 1
158 |     decay_mult: 1
159 |   }
160 |   param {
161 |     lr_mult: 2
162 |     decay_mult: 0
163 |   }
164 |   convolution_param {
165 |     num_output: 384
166 |     pad: 1
167 |     kernel_size: 3
168 |     weight_filler {
169 |       type: "gaussian"
170 |       std: 0.01
171 |     }
172 |     bias_filler {
173 |       type: "constant"
174 |       value: 0.1
175 |     }
176 |   }
177 | }
178 | layer {
179 |   name: "relu4"
180 |   type: "ReLU"
181 |   bottom: "conv4"
182 |   top: "conv4"
183 | }
184 | layer {
185 |   name: "conv5"
186 |   type: "Convolution"
187 |   bottom: "conv4"
188 |   top: "conv5"
189 |   param {
190 |     lr_mult: 1
191 |     decay_mult: 1
192 |   }
193 |   param {
194 |     lr_mult: 2
195 |     decay_mult: 0
196 |   }
197 |   convolution_param {
198 |     num_output: 256
199 |     pad: 1
200 |     kernel_size: 3
201 |     weight_filler {
202 |       type: "gaussian"
203 |       std: 0.01
204 |     }
205 |     bias_filler {
206 |       type: "constant"
207 |       value: 0.1
208 |     }
209 |   }
210 | }
211 | layer {
212 |   name: "relu5"
213 |   type: "ReLU"
214 |   bottom: "conv5"
215 |   top: "conv5"
216 | }
217 | layer {
218 |   name: "pool5"
219 |   type: "Pooling"
220 |   bottom: "conv5"
221 |   top: "pool5"
222 |   pooling_param {
223 |     pool: MAX
224 |     kernel_size: 3
225 |     stride: 2
226 |   }
227 | }
228 | layer {
229 |   name: "fc6"
230 |   type: "InnerProduct"
231 |   bottom: "pool5"
232 |   top: "fc6"
233 |   param {
234 |     lr_mult: 1
235 |     decay_mult: 1
236 |   }
237 |   param {
238 |     lr_mult: 2
239 |     decay_mult: 0
240 |   }
241 |   inner_product_param {
242 |     num_output: 4096
243 |     weight_filler {
244 |       type: "gaussian"
245 |       std: 0.005
246 |     }
247 |     bias_filler {
248 |       type: "constant"
249 |       value: 0.1
250 |     }
251 |   }
252 | }
253 | layer {
254 |   name: "relu6"
255 |   type: "ReLU"
256 |   bottom: "fc6"
257 |   top: "fc6"
258 | }
259 | layer {
260 |   name: "drop6"
261 |   type: "Dropout"
262 |   bottom: "fc6"
263 |   top: "fc6"
264 |   dropout_param {
265 |     dropout_ratio: 0.5
266 |   }
267 | }
268 | layer {
269 |   name: "fc7"
270 |   type: "InnerProduct"
271 |   bottom: "fc6"
272 |   top: "fc7"
273 |   param {
274 |     lr_mult: 1
275 |     decay_mult: 1
276 |   }
277 |   param {
278 |     lr_mult: 2
279 |     decay_mult: 0
280 |   }
281 |   inner_product_param {
282 |     num_output: 4096
283 |     weight_filler {
284 |       type: "gaussian"
285 |       std: 0.005
286 |     }
287 |     bias_filler {
288 |       type: "constant"
289 |       value: 0.1
290 |     }
291 |   }
292 | }
293 | layer {
294 |   name: "relu7"
295 |   type: "ReLU"
296 |   bottom: "fc7"
297 |   top: "fc7"
298 | }
299 | layer {
300 |   name: "drop7"
301 |   type: "Dropout"
302 |   bottom: "fc7"
303 |   top: "fc7"
304 |   dropout_param {
305 |     dropout_ratio: 0.5
306 |   }
307 | }
308 | layer {
309 |   name: "fc8"
310 |   type: "InnerProduct"
311 |   bottom: "fc7"
312 |   top: "fc8"
313 |   param {
314 |     lr_mult: 1
315 |     decay_mult: 1
316 |   }
317 |   param {
318 |     lr_mult: 2
319 |     decay_mult: 0
320 |   }
321 |   inner_product_param {
322 |     num_output: 1000
323 |     weight_filler {
324 |       type: "gaussian"
325 |       std: 0.01
326 |     }
327 |     bias_filler {
328 |       type: "constant"
329 |       value: 0
330 |     }
331 |   }
332 | }
333 | layer {
334 |   name: "loss"
335 |   type: "SoftmaxWithLoss"
336 |   bottom: "fc8"
337 |   bottom: "label"
338 |   top: "loss"
339 | }
340 | I1210 17:39:24.951545  1423 layer_factory.hpp:77] Creating layer data
341 | I1210 17:39:24.952008  1423 net.cpp:106] Creating Layer data
342 | I1210 17:39:24.952018  1423 net.cpp:411] data -> data
343 | I1210 17:39:24.952041  1423 net.cpp:411] data -> label
344 | I1210 17:39:24.954187  1425 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb
345 | I1210 17:39:24.967650  1423 data_layer.cpp:41] output data size: 64,3,224,224
346 | I1210 17:39:25.048879  1423 net.cpp:150] Setting up data
347 | I1210 17:39:25.048928  1423 net.cpp:157] Top shape: 64 3 224 224 (9633792)
348 | I1210 17:39:25.048936  1423 net.cpp:157] Top shape: 64 (64)
349 | I1210 17:39:25.048940  1423 net.cpp:165] Memory required for data: 38535424
350 | I1210 17:39:25.048950  1423 layer_factory.hpp:77] Creating layer conv1
351 | I1210 17:39:25.048975  1423 net.cpp:106] Creating Layer conv1
352 | I1210 17:39:25.048982  1423 net.cpp:454] conv1 <- data
353 | I1210 17:39:25.048998  1423 net.cpp:411] conv1 -> conv1
354 | I1210 17:39:25.054342  1426 blocking_queue.cpp:50] Waiting for data
355 | I1210 17:39:25.152736  1423 net.cpp:150] Setting up conv1
356 | I1210 17:39:25.152753  1423 net.cpp:157] Top shape: 64 96 54 54 (17915904)
357 | I1210 17:39:25.152758  1423 net.cpp:165] Memory required for data: 110199040
358 | I1210 17:39:25.152773  1423 layer_factory.hpp:77] Creating layer relu1
359 | I1210 17:39:25.152782  1423 net.cpp:106] Creating Layer relu1
360 | I1210 17:39:25.152787  1423 net.cpp:454] relu1 <- conv1
361 | I1210 17:39:25.152793  1423 net.cpp:397] relu1 -> conv1 (in-place)
362 | I1210 17:39:25.153028  1423 net.cpp:150] Setting up relu1
363 | I1210 17:39:25.153039  1423 net.cpp:157] Top shape: 64 96 54 54 (17915904)
364 | I1210 17:39:25.153043  1423 net.cpp:165] Memory required for data: 181862656
365 | I1210 17:39:25.153048  1423 layer_factory.hpp:77] Creating layer pool1
366 | I1210 17:39:25.153056  1423 net.cpp:106] Creating Layer pool1
367 | I1210 17:39:25.153060  1423 net.cpp:454] pool1 <- conv1
368 | I1210 17:39:25.153066  1423 net.cpp:411] pool1 -> pool1
369 | I1210 17:39:25.153345  1423 net.cpp:150] Setting up pool1
370 | I1210 17:39:25.153357  1423 net.cpp:157] Top shape: 64 96 27 27 (4478976)
371 | I1210 17:39:25.153360  1423 net.cpp:165] Memory required for data: 199778560
372 | I1210 17:39:25.153365  1423 layer_factory.hpp:77] Creating layer conv2
373 | I1210 17:39:25.153383  1423 net.cpp:106] Creating Layer conv2
374 | I1210 17:39:25.153388  1423 net.cpp:454] conv2 <- pool1
375 | I1210 17:39:25.153393  1423 net.cpp:411] conv2 -> conv2
376 | I1210 17:39:25.171576  1423 net.cpp:150] Setting up conv2
377 | I1210 17:39:25.171588  1423 net.cpp:157] Top shape: 64 256 27 27 (11943936)
378 | I1210 17:39:25.171592  1423 net.cpp:165] Memory required for data: 247554304
379 | I1210 17:39:25.171602  1423 layer_factory.hpp:77] Creating layer relu2
380 | I1210 17:39:25.171609  1423 net.cpp:106] Creating Layer relu2
381 | I1210 17:39:25.171613  1423 net.cpp:454] relu2 <- conv2
382 | I1210 17:39:25.171622  1423 net.cpp:397] relu2 -> conv2 (in-place)
383 | I1210 17:39:25.171888  1423 net.cpp:150] Setting up relu2
384 | I1210 17:39:25.171898  1423 net.cpp:157] Top shape: 64 256 27 27 (11943936)
385 | I1210 17:39:25.171902  1423 net.cpp:165] Memory required for data: 295330048
386 | I1210 17:39:25.171906  1423 layer_factory.hpp:77] Creating layer pool2
387 | I1210 17:39:25.171914  1423 net.cpp:106] Creating Layer pool2
388 | I1210 17:39:25.171918  1423 net.cpp:454] pool2 <- conv2
389 | I1210 17:39:25.171933  1423 net.cpp:411] pool2 -> pool2
390 | I1210 17:39:25.172139  1423 net.cpp:150] Setting up pool2
391 | I1210 17:39:25.172149  1423 net.cpp:157] Top shape: 64 256 13 13 (2768896)
392 | I1210 17:39:25.172153  1423 net.cpp:165] Memory required for data: 306405632
393 | I1210 17:39:25.172158  1423 layer_factory.hpp:77] Creating layer conv3
394 | I1210 17:39:25.172168  1423 net.cpp:106] Creating Layer conv3
395 | I1210 17:39:25.172173  1423 net.cpp:454] conv3 <- pool2
396 | I1210 17:39:25.172186  1423 net.cpp:411] conv3 -> conv3
397 | I1210 17:39:25.197000  1423 net.cpp:150] Setting up conv3
398 | I1210 17:39:25.197011  1423 net.cpp:157] Top shape: 64 384 13 13 (4153344)
399 | I1210 17:39:25.197016  1423 net.cpp:165] Memory required for data: 323019008
400 | I1210 17:39:25.197026  1423 layer_factory.hpp:77] Creating layer relu3
401 | I1210 17:39:25.197036  1423 net.cpp:106] Creating Layer relu3
402 | I1210 17:39:25.197041  1423 net.cpp:454] relu3 <- conv3
403 | I1210 17:39:25.197046  1423 net.cpp:397] relu3 -> conv3 (in-place)
404 | I1210 17:39:25.197306  1423 net.cpp:150] Setting up relu3
405 | I1210 17:39:25.197317  1423 net.cpp:157] Top shape: 64 384 13 13 (4153344)
406 | I1210 17:39:25.197321  1423 net.cpp:165] Memory required for data: 339632384
407 | I1210 17:39:25.197325  1423 layer_factory.hpp:77] Creating layer conv4
408 | I1210 17:39:25.197337  1423 net.cpp:106] Creating Layer conv4
409 | I1210 17:39:25.197341  1423 net.cpp:454] conv4 <- conv3
410 | I1210 17:39:25.197350  1423 net.cpp:411] conv4 -> conv4
411 | I1210 17:39:25.235277  1423 net.cpp:150] Setting up conv4
412 | I1210 17:39:25.235290  1423 net.cpp:157] Top shape: 64 384 13 13 (4153344)
413 | I1210 17:39:25.235293  1423 net.cpp:165] Memory required for data: 356245760
414 | I1210 17:39:25.235301  1423 layer_factory.hpp:77] Creating layer relu4
415 | I1210 17:39:25.235311  1423 net.cpp:106] Creating Layer relu4
416 | I1210 17:39:25.235316  1423 net.cpp:454] relu4 <- conv4
417 | I1210 17:39:25.235321  1423 net.cpp:397] relu4 -> conv4 (in-place)
418 | I1210 17:39:25.235462  1423 net.cpp:150] Setting up relu4
419 | I1210 17:39:25.235474  1423 net.cpp:157] Top shape: 64 384 13 13 (4153344)
420 | I1210 17:39:25.235478  1423 net.cpp:165] Memory required for data: 372859136
421 | I1210 17:39:25.235482  1423 layer_factory.hpp:77] Creating layer conv5
422 | I1210 17:39:25.235491  1423 net.cpp:106] Creating Layer conv5
423 | I1210 17:39:25.235496  1423 net.cpp:454] conv5 <- conv4
424 | I1210 17:39:25.235503  1423 net.cpp:411] conv5 -> conv5
425 | I1210 17:39:25.260288  1423 net.cpp:150] Setting up conv5
426 | I1210 17:39:25.260300  1423 net.cpp:157] Top shape: 64 256 13 13 (2768896)
427 | I1210 17:39:25.260304  1423 net.cpp:165] Memory required for data: 383934720
428 | I1210 17:39:25.260316  1423 layer_factory.hpp:77] Creating layer relu5
429 | I1210 17:39:25.260324  1423 net.cpp:106] Creating Layer relu5
430 | I1210 17:39:25.260329  1423 net.cpp:454] relu5 <- conv5
431 | I1210 17:39:25.260334  1423 net.cpp:397] relu5 -> conv5 (in-place)
432 | I1210 17:39:25.260481  1423 net.cpp:150] Setting up relu5
433 | I1210 17:39:25.260490  1423 net.cpp:157] Top shape: 64 256 13 13 (2768896)
434 | I1210 17:39:25.260494  1423 net.cpp:165] Memory required for data: 395010304
435 | I1210 17:39:25.260499  1423 layer_factory.hpp:77] Creating layer pool5
436 | I1210 17:39:25.260505  1423 net.cpp:106] Creating Layer pool5
437 | I1210 17:39:25.260509  1423 net.cpp:454] pool5 <- conv5
438 | I1210 17:39:25.260517  1423 net.cpp:411] pool5 -> pool5
439 | I1210 17:39:25.260802  1423 net.cpp:150] Setting up pool5
440 | I1210 17:39:25.260812  1423 net.cpp:157] Top shape: 64 256 6 6 (589824)
441 | I1210 17:39:25.260817  1423 net.cpp:165] Memory required for data: 397369600
442 | I1210 17:39:25.260820  1423 layer_factory.hpp:77] Creating layer fc6
443 | I1210 17:39:25.260833  1423 net.cpp:106] Creating Layer fc6
444 | I1210 17:39:25.260838  1423 net.cpp:454] fc6 <- pool5
445 | I1210 17:39:25.260846  1423 net.cpp:411] fc6 -> fc6
446 | I1210 17:39:26.329960  1423 net.cpp:150] Setting up fc6
447 | I1210 17:39:26.330013  1423 net.cpp:157] Top shape: 64 4096 (262144)
448 | I1210 17:39:26.330018  1423 net.cpp:165] Memory required for data: 398418176
449 | I1210 17:39:26.330035  1423 layer_factory.hpp:77] Creating layer relu6
450 | I1210 17:39:26.330051  1423 net.cpp:106] Creating Layer relu6
451 | I1210 17:39:26.330067  1423 net.cpp:454] relu6 <- fc6
452 | I1210 17:39:26.330078  1423 net.cpp:397] relu6 -> fc6 (in-place)
453 | I1210 17:39:26.330458  1423 net.cpp:150] Setting up relu6
454 | I1210 17:39:26.330468  1423 net.cpp:157] Top shape: 64 4096 (262144)
455 | I1210 17:39:26.330471  1423 net.cpp:165] Memory required for data: 399466752
456 | I1210 17:39:26.330476  1423 layer_factory.hpp:77] Creating layer drop6
457 | I1210 17:39:26.330504  1423 net.cpp:106] Creating Layer drop6
458 | I1210 17:39:26.330509  1423 net.cpp:454] drop6 <- fc6
459 | I1210 17:39:26.330514  1423 net.cpp:397] drop6 -> fc6 (in-place)
460 | I1210 17:39:26.330551  1423 net.cpp:150] Setting up drop6
461 | I1210 17:39:26.330559  1423 net.cpp:157] Top shape: 64 4096 (262144)
462 | I1210 17:39:26.330561  1423 net.cpp:165] Memory required for data: 400515328
463 | I1210 17:39:26.330565  1423 layer_factory.hpp:77] Creating layer fc7
464 | I1210 17:39:26.330585  1423 net.cpp:106] Creating Layer fc7
465 | I1210 17:39:26.330590  1423 net.cpp:454] fc7 <- fc6
466 | I1210 17:39:26.330596  1423 net.cpp:411] fc7 -> fc7
467 | I1210 17:39:26.805878  1423 net.cpp:150] Setting up fc7
468 | I1210 17:39:26.805927  1423 net.cpp:157] Top shape: 64 4096 (262144)
469 | I1210 17:39:26.805932  1423 net.cpp:165] Memory required for data: 401563904
470 | I1210 17:39:26.805943  1423 layer_factory.hpp:77] Creating layer relu7
471 | I1210 17:39:26.805958  1423 net.cpp:106] Creating Layer relu7
472 | I1210 17:39:26.805963  1423 net.cpp:454] relu7 <- fc7
473 | I1210 17:39:26.805973  1423 net.cpp:397] relu7 -> fc7 (in-place)
474 | I1210 17:39:26.806596  1423 net.cpp:150] Setting up relu7
475 | I1210 17:39:26.806605  1423 net.cpp:157] Top shape: 64 4096 (262144)
476 | I1210 17:39:26.806609  1423 net.cpp:165] Memory required for data: 402612480
477 | I1210 17:39:26.806614  1423 layer_factory.hpp:77] Creating layer drop7
478 | I1210 17:39:26.806622  1423 net.cpp:106] Creating Layer drop7
479 | I1210 17:39:26.806627  1423 net.cpp:454] drop7 <- fc7
480 | I1210 17:39:26.806637  1423 net.cpp:397] drop7 -> fc7 (in-place)
481 | I1210 17:39:26.806674  1423 net.cpp:150] Setting up drop7
482 | I1210 17:39:26.806684  1423 net.cpp:157] Top shape: 64 4096 (262144)
483 | I1210 17:39:26.806689  1423 net.cpp:165] Memory required for data: 403661056
484 | I1210 17:39:26.806692  1423 layer_factory.hpp:77] Creating layer fc8
485 | I1210 17:39:26.806704  1423 net.cpp:106] Creating Layer fc8
486 | I1210 17:39:26.806709  1423 net.cpp:454] fc8 <- fc7
487 | I1210 17:39:26.806717  1423 net.cpp:411] fc8 -> fc8
488 | I1210 17:39:26.918463  1423 net.cpp:150] Setting up fc8
489 | I1210 17:39:26.918480  1423 net.cpp:157] Top shape: 64 1000 (64000)
490 | I1210 17:39:26.918484  1423 net.cpp:165] Memory required for data: 403917056
491 | I1210 17:39:26.918493  1423 layer_factory.hpp:77] Creating layer loss
492 | I1210 17:39:26.918500  1423 net.cpp:106] Creating Layer loss
493 | I1210 17:39:26.918504  1423 net.cpp:454] loss <- fc8
494 | I1210 17:39:26.918510  1423 net.cpp:454] loss <- label
495 | I1210 17:39:26.918519  1423 net.cpp:411] loss -> loss
496 | I1210 17:39:26.918534  1423 layer_factory.hpp:77] Creating layer loss
497 | I1210 17:39:26.919003  1423 net.cpp:150] Setting up loss
498 | I1210 17:39:26.919014  1423 net.cpp:157] Top shape: (1)
499 | I1210 17:39:26.919018  1423 net.cpp:160]     with loss weight 1
500 | I1210 17:39:26.919049  1423 net.cpp:165] Memory required for data: 403917060
501 | I1210 17:39:26.919054  1423 net.cpp:226] loss needs backward computation.
502 | I1210 17:39:26.919057  1423 net.cpp:226] fc8 needs backward computation.
503 | I1210 17:39:26.919061  1423 net.cpp:226] drop7 needs backward computation.
504 | I1210 17:39:26.919064  1423 net.cpp:226] relu7 needs backward computation.
505 | I1210 17:39:26.919069  1423 net.cpp:226] fc7 needs backward computation.
506 | I1210 17:39:26.919072  1423 net.cpp:226] drop6 needs backward computation.
507 | I1210 17:39:26.919076  1423 net.cpp:226] relu6 needs backward computation.
508 | I1210 17:39:26.919080  1423 net.cpp:226] fc6 needs backward computation.
509 | I1210 17:39:26.919085  1423 net.cpp:226] pool5 needs backward computation.
510 | I1210 17:39:26.919088  1423 net.cpp:226] relu5 needs backward computation.
511 | I1210 17:39:26.919092  1423 net.cpp:226] conv5 needs backward computation.
512 | I1210 17:39:26.919096  1423 net.cpp:226] relu4 needs backward computation.
513 | I1210 17:39:26.919100  1423 net.cpp:226] conv4 needs backward computation.
514 | I1210 17:39:26.919112  1423 net.cpp:226] relu3 needs backward computation.
515 | I1210 17:39:26.919149  1423 net.cpp:226] conv3 needs backward computation.
516 | I1210 17:39:26.919154  1423 net.cpp:226] pool2 needs backward computation.
517 | I1210 17:39:26.919162  1423 net.cpp:226] relu2 needs backward computation.
518 | I1210 17:39:26.919167  1423 net.cpp:226] conv2 needs backward computation.
519 | I1210 17:39:26.919172  1423 net.cpp:226] pool1 needs backward computation.
520 | I1210 17:39:26.919179  1423 net.cpp:226] relu1 needs backward computation.
521 | I1210 17:39:26.919184  1423 net.cpp:226] conv1 needs backward computation.
522 | I1210 17:39:26.919189  1423 net.cpp:228] data does not need backward computation.
523 | I1210 17:39:26.919193  1423 net.cpp:270] This network produces output loss
524 | I1210 17:39:26.919209  1423 net.cpp:283] Network initialization done.
525 | I1210 17:39:26.919297  1423 solver.cpp:60] Solver scaffolding done.
526 | I1210 17:39:26.956542  1423 parallel.cpp:391] GPUs pairs 0:1, 2:3, 0:2
527 | I1210 17:39:27.190856  1423 data_layer.cpp:41] output data size: 64,3,224,224
528 | I1210 17:39:29.479032  1423 data_layer.cpp:41] output data size: 64,3,224,224
529 | I1210 17:39:31.457525  1423 parallel.cpp:234] GPU 2 does not have p2p access to GPU 0
530 | I1210 17:39:31.698340  1423 data_layer.cpp:41] output data size: 64,3,224,224
531 | I1210 17:39:33.793845  1423 parallel.cpp:419] Starting Optimization
532 | I1210 17:39:33.794456  1423 solver.cpp:288] Solving AlexNet
533 | I1210 17:39:33.794497  1423 solver.cpp:289] Learning Rate Policy: fixed
534 | I1210 17:40:02.691781  1423 solver.cpp:459] Snapshotting to binary proto file _iter_50.caffemodel
535 | I1210 17:40:05.501304  1423 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_50.solverstate
536 | I1210 17:40:07.500490  1423 solver.cpp:326] Optimization Done.
537 | I1210 17:40:07.728231  1423 caffe.cpp:215] Optimization Done.
538 | 


--------------------------------------------------------------------------------
/caffe/alexnet_4GPUs.prototxt:
--------------------------------------------------------------------------------
  1 | name: "AlexNet"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   data_param {
 11 |     source: "./fake_image_net.lmdb"
 12 |     batch_size: 64
 13 |     backend: LMDB
 14 |   }
 15 | }
 16 | layer {
 17 |   name: "conv1"
 18 |   type: "Convolution"
 19 |   bottom: "data"
 20 |   top: "conv1"
 21 |   param {
 22 |     lr_mult: 1
 23 |     decay_mult: 1
 24 |   }
 25 |   param {
 26 |     lr_mult: 2
 27 |     decay_mult: 0
 28 |   }
 29 |   convolution_param {
 30 |     num_output: 96
 31 |     kernel_size: 11
 32 |     stride: 4
 33 |     weight_filler {
 34 |       type: "gaussian"
 35 |       std: 0.01
 36 |     }
 37 |     bias_filler {
 38 |       type: "constant"
 39 |       value: 0
 40 |     }
 41 |   }
 42 | }
 43 | layer {
 44 |   name: "relu1"
 45 |   type: "ReLU"
 46 |   bottom: "conv1"
 47 |   top: "conv1"
 48 | }
 49 | layer {
 50 |   name: "pool1"
 51 |   type: "Pooling"
 52 |   bottom: "conv1"
 53 |   top: "pool1"
 54 |   pooling_param {
 55 |     pool: MAX
 56 |     kernel_size: 3
 57 |     stride: 2
 58 |   }
 59 | }
 60 | layer {
 61 |   name: "conv2"
 62 |   type: "Convolution"
 63 |   bottom: "pool1"
 64 |   top: "conv2"
 65 |   param {
 66 |     lr_mult: 1
 67 |     decay_mult: 1
 68 |   }
 69 |   param {
 70 |     lr_mult: 2
 71 |     decay_mult: 0
 72 |   }
 73 |   convolution_param {
 74 |     num_output: 256
 75 |     pad: 2
 76 |     kernel_size: 5
 77 |     weight_filler {
 78 |       type: "gaussian"
 79 |       std: 0.01
 80 |     }
 81 |     bias_filler {
 82 |       type: "constant"
 83 |       value: 0.1
 84 |     }
 85 |   }
 86 | }
 87 | layer {
 88 |   name: "relu2"
 89 |   type: "ReLU"
 90 |   bottom: "conv2"
 91 |   top: "conv2"
 92 | }
 93 | layer {
 94 |   name: "pool2"
 95 |   type: "Pooling"
 96 |   bottom: "conv2"
 97 |   top: "pool2"
 98 |   pooling_param {
 99 |     pool: MAX
100 |     kernel_size: 3
101 |     stride: 2
102 |   }
103 | }
104 | layer {
105 |   name: "conv3"
106 |   type: "Convolution"
107 |   bottom: "pool2"
108 |   top: "conv3"
109 |   param {
110 |     lr_mult: 1
111 |     decay_mult: 1
112 |   }
113 |   param {
114 |     lr_mult: 2
115 |     decay_mult: 0
116 |   }
117 |   convolution_param {
118 |     num_output: 384
119 |     pad: 1
120 |     kernel_size: 3
121 |     weight_filler {
122 |       type: "gaussian"
123 |       std: 0.01
124 |     }
125 |     bias_filler {
126 |       type: "constant"
127 |       value: 0
128 |     }
129 |   }
130 | }
131 | layer {
132 |   name: "relu3"
133 |   type: "ReLU"
134 |   bottom: "conv3"
135 |   top: "conv3"
136 | }
137 | layer {
138 |   name: "conv4"
139 |   type: "Convolution"
140 |   bottom: "conv3"
141 |   top: "conv4"
142 |   param {
143 |     lr_mult: 1
144 |     decay_mult: 1
145 |   }
146 |   param {
147 |     lr_mult: 2
148 |     decay_mult: 0
149 |   }
150 |   convolution_param {
151 |     num_output: 384
152 |     pad: 1
153 |     kernel_size: 3
154 |     weight_filler {
155 |       type: "gaussian"
156 |       std: 0.01
157 |     }
158 |     bias_filler {
159 |       type: "constant"
160 |       value: 0.1
161 |     }
162 |   }
163 | }
164 | layer {
165 |   name: "relu4"
166 |   type: "ReLU"
167 |   bottom: "conv4"
168 |   top: "conv4"
169 | }
170 | layer {
171 |   name: "conv5"
172 |   type: "Convolution"
173 |   bottom: "conv4"
174 |   top: "conv5"
175 |   param {
176 |     lr_mult: 1
177 |     decay_mult: 1
178 |   }
179 |   param {
180 |     lr_mult: 2
181 |     decay_mult: 0
182 |   }
183 |   convolution_param {
184 |     num_output: 256
185 |     pad: 1
186 |     kernel_size: 3
187 |     weight_filler {
188 |       type: "gaussian"
189 |       std: 0.01
190 |     }
191 |     bias_filler {
192 |       type: "constant"
193 |       value: 0.1
194 |     }
195 |   }
196 | }
197 | layer {
198 |   name: "relu5"
199 |   type: "ReLU"
200 |   bottom: "conv5"
201 |   top: "conv5"
202 | }
203 | layer {
204 |   name: "pool5"
205 |   type: "Pooling"
206 |   bottom: "conv5"
207 |   top: "pool5"
208 |   pooling_param {
209 |     pool: MAX
210 |     kernel_size: 3
211 |     stride: 2
212 |   }
213 | }
214 | layer {
215 |   name: "fc6"
216 |   type: "InnerProduct"
217 |   bottom: "pool5"
218 |   top: "fc6"
219 |   param {
220 |     lr_mult: 1
221 |     decay_mult: 1
222 |   }
223 |   param {
224 |     lr_mult: 2
225 |     decay_mult: 0
226 |   }
227 |   inner_product_param {
228 |     num_output: 4096
229 |     weight_filler {
230 |       type: "gaussian"
231 |       std: 0.005
232 |     }
233 |     bias_filler {
234 |       type: "constant"
235 |       value: 0.1
236 |     }
237 |   }
238 | }
239 | layer {
240 |   name: "relu6"
241 |   type: "ReLU"
242 |   bottom: "fc6"
243 |   top: "fc6"
244 | }
245 | layer {
246 |   name: "drop6"
247 |   type: "Dropout"
248 |   bottom: "fc6"
249 |   top: "fc6"
250 |   dropout_param {
251 |     dropout_ratio: 0.5
252 |   }
253 | }
254 | layer {
255 |   name: "fc7"
256 |   type: "InnerProduct"
257 |   bottom: "fc6"
258 |   top: "fc7"
259 |   param {
260 |     lr_mult: 1
261 |     decay_mult: 1
262 |   }
263 |   param {
264 |     lr_mult: 2
265 |     decay_mult: 0
266 |   }
267 |   inner_product_param {
268 |     num_output: 4096
269 |     weight_filler {
270 |       type: "gaussian"
271 |       std: 0.005
272 |     }
273 |     bias_filler {
274 |       type: "constant"
275 |       value: 0.1
276 |     }
277 |   }
278 | }
279 | layer {
280 |   name: "relu7"
281 |   type: "ReLU"
282 |   bottom: "fc7"
283 |   top: "fc7"
284 | }
285 | layer {
286 |   name: "drop7"
287 |   type: "Dropout"
288 |   bottom: "fc7"
289 |   top: "fc7"
290 |   dropout_param {
291 |     dropout_ratio: 0.5
292 |   }
293 | }
294 | layer {
295 |   name: "fc8"
296 |   type: "InnerProduct"
297 |   bottom: "fc7"
298 |   top: "fc8"
299 |   param {
300 |     lr_mult: 1
301 |     decay_mult: 1
302 |   }
303 |   param {
304 |     lr_mult: 2
305 |     decay_mult: 0
306 |   }
307 |   inner_product_param {
308 |     num_output: 1000
309 |     weight_filler {
310 |       type: "gaussian"
311 |       std: 0.01
312 |     }
313 |     bias_filler {
314 |       type: "constant"
315 |       value: 0
316 |     }
317 |   }
318 | }
319 | layer {
320 |   name: "accuracy"
321 |   type: "Accuracy"
322 |   bottom: "fc8"
323 |   bottom: "label"
324 |   top: "accuracy"
325 |   include {
326 |     phase: TEST
327 |   }
328 | }
329 | layer {
330 |   name: "loss"
331 |   type: "SoftmaxWithLoss"
332 |   bottom: "fc8"
333 |   bottom: "label"
334 |   top: "loss"
335 | }
336 | 
337 | 


--------------------------------------------------------------------------------
/caffe/alexnet_4GPUs.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe train -solver=./alexnet_4GPUs_solver.prototxt -gpu=0,1,2,3 >alexnet_4GPUs.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/alexnet_4GPUs_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "./alexnet_4GPUs.prototxt"
2 | max_iter: 50
3 | base_lr: 0.01
4 | lr_policy: "fixed"
5 | solver_mode: GPU
6 | 


--------------------------------------------------------------------------------
/caffe/alexnet_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "./alexnet.prototxt"
2 | max_iter: 50
3 | base_lr: 0.01
4 | lr_policy: "fixed"
5 | solver_mode: GPU
6 | 


--------------------------------------------------------------------------------
/caffe/alexnet_time_1GPU.log:
--------------------------------------------------------------------------------
  1 | I1212 01:16:31.289032 40332 caffe.cpp:297] Use GPU with device ID 0
  2 | I1212 01:16:41.500707 40332 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
  3 | I1212 01:16:41.500973 40332 net.cpp:49] Initializing net from parameters: 
  4 | name: "AlexNet"
  5 | state {
  6 |   phase: TRAIN
  7 | }
  8 | layer {
  9 |   name: "data"
 10 |   type: "Data"
 11 |   top: "data"
 12 |   top: "label"
 13 |   include {
 14 |     phase: TRAIN
 15 |   }
 16 |   data_param {
 17 |     source: "./fake_image_net.lmdb"
 18 |     batch_size: 256
 19 |     backend: LMDB
 20 |   }
 21 | }
 22 | layer {
 23 |   name: "conv1"
 24 |   type: "Convolution"
 25 |   bottom: "data"
 26 |   top: "conv1"
 27 |   param {
 28 |     lr_mult: 1
 29 |     decay_mult: 1
 30 |   }
 31 |   param {
 32 |     lr_mult: 2
 33 |     decay_mult: 0
 34 |   }
 35 |   convolution_param {
 36 |     num_output: 96
 37 |     kernel_size: 11
 38 |     stride: 4
 39 |     weight_filler {
 40 |       type: "gaussian"
 41 |       std: 0.01
 42 |     }
 43 |     bias_filler {
 44 |       type: "constant"
 45 |       value: 0
 46 |     }
 47 |   }
 48 | }
 49 | layer {
 50 |   name: "relu1"
 51 |   type: "ReLU"
 52 |   bottom: "conv1"
 53 |   top: "conv1"
 54 | }
 55 | layer {
 56 |   name: "pool1"
 57 |   type: "Pooling"
 58 |   bottom: "conv1"
 59 |   top: "pool1"
 60 |   pooling_param {
 61 |     pool: MAX
 62 |     kernel_size: 3
 63 |     stride: 2
 64 |   }
 65 | }
 66 | layer {
 67 |   name: "conv2"
 68 |   type: "Convolution"
 69 |   bottom: "pool1"
 70 |   top: "conv2"
 71 |   param {
 72 |     lr_mult: 1
 73 |     decay_mult: 1
 74 |   }
 75 |   param {
 76 |     lr_mult: 2
 77 |     decay_mult: 0
 78 |   }
 79 |   convolution_param {
 80 |     num_output: 256
 81 |     pad: 2
 82 |     kernel_size: 5
 83 |     weight_filler {
 84 |       type: "gaussian"
 85 |       std: 0.01
 86 |     }
 87 |     bias_filler {
 88 |       type: "constant"
 89 |       value: 0.1
 90 |     }
 91 |   }
 92 | }
 93 | layer {
 94 |   name: "relu2"
 95 |   type: "ReLU"
 96 |   bottom: "conv2"
 97 |   top: "conv2"
 98 | }
 99 | layer {
100 |   name: "pool2"
101 |   type: "Pooling"
102 |   bottom: "conv2"
103 |   top: "pool2"
104 |   pooling_param {
105 |     pool: MAX
106 |     kernel_size: 3
107 |     stride: 2
108 |   }
109 | }
110 | layer {
111 |   name: "conv3"
112 |   type: "Convolution"
113 |   bottom: "pool2"
114 |   top: "conv3"
115 |   param {
116 |     lr_mult: 1
117 |     decay_mult: 1
118 |   }
119 |   param {
120 |     lr_mult: 2
121 |     decay_mult: 0
122 |   }
123 |   convolution_param {
124 |     num_output: 384
125 |     pad: 1
126 |     kernel_size: 3
127 |     weight_filler {
128 |       type: "gaussian"
129 |       std: 0.01
130 |     }
131 |     bias_filler {
132 |       type: "constant"
133 |       value: 0
134 |     }
135 |   }
136 | }
137 | layer {
138 |   name: "relu3"
139 |   type: "ReLU"
140 |   bottom: "conv3"
141 |   top: "conv3"
142 | }
143 | layer {
144 |   name: "conv4"
145 |   type: "Convolution"
146 |   bottom: "conv3"
147 |   top: "conv4"
148 |   param {
149 |     lr_mult: 1
150 |     decay_mult: 1
151 |   }
152 |   param {
153 |     lr_mult: 2
154 |     decay_mult: 0
155 |   }
156 |   convolution_param {
157 |     num_output: 384
158 |     pad: 1
159 |     kernel_size: 3
160 |     weight_filler {
161 |       type: "gaussian"
162 |       std: 0.01
163 |     }
164 |     bias_filler {
165 |       type: "constant"
166 |       value: 0.1
167 |     }
168 |   }
169 | }
170 | layer {
171 |   name: "relu4"
172 |   type: "ReLU"
173 |   bottom: "conv4"
174 |   top: "conv4"
175 | }
176 | layer {
177 |   name: "conv5"
178 |   type: "Convolution"
179 |   bottom: "conv4"
180 |   top: "conv5"
181 |   param {
182 |     lr_mult: 1
183 |     decay_mult: 1
184 |   }
185 |   param {
186 |     lr_mult: 2
187 |     decay_mult: 0
188 |   }
189 |   convolution_param {
190 |     num_output: 256
191 |     pad: 1
192 |     kernel_size: 3
193 |     weight_filler {
194 |       type: "gaussian"
195 |       std: 0.01
196 |     }
197 |     bias_filler {
198 |       type: "constant"
199 |       value: 0.1
200 |     }
201 |   }
202 | }
203 | layer {
204 |   name: "relu5"
205 |   type: "ReLU"
206 |   bottom: "conv5"
207 |   top: "conv5"
208 | }
209 | layer {
210 |   name: "pool5"
211 |   type: "Pooling"
212 |   bottom: "conv5"
213 |   top: "pool5"
214 |   pooling_param {
215 |     pool: MAX
216 |     kernel_size: 3
217 |     stride: 2
218 |   }
219 | }
220 | layer {
221 |   name: "fc6"
222 |   type: "InnerProduct"
223 |   bottom: "pool5"
224 |   top: "fc6"
225 |   param {
226 |     lr_mult: 1
227 |     decay_mult: 1
228 |   }
229 |   param {
230 |     lr_mult: 2
231 |     decay_mult: 0
232 |   }
233 |   inner_product_param {
234 |     num_output: 4096
235 |     weight_filler {
236 |       type: "gaussian"
237 |       std: 0.005
238 |     }
239 |     bias_filler {
240 |       type: "constant"
241 |       value: 0.1
242 |     }
243 |   }
244 | }
245 | layer {
246 |   name: "relu6"
247 |   type: "ReLU"
248 |   bottom: "fc6"
249 |   top: "fc6"
250 | }
251 | layer {
252 |   name: "drop6"
253 |   type: "Dropout"
254 |   bottom: "fc6"
255 |   top: "fc6"
256 |   dropout_param {
257 |     dropout_ratio: 0.5
258 |   }
259 | }
260 | layer {
261 |   name: "fc7"
262 |   type: "InnerProduct"
263 |   bottom: "fc6"
264 |   top: "fc7"
265 |   param {
266 |     lr_mult: 1
267 |     decay_mult: 1
268 |   }
269 |   param {
270 |     lr_mult: 2
271 |     decay_mult: 0
272 |   }
273 |   inner_product_param {
274 |     num_output: 4096
275 |     weight_filler {
276 |       type: "gaussian"
277 |       std: 0.005
278 |     }
279 |     bias_filler {
280 |       type: "constant"
281 |       value: 0.1
282 |     }
283 |   }
284 | }
285 | layer {
286 |   name: "relu7"
287 |   type: "ReLU"
288 |   bottom: "fc7"
289 |   top: "fc7"
290 | }
291 | layer {
292 |   name: "drop7"
293 |   type: "Dropout"
294 |   bottom: "fc7"
295 |   top: "fc7"
296 |   dropout_param {
297 |     dropout_ratio: 0.5
298 |   }
299 | }
300 | layer {
301 |   name: "fc8"
302 |   type: "InnerProduct"
303 |   bottom: "fc7"
304 |   top: "fc8"
305 |   param {
306 |     lr_mult: 1
307 |     decay_mult: 1
308 |   }
309 |   param {
310 |     lr_mult: 2
311 |     decay_mult: 0
312 |   }
313 |   inner_product_param {
314 |     num_output: 1000
315 |     weight_filler {
316 |       type: "gaussian"
317 |       std: 0.01
318 |     }
319 |     bias_filler {
320 |       type: "constant"
321 |       value: 0
322 |     }
323 |   }
324 | }
325 | layer {
326 |   name: "loss"
327 |   type: "SoftmaxWithLoss"
328 |   bottom: "fc8"
329 |   bottom: "label"
330 |   top: "loss"
331 | }
332 | I1212 01:16:41.501101 40332 layer_factory.hpp:77] Creating layer data
333 | I1212 01:16:41.501677 40332 net.cpp:106] Creating Layer data
334 | I1212 01:16:41.501689 40332 net.cpp:411] data -> data
335 | I1212 01:16:41.501711 40332 net.cpp:411] data -> label
336 | I1212 01:16:41.503720 40334 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb
337 | I1212 01:16:41.518365 40332 data_layer.cpp:41] output data size: 256,3,224,224
338 | I1212 01:16:41.777848 40332 net.cpp:150] Setting up data
339 | I1212 01:16:41.777902 40332 net.cpp:157] Top shape: 256 3 224 224 (38535168)
340 | I1212 01:16:41.777909 40332 net.cpp:157] Top shape: 256 (256)
341 | I1212 01:16:41.777914 40332 net.cpp:165] Memory required for data: 154141696
342 | I1212 01:16:41.777923 40332 layer_factory.hpp:77] Creating layer conv1
343 | I1212 01:16:41.777945 40332 net.cpp:106] Creating Layer conv1
344 | I1212 01:16:41.777951 40332 net.cpp:454] conv1 <- data
345 | I1212 01:16:41.777966 40332 net.cpp:411] conv1 -> conv1
346 | I1212 01:16:41.926776 40332 net.cpp:150] Setting up conv1
347 | I1212 01:16:41.926822 40332 net.cpp:157] Top shape: 256 96 54 54 (71663616)
348 | I1212 01:16:41.926828 40332 net.cpp:165] Memory required for data: 440796160
349 | I1212 01:16:41.926854 40332 layer_factory.hpp:77] Creating layer relu1
350 | I1212 01:16:41.926873 40332 net.cpp:106] Creating Layer relu1
351 | I1212 01:16:41.926879 40332 net.cpp:454] relu1 <- conv1
352 | I1212 01:16:41.926888 40332 net.cpp:397] relu1 -> conv1 (in-place)
353 | I1212 01:16:41.927146 40332 net.cpp:150] Setting up relu1
354 | I1212 01:16:41.927160 40332 net.cpp:157] Top shape: 256 96 54 54 (71663616)
355 | I1212 01:16:41.927163 40332 net.cpp:165] Memory required for data: 727450624
356 | I1212 01:16:41.927168 40332 layer_factory.hpp:77] Creating layer pool1
357 | I1212 01:16:41.927184 40332 net.cpp:106] Creating Layer pool1
358 | I1212 01:16:41.927189 40332 net.cpp:454] pool1 <- conv1
359 | I1212 01:16:41.927197 40332 net.cpp:411] pool1 -> pool1
360 | I1212 01:16:41.927513 40332 net.cpp:150] Setting up pool1
361 | I1212 01:16:41.927525 40332 net.cpp:157] Top shape: 256 96 27 27 (17915904)
362 | I1212 01:16:41.927530 40332 net.cpp:165] Memory required for data: 799114240
363 | I1212 01:16:41.927534 40332 layer_factory.hpp:77] Creating layer conv2
364 | I1212 01:16:41.927551 40332 net.cpp:106] Creating Layer conv2
365 | I1212 01:16:41.927556 40332 net.cpp:454] conv2 <- pool1
366 | I1212 01:16:41.927563 40332 net.cpp:411] conv2 -> conv2
367 | I1212 01:16:41.946357 40332 net.cpp:150] Setting up conv2
368 | I1212 01:16:41.946400 40332 net.cpp:157] Top shape: 256 256 27 27 (47775744)
369 | I1212 01:16:41.946406 40332 net.cpp:165] Memory required for data: 990217216
370 | I1212 01:16:41.946424 40332 layer_factory.hpp:77] Creating layer relu2
371 | I1212 01:16:41.946437 40332 net.cpp:106] Creating Layer relu2
372 | I1212 01:16:41.946444 40332 net.cpp:454] relu2 <- conv2
373 | I1212 01:16:41.946452 40332 net.cpp:397] relu2 -> conv2 (in-place)
374 | I1212 01:16:41.946724 40332 net.cpp:150] Setting up relu2
375 | I1212 01:16:41.946737 40332 net.cpp:157] Top shape: 256 256 27 27 (47775744)
376 | I1212 01:16:41.946740 40332 net.cpp:165] Memory required for data: 1181320192
377 | I1212 01:16:41.946745 40332 layer_factory.hpp:77] Creating layer pool2
378 | I1212 01:16:41.946753 40332 net.cpp:106] Creating Layer pool2
379 | I1212 01:16:41.946758 40332 net.cpp:454] pool2 <- conv2
380 | I1212 01:16:41.946766 40332 net.cpp:411] pool2 -> pool2
381 | I1212 01:16:41.946939 40332 net.cpp:150] Setting up pool2
382 | I1212 01:16:41.946949 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584)
383 | I1212 01:16:41.946954 40332 net.cpp:165] Memory required for data: 1225622528
384 | I1212 01:16:41.946959 40332 layer_factory.hpp:77] Creating layer conv3
385 | I1212 01:16:41.946976 40332 net.cpp:106] Creating Layer conv3
386 | I1212 01:16:41.947024 40332 net.cpp:454] conv3 <- pool2
387 | I1212 01:16:41.947033 40332 net.cpp:411] conv3 -> conv3
388 | I1212 01:16:41.973142 40332 net.cpp:150] Setting up conv3
389 | I1212 01:16:41.973193 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376)
390 | I1212 01:16:41.973198 40332 net.cpp:165] Memory required for data: 1292076032
391 | I1212 01:16:41.973217 40332 layer_factory.hpp:77] Creating layer relu3
392 | I1212 01:16:41.973234 40332 net.cpp:106] Creating Layer relu3
393 | I1212 01:16:41.973240 40332 net.cpp:454] relu3 <- conv3
394 | I1212 01:16:41.973250 40332 net.cpp:397] relu3 -> conv3 (in-place)
395 | I1212 01:16:41.973515 40332 net.cpp:150] Setting up relu3
396 | I1212 01:16:41.973526 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376)
397 | I1212 01:16:41.973531 40332 net.cpp:165] Memory required for data: 1358529536
398 | I1212 01:16:41.973536 40332 layer_factory.hpp:77] Creating layer conv4
399 | I1212 01:16:41.973552 40332 net.cpp:106] Creating Layer conv4
400 | I1212 01:16:41.973557 40332 net.cpp:454] conv4 <- conv3
401 | I1212 01:16:41.973567 40332 net.cpp:411] conv4 -> conv4
402 | I1212 01:16:42.002802 40335 blocking_queue.cpp:50] Waiting for data
403 | I1212 01:16:42.011420 40332 net.cpp:150] Setting up conv4
404 | I1212 01:16:42.011440 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376)
405 | I1212 01:16:42.011445 40332 net.cpp:165] Memory required for data: 1424983040
406 | I1212 01:16:42.011456 40332 layer_factory.hpp:77] Creating layer relu4
407 | I1212 01:16:42.011471 40332 net.cpp:106] Creating Layer relu4
408 | I1212 01:16:42.011476 40332 net.cpp:454] relu4 <- conv4
409 | I1212 01:16:42.011492 40332 net.cpp:397] relu4 -> conv4 (in-place)
410 | I1212 01:16:42.011638 40332 net.cpp:150] Setting up relu4
411 | I1212 01:16:42.011647 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376)
412 | I1212 01:16:42.011651 40332 net.cpp:165] Memory required for data: 1491436544
413 | I1212 01:16:42.011656 40332 layer_factory.hpp:77] Creating layer conv5
414 | I1212 01:16:42.011669 40332 net.cpp:106] Creating Layer conv5
415 | I1212 01:16:42.011673 40332 net.cpp:454] conv5 <- conv4
416 | I1212 01:16:42.011692 40332 net.cpp:411] conv5 -> conv5
417 | I1212 01:16:42.050737 40332 net.cpp:150] Setting up conv5
418 | I1212 01:16:42.050758 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584)
419 | I1212 01:16:42.050763 40332 net.cpp:165] Memory required for data: 1535738880
420 | I1212 01:16:42.050776 40332 layer_factory.hpp:77] Creating layer relu5
421 | I1212 01:16:42.050786 40332 net.cpp:106] Creating Layer relu5
422 | I1212 01:16:42.050791 40332 net.cpp:454] relu5 <- conv5
423 | I1212 01:16:42.050798 40332 net.cpp:397] relu5 -> conv5 (in-place)
424 | I1212 01:16:42.050946 40332 net.cpp:150] Setting up relu5
425 | I1212 01:16:42.050956 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584)
426 | I1212 01:16:42.050961 40332 net.cpp:165] Memory required for data: 1580041216
427 | I1212 01:16:42.050964 40332 layer_factory.hpp:77] Creating layer pool5
428 | I1212 01:16:42.050976 40332 net.cpp:106] Creating Layer pool5
429 | I1212 01:16:42.050979 40332 net.cpp:454] pool5 <- conv5
430 | I1212 01:16:42.050997 40332 net.cpp:411] pool5 -> pool5
431 | I1212 01:16:42.051321 40332 net.cpp:150] Setting up pool5
432 | I1212 01:16:42.051331 40332 net.cpp:157] Top shape: 256 256 6 6 (2359296)
433 | I1212 01:16:42.051336 40332 net.cpp:165] Memory required for data: 1589478400
434 | I1212 01:16:42.051340 40332 layer_factory.hpp:77] Creating layer fc6
435 | I1212 01:16:42.051352 40332 net.cpp:106] Creating Layer fc6
436 | I1212 01:16:42.051357 40332 net.cpp:454] fc6 <- pool5
437 | I1212 01:16:42.051363 40332 net.cpp:411] fc6 -> fc6
438 | I1212 01:16:43.094733 40332 net.cpp:150] Setting up fc6
439 | I1212 01:16:43.094801 40332 net.cpp:157] Top shape: 256 4096 (1048576)
440 | I1212 01:16:43.094806 40332 net.cpp:165] Memory required for data: 1593672704
441 | I1212 01:16:43.094828 40332 layer_factory.hpp:77] Creating layer relu6
442 | I1212 01:16:43.094853 40332 net.cpp:106] Creating Layer relu6
443 | I1212 01:16:43.094863 40332 net.cpp:454] relu6 <- fc6
444 | I1212 01:16:43.094873 40332 net.cpp:397] relu6 -> fc6 (in-place)
445 | I1212 01:16:43.095288 40332 net.cpp:150] Setting up relu6
446 | I1212 01:16:43.095300 40332 net.cpp:157] Top shape: 256 4096 (1048576)
447 | I1212 01:16:43.095309 40332 net.cpp:165] Memory required for data: 1597867008
448 | I1212 01:16:43.095340 40332 layer_factory.hpp:77] Creating layer drop6
449 | I1212 01:16:43.095379 40332 net.cpp:106] Creating Layer drop6
450 | I1212 01:16:43.095384 40332 net.cpp:454] drop6 <- fc6
451 | I1212 01:16:43.095391 40332 net.cpp:397] drop6 -> fc6 (in-place)
452 | I1212 01:16:43.095423 40332 net.cpp:150] Setting up drop6
453 | I1212 01:16:43.095430 40332 net.cpp:157] Top shape: 256 4096 (1048576)
454 | I1212 01:16:43.095434 40332 net.cpp:165] Memory required for data: 1602061312
455 | I1212 01:16:43.095438 40332 layer_factory.hpp:77] Creating layer fc7
456 | I1212 01:16:43.095456 40332 net.cpp:106] Creating Layer fc7
457 | I1212 01:16:43.095461 40332 net.cpp:454] fc7 <- fc6
458 | I1212 01:16:43.095466 40332 net.cpp:411] fc7 -> fc7
459 | I1212 01:16:43.556849 40332 net.cpp:150] Setting up fc7
460 | I1212 01:16:43.556907 40332 net.cpp:157] Top shape: 256 4096 (1048576)
461 | I1212 01:16:43.556912 40332 net.cpp:165] Memory required for data: 1606255616
462 | I1212 01:16:43.556933 40332 layer_factory.hpp:77] Creating layer relu7
463 | I1212 01:16:43.556972 40332 net.cpp:106] Creating Layer relu7
464 | I1212 01:16:43.556982 40332 net.cpp:454] relu7 <- fc7
465 | I1212 01:16:43.556994 40332 net.cpp:397] relu7 -> fc7 (in-place)
466 | I1212 01:16:43.557718 40332 net.cpp:150] Setting up relu7
467 | I1212 01:16:43.557731 40332 net.cpp:157] Top shape: 256 4096 (1048576)
468 | I1212 01:16:43.557735 40332 net.cpp:165] Memory required for data: 1610449920
469 | I1212 01:16:43.557740 40332 layer_factory.hpp:77] Creating layer drop7
470 | I1212 01:16:43.557759 40332 net.cpp:106] Creating Layer drop7
471 | I1212 01:16:43.557764 40332 net.cpp:454] drop7 <- fc7
472 | I1212 01:16:43.557772 40332 net.cpp:397] drop7 -> fc7 (in-place)
473 | I1212 01:16:43.557796 40332 net.cpp:150] Setting up drop7
474 | I1212 01:16:43.557803 40332 net.cpp:157] Top shape: 256 4096 (1048576)
475 | I1212 01:16:43.557807 40332 net.cpp:165] Memory required for data: 1614644224
476 | I1212 01:16:43.557812 40332 layer_factory.hpp:77] Creating layer fc8
477 | I1212 01:16:43.557826 40332 net.cpp:106] Creating Layer fc8
478 | I1212 01:16:43.557832 40332 net.cpp:454] fc8 <- fc7
479 | I1212 01:16:43.557839 40332 net.cpp:411] fc8 -> fc8
480 | I1212 01:16:43.667419 40332 net.cpp:150] Setting up fc8
481 | I1212 01:16:43.667448 40332 net.cpp:157] Top shape: 256 1000 (256000)
482 | I1212 01:16:43.667454 40332 net.cpp:165] Memory required for data: 1615668224
483 | I1212 01:16:43.667466 40332 layer_factory.hpp:77] Creating layer loss
484 | I1212 01:16:43.667481 40332 net.cpp:106] Creating Layer loss
485 | I1212 01:16:43.667486 40332 net.cpp:454] loss <- fc8
486 | I1212 01:16:43.667492 40332 net.cpp:454] loss <- label
487 | I1212 01:16:43.667506 40332 net.cpp:411] loss -> loss
488 | I1212 01:16:43.667521 40332 layer_factory.hpp:77] Creating layer loss
489 | I1212 01:16:43.668579 40332 net.cpp:150] Setting up loss
490 | I1212 01:16:43.668591 40332 net.cpp:157] Top shape: (1)
491 | I1212 01:16:43.668594 40332 net.cpp:160]     with loss weight 1
492 | I1212 01:16:43.668633 40332 net.cpp:165] Memory required for data: 1615668228
493 | I1212 01:16:43.668637 40332 net.cpp:226] loss needs backward computation.
494 | I1212 01:16:43.668642 40332 net.cpp:226] fc8 needs backward computation.
495 | I1212 01:16:43.668647 40332 net.cpp:226] drop7 needs backward computation.
496 | I1212 01:16:43.668649 40332 net.cpp:226] relu7 needs backward computation.
497 | I1212 01:16:43.668653 40332 net.cpp:226] fc7 needs backward computation.
498 | I1212 01:16:43.668658 40332 net.cpp:226] drop6 needs backward computation.
499 | I1212 01:16:43.668661 40332 net.cpp:226] relu6 needs backward computation.
500 | I1212 01:16:43.668664 40332 net.cpp:226] fc6 needs backward computation.
501 | I1212 01:16:43.668670 40332 net.cpp:226] pool5 needs backward computation.
502 | I1212 01:16:43.668674 40332 net.cpp:226] relu5 needs backward computation.
503 | I1212 01:16:43.668679 40332 net.cpp:226] conv5 needs backward computation.
504 | I1212 01:16:43.668684 40332 net.cpp:226] relu4 needs backward computation.
505 | I1212 01:16:43.668689 40332 net.cpp:226] conv4 needs backward computation.
506 | I1212 01:16:43.668692 40332 net.cpp:226] relu3 needs backward computation.
507 | I1212 01:16:43.668697 40332 net.cpp:226] conv3 needs backward computation.
508 | I1212 01:16:43.668702 40332 net.cpp:226] pool2 needs backward computation.
509 | I1212 01:16:43.668711 40332 net.cpp:226] relu2 needs backward computation.
510 | I1212 01:16:43.668748 40332 net.cpp:226] conv2 needs backward computation.
511 | I1212 01:16:43.668753 40332 net.cpp:226] pool1 needs backward computation.
512 | I1212 01:16:43.668757 40332 net.cpp:226] relu1 needs backward computation.
513 | I1212 01:16:43.668761 40332 net.cpp:226] conv1 needs backward computation.
514 | I1212 01:16:43.668767 40332 net.cpp:228] data does not need backward computation.
515 | I1212 01:16:43.668771 40332 net.cpp:270] This network produces output loss
516 | I1212 01:16:43.668787 40332 net.cpp:283] Network initialization done.
517 | I1212 01:16:43.668884 40332 caffe.cpp:309] Performing Forward
518 | I1212 01:16:44.038889 40332 caffe.cpp:314] Initial loss: 6.93382
519 | I1212 01:16:44.038939 40332 caffe.cpp:315] Performing Backward
520 | I1212 01:16:44.043067 40332 caffe.cpp:323] *** Benchmark begins ***
521 | I1212 01:16:44.043078 40332 caffe.cpp:324] Testing for 10 iterations.
522 | I1212 01:16:46.011155 40332 caffe.cpp:352] Iteration: 1 forward-backward time: 1140 ms.
523 | I1212 01:16:47.155781 40332 caffe.cpp:352] Iteration: 2 forward-backward time: 1144.47 ms.
524 | I1212 01:16:48.294250 40332 caffe.cpp:352] Iteration: 3 forward-backward time: 1138.34 ms.
525 | I1212 01:16:49.432665 40332 caffe.cpp:352] Iteration: 4 forward-backward time: 1138.29 ms.
526 | I1212 01:16:50.570600 40332 caffe.cpp:352] Iteration: 5 forward-backward time: 1137.81 ms.
527 | I1212 01:16:51.709350 40332 caffe.cpp:352] Iteration: 6 forward-backward time: 1138.65 ms.
528 | I1212 01:16:52.846112 40332 caffe.cpp:352] Iteration: 7 forward-backward time: 1136.65 ms.
529 | I1212 01:16:53.984618 40332 caffe.cpp:352] Iteration: 8 forward-backward time: 1138.38 ms.
530 | I1212 01:16:55.122740 40332 caffe.cpp:352] Iteration: 9 forward-backward time: 1138.02 ms.
531 | I1212 01:16:56.258673 40332 caffe.cpp:352] Iteration: 10 forward-backward time: 1135.83 ms.
532 | I1212 01:16:56.258728 40332 caffe.cpp:355] Average time per layer: 
533 | I1212 01:16:56.258733 40332 caffe.cpp:358]       data	forward: 1.70459 ms.
534 | I1212 01:16:56.258739 40332 caffe.cpp:361]       data	backward: 0.0043392 ms.
535 | I1212 01:16:56.258744 40332 caffe.cpp:358]      conv1	forward: 38.4802 ms.
536 | I1212 01:16:56.258750 40332 caffe.cpp:361]      conv1	backward: 48.4646 ms.
537 | I1212 01:16:56.258754 40332 caffe.cpp:358]      relu1	forward: 3.21308 ms.
538 | I1212 01:16:56.258759 40332 caffe.cpp:361]      relu1	backward: 4.87886 ms.
539 | I1212 01:16:56.258764 40332 caffe.cpp:358]      pool1	forward: 4.21732 ms.
540 | I1212 01:16:56.258767 40332 caffe.cpp:361]      pool1	backward: 19.1974 ms.
541 | I1212 01:16:56.258772 40332 caffe.cpp:358]      conv2	forward: 105.522 ms.
542 | I1212 01:16:56.258777 40332 caffe.cpp:361]      conv2	backward: 287.43 ms.
543 | I1212 01:16:56.258783 40332 caffe.cpp:358]      relu2	forward: 2.18696 ms.
544 | I1212 01:16:56.258787 40332 caffe.cpp:361]      relu2	backward: 3.26851 ms.
545 | I1212 01:16:56.258792 40332 caffe.cpp:358]      pool2	forward: 2.58741 ms.
546 | I1212 01:16:56.258796 40332 caffe.cpp:361]      pool2	backward: 10.4666 ms.
547 | I1212 01:16:56.258801 40332 caffe.cpp:358]      conv3	forward: 41.0432 ms.
548 | I1212 01:16:56.258806 40332 caffe.cpp:361]      conv3	backward: 117.781 ms.
549 | I1212 01:16:56.258811 40332 caffe.cpp:358]      relu3	forward: 0.88577 ms.
550 | I1212 01:16:56.258816 40332 caffe.cpp:361]      relu3	backward: 1.39212 ms.
551 | I1212 01:16:56.258821 40332 caffe.cpp:358]      conv4	forward: 60.3497 ms.
552 | I1212 01:16:56.258826 40332 caffe.cpp:361]      conv4	backward: 177.144 ms.
553 | I1212 01:16:56.258831 40332 caffe.cpp:358]      relu4	forward: 0.872438 ms.
554 | I1212 01:16:56.258836 40332 caffe.cpp:361]      relu4	backward: 1.37873 ms.
555 | I1212 01:16:56.258841 40332 caffe.cpp:358]      conv5	forward: 36.2849 ms.
556 | I1212 01:16:56.258846 40332 caffe.cpp:361]      conv5	backward: 117.303 ms.
557 | I1212 01:16:56.258852 40332 caffe.cpp:358]      relu5	forward: 0.586986 ms.
558 | I1212 01:16:56.258855 40332 caffe.cpp:361]      relu5	backward: 0.916598 ms.
559 | I1212 01:16:56.258860 40332 caffe.cpp:358]      pool5	forward: 0.630026 ms.
560 | I1212 01:16:56.258865 40332 caffe.cpp:361]      pool5	backward: 2.59878 ms.
561 | I1212 01:16:56.258870 40332 caffe.cpp:358]        fc6	forward: 10.3887 ms.
562 | I1212 01:16:56.258884 40332 caffe.cpp:361]        fc6	backward: 17.0964 ms.
563 | I1212 01:16:56.258925 40332 caffe.cpp:358]      relu6	forward: 0.068016 ms.
564 | I1212 01:16:56.258932 40332 caffe.cpp:361]      relu6	backward: 0.0939072 ms.
565 | I1212 01:16:56.258937 40332 caffe.cpp:358]      drop6	forward: 0.171498 ms.
566 | I1212 01:16:56.258941 40332 caffe.cpp:361]      drop6	backward: 0.0838112 ms.
567 | I1212 01:16:56.258946 40332 caffe.cpp:358]        fc7	forward: 5.36998 ms.
568 | I1212 01:16:56.258950 40332 caffe.cpp:361]        fc7	backward: 8.89202 ms.
569 | I1212 01:16:56.258955 40332 caffe.cpp:358]      relu7	forward: 0.0690176 ms.
570 | I1212 01:16:56.258960 40332 caffe.cpp:361]      relu7	backward: 0.103798 ms.
571 | I1212 01:16:56.258965 40332 caffe.cpp:358]      drop7	forward: 0.145693 ms.
572 | I1212 01:16:56.258970 40332 caffe.cpp:361]      drop7	backward: 0.0841472 ms.
573 | I1212 01:16:56.258975 40332 caffe.cpp:358]        fc8	forward: 1.58494 ms.
574 | I1212 01:16:56.258978 40332 caffe.cpp:361]        fc8	backward: 2.457 ms.
575 | I1212 01:16:56.258983 40332 caffe.cpp:358]       loss	forward: 0.234429 ms.
576 | I1212 01:16:56.258987 40332 caffe.cpp:361]       loss	backward: 0.0610048 ms.
577 | I1212 01:16:56.259012 40332 caffe.cpp:366] Average Forward pass: 317.065 ms.
578 | I1212 01:16:56.259021 40332 caffe.cpp:368] Average Backward pass: 821.549 ms.
579 | I1212 01:16:56.259030 40332 caffe.cpp:370] Average Forward-Backward: 1138.77 ms.
580 | I1212 01:16:56.259039 40332 caffe.cpp:372] Total Time: 11387.7 ms.
581 | I1212 01:16:56.259044 40332 caffe.cpp:373] *** Benchmark ends ***
582 | 


--------------------------------------------------------------------------------
/caffe/alexnet_time_1GPU.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe time --model=./alexnet.prototxt --iterations=10 -gpu=0 >alexnet_time_1GPU.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/createFakeData.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import lmdb
 3 | import caffe as c
 4 | 
 5 | featDim = 512
 6 | labDim = 10000
 7 | mbSize = 8192
 8 | totalCount = mbSize * 16
 9 | 
10 | features = np.random.randn(totalCount, 1, 1, featDim)
11 | labels = np.random.randint(0, labDim, size=(totalCount,))
12 | 
13 | db = lmdb.open('./fake_data.lmdb', map_size=features.nbytes * 10)
14 | 
15 | with db.begin(write = True) as txn:
16 |   for i in range(totalCount):
17 |     d = c.proto.caffe_pb2.Datum()
18 |     d.channels = features.shape[1]
19 |     d.height = features.shape[2]
20 |     d.width = features.shape[3]
21 |     d.data = features[i].tostring()
22 |     d.label = labels[i]
23 |     txn.put('{:08}'.format(i), d.SerializeToString())
24 | 
25 | 


--------------------------------------------------------------------------------
/caffe/createFakeImageNet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import lmdb
 3 | import caffe as c
 4 | 
 5 | mbSize = 256
 6 | totalCount = mbSize * 16
 7 | 
 8 | features = np.random.randn(totalCount, 3, 224, 224)
 9 | labels = np.random.randint(0, 1000, size=(totalCount,))
10 | 
11 | db = lmdb.open('./fake_image_net.lmdb', map_size=features.nbytes * 10)
12 | 
13 | with db.begin(write = True) as txn:
14 |   for i in range(totalCount):
15 |     d = c.proto.caffe_pb2.Datum()
16 |     d.channels = features.shape[1]
17 |     d.height = features.shape[2]
18 |     d.width = features.shape[3]
19 |     d.data = features[i].tostring()
20 |     d.label = labels[i]
21 |     txn.put('{:08}'.format(i), d.SerializeToString())
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/caffe/ffn.prototxt:
--------------------------------------------------------------------------------
  1 | name: "FFN"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   data_param: {
 11 |     batch_size: 8192
 12 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 13 |     backend: LMDB
 14 |   }
 15 | }
 16 | 
 17 | layer {
 18 |   name: "H1"
 19 |   type: "InnerProduct"
 20 |   bottom: "data"
 21 |   top: "H1"
 22 |   inner_product_param {
 23 |     num_output: 2048   
 24 |   }
 25 | }
 26 | 
 27 | layer {
 28 |   name: "H1_A"
 29 |   type: "Sigmoid"
 30 |   bottom: "H1"
 31 |   top: "H1"
 32 | }
 33 | 
 34 | layer {
 35 |   name: "H2"
 36 |   type: "InnerProduct"
 37 |   bottom: "H1"
 38 |   top: "H2"
 39 |   inner_product_param {
 40 |     num_output: 2048
 41 |   }
 42 | }
 43 | 
 44 | layer {
 45 |   name: "H2_A"
 46 |   type: "Sigmoid"
 47 |   bottom: "H2"
 48 |   top: "H2"
 49 | }
 50 | 
 51 | layer {
 52 |   name: "H3"
 53 |   type: "InnerProduct"
 54 |   bottom: "H2"
 55 |   top: "H3"
 56 |   inner_product_param {
 57 |     num_output: 2048
 58 |   }
 59 | }
 60 | 
 61 | layer {
 62 |   name: "H3_A"
 63 |   type: "Sigmoid"
 64 |   bottom: "H3"
 65 |   top: "H3"
 66 | }
 67 | 
 68 | layer {
 69 |   name: "H4"
 70 |   type: "InnerProduct"
 71 |   bottom: "H3"
 72 |   top: "H4"
 73 |   inner_product_param {
 74 |     num_output: 2048
 75 |   }
 76 | }
 77 | 
 78 | layer {
 79 |   name: "H4_A"
 80 |   type: "Sigmoid"
 81 |   bottom: "H4"
 82 |   top: "H4"
 83 | }
 84 | 
 85 | layer {
 86 |   name: "L"
 87 |   type: "InnerProduct"
 88 |   bottom: "H4"
 89 |   top: "L"
 90 |   inner_product_param {
 91 |     num_output: 10000
 92 |   }
 93 | }
 94 | 
 95 | layer {
 96 |   name: "loss"
 97 |   type: "SoftmaxWithLoss"
 98 |   bottom: "L"
 99 |   bottom: "label"
100 |   top: "loss"
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/caffe/ffn_1GPU.log:
--------------------------------------------------------------------------------
  1 | I1208 05:38:49.425312 49471 caffe.cpp:184] Using GPUs 0
  2 | I1208 05:38:59.337173 49471 solver.cpp:48] Initializing solver from parameters: 
  3 | base_lr: 0.001
  4 | max_iter: 100
  5 | lr_policy: "fixed"
  6 | solver_mode: GPU
  7 | device_id: 0
  8 | net: "./ffn.prototxt"
  9 | I1208 05:38:59.337251 49471 solver.cpp:91] Creating training net from net file: ./ffn.prototxt
 10 | I1208 05:38:59.337708 49471 net.cpp:49] Initializing net from parameters: 
 11 | name: "FFN"
 12 | state {
 13 |   phase: TRAIN
 14 | }
 15 | layer {
 16 |   name: "data"
 17 |   type: "Data"
 18 |   top: "data"
 19 |   top: "label"
 20 |   include {
 21 |     phase: TRAIN
 22 |   }
 23 |   data_param {
 24 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 25 |     batch_size: 8192
 26 |     backend: LMDB
 27 |   }
 28 | }
 29 | layer {
 30 |   name: "H1"
 31 |   type: "InnerProduct"
 32 |   bottom: "data"
 33 |   top: "H1"
 34 |   inner_product_param {
 35 |     num_output: 2048
 36 |   }
 37 | }
 38 | layer {
 39 |   name: "H1_A"
 40 |   type: "Sigmoid"
 41 |   bottom: "H1"
 42 |   top: "H1"
 43 | }
 44 | layer {
 45 |   name: "H2"
 46 |   type: "InnerProduct"
 47 |   bottom: "H1"
 48 |   top: "H2"
 49 |   inner_product_param {
 50 |     num_output: 2048
 51 |   }
 52 | }
 53 | layer {
 54 |   name: "H2_A"
 55 |   type: "Sigmoid"
 56 |   bottom: "H2"
 57 |   top: "H2"
 58 | }
 59 | layer {
 60 |   name: "H3"
 61 |   type: "InnerProduct"
 62 |   bottom: "H2"
 63 |   top: "H3"
 64 |   inner_product_param {
 65 |     num_output: 2048
 66 |   }
 67 | }
 68 | layer {
 69 |   name: "H3_A"
 70 |   type: "Sigmoid"
 71 |   bottom: "H3"
 72 |   top: "H3"
 73 | }
 74 | layer {
 75 |   name: "H4"
 76 |   type: "InnerProduct"
 77 |   bottom: "H3"
 78 |   top: "H4"
 79 |   inner_product_param {
 80 |     num_output: 2048
 81 |   }
 82 | }
 83 | layer {
 84 |   name: "H4_A"
 85 |   type: "Sigmoid"
 86 |   bottom: "H4"
 87 |   top: "H4"
 88 | }
 89 | layer {
 90 |   name: "L"
 91 |   type: "InnerProduct"
 92 |   bottom: "H4"
 93 |   top: "L"
 94 |   inner_product_param {
 95 |     num_output: 10000
 96 |   }
 97 | }
 98 | layer {
 99 |   name: "loss"
100 |   type: "SoftmaxWithLoss"
101 |   bottom: "L"
102 |   bottom: "label"
103 |   top: "loss"
104 | }
105 | I1208 05:38:59.337765 49471 layer_factory.hpp:77] Creating layer data
106 | I1208 05:38:59.342125 49471 net.cpp:106] Creating Layer data
107 | I1208 05:38:59.342136 49471 net.cpp:411] data -> data
108 | I1208 05:38:59.342159 49471 net.cpp:411] data -> label
109 | I1208 05:38:59.344041 49474 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb
110 | I1208 05:38:59.358345 49471 data_layer.cpp:41] output data size: 8192,1,1,512
111 | I1208 05:38:59.387707 49471 net.cpp:150] Setting up data
112 | I1208 05:38:59.387778 49471 net.cpp:157] Top shape: 8192 1 1 512 (4194304)
113 | I1208 05:38:59.387785 49471 net.cpp:157] Top shape: 8192 (8192)
114 | I1208 05:38:59.387789 49471 net.cpp:165] Memory required for data: 16809984
115 | I1208 05:38:59.387799 49471 layer_factory.hpp:77] Creating layer H1
116 | I1208 05:38:59.387814 49471 net.cpp:106] Creating Layer H1
117 | I1208 05:38:59.387820 49471 net.cpp:454] H1 <- data
118 | I1208 05:38:59.387833 49471 net.cpp:411] H1 -> H1
119 | I1208 05:38:59.390786 49471 net.cpp:150] Setting up H1
120 | I1208 05:38:59.390800 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
121 | I1208 05:38:59.390805 49471 net.cpp:165] Memory required for data: 83918848
122 | I1208 05:38:59.390818 49471 layer_factory.hpp:77] Creating layer H1_A
123 | I1208 05:38:59.390830 49471 net.cpp:106] Creating Layer H1_A
124 | I1208 05:38:59.390836 49471 net.cpp:454] H1_A <- H1
125 | I1208 05:38:59.390841 49471 net.cpp:397] H1_A -> H1 (in-place)
126 | I1208 05:38:59.412019 49475 blocking_queue.cpp:50] Waiting for data
127 | I1208 05:38:59.438570 49475 blocking_queue.cpp:50] Waiting for data
128 | I1208 05:38:59.451354 49475 blocking_queue.cpp:50] Waiting for data
129 | I1208 05:38:59.464041 49475 blocking_queue.cpp:50] Waiting for data
130 | I1208 05:38:59.485954 49475 blocking_queue.cpp:50] Waiting for data
131 | I1208 05:38:59.498752 49475 blocking_queue.cpp:50] Waiting for data
132 | I1208 05:38:59.511708 49475 blocking_queue.cpp:50] Waiting for data
133 | I1208 05:38:59.513068 49471 net.cpp:150] Setting up H1_A
134 | I1208 05:38:59.513108 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
135 | I1208 05:38:59.513113 49471 net.cpp:165] Memory required for data: 151027712
136 | I1208 05:38:59.513119 49471 layer_factory.hpp:77] Creating layer H2
137 | I1208 05:38:59.513131 49471 net.cpp:106] Creating Layer H2
138 | I1208 05:38:59.513137 49471 net.cpp:454] H2 <- H1
139 | I1208 05:38:59.513147 49471 net.cpp:411] H2 -> H2
140 | I1208 05:38:59.521333 49471 net.cpp:150] Setting up H2
141 | I1208 05:38:59.521363 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
142 | I1208 05:38:59.521409 49471 net.cpp:165] Memory required for data: 218136576
143 | I1208 05:38:59.521428 49471 layer_factory.hpp:77] Creating layer H2_A
144 | I1208 05:38:59.521445 49471 net.cpp:106] Creating Layer H2_A
145 | I1208 05:38:59.521450 49471 net.cpp:454] H2_A <- H2
146 | I1208 05:38:59.521458 49471 net.cpp:397] H2_A -> H2 (in-place)
147 | I1208 05:38:59.521807 49471 net.cpp:150] Setting up H2_A
148 | I1208 05:38:59.521818 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
149 | I1208 05:38:59.521826 49471 net.cpp:165] Memory required for data: 285245440
150 | I1208 05:38:59.521831 49471 layer_factory.hpp:77] Creating layer H3
151 | I1208 05:38:59.521841 49471 net.cpp:106] Creating Layer H3
152 | I1208 05:38:59.521844 49471 net.cpp:454] H3 <- H2
153 | I1208 05:38:59.521850 49471 net.cpp:411] H3 -> H3
154 | I1208 05:38:59.530151 49471 net.cpp:150] Setting up H3
155 | I1208 05:38:59.530181 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
156 | I1208 05:38:59.530185 49471 net.cpp:165] Memory required for data: 352354304
157 | I1208 05:38:59.530199 49471 layer_factory.hpp:77] Creating layer H3_A
158 | I1208 05:38:59.530210 49471 net.cpp:106] Creating Layer H3_A
159 | I1208 05:38:59.530220 49471 net.cpp:454] H3_A <- H3
160 | I1208 05:38:59.530227 49471 net.cpp:397] H3_A -> H3 (in-place)
161 | I1208 05:38:59.530416 49471 net.cpp:150] Setting up H3_A
162 | I1208 05:38:59.530426 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
163 | I1208 05:38:59.530431 49471 net.cpp:165] Memory required for data: 419463168
164 | I1208 05:38:59.530434 49471 layer_factory.hpp:77] Creating layer H4
165 | I1208 05:38:59.530444 49471 net.cpp:106] Creating Layer H4
166 | I1208 05:38:59.530448 49471 net.cpp:454] H4 <- H3
167 | I1208 05:38:59.530455 49471 net.cpp:411] H4 -> H4
168 | I1208 05:38:59.538918 49475 blocking_queue.cpp:50] Waiting for data
169 | I1208 05:38:59.539352 49471 net.cpp:150] Setting up H4
170 | I1208 05:38:59.539376 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
171 | I1208 05:38:59.539381 49471 net.cpp:165] Memory required for data: 486572032
172 | I1208 05:38:59.539389 49471 layer_factory.hpp:77] Creating layer H4_A
173 | I1208 05:38:59.539404 49471 net.cpp:106] Creating Layer H4_A
174 | I1208 05:38:59.539412 49471 net.cpp:454] H4_A <- H4
175 | I1208 05:38:59.539419 49471 net.cpp:397] H4_A -> H4 (in-place)
176 | I1208 05:38:59.539780 49471 net.cpp:150] Setting up H4_A
177 | I1208 05:38:59.539791 49471 net.cpp:157] Top shape: 8192 2048 (16777216)
178 | I1208 05:38:59.539798 49471 net.cpp:165] Memory required for data: 553680896
179 | I1208 05:38:59.539803 49471 layer_factory.hpp:77] Creating layer L
180 | I1208 05:38:59.539810 49471 net.cpp:106] Creating Layer L
181 | I1208 05:38:59.539814 49471 net.cpp:454] L <- H4
182 | I1208 05:38:59.539822 49471 net.cpp:411] L -> L
183 | I1208 05:38:59.583822 49471 net.cpp:150] Setting up L
184 | I1208 05:38:59.583858 49471 net.cpp:157] Top shape: 8192 10000 (81920000)
185 | I1208 05:38:59.583863 49471 net.cpp:165] Memory required for data: 881360896
186 | I1208 05:38:59.583878 49471 layer_factory.hpp:77] Creating layer loss
187 | I1208 05:38:59.583896 49471 net.cpp:106] Creating Layer loss
188 | I1208 05:38:59.583902 49471 net.cpp:454] loss <- L
189 | I1208 05:38:59.583910 49471 net.cpp:454] loss <- label
190 | I1208 05:38:59.583922 49471 net.cpp:411] loss -> loss
191 | I1208 05:38:59.583940 49471 layer_factory.hpp:77] Creating layer loss
192 | I1208 05:38:59.765748 49471 net.cpp:150] Setting up loss
193 | I1208 05:38:59.765797 49471 net.cpp:157] Top shape: (1)
194 | I1208 05:38:59.765804 49471 net.cpp:160]     with loss weight 1
195 | I1208 05:38:59.765825 49471 net.cpp:165] Memory required for data: 881360900
196 | I1208 05:38:59.765832 49471 net.cpp:226] loss needs backward computation.
197 | I1208 05:38:59.765840 49471 net.cpp:226] L needs backward computation.
198 | I1208 05:38:59.765844 49471 net.cpp:226] H4_A needs backward computation.
199 | I1208 05:38:59.765849 49471 net.cpp:226] H4 needs backward computation.
200 | I1208 05:38:59.765854 49471 net.cpp:226] H3_A needs backward computation.
201 | I1208 05:38:59.765859 49471 net.cpp:226] H3 needs backward computation.
202 | I1208 05:38:59.765864 49471 net.cpp:226] H2_A needs backward computation.
203 | I1208 05:38:59.765869 49471 net.cpp:226] H2 needs backward computation.
204 | I1208 05:38:59.765874 49471 net.cpp:226] H1_A needs backward computation.
205 | I1208 05:38:59.765887 49471 net.cpp:226] H1 needs backward computation.
206 | I1208 05:38:59.765934 49471 net.cpp:228] data does not need backward computation.
207 | I1208 05:38:59.765940 49471 net.cpp:270] This network produces output loss
208 | I1208 05:38:59.765955 49471 net.cpp:283] Network initialization done.
209 | I1208 05:38:59.765995 49471 solver.cpp:60] Solver scaffolding done.
210 | I1208 05:38:59.766335 49471 caffe.cpp:212] Starting Optimization
211 | I1208 05:38:59.766345 49471 solver.cpp:288] Solving FFN
212 | I1208 05:38:59.766347 49471 solver.cpp:289] Learning Rate Policy: fixed
213 | I1208 05:40:03.220875 49471 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel
214 | I1208 05:40:05.152649 49471 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate
215 | I1208 05:40:06.284504 49471 solver.cpp:326] Optimization Done.
216 | I1208 05:40:06.284602 49471 caffe.cpp:215] Optimization Done.
217 | 


--------------------------------------------------------------------------------
/caffe/ffn_1GPU.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import caffe as c
 3 | import time
 4 | 
 5 | mbSize = 512
 6 | count = mbSize * 16
 7 | 
 8 | feat = np.random.randn(count, 1, 1, 512).astype(np.float32)
 9 | lab = np.random.randint(0, 10000, size=(count, 1, 1, 1)).astype(np.float32)
10 | 
11 | def createSolver(solverFile):
12 |     c.set_mode_gpu()
13 |     solver = c.SGDSolver(solverFile)
14 |     solver.net.set_input_arrays(feat, lab)
15 |     return solver
16 | 
17 | def samplesPerSec(minibatchSize, processingTime):
18 |     return minibatchSize / processingTime
19 | 
20 | def runBenchmark(solver, iter):
21 |     startTime = time.time()
22 |     solver.step(iter)
23 |     stepTime = time.time() - startTime
24 |     print "Samples per sec = %d." %  samplesPerSec(mbSize * iter, stepTime)
25 | 
26 | s = createSolver('./ffn_solver_md.prototxt')
27 | #runBenchmark(s, 10)
28 | 
29 | 


--------------------------------------------------------------------------------
/caffe/ffn_1GPU.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe train -solver=./ffn_solver.prototxt -gpu=0 >ffn_1GPU.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/ffn_2GPUs.log:
--------------------------------------------------------------------------------
  1 | I1208 05:42:01.665724 49481 caffe.cpp:184] Using GPUs 0, 1
  2 | I1208 05:42:11.086211 49481 solver.cpp:48] Initializing solver from parameters: 
  3 | base_lr: 0.001
  4 | max_iter: 100
  5 | lr_policy: "fixed"
  6 | solver_mode: GPU
  7 | device_id: 0
  8 | net: "./ffn_2GPUs.prototxt"
  9 | I1208 05:42:11.086272 49481 solver.cpp:91] Creating training net from net file: ./ffn_2GPUs.prototxt
 10 | I1208 05:42:11.086724 49481 net.cpp:49] Initializing net from parameters: 
 11 | name: "FFN"
 12 | state {
 13 |   phase: TRAIN
 14 | }
 15 | layer {
 16 |   name: "data"
 17 |   type: "Data"
 18 |   top: "data"
 19 |   top: "label"
 20 |   include {
 21 |     phase: TRAIN
 22 |   }
 23 |   data_param {
 24 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 25 |     batch_size: 4096
 26 |     backend: LMDB
 27 |   }
 28 | }
 29 | layer {
 30 |   name: "H1"
 31 |   type: "InnerProduct"
 32 |   bottom: "data"
 33 |   top: "H1"
 34 |   inner_product_param {
 35 |     num_output: 2048
 36 |   }
 37 | }
 38 | layer {
 39 |   name: "H1_A"
 40 |   type: "Sigmoid"
 41 |   bottom: "H1"
 42 |   top: "H1"
 43 | }
 44 | layer {
 45 |   name: "H2"
 46 |   type: "InnerProduct"
 47 |   bottom: "H1"
 48 |   top: "H2"
 49 |   inner_product_param {
 50 |     num_output: 2048
 51 |   }
 52 | }
 53 | layer {
 54 |   name: "H2_A"
 55 |   type: "Sigmoid"
 56 |   bottom: "H2"
 57 |   top: "H2"
 58 | }
 59 | layer {
 60 |   name: "H3"
 61 |   type: "InnerProduct"
 62 |   bottom: "H2"
 63 |   top: "H3"
 64 |   inner_product_param {
 65 |     num_output: 2048
 66 |   }
 67 | }
 68 | layer {
 69 |   name: "H3_A"
 70 |   type: "Sigmoid"
 71 |   bottom: "H3"
 72 |   top: "H3"
 73 | }
 74 | layer {
 75 |   name: "H4"
 76 |   type: "InnerProduct"
 77 |   bottom: "H3"
 78 |   top: "H4"
 79 |   inner_product_param {
 80 |     num_output: 2048
 81 |   }
 82 | }
 83 | layer {
 84 |   name: "H4_A"
 85 |   type: "Sigmoid"
 86 |   bottom: "H4"
 87 |   top: "H4"
 88 | }
 89 | layer {
 90 |   name: "L"
 91 |   type: "InnerProduct"
 92 |   bottom: "H4"
 93 |   top: "L"
 94 |   inner_product_param {
 95 |     num_output: 10000
 96 |   }
 97 | }
 98 | layer {
 99 |   name: "loss"
100 |   type: "SoftmaxWithLoss"
101 |   bottom: "L"
102 |   bottom: "label"
103 |   top: "loss"
104 | }
105 | I1208 05:42:11.086781 49481 layer_factory.hpp:77] Creating layer data
106 | I1208 05:42:11.089282 49481 net.cpp:106] Creating Layer data
107 | I1208 05:42:11.089293 49481 net.cpp:411] data -> data
108 | I1208 05:42:11.089318 49481 net.cpp:411] data -> label
109 | I1208 05:42:11.091265 49484 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb
110 | I1208 05:42:11.102735 49481 data_layer.cpp:41] output data size: 4096,1,1,512
111 | I1208 05:42:11.117261 49481 net.cpp:150] Setting up data
112 | I1208 05:42:11.117303 49481 net.cpp:157] Top shape: 4096 1 1 512 (2097152)
113 | I1208 05:42:11.117310 49481 net.cpp:157] Top shape: 4096 (4096)
114 | I1208 05:42:11.117319 49481 net.cpp:165] Memory required for data: 8404992
115 | I1208 05:42:11.117331 49481 layer_factory.hpp:77] Creating layer H1
116 | I1208 05:42:11.117346 49481 net.cpp:106] Creating Layer H1
117 | I1208 05:42:11.117352 49481 net.cpp:454] H1 <- data
118 | I1208 05:42:11.117363 49481 net.cpp:411] H1 -> H1
119 | I1208 05:42:11.120373 49485 blocking_queue.cpp:50] Waiting for data
120 | I1208 05:42:11.120420 49481 net.cpp:150] Setting up H1
121 | I1208 05:42:11.120434 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
122 | I1208 05:42:11.120437 49481 net.cpp:165] Memory required for data: 41959424
123 | I1208 05:42:11.120451 49481 layer_factory.hpp:77] Creating layer H1_A
124 | I1208 05:42:11.120462 49481 net.cpp:106] Creating Layer H1_A
125 | I1208 05:42:11.120466 49481 net.cpp:454] H1_A <- H1
126 | I1208 05:42:11.120472 49481 net.cpp:397] H1_A -> H1 (in-place)
127 | I1208 05:42:11.213544 49481 net.cpp:150] Setting up H1_A
128 | I1208 05:42:11.213587 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
129 | I1208 05:42:11.213593 49481 net.cpp:165] Memory required for data: 75513856
130 | I1208 05:42:11.213599 49481 layer_factory.hpp:77] Creating layer H2
131 | I1208 05:42:11.213611 49481 net.cpp:106] Creating Layer H2
132 | I1208 05:42:11.213616 49481 net.cpp:454] H2 <- H1
133 | I1208 05:42:11.213626 49481 net.cpp:411] H2 -> H2
134 | I1208 05:42:11.222342 49481 net.cpp:150] Setting up H2
135 | I1208 05:42:11.222368 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
136 | I1208 05:42:11.222371 49481 net.cpp:165] Memory required for data: 109068288
137 | I1208 05:42:11.222383 49481 layer_factory.hpp:77] Creating layer H2_A
138 | I1208 05:42:11.222394 49481 net.cpp:106] Creating Layer H2_A
139 | I1208 05:42:11.222400 49481 net.cpp:454] H2_A <- H2
140 | I1208 05:42:11.222406 49481 net.cpp:397] H2_A -> H2 (in-place)
141 | I1208 05:42:11.222707 49481 net.cpp:150] Setting up H2_A
142 | I1208 05:42:11.222725 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
143 | I1208 05:42:11.222767 49481 net.cpp:165] Memory required for data: 142622720
144 | I1208 05:42:11.222772 49481 layer_factory.hpp:77] Creating layer H3
145 | I1208 05:42:11.222779 49481 net.cpp:106] Creating Layer H3
146 | I1208 05:42:11.222784 49481 net.cpp:454] H3 <- H2
147 | I1208 05:42:11.222790 49481 net.cpp:411] H3 -> H3
148 | I1208 05:42:11.231565 49481 net.cpp:150] Setting up H3
149 | I1208 05:42:11.231588 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
150 | I1208 05:42:11.231592 49481 net.cpp:165] Memory required for data: 176177152
151 | I1208 05:42:11.231603 49481 layer_factory.hpp:77] Creating layer H3_A
152 | I1208 05:42:11.231611 49481 net.cpp:106] Creating Layer H3_A
153 | I1208 05:42:11.231616 49481 net.cpp:454] H3_A <- H3
154 | I1208 05:42:11.231622 49481 net.cpp:397] H3_A -> H3 (in-place)
155 | I1208 05:42:11.231778 49481 net.cpp:150] Setting up H3_A
156 | I1208 05:42:11.231788 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
157 | I1208 05:42:11.231792 49481 net.cpp:165] Memory required for data: 209731584
158 | I1208 05:42:11.231796 49481 layer_factory.hpp:77] Creating layer H4
159 | I1208 05:42:11.231803 49481 net.cpp:106] Creating Layer H4
160 | I1208 05:42:11.231807 49481 net.cpp:454] H4 <- H3
161 | I1208 05:42:11.231813 49481 net.cpp:411] H4 -> H4
162 | I1208 05:42:11.240564 49481 net.cpp:150] Setting up H4
163 | I1208 05:42:11.240587 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
164 | I1208 05:42:11.240592 49481 net.cpp:165] Memory required for data: 243286016
165 | I1208 05:42:11.240599 49481 layer_factory.hpp:77] Creating layer H4_A
166 | I1208 05:42:11.240609 49481 net.cpp:106] Creating Layer H4_A
167 | I1208 05:42:11.240614 49481 net.cpp:454] H4_A <- H4
168 | I1208 05:42:11.240620 49481 net.cpp:397] H4_A -> H4 (in-place)
169 | I1208 05:42:11.240926 49481 net.cpp:150] Setting up H4_A
170 | I1208 05:42:11.240936 49481 net.cpp:157] Top shape: 4096 2048 (8388608)
171 | I1208 05:42:11.240941 49481 net.cpp:165] Memory required for data: 276840448
172 | I1208 05:42:11.240944 49481 layer_factory.hpp:77] Creating layer L
173 | I1208 05:42:11.240952 49481 net.cpp:106] Creating Layer L
174 | I1208 05:42:11.240957 49481 net.cpp:454] L <- H4
175 | I1208 05:42:11.240963 49481 net.cpp:411] L -> L
176 | I1208 05:42:11.288460 49481 net.cpp:150] Setting up L
177 | I1208 05:42:11.288503 49481 net.cpp:157] Top shape: 4096 10000 (40960000)
178 | I1208 05:42:11.288508 49481 net.cpp:165] Memory required for data: 440680448
179 | I1208 05:42:11.288525 49481 layer_factory.hpp:77] Creating layer loss
180 | I1208 05:42:11.288537 49481 net.cpp:106] Creating Layer loss
181 | I1208 05:42:11.288543 49481 net.cpp:454] loss <- L
182 | I1208 05:42:11.288550 49481 net.cpp:454] loss <- label
183 | I1208 05:42:11.288573 49481 net.cpp:411] loss -> loss
184 | I1208 05:42:11.288590 49481 layer_factory.hpp:77] Creating layer loss
185 | I1208 05:42:11.382125 49481 net.cpp:150] Setting up loss
186 | I1208 05:42:11.382170 49481 net.cpp:157] Top shape: (1)
187 | I1208 05:42:11.382175 49481 net.cpp:160]     with loss weight 1
188 | I1208 05:42:11.382203 49481 net.cpp:165] Memory required for data: 440680452
189 | I1208 05:42:11.382210 49481 net.cpp:226] loss needs backward computation.
190 | I1208 05:42:11.382215 49481 net.cpp:226] L needs backward computation.
191 | I1208 05:42:11.382220 49481 net.cpp:226] H4_A needs backward computation.
192 | I1208 05:42:11.382225 49481 net.cpp:226] H4 needs backward computation.
193 | I1208 05:42:11.382230 49481 net.cpp:226] H3_A needs backward computation.
194 | I1208 05:42:11.382233 49481 net.cpp:226] H3 needs backward computation.
195 | I1208 05:42:11.382237 49481 net.cpp:226] H2_A needs backward computation.
196 | I1208 05:42:11.382253 49481 net.cpp:226] H2 needs backward computation.
197 | I1208 05:42:11.382257 49481 net.cpp:226] H1_A needs backward computation.
198 | I1208 05:42:11.382261 49481 net.cpp:226] H1 needs backward computation.
199 | I1208 05:42:11.382266 49481 net.cpp:228] data does not need backward computation.
200 | I1208 05:42:11.382269 49481 net.cpp:270] This network produces output loss
201 | I1208 05:42:11.382282 49481 net.cpp:283] Network initialization done.
202 | I1208 05:42:11.382318 49481 solver.cpp:60] Solver scaffolding done.
203 | I1208 05:42:11.397910 49481 parallel.cpp:391] GPUs pairs 0:1
204 | I1208 05:42:11.596046 49481 data_layer.cpp:41] output data size: 4096,1,1,512
205 | I1208 05:42:11.620034 49485 blocking_queue.cpp:50] Waiting for data
206 | I1208 05:42:11.635661 49485 blocking_queue.cpp:50] Waiting for data
207 | I1208 05:42:11.649173 49485 blocking_queue.cpp:50] Waiting for data
208 | I1208 05:42:11.664270 49485 blocking_queue.cpp:50] Waiting for data
209 | I1208 05:42:11.681982 49485 blocking_queue.cpp:50] Waiting for data
210 | I1208 05:42:11.697202 49485 blocking_queue.cpp:50] Waiting for data
211 | I1208 05:42:11.711992 49485 blocking_queue.cpp:50] Waiting for data
212 | I1208 05:42:11.728345 49485 blocking_queue.cpp:50] Waiting for data
213 | I1208 05:42:11.743849 49485 blocking_queue.cpp:50] Waiting for data
214 | I1208 05:42:11.758334 49485 blocking_queue.cpp:50] Waiting for data
215 | I1208 05:42:11.775403 49485 blocking_queue.cpp:50] Waiting for data
216 | I1208 05:42:12.035506 49481 parallel.cpp:419] Starting Optimization
217 | I1208 05:42:12.035604 49481 solver.cpp:288] Solving FFN
218 | I1208 05:42:12.035619 49481 solver.cpp:289] Learning Rate Policy: fixed
219 | I1208 05:42:48.475808 49481 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel
220 | I1208 05:42:49.984336 49481 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate
221 | I1208 05:42:50.939898 49481 solver.cpp:326] Optimization Done.
222 | I1208 05:42:50.990241 49481 caffe.cpp:215] Optimization Done.
223 | 


--------------------------------------------------------------------------------
/caffe/ffn_2GPUs.prototxt:
--------------------------------------------------------------------------------
  1 | name: "FFN"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   data_param: {
 11 |     batch_size: 4096
 12 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 13 |     backend: LMDB
 14 |   }
 15 | }
 16 | 
 17 | layer {
 18 |   name: "H1"
 19 |   type: "InnerProduct"
 20 |   bottom: "data"
 21 |   top: "H1"
 22 |   inner_product_param {
 23 |     num_output: 2048   
 24 |   }
 25 | }
 26 | 
 27 | layer {
 28 |   name: "H1_A"
 29 |   type: "Sigmoid"
 30 |   bottom: "H1"
 31 |   top: "H1"
 32 | }
 33 | 
 34 | layer {
 35 |   name: "H2"
 36 |   type: "InnerProduct"
 37 |   bottom: "H1"
 38 |   top: "H2"
 39 |   inner_product_param {
 40 |     num_output: 2048
 41 |   }
 42 | }
 43 | 
 44 | layer {
 45 |   name: "H2_A"
 46 |   type: "Sigmoid"
 47 |   bottom: "H2"
 48 |   top: "H2"
 49 | }
 50 | 
 51 | layer {
 52 |   name: "H3"
 53 |   type: "InnerProduct"
 54 |   bottom: "H2"
 55 |   top: "H3"
 56 |   inner_product_param {
 57 |     num_output: 2048
 58 |   }
 59 | }
 60 | 
 61 | layer {
 62 |   name: "H3_A"
 63 |   type: "Sigmoid"
 64 |   bottom: "H3"
 65 |   top: "H3"
 66 | }
 67 | 
 68 | layer {
 69 |   name: "H4"
 70 |   type: "InnerProduct"
 71 |   bottom: "H3"
 72 |   top: "H4"
 73 |   inner_product_param {
 74 |     num_output: 2048
 75 |   }
 76 | }
 77 | 
 78 | layer {
 79 |   name: "H4_A"
 80 |   type: "Sigmoid"
 81 |   bottom: "H4"
 82 |   top: "H4"
 83 | }
 84 | 
 85 | layer {
 86 |   name: "L"
 87 |   type: "InnerProduct"
 88 |   bottom: "H4"
 89 |   top: "L"
 90 |   inner_product_param {
 91 |     num_output: 10000
 92 |   }
 93 | }
 94 | 
 95 | layer {
 96 |   name: "loss"
 97 |   type: "SoftmaxWithLoss"
 98 |   bottom: "L"
 99 |   bottom: "label"
100 |   top: "loss"
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/caffe/ffn_2GPUs.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe train -solver=./ffn_2GPUs_solver.prototxt -gpu=0,1 >ffn_2GPUs.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/ffn_2GPUs_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "./ffn_2GPUs.prototxt"
2 | max_iter: 100
3 | base_lr: 0.001
4 | lr_policy: "fixed"
5 | solver_mode: GPU
6 | 


--------------------------------------------------------------------------------
/caffe/ffn_4GPUs.log:
--------------------------------------------------------------------------------
  1 | I1208 05:45:00.539070 49494 caffe.cpp:184] Using GPUs 0, 1, 2, 3
  2 | I1208 05:45:10.508654 49494 solver.cpp:48] Initializing solver from parameters: 
  3 | base_lr: 0.001
  4 | max_iter: 100
  5 | lr_policy: "fixed"
  6 | solver_mode: GPU
  7 | device_id: 0
  8 | net: "./ffn_4GPUs.prototxt"
  9 | I1208 05:45:10.508736 49494 solver.cpp:91] Creating training net from net file: ./ffn_4GPUs.prototxt
 10 | I1208 05:45:10.509438 49494 net.cpp:49] Initializing net from parameters: 
 11 | name: "FFN"
 12 | state {
 13 |   phase: TRAIN
 14 | }
 15 | layer {
 16 |   name: "data"
 17 |   type: "Data"
 18 |   top: "data"
 19 |   top: "label"
 20 |   include {
 21 |     phase: TRAIN
 22 |   }
 23 |   data_param {
 24 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 25 |     batch_size: 2048
 26 |     backend: LMDB
 27 |   }
 28 | }
 29 | layer {
 30 |   name: "H1"
 31 |   type: "InnerProduct"
 32 |   bottom: "data"
 33 |   top: "H1"
 34 |   inner_product_param {
 35 |     num_output: 2048
 36 |   }
 37 | }
 38 | layer {
 39 |   name: "H1_A"
 40 |   type: "Sigmoid"
 41 |   bottom: "H1"
 42 |   top: "H1"
 43 | }
 44 | layer {
 45 |   name: "H2"
 46 |   type: "InnerProduct"
 47 |   bottom: "H1"
 48 |   top: "H2"
 49 |   inner_product_param {
 50 |     num_output: 2048
 51 |   }
 52 | }
 53 | layer {
 54 |   name: "H2_A"
 55 |   type: "Sigmoid"
 56 |   bottom: "H2"
 57 |   top: "H2"
 58 | }
 59 | layer {
 60 |   name: "H3"
 61 |   type: "InnerProduct"
 62 |   bottom: "H2"
 63 |   top: "H3"
 64 |   inner_product_param {
 65 |     num_output: 2048
 66 |   }
 67 | }
 68 | layer {
 69 |   name: "H3_A"
 70 |   type: "Sigmoid"
 71 |   bottom: "H3"
 72 |   top: "H3"
 73 | }
 74 | layer {
 75 |   name: "H4"
 76 |   type: "InnerProduct"
 77 |   bottom: "H3"
 78 |   top: "H4"
 79 |   inner_product_param {
 80 |     num_output: 2048
 81 |   }
 82 | }
 83 | layer {
 84 |   name: "H4_A"
 85 |   type: "Sigmoid"
 86 |   bottom: "H4"
 87 |   top: "H4"
 88 | }
 89 | layer {
 90 |   name: "L"
 91 |   type: "InnerProduct"
 92 |   bottom: "H4"
 93 |   top: "L"
 94 |   inner_product_param {
 95 |     num_output: 10000
 96 |   }
 97 | }
 98 | layer {
 99 |   name: "loss"
100 |   type: "SoftmaxWithLoss"
101 |   bottom: "L"
102 |   bottom: "label"
103 |   top: "loss"
104 | }
105 | I1208 05:45:10.509542 49494 layer_factory.hpp:77] Creating layer data
106 | I1208 05:45:10.511235 49494 net.cpp:106] Creating Layer data
107 | I1208 05:45:10.511248 49494 net.cpp:411] data -> data
108 | I1208 05:45:10.511288 49494 net.cpp:411] data -> label
109 | I1208 05:45:10.513157 49496 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb
110 | I1208 05:45:10.525254 49494 data_layer.cpp:41] output data size: 2048,1,1,512
111 | I1208 05:45:10.532680 49494 net.cpp:150] Setting up data
112 | I1208 05:45:10.532718 49494 net.cpp:157] Top shape: 2048 1 1 512 (1048576)
113 | I1208 05:45:10.532724 49494 net.cpp:157] Top shape: 2048 (2048)
114 | I1208 05:45:10.532728 49494 net.cpp:165] Memory required for data: 4202496
115 | I1208 05:45:10.532737 49494 layer_factory.hpp:77] Creating layer H1
116 | I1208 05:45:10.532749 49494 net.cpp:106] Creating Layer H1
117 | I1208 05:45:10.532754 49494 net.cpp:454] H1 <- data
118 | I1208 05:45:10.532766 49494 net.cpp:411] H1 -> H1
119 | I1208 05:45:10.534867 49494 net.cpp:150] Setting up H1
120 | I1208 05:45:10.534879 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
121 | I1208 05:45:10.534884 49494 net.cpp:165] Memory required for data: 20979712
122 | I1208 05:45:10.534898 49494 layer_factory.hpp:77] Creating layer H1_A
123 | I1208 05:45:10.534907 49494 net.cpp:106] Creating Layer H1_A
124 | I1208 05:45:10.534911 49494 net.cpp:454] H1_A <- H1
125 | I1208 05:45:10.534919 49494 net.cpp:397] H1_A -> H1 (in-place)
126 | I1208 05:45:10.535902 49497 blocking_queue.cpp:50] Waiting for data
127 | I1208 05:45:10.626925 49494 net.cpp:150] Setting up H1_A
128 | I1208 05:45:10.626981 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
129 | I1208 05:45:10.626986 49494 net.cpp:165] Memory required for data: 37756928
130 | I1208 05:45:10.626993 49494 layer_factory.hpp:77] Creating layer H2
131 | I1208 05:45:10.627008 49494 net.cpp:106] Creating Layer H2
132 | I1208 05:45:10.627013 49494 net.cpp:454] H2 <- H1
133 | I1208 05:45:10.627024 49494 net.cpp:411] H2 -> H2
134 | I1208 05:45:10.635015 49494 net.cpp:150] Setting up H2
135 | I1208 05:45:10.635040 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
136 | I1208 05:45:10.635043 49494 net.cpp:165] Memory required for data: 54534144
137 | I1208 05:45:10.635058 49494 layer_factory.hpp:77] Creating layer H2_A
138 | I1208 05:45:10.635071 49494 net.cpp:106] Creating Layer H2_A
139 | I1208 05:45:10.635076 49494 net.cpp:454] H2_A <- H2
140 | I1208 05:45:10.635082 49494 net.cpp:397] H2_A -> H2 (in-place)
141 | I1208 05:45:10.635387 49494 net.cpp:150] Setting up H2_A
142 | I1208 05:45:10.635404 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
143 | I1208 05:45:10.635445 49494 net.cpp:165] Memory required for data: 71311360
144 | I1208 05:45:10.635450 49494 layer_factory.hpp:77] Creating layer H3
145 | I1208 05:45:10.635457 49494 net.cpp:106] Creating Layer H3
146 | I1208 05:45:10.635462 49494 net.cpp:454] H3 <- H2
147 | I1208 05:45:10.635468 49494 net.cpp:411] H3 -> H3
148 | I1208 05:45:10.643630 49494 net.cpp:150] Setting up H3
149 | I1208 05:45:10.643656 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
150 | I1208 05:45:10.643661 49494 net.cpp:165] Memory required for data: 88088576
151 | I1208 05:45:10.643671 49494 layer_factory.hpp:77] Creating layer H3_A
152 | I1208 05:45:10.643681 49494 net.cpp:106] Creating Layer H3_A
153 | I1208 05:45:10.643685 49494 net.cpp:454] H3_A <- H3
154 | I1208 05:45:10.643692 49494 net.cpp:397] H3_A -> H3 (in-place)
155 | I1208 05:45:10.643851 49494 net.cpp:150] Setting up H3_A
156 | I1208 05:45:10.643860 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
157 | I1208 05:45:10.643864 49494 net.cpp:165] Memory required for data: 104865792
158 | I1208 05:45:10.643869 49494 layer_factory.hpp:77] Creating layer H4
159 | I1208 05:45:10.643877 49494 net.cpp:106] Creating Layer H4
160 | I1208 05:45:10.643880 49494 net.cpp:454] H4 <- H3
161 | I1208 05:45:10.643887 49494 net.cpp:411] H4 -> H4
162 | I1208 05:45:10.651945 49494 net.cpp:150] Setting up H4
163 | I1208 05:45:10.651967 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
164 | I1208 05:45:10.651971 49494 net.cpp:165] Memory required for data: 121643008
165 | I1208 05:45:10.651979 49494 layer_factory.hpp:77] Creating layer H4_A
166 | I1208 05:45:10.651993 49494 net.cpp:106] Creating Layer H4_A
167 | I1208 05:45:10.651998 49494 net.cpp:454] H4_A <- H4
168 | I1208 05:45:10.652004 49494 net.cpp:397] H4_A -> H4 (in-place)
169 | I1208 05:45:10.652295 49494 net.cpp:150] Setting up H4_A
170 | I1208 05:45:10.652307 49494 net.cpp:157] Top shape: 2048 2048 (4194304)
171 | I1208 05:45:10.652310 49494 net.cpp:165] Memory required for data: 138420224
172 | I1208 05:45:10.652314 49494 layer_factory.hpp:77] Creating layer L
173 | I1208 05:45:10.652323 49494 net.cpp:106] Creating Layer L
174 | I1208 05:45:10.652326 49494 net.cpp:454] L <- H4
175 | I1208 05:45:10.652333 49494 net.cpp:411] L -> L
176 | I1208 05:45:10.695912 49494 net.cpp:150] Setting up L
177 | I1208 05:45:10.695956 49494 net.cpp:157] Top shape: 2048 10000 (20480000)
178 | I1208 05:45:10.695961 49494 net.cpp:165] Memory required for data: 220340224
179 | I1208 05:45:10.695978 49494 layer_factory.hpp:77] Creating layer loss
180 | I1208 05:45:10.695992 49494 net.cpp:106] Creating Layer loss
181 | I1208 05:45:10.695997 49494 net.cpp:454] loss <- L
182 | I1208 05:45:10.696004 49494 net.cpp:454] loss <- label
183 | I1208 05:45:10.696014 49494 net.cpp:411] loss -> loss
184 | I1208 05:45:10.696033 49494 layer_factory.hpp:77] Creating layer loss
185 | I1208 05:45:10.739253 49494 net.cpp:150] Setting up loss
186 | I1208 05:45:10.739298 49494 net.cpp:157] Top shape: (1)
187 | I1208 05:45:10.739303 49494 net.cpp:160]     with loss weight 1
188 | I1208 05:45:10.739328 49494 net.cpp:165] Memory required for data: 220340228
189 | I1208 05:45:10.739336 49494 net.cpp:226] loss needs backward computation.
190 | I1208 05:45:10.739342 49494 net.cpp:226] L needs backward computation.
191 | I1208 05:45:10.739347 49494 net.cpp:226] H4_A needs backward computation.
192 | I1208 05:45:10.739351 49494 net.cpp:226] H4 needs backward computation.
193 | I1208 05:45:10.739356 49494 net.cpp:226] H3_A needs backward computation.
194 | I1208 05:45:10.739359 49494 net.cpp:226] H3 needs backward computation.
195 | I1208 05:45:10.739363 49494 net.cpp:226] H2_A needs backward computation.
196 | I1208 05:45:10.739367 49494 net.cpp:226] H2 needs backward computation.
197 | I1208 05:45:10.739372 49494 net.cpp:226] H1_A needs backward computation.
198 | I1208 05:45:10.739374 49494 net.cpp:226] H1 needs backward computation.
199 | I1208 05:45:10.739380 49494 net.cpp:228] data does not need backward computation.
200 | I1208 05:45:10.739384 49494 net.cpp:270] This network produces output loss
201 | I1208 05:45:10.739399 49494 net.cpp:283] Network initialization done.
202 | I1208 05:45:10.739444 49494 solver.cpp:60] Solver scaffolding done.
203 | I1208 05:45:10.766202 49494 parallel.cpp:391] GPUs pairs 0:1, 2:3, 0:2
204 | I1208 05:45:10.960978 49494 data_layer.cpp:41] output data size: 2048,1,1,512
205 | I1208 05:45:11.484361 49494 data_layer.cpp:41] output data size: 2048,1,1,512
206 | I1208 05:45:11.806061 49494 parallel.cpp:234] GPU 2 does not have p2p access to GPU 0
207 | I1208 05:45:12.012104 49494 data_layer.cpp:41] output data size: 2048,1,1,512
208 | I1208 05:45:12.026998 49497 blocking_queue.cpp:50] Waiting for data
209 | I1208 05:45:12.038635 49499 blocking_queue.cpp:50] Waiting for data
210 | I1208 05:45:12.050448 49501 blocking_queue.cpp:50] Waiting for data
211 | I1208 05:45:12.061439 49497 blocking_queue.cpp:50] Waiting for data
212 | I1208 05:45:12.072552 49499 blocking_queue.cpp:50] Waiting for data
213 | I1208 05:45:12.082746 49501 blocking_queue.cpp:50] Waiting for data
214 | I1208 05:45:12.095509 49501 blocking_queue.cpp:50] Waiting for data
215 | I1208 05:45:12.106873 49497 blocking_queue.cpp:50] Waiting for data
216 | I1208 05:45:12.119169 49499 blocking_queue.cpp:50] Waiting for data
217 | I1208 05:45:12.130293 49501 blocking_queue.cpp:50] Waiting for data
218 | I1208 05:45:12.141988 49497 blocking_queue.cpp:50] Waiting for data
219 | I1208 05:45:12.153851 49501 blocking_queue.cpp:50] Waiting for data
220 | I1208 05:45:12.166868 49497 blocking_queue.cpp:50] Waiting for data
221 | I1208 05:45:12.178115 49499 blocking_queue.cpp:50] Waiting for data
222 | I1208 05:45:12.189239 49501 blocking_queue.cpp:50] Waiting for data
223 | I1208 05:45:12.200922 49497 blocking_queue.cpp:50] Waiting for data
224 | I1208 05:45:12.211683 49499 blocking_queue.cpp:50] Waiting for data
225 | I1208 05:45:12.222765 49497 blocking_queue.cpp:50] Waiting for data
226 | I1208 05:45:12.420398 49494 parallel.cpp:419] Starting Optimization
227 | I1208 05:45:12.420925 49494 solver.cpp:288] Solving FFN
228 | I1208 05:45:12.420943 49494 solver.cpp:289] Learning Rate Policy: fixed
229 | I1208 05:45:46.138041 49494 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel
230 | I1208 05:45:47.682664 49494 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate
231 | I1208 05:45:48.712003 49494 solver.cpp:326] Optimization Done.
232 | I1208 05:45:48.851454 49494 caffe.cpp:215] Optimization Done.
233 | 


--------------------------------------------------------------------------------
/caffe/ffn_4GPUs.prototxt:
--------------------------------------------------------------------------------
  1 | name: "FFN"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   data_param: {
 11 |     batch_size: 2048
 12 |     source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb"
 13 |     backend: LMDB
 14 |   }
 15 | }
 16 | 
 17 | layer {
 18 |   name: "H1"
 19 |   type: "InnerProduct"
 20 |   bottom: "data"
 21 |   top: "H1"
 22 |   inner_product_param {
 23 |     num_output: 2048   
 24 |   }
 25 | }
 26 | 
 27 | layer {
 28 |   name: "H1_A"
 29 |   type: "Sigmoid"
 30 |   bottom: "H1"
 31 |   top: "H1"
 32 | }
 33 | 
 34 | layer {
 35 |   name: "H2"
 36 |   type: "InnerProduct"
 37 |   bottom: "H1"
 38 |   top: "H2"
 39 |   inner_product_param {
 40 |     num_output: 2048
 41 |   }
 42 | }
 43 | 
 44 | layer {
 45 |   name: "H2_A"
 46 |   type: "Sigmoid"
 47 |   bottom: "H2"
 48 |   top: "H2"
 49 | }
 50 | 
 51 | layer {
 52 |   name: "H3"
 53 |   type: "InnerProduct"
 54 |   bottom: "H2"
 55 |   top: "H3"
 56 |   inner_product_param {
 57 |     num_output: 2048
 58 |   }
 59 | }
 60 | 
 61 | layer {
 62 |   name: "H3_A"
 63 |   type: "Sigmoid"
 64 |   bottom: "H3"
 65 |   top: "H3"
 66 | }
 67 | 
 68 | layer {
 69 |   name: "H4"
 70 |   type: "InnerProduct"
 71 |   bottom: "H3"
 72 |   top: "H4"
 73 |   inner_product_param {
 74 |     num_output: 2048
 75 |   }
 76 | }
 77 | 
 78 | layer {
 79 |   name: "H4_A"
 80 |   type: "Sigmoid"
 81 |   bottom: "H4"
 82 |   top: "H4"
 83 | }
 84 | 
 85 | layer {
 86 |   name: "L"
 87 |   type: "InnerProduct"
 88 |   bottom: "H4"
 89 |   top: "L"
 90 |   inner_product_param {
 91 |     num_output: 10000
 92 |   }
 93 | }
 94 | 
 95 | layer {
 96 |   name: "loss"
 97 |   type: "SoftmaxWithLoss"
 98 |   bottom: "L"
 99 |   bottom: "label"
100 |   top: "loss"
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/caffe/ffn_4GPUs.sh:
--------------------------------------------------------------------------------
1 | ../../caffe/build/tools/caffe train -solver=./ffn_4GPUs_solver.prototxt -gpu=0,1,2,3 >ffn_4GPUs.log 2>&1
2 | 
3 | 


--------------------------------------------------------------------------------
/caffe/ffn_4GPUs_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "./ffn_4GPUs.prototxt"
2 | max_iter: 100
3 | base_lr: 0.001
4 | lr_policy: "fixed"
5 | solver_mode: GPU
6 | 


--------------------------------------------------------------------------------
/caffe/ffn_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "./ffn.prototxt"
2 | max_iter: 100
3 | base_lr: 0.001
4 | lr_policy: "fixed"
5 | solver_mode: GPU
6 | 


--------------------------------------------------------------------------------
/createData.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | featDim = 512
 4 | labDim = 10000
 5 | totalCount = 256 * 1024
 6 | 
 7 | def createFakeData(count):
 8 |     features = np.random.randn(count, featDim)
 9 |     labels = np.random.randint(0, labDim, size=(count, 1))
10 |     return features, labels
11 | 
12 | f, l = createFakeData(totalCount)
13 | 
14 | np.savetxt(r'./data.txt', np.hstack((l, f)), fmt='%d' + ' %f4' * featDim)
15 | 
16 | 


--------------------------------------------------------------------------------
/keras/ffn.log:
--------------------------------------------------------------------------------
1 | Using gpu device 0: Tesla K40m
2 | Using Theano backend.
3 | 1 GPU: 4585.43059456 samples per sec
4 | 


--------------------------------------------------------------------------------
/keras/ffn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=gpu,floatX=float32"
 3 | 
 4 | import theano
 5 | import time
 6 | import numpy as np
 7 | 
 8 | from keras.models import Sequential
 9 | from keras.layers.core import Dense, Activation
10 | from keras.optimizers import SGD
11 | 
12 | nruns = 5
13 | bsize = 8192
14 | isize = 512
15 | hsize = 2048
16 | osize = 10000
17 | 
18 | #fake data
19 | X = np.random.rand(bsize, isize).astype(np.float32)
20 | y = np.zeros((bsize, osize), dtype=np.bool)
21 | ind = np.random.randint(0,osize,bsize)
22 | for i in range(bsize):
23 |     y[i,ind[i]] = True
24 | 
25 | #model definition
26 | model = Sequential()
27 | model.add(Dense(hsize, input_dim=isize))
28 | model.add(Activation('sigmoid')) #hidden layer 1
29 | model.add(Dense(hsize))
30 | model.add(Activation('sigmoid')) #hidden layer 2
31 | model.add(Dense(hsize))
32 | model.add(Activation('sigmoid')) #hidden layer 3
33 | model.add(Dense(hsize))
34 | model.add(Activation('sigmoid')) #hidden layer 4
35 | model.add(Dense(osize))
36 | model.add(Activation('softmax')) #output layer
37 | model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.1))
38 | 
39 | #start training and measuring
40 | start = time.time()
41 | for i in range(nruns):
42 |     model.train_on_batch(X, y)
43 | end = time.time()
44 | print('1 GPU: {0} samples per sec'.format(nruns * bsize / (end-start)))
45 | 
46 | 


--------------------------------------------------------------------------------