├── CNTK ├── AlexNet.config ├── AlexNet.ndl ├── Macros.ndl ├── alexnet_1GPU.sh ├── alexnet_4GPUs.sh ├── createFakeImageNetData.py ├── createLabelMap.py ├── ffn.config ├── ffn_1GPU.sh ├── ffn_2GPUs.sh ├── ffn_4GPUs.sh ├── ffn_orig.config ├── labelmap.1K.txt └── logs │ ├── alexnet │ ├── 1 │ │ └── AlexNet_Train.log │ └── 4 │ │ ├── AlexNet_Train.logrank0 │ │ ├── AlexNet_Train.logrank1 │ │ ├── AlexNet_Train.logrank2 │ │ └── AlexNet_Train.logrank3 │ └── ffn │ ├── 1 │ └── out_Train.log │ ├── 4 │ ├── out_Train.logrank0 │ ├── out_Train.logrank1 │ ├── out_Train.logrank2 │ └── out_Train.logrank3 │ ├── 8 │ ├── out_Train.logrank0 │ ├── out_Train.logrank1 │ ├── out_Train.logrank2 │ ├── out_Train.logrank3 │ ├── out_Train.logrank4 │ ├── out_Train.logrank5 │ ├── out_Train.logrank6 │ └── out_Train.logrank7 │ └── 16 │ ├── out_Train.logrank0 │ ├── out_Train.logrank1 │ ├── out_Train.logrank10 │ ├── out_Train.logrank11 │ ├── out_Train.logrank12 │ ├── out_Train.logrank13 │ ├── out_Train.logrank14 │ ├── out_Train.logrank15 │ ├── out_Train.logrank2 │ ├── out_Train.logrank3 │ ├── out_Train.logrank4 │ ├── out_Train.logrank5 │ ├── out_Train.logrank6 │ ├── out_Train.logrank7 │ ├── out_Train.logrank8 │ └── out_Train.logrank9 ├── GPU-hardware.txt ├── README.md ├── TensorFlow ├── ffn.py ├── ffn_1GPU.log ├── ffn_4GPUs.log ├── ffn_exp.py └── ffn_exp_4GPUs.py ├── Torch ├── alexnet.lua ├── ffn.log └── ffn.lua ├── caffe ├── alexnet.prototxt ├── alexnet_1GPU.log ├── alexnet_1GPU.sh ├── alexnet_4GPUs.log ├── alexnet_4GPUs.prototxt ├── alexnet_4GPUs.sh ├── alexnet_4GPUs_solver.prototxt ├── alexnet_solver.prototxt ├── alexnet_time_1GPU.log ├── alexnet_time_1GPU.sh ├── createFakeData.py ├── createFakeImageNet.py ├── ffn.prototxt ├── ffn_1GPU.log ├── ffn_1GPU.py ├── ffn_1GPU.sh ├── ffn_2GPUs.log ├── ffn_2GPUs.prototxt ├── ffn_2GPUs.sh ├── ffn_2GPUs_solver.prototxt ├── ffn_4GPUs.log ├── ffn_4GPUs.prototxt ├── ffn_4GPUs.sh ├── ffn_4GPUs_solver.prototxt └── ffn_solver.prototxt ├── createData.py └── keras ├── ffn.log └── ffn.py /CNTK/AlexNet.config: -------------------------------------------------------------------------------- 1 | WorkDir=. 2 | ModelDir=$WorkDir$/_out/$ConfigName$ 3 | stderr=$WorkDir$/_out/$ConfigName$ 4 | 5 | ndlMacros=$WorkDir$/Macros.ndl 6 | 7 | precision=float 8 | deviceId=Auto 9 | 10 | command=Train 11 | 12 | makeMode=false 13 | 14 | parallelTrain=false 15 | 16 | prefetch=true 17 | 18 | traceLevel=1 19 | 20 | Train=[ 21 | action=train 22 | modelPath=$ModelDir$/AlexNet 23 | 24 | NDLNetworkBuilder=[ 25 | networkDescription=$WorkDir$/AlexNet.ndl 26 | ] 27 | 28 | SGD=[ 29 | epochSize=8192 30 | minibatchSize=256 31 | learningRatesPerMB=0.01 32 | momentumPerMB=0 33 | maxEpochs=10 34 | gradUpdateType=None 35 | L2RegWeight=0 36 | dropoutRate=0 37 | 38 | ParallelTrain=[ 39 | parallelizationMethod=DataParallelSGD 40 | distributedMBReading=true 41 | parallelizationStartEpoch=1 42 | DataParallelSGD=[ 43 | gradientBits=1 44 | ] 45 | ] 46 | 47 | numMBsToShowResult=8 48 | ] 49 | 50 | reader=[ 51 | readerType=UCIFastReader 52 | file=$WorkDir$/imagenet_data.txt 53 | randomize=None 54 | features=[ 55 | dim=150528 56 | start=1 57 | ] 58 | labels=[ 59 | dim=1 60 | start=0 61 | labelDim=1000 62 | labelMappingFile=$WorkDir$/labelmap.1K.txt 63 | ] 64 | ] 65 | 66 | ] 67 | -------------------------------------------------------------------------------- /CNTK/AlexNet.ndl: -------------------------------------------------------------------------------- 1 | load=ndlMacros 2 | run=DNN 3 | 4 | ndlMacros = [ 5 | ImageW = 224 6 | ImageH = 224 7 | ImageC = 3 8 | LabelDim = 1000 9 | 10 | features = ImageInput(ImageW, ImageH, ImageC, tag = feature) 11 | labels = Input(LabelDim, tag = label) 12 | 13 | conv1WScale = 0.95 14 | conv1BValue = 0 15 | conv2WScale = 2 16 | conv2BValue = 1 17 | conv3WScale = 2.07 18 | conv3BValue = 0 19 | conv4WScale = 2.9 20 | conv4BValue = 1 21 | conv5WScale = 2.4 22 | conv5BValue = 1 23 | fc1WScale = 6.4 24 | fc1BValue = 1 25 | fc2WScale = 3.2 26 | fc2BValue = 1 27 | fc3WScale = 3.2 28 | fc3BValue = 1 29 | ] 30 | 31 | DNN=[ 32 | # conv1 33 | kW1 = 11 34 | kH1 = 11 35 | cMap1 = 96 36 | hStride1 = 4 37 | vStride1 = 4 38 | # weight[cMap1, kW1 * kH1 * ImageC] 39 | conv1_act = ConvReLULayer(features, cMap1, 363, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue) 40 | 41 | # pool1 42 | pool1W = 3 43 | pool1H = 3 44 | pool1hStride = 2 45 | pool1vStride = 2 46 | pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride) 47 | 48 | # conv2 49 | kW2 = 5 50 | kH2 = 5 51 | cMap2 = 256 52 | hStride2 = 1 53 | vStride2 = 1 54 | # weight[cMap2, kW2 * kH2 * cMap1] 55 | conv2_act = ConvReLULayer(pool1, cMap2, 2400, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue) 56 | 57 | # pool2 58 | pool2W = 3 59 | pool2H = 3 60 | pool2hStride = 2 61 | pool2vStride = 2 62 | pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride) 63 | 64 | # conv3 65 | kW3 = 3 66 | kH3 = 3 67 | cMap3 = 384 68 | hStride3 = 1 69 | vStride3 = 1 70 | # weight[cMap3, kW3 * kH3 * cMap2] 71 | conv3_act = ConvReLULayer(pool2, cMap3, 2304, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue) 72 | 73 | # conv4 74 | kW4 = 3 75 | kH4 = 3 76 | cMap4 = 384 77 | hStride4 = 1 78 | vStride4 = 1 79 | # weight[cMap4, kW4 * kH4 * cMap3] 80 | conv4_act = ConvReLULayer(conv3_act, cMap4, 3456, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue) 81 | 82 | # conv5 83 | kW5 = 3 84 | kH5 = 3 85 | cMap5 = 256 86 | hStride5 = 1 87 | vStride5 = 1 88 | # weight[cMap5, kW5 * kH5 * cMap4] 89 | conv5_act = ConvReLULayer(conv4_act, cMap5, 3456, kW5, kH5, hStride5, vStride5, conv5WScale, conv5BValue) 90 | 91 | # pool3 92 | pool3W = 3 93 | pool3H = 3 94 | pool3hStride = 2 95 | pool3vStride = 2 96 | pool3 = MaxPooling(conv5_act, pool3W, pool3H, pool3hStride, pool3vStride) 97 | 98 | hiddenDim = 4096 99 | h1 = DNNReLULayer(9216, hiddenDim, pool3, fc1WScale, fc1BValue) 100 | h1_d = Dropout(h1) 101 | h2 = DNNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue) 102 | h2_d = Dropout(h2) 103 | ol = DNNLastLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue) 104 | 105 | CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) 106 | Err = ErrorPrediction(labels, ol, tag = Eval) 107 | OutputNodes = ol 108 | ] 109 | -------------------------------------------------------------------------------- /CNTK/Macros.ndl: -------------------------------------------------------------------------------- 1 | ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) 2 | { 3 | convW = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) 4 | conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true) 5 | convB = Parameter(outMap, 1, init = fixedValue, value = bValue) 6 | convPlusB = Plus(conv, convB); 7 | act = RectifiedLinear(convPlusB); 8 | } 9 | 10 | DNNReLULayer(inDim, outDim, x, wScale, bValue) 11 | { 12 | W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 13 | b = Parameter(outDim, init = fixedValue, value = bValue) 14 | t = Times(W, x) 15 | z = Plus(t, b) 16 | y = RectifiedLinear(z) 17 | } 18 | 19 | DNNLastLayer(hiddenDim, labelDim, x, wScale, bValue) 20 | { 21 | W = Parameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale) 22 | b = Parameter(labelDim, init = fixedValue, value = bValue) 23 | t = Times(W, x) 24 | z = Plus(t, b) 25 | } 26 | -------------------------------------------------------------------------------- /CNTK/alexnet_1GPU.sh: -------------------------------------------------------------------------------- 1 | ~/cntk/bin/cntk configFile=AlexNet.config configName=AlexNet 2 | 3 | -------------------------------------------------------------------------------- /CNTK/alexnet_4GPUs.sh: -------------------------------------------------------------------------------- 1 | mpiexec -n 4 ~/cntk/bin/cntk configFile=AlexNet.config configName=AlexNet parallelTrain=true 2 | 3 | -------------------------------------------------------------------------------- /CNTK/createFakeImageNetData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | featDim = 224 * 224 * 3 4 | labDim = 1000 5 | totalCount = 256 * 32 6 | 7 | def createFakeData(count): 8 | features = np.random.randn(count, featDim) 9 | labels = np.random.randint(0, labDim, size=(count, 1)) 10 | return features, labels 11 | 12 | f, l = createFakeData(totalCount) 13 | 14 | np.savetxt(r'./imagenet_data.txt', np.hstack((l, f)), fmt='%d' + ' %f4' * featDim) 15 | 16 | -------------------------------------------------------------------------------- /CNTK/createLabelMap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | dim = 1000 3 | a = range(0, dim) 4 | np.savetxt('./labelmap.txt', np.reshape(a, (dim, 1)), fmt='%d') 5 | -------------------------------------------------------------------------------- /CNTK/ffn.config: -------------------------------------------------------------------------------- 1 | WorkDir=. 2 | ModelDir=$WorkDir$/models/$ConfigName$ 3 | stderr=$WorkDir$/logs/$ConfigName$/out 4 | precision=float 5 | deviceId=Auto 6 | 7 | makeMode=false 8 | 9 | command=Train 10 | 11 | featureDim = 512 12 | labelDim = 10000 13 | hiddenDim = 2048 14 | 15 | parallelTrain=false 16 | prefetch=true 17 | 18 | Train=[ 19 | action=train 20 | modelPath=$ModelDir$/cntk 21 | deviceId=Auto 22 | traceLevel=1 23 | 24 | SimpleNetworkBuilder=[ 25 | layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$ 26 | trainingCriterion=CrossEntropyWithSoftmax 27 | evalCriterion=ErrorPrediction 28 | layerTypes=Sigmoid 29 | applyMeanVarNorm=false 30 | initValueScale=1.0 31 | uniformInit=true 32 | needPrior=false 33 | ] 34 | 35 | SGD=[ 36 | epochSize=262144 37 | minibatchSize=8192 38 | learningRatesPerMB=0.01 39 | numMBsToShowResult=4 40 | momentumPerSample=0 41 | dropoutRate=0.0 42 | maxEpochs=40 43 | 44 | ParallelTrain=[ 45 | parallelizationMethod=DataParallelSGD 46 | distributedMBReading=true 47 | parallelizationStartEpoch=1 48 | DataParallelSGD=[ 49 | gradientBits=1 50 | ] 51 | ] 52 | 53 | gradUpdateType=None 54 | normWithAveMultiplier=true 55 | clippingThresholdPerSample=1#INF 56 | ] 57 | ] 58 | 59 | reader=[ 60 | readerType=UCIFastReader 61 | file=$WorkDir$/../data.txt 62 | features=[ 63 | dim=$featureDim$ 64 | start=1 65 | ] 66 | labels=[ 67 | dim=1 68 | start=0 69 | labelDim=$labelDim$ 70 | labelMappingFile=$WorkDir$/labelmap.txt 71 | ] 72 | ] 73 | -------------------------------------------------------------------------------- /CNTK/ffn_1GPU.sh: -------------------------------------------------------------------------------- 1 | ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn 2 | 3 | -------------------------------------------------------------------------------- /CNTK/ffn_2GPUs.sh: -------------------------------------------------------------------------------- 1 | mpiexec -n 2 ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn parallelTrain=true 2 | -------------------------------------------------------------------------------- /CNTK/ffn_4GPUs.sh: -------------------------------------------------------------------------------- 1 | mpiexec -n 4 ../../cntkbin/bin/cntk configFile=ffn.config configName=ffn parallelTrain=true 2 | -------------------------------------------------------------------------------- /CNTK/ffn_orig.config: -------------------------------------------------------------------------------- 1 | WorkDir=. 2 | ModelDir=$WorkDir$/models/$ConfigName$ 3 | stderr=$WorkDir$/logs/$ConfigName$/out 4 | precision=float 5 | deviceId=Auto 6 | 7 | command=Train 8 | 9 | featureDim = 957 10 | labelDim = 5976 11 | hiddenDim = 2048 12 | 13 | parallelTrain=false 14 | 15 | Train=[ 16 | action=train 17 | modelPath=$ModelDir$/cntk 18 | deviceId=Auto 19 | traceLevel=1 20 | 21 | SimpleNetworkBuilder=[ 22 | layerSizes=$featureDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$hiddenDim$:$labelDim$ 23 | trainingCriterion=CrossEntropyWithSoftmax 24 | evalCriterion=ErrorPrediction 25 | layerTypes=Sigmoid 26 | applyMeanVarNorm=false 27 | initValueScale=1.0 28 | uniformInit=true 29 | needPrior=false 30 | ] 31 | 32 | SGD=[ 33 | epochSize=65536 34 | minibatchSize=512 35 | learningRatesPerMB=0.1 36 | numMBsToShowResult=10 37 | momentumPerSample=0.999589 38 | dropoutRate=0.0 39 | maxEpochs=2 40 | 41 | ParallelTrain=[ 42 | parallelizationMethod=DataParallelSGD 43 | distributedMBReading=true 44 | parallelizationStartEpoch=1 45 | DataParallelSGD=[ 46 | gradientBits=32 47 | ] 48 | ] 49 | 50 | gradUpdateType=None 51 | normWithAveMultiplier=true 52 | clippingThresholdPerSample=1#INF 53 | ] 54 | ] 55 | 56 | reader=[ 57 | readerType=UCIFastReader 58 | file=$WorkDir$/../data_orig.txt 59 | features=[ 60 | dim=$featureDim$ 61 | start=1 62 | ] 63 | labels=[ 64 | dim=1 65 | start=0 66 | labelDim=$labelDim$ 67 | labelMappingFile=$WorkDir$/labelmap_orig.txt 68 | ] 69 | ] 70 | -------------------------------------------------------------------------------- /CNTK/labelmap.1K.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | 10 12 | 11 13 | 12 14 | 13 15 | 14 16 | 15 17 | 16 18 | 17 19 | 18 20 | 19 21 | 20 22 | 21 23 | 22 24 | 23 25 | 24 26 | 25 27 | 26 28 | 27 29 | 28 30 | 29 31 | 30 32 | 31 33 | 32 34 | 33 35 | 34 36 | 35 37 | 36 38 | 37 39 | 38 40 | 39 41 | 40 42 | 41 43 | 42 44 | 43 45 | 44 46 | 45 47 | 46 48 | 47 49 | 48 50 | 49 51 | 50 52 | 51 53 | 52 54 | 53 55 | 54 56 | 55 57 | 56 58 | 57 59 | 58 60 | 59 61 | 60 62 | 61 63 | 62 64 | 63 65 | 64 66 | 65 67 | 66 68 | 67 69 | 68 70 | 69 71 | 70 72 | 71 73 | 72 74 | 73 75 | 74 76 | 75 77 | 76 78 | 77 79 | 78 80 | 79 81 | 80 82 | 81 83 | 82 84 | 83 85 | 84 86 | 85 87 | 86 88 | 87 89 | 88 90 | 89 91 | 90 92 | 91 93 | 92 94 | 93 95 | 94 96 | 95 97 | 96 98 | 97 99 | 98 100 | 99 101 | 100 102 | 101 103 | 102 104 | 103 105 | 104 106 | 105 107 | 106 108 | 107 109 | 108 110 | 109 111 | 110 112 | 111 113 | 112 114 | 113 115 | 114 116 | 115 117 | 116 118 | 117 119 | 118 120 | 119 121 | 120 122 | 121 123 | 122 124 | 123 125 | 124 126 | 125 127 | 126 128 | 127 129 | 128 130 | 129 131 | 130 132 | 131 133 | 132 134 | 133 135 | 134 136 | 135 137 | 136 138 | 137 139 | 138 140 | 139 141 | 140 142 | 141 143 | 142 144 | 143 145 | 144 146 | 145 147 | 146 148 | 147 149 | 148 150 | 149 151 | 150 152 | 151 153 | 152 154 | 153 155 | 154 156 | 155 157 | 156 158 | 157 159 | 158 160 | 159 161 | 160 162 | 161 163 | 162 164 | 163 165 | 164 166 | 165 167 | 166 168 | 167 169 | 168 170 | 169 171 | 170 172 | 171 173 | 172 174 | 173 175 | 174 176 | 175 177 | 176 178 | 177 179 | 178 180 | 179 181 | 180 182 | 181 183 | 182 184 | 183 185 | 184 186 | 185 187 | 186 188 | 187 189 | 188 190 | 189 191 | 190 192 | 191 193 | 192 194 | 193 195 | 194 196 | 195 197 | 196 198 | 197 199 | 198 200 | 199 201 | 200 202 | 201 203 | 202 204 | 203 205 | 204 206 | 205 207 | 206 208 | 207 209 | 208 210 | 209 211 | 210 212 | 211 213 | 212 214 | 213 215 | 214 216 | 215 217 | 216 218 | 217 219 | 218 220 | 219 221 | 220 222 | 221 223 | 222 224 | 223 225 | 224 226 | 225 227 | 226 228 | 227 229 | 228 230 | 229 231 | 230 232 | 231 233 | 232 234 | 233 235 | 234 236 | 235 237 | 236 238 | 237 239 | 238 240 | 239 241 | 240 242 | 241 243 | 242 244 | 243 245 | 244 246 | 245 247 | 246 248 | 247 249 | 248 250 | 249 251 | 250 252 | 251 253 | 252 254 | 253 255 | 254 256 | 255 257 | 256 258 | 257 259 | 258 260 | 259 261 | 260 262 | 261 263 | 262 264 | 263 265 | 264 266 | 265 267 | 266 268 | 267 269 | 268 270 | 269 271 | 270 272 | 271 273 | 272 274 | 273 275 | 274 276 | 275 277 | 276 278 | 277 279 | 278 280 | 279 281 | 280 282 | 281 283 | 282 284 | 283 285 | 284 286 | 285 287 | 286 288 | 287 289 | 288 290 | 289 291 | 290 292 | 291 293 | 292 294 | 293 295 | 294 296 | 295 297 | 296 298 | 297 299 | 298 300 | 299 301 | 300 302 | 301 303 | 302 304 | 303 305 | 304 306 | 305 307 | 306 308 | 307 309 | 308 310 | 309 311 | 310 312 | 311 313 | 312 314 | 313 315 | 314 316 | 315 317 | 316 318 | 317 319 | 318 320 | 319 321 | 320 322 | 321 323 | 322 324 | 323 325 | 324 326 | 325 327 | 326 328 | 327 329 | 328 330 | 329 331 | 330 332 | 331 333 | 332 334 | 333 335 | 334 336 | 335 337 | 336 338 | 337 339 | 338 340 | 339 341 | 340 342 | 341 343 | 342 344 | 343 345 | 344 346 | 345 347 | 346 348 | 347 349 | 348 350 | 349 351 | 350 352 | 351 353 | 352 354 | 353 355 | 354 356 | 355 357 | 356 358 | 357 359 | 358 360 | 359 361 | 360 362 | 361 363 | 362 364 | 363 365 | 364 366 | 365 367 | 366 368 | 367 369 | 368 370 | 369 371 | 370 372 | 371 373 | 372 374 | 373 375 | 374 376 | 375 377 | 376 378 | 377 379 | 378 380 | 379 381 | 380 382 | 381 383 | 382 384 | 383 385 | 384 386 | 385 387 | 386 388 | 387 389 | 388 390 | 389 391 | 390 392 | 391 393 | 392 394 | 393 395 | 394 396 | 395 397 | 396 398 | 397 399 | 398 400 | 399 401 | 400 402 | 401 403 | 402 404 | 403 405 | 404 406 | 405 407 | 406 408 | 407 409 | 408 410 | 409 411 | 410 412 | 411 413 | 412 414 | 413 415 | 414 416 | 415 417 | 416 418 | 417 419 | 418 420 | 419 421 | 420 422 | 421 423 | 422 424 | 423 425 | 424 426 | 425 427 | 426 428 | 427 429 | 428 430 | 429 431 | 430 432 | 431 433 | 432 434 | 433 435 | 434 436 | 435 437 | 436 438 | 437 439 | 438 440 | 439 441 | 440 442 | 441 443 | 442 444 | 443 445 | 444 446 | 445 447 | 446 448 | 447 449 | 448 450 | 449 451 | 450 452 | 451 453 | 452 454 | 453 455 | 454 456 | 455 457 | 456 458 | 457 459 | 458 460 | 459 461 | 460 462 | 461 463 | 462 464 | 463 465 | 464 466 | 465 467 | 466 468 | 467 469 | 468 470 | 469 471 | 470 472 | 471 473 | 472 474 | 473 475 | 474 476 | 475 477 | 476 478 | 477 479 | 478 480 | 479 481 | 480 482 | 481 483 | 482 484 | 483 485 | 484 486 | 485 487 | 486 488 | 487 489 | 488 490 | 489 491 | 490 492 | 491 493 | 492 494 | 493 495 | 494 496 | 495 497 | 496 498 | 497 499 | 498 500 | 499 501 | 500 502 | 501 503 | 502 504 | 503 505 | 504 506 | 505 507 | 506 508 | 507 509 | 508 510 | 509 511 | 510 512 | 511 513 | 512 514 | 513 515 | 514 516 | 515 517 | 516 518 | 517 519 | 518 520 | 519 521 | 520 522 | 521 523 | 522 524 | 523 525 | 524 526 | 525 527 | 526 528 | 527 529 | 528 530 | 529 531 | 530 532 | 531 533 | 532 534 | 533 535 | 534 536 | 535 537 | 536 538 | 537 539 | 538 540 | 539 541 | 540 542 | 541 543 | 542 544 | 543 545 | 544 546 | 545 547 | 546 548 | 547 549 | 548 550 | 549 551 | 550 552 | 551 553 | 552 554 | 553 555 | 554 556 | 555 557 | 556 558 | 557 559 | 558 560 | 559 561 | 560 562 | 561 563 | 562 564 | 563 565 | 564 566 | 565 567 | 566 568 | 567 569 | 568 570 | 569 571 | 570 572 | 571 573 | 572 574 | 573 575 | 574 576 | 575 577 | 576 578 | 577 579 | 578 580 | 579 581 | 580 582 | 581 583 | 582 584 | 583 585 | 584 586 | 585 587 | 586 588 | 587 589 | 588 590 | 589 591 | 590 592 | 591 593 | 592 594 | 593 595 | 594 596 | 595 597 | 596 598 | 597 599 | 598 600 | 599 601 | 600 602 | 601 603 | 602 604 | 603 605 | 604 606 | 605 607 | 606 608 | 607 609 | 608 610 | 609 611 | 610 612 | 611 613 | 612 614 | 613 615 | 614 616 | 615 617 | 616 618 | 617 619 | 618 620 | 619 621 | 620 622 | 621 623 | 622 624 | 623 625 | 624 626 | 625 627 | 626 628 | 627 629 | 628 630 | 629 631 | 630 632 | 631 633 | 632 634 | 633 635 | 634 636 | 635 637 | 636 638 | 637 639 | 638 640 | 639 641 | 640 642 | 641 643 | 642 644 | 643 645 | 644 646 | 645 647 | 646 648 | 647 649 | 648 650 | 649 651 | 650 652 | 651 653 | 652 654 | 653 655 | 654 656 | 655 657 | 656 658 | 657 659 | 658 660 | 659 661 | 660 662 | 661 663 | 662 664 | 663 665 | 664 666 | 665 667 | 666 668 | 667 669 | 668 670 | 669 671 | 670 672 | 671 673 | 672 674 | 673 675 | 674 676 | 675 677 | 676 678 | 677 679 | 678 680 | 679 681 | 680 682 | 681 683 | 682 684 | 683 685 | 684 686 | 685 687 | 686 688 | 687 689 | 688 690 | 689 691 | 690 692 | 691 693 | 692 694 | 693 695 | 694 696 | 695 697 | 696 698 | 697 699 | 698 700 | 699 701 | 700 702 | 701 703 | 702 704 | 703 705 | 704 706 | 705 707 | 706 708 | 707 709 | 708 710 | 709 711 | 710 712 | 711 713 | 712 714 | 713 715 | 714 716 | 715 717 | 716 718 | 717 719 | 718 720 | 719 721 | 720 722 | 721 723 | 722 724 | 723 725 | 724 726 | 725 727 | 726 728 | 727 729 | 728 730 | 729 731 | 730 732 | 731 733 | 732 734 | 733 735 | 734 736 | 735 737 | 736 738 | 737 739 | 738 740 | 739 741 | 740 742 | 741 743 | 742 744 | 743 745 | 744 746 | 745 747 | 746 748 | 747 749 | 748 750 | 749 751 | 750 752 | 751 753 | 752 754 | 753 755 | 754 756 | 755 757 | 756 758 | 757 759 | 758 760 | 759 761 | 760 762 | 761 763 | 762 764 | 763 765 | 764 766 | 765 767 | 766 768 | 767 769 | 768 770 | 769 771 | 770 772 | 771 773 | 772 774 | 773 775 | 774 776 | 775 777 | 776 778 | 777 779 | 778 780 | 779 781 | 780 782 | 781 783 | 782 784 | 783 785 | 784 786 | 785 787 | 786 788 | 787 789 | 788 790 | 789 791 | 790 792 | 791 793 | 792 794 | 793 795 | 794 796 | 795 797 | 796 798 | 797 799 | 798 800 | 799 801 | 800 802 | 801 803 | 802 804 | 803 805 | 804 806 | 805 807 | 806 808 | 807 809 | 808 810 | 809 811 | 810 812 | 811 813 | 812 814 | 813 815 | 814 816 | 815 817 | 816 818 | 817 819 | 818 820 | 819 821 | 820 822 | 821 823 | 822 824 | 823 825 | 824 826 | 825 827 | 826 828 | 827 829 | 828 830 | 829 831 | 830 832 | 831 833 | 832 834 | 833 835 | 834 836 | 835 837 | 836 838 | 837 839 | 838 840 | 839 841 | 840 842 | 841 843 | 842 844 | 843 845 | 844 846 | 845 847 | 846 848 | 847 849 | 848 850 | 849 851 | 850 852 | 851 853 | 852 854 | 853 855 | 854 856 | 855 857 | 856 858 | 857 859 | 858 860 | 859 861 | 860 862 | 861 863 | 862 864 | 863 865 | 864 866 | 865 867 | 866 868 | 867 869 | 868 870 | 869 871 | 870 872 | 871 873 | 872 874 | 873 875 | 874 876 | 875 877 | 876 878 | 877 879 | 878 880 | 879 881 | 880 882 | 881 883 | 882 884 | 883 885 | 884 886 | 885 887 | 886 888 | 887 889 | 888 890 | 889 891 | 890 892 | 891 893 | 892 894 | 893 895 | 894 896 | 895 897 | 896 898 | 897 899 | 898 900 | 899 901 | 900 902 | 901 903 | 902 904 | 903 905 | 904 906 | 905 907 | 906 908 | 907 909 | 908 910 | 909 911 | 910 912 | 911 913 | 912 914 | 913 915 | 914 916 | 915 917 | 916 918 | 917 919 | 918 920 | 919 921 | 920 922 | 921 923 | 922 924 | 923 925 | 924 926 | 925 927 | 926 928 | 927 929 | 928 930 | 929 931 | 930 932 | 931 933 | 932 934 | 933 935 | 934 936 | 935 937 | 936 938 | 937 939 | 938 940 | 939 941 | 940 942 | 941 943 | 942 944 | 943 945 | 944 946 | 945 947 | 946 948 | 947 949 | 948 950 | 949 951 | 950 952 | 951 953 | 952 954 | 953 955 | 954 956 | 955 957 | 956 958 | 957 959 | 958 960 | 959 961 | 960 962 | 961 963 | 962 964 | 963 965 | 964 966 | 965 967 | 966 968 | 967 969 | 968 970 | 969 971 | 970 972 | 971 973 | 972 974 | 973 975 | 974 976 | 975 977 | 976 978 | 977 979 | 978 980 | 979 981 | 980 982 | 981 983 | 982 984 | 983 985 | 984 986 | 985 987 | 986 988 | 987 989 | 988 990 | 989 991 | 990 992 | 991 993 | 992 994 | 993 995 | 994 996 | 995 997 | 996 998 | 997 999 | 998 1000 | 999 1001 | -------------------------------------------------------------------------------- /GPU-hardware.txt: -------------------------------------------------------------------------------- 1 | 2 | ==============NVSMI LOG============== 3 | 4 | Timestamp : Tue Dec 8 06:02:49 2015 5 | Driver Version : 346.72 6 | 7 | Attached GPUs : 4 8 | GPU 0000:0A:00.0 9 | Product Name : Tesla K40m 10 | Product Brand : Tesla 11 | Display Mode : Disabled 12 | Display Active : Disabled 13 | Persistence Mode : Disabled 14 | Accounting Mode : Disabled 15 | Accounting Mode Buffer Size : 128 16 | Driver Model 17 | Current : N/A 18 | Pending : N/A 19 | Serial Number : 0322815022851 20 | GPU UUID : GPU-9d74c40c-8145-27f7-a65d-0c8268d892cd 21 | Minor Number : 1 22 | VBIOS Version : 80.80.3E.00.0F 23 | MultiGPU Board : No 24 | Board ID : 0xa00 25 | Inforom Version 26 | Image Version : 2081.0202.01.04 27 | OEM Object : 1.1 28 | ECC Object : 3.0 29 | Power Management Object : N/A 30 | GPU Operation Mode 31 | Current : N/A 32 | Pending : N/A 33 | PCI 34 | Bus : 0x0A 35 | Device : 0x00 36 | Domain : 0x0000 37 | Device Id : 0x102310DE 38 | Bus Id : 0000:0A:00.0 39 | Sub System Id : 0x097E10DE 40 | GPU Link Info 41 | PCIe Generation 42 | Max : 3 43 | Current : 3 44 | Link Width 45 | Max : 16x 46 | Current : 16x 47 | Bridge Chip 48 | Type : N/A 49 | Firmware : N/A 50 | Replays since reset : 0 51 | Tx Throughput : N/A 52 | Rx Throughput : N/A 53 | Fan Speed : N/A 54 | Performance State : P0 55 | Clocks Throttle Reasons 56 | Idle : Not Active 57 | Applications Clocks Setting : Active 58 | SW Power Cap : Not Active 59 | HW Slowdown : Not Active 60 | Unknown : Not Active 61 | FB Memory Usage 62 | Total : 11519 MiB 63 | Used : 55 MiB 64 | Free : 11464 MiB 65 | BAR1 Memory Usage 66 | Total : 16384 MiB 67 | Used : 2 MiB 68 | Free : 16382 MiB 69 | Compute Mode : Default 70 | Utilization 71 | Gpu : 0 % 72 | Memory : 0 % 73 | Encoder : 0 % 74 | Decoder : 0 % 75 | Ecc Mode 76 | Current : Enabled 77 | Pending : Enabled 78 | ECC Errors 79 | Volatile 80 | Single Bit 81 | Device Memory : 0 82 | Register File : 0 83 | L1 Cache : 0 84 | L2 Cache : 0 85 | Texture Memory : 0 86 | Total : 0 87 | Double Bit 88 | Device Memory : 0 89 | Register File : 0 90 | L1 Cache : 0 91 | L2 Cache : 0 92 | Texture Memory : 0 93 | Total : 0 94 | Aggregate 95 | Single Bit 96 | Device Memory : 0 97 | Register File : 0 98 | L1 Cache : 0 99 | L2 Cache : 0 100 | Texture Memory : 0 101 | Total : 0 102 | Double Bit 103 | Device Memory : 0 104 | Register File : 0 105 | L1 Cache : 0 106 | L2 Cache : 0 107 | Texture Memory : 0 108 | Total : 0 109 | Retired Pages 110 | Single Bit ECC : 0 111 | Double Bit ECC : 0 112 | Pending : No 113 | Temperature 114 | GPU Current Temp : 38 C 115 | GPU Shutdown Temp : 95 C 116 | GPU Slowdown Temp : 90 C 117 | Power Readings 118 | Power Management : Supported 119 | Power Draw : 62.50 W 120 | Power Limit : 235.00 W 121 | Default Power Limit : 235.00 W 122 | Enforced Power Limit : 235.00 W 123 | Min Power Limit : 180.00 W 124 | Max Power Limit : 235.00 W 125 | Clocks 126 | Graphics : 745 MHz 127 | SM : 745 MHz 128 | Memory : 3004 MHz 129 | Applications Clocks 130 | Graphics : 745 MHz 131 | Memory : 3004 MHz 132 | Default Applications Clocks 133 | Graphics : 745 MHz 134 | Memory : 3004 MHz 135 | Max Clocks 136 | Graphics : 875 MHz 137 | SM : 875 MHz 138 | Memory : 3004 MHz 139 | Clock Policy 140 | Auto Boost : N/A 141 | Auto Boost Default : N/A 142 | Processes : None 143 | 144 | GPU 0000:0D:00.0 145 | Product Name : Tesla K40m 146 | Product Brand : Tesla 147 | Display Mode : Disabled 148 | Display Active : Disabled 149 | Persistence Mode : Disabled 150 | Accounting Mode : Disabled 151 | Accounting Mode Buffer Size : 128 152 | Driver Model 153 | Current : N/A 154 | Pending : N/A 155 | Serial Number : 0323315059424 156 | GPU UUID : GPU-1c892c8f-e42d-2261-f9a3-a9dbf6d1d2dc 157 | Minor Number : 0 158 | VBIOS Version : 80.80.3E.00.0F 159 | MultiGPU Board : No 160 | Board ID : 0xd00 161 | Inforom Version 162 | Image Version : 2081.0202.01.04 163 | OEM Object : 1.1 164 | ECC Object : 3.0 165 | Power Management Object : N/A 166 | GPU Operation Mode 167 | Current : N/A 168 | Pending : N/A 169 | PCI 170 | Bus : 0x0D 171 | Device : 0x00 172 | Domain : 0x0000 173 | Device Id : 0x102310DE 174 | Bus Id : 0000:0D:00.0 175 | Sub System Id : 0x097E10DE 176 | GPU Link Info 177 | PCIe Generation 178 | Max : 3 179 | Current : 3 180 | Link Width 181 | Max : 16x 182 | Current : 16x 183 | Bridge Chip 184 | Type : N/A 185 | Firmware : N/A 186 | Replays since reset : 0 187 | Tx Throughput : N/A 188 | Rx Throughput : N/A 189 | Fan Speed : N/A 190 | Performance State : P0 191 | Clocks Throttle Reasons 192 | Idle : Not Active 193 | Applications Clocks Setting : Active 194 | SW Power Cap : Not Active 195 | HW Slowdown : Not Active 196 | Unknown : Not Active 197 | FB Memory Usage 198 | Total : 11519 MiB 199 | Used : 55 MiB 200 | Free : 11464 MiB 201 | BAR1 Memory Usage 202 | Total : 16384 MiB 203 | Used : 2 MiB 204 | Free : 16382 MiB 205 | Compute Mode : Default 206 | Utilization 207 | Gpu : 0 % 208 | Memory : 0 % 209 | Encoder : 0 % 210 | Decoder : 0 % 211 | Ecc Mode 212 | Current : Enabled 213 | Pending : Enabled 214 | ECC Errors 215 | Volatile 216 | Single Bit 217 | Device Memory : 0 218 | Register File : 0 219 | L1 Cache : 0 220 | L2 Cache : 0 221 | Texture Memory : 0 222 | Total : 0 223 | Double Bit 224 | Device Memory : 0 225 | Register File : 0 226 | L1 Cache : 0 227 | L2 Cache : 0 228 | Texture Memory : 0 229 | Total : 0 230 | Aggregate 231 | Single Bit 232 | Device Memory : 0 233 | Register File : 0 234 | L1 Cache : 0 235 | L2 Cache : 0 236 | Texture Memory : 0 237 | Total : 0 238 | Double Bit 239 | Device Memory : 0 240 | Register File : 0 241 | L1 Cache : 0 242 | L2 Cache : 0 243 | Texture Memory : 0 244 | Total : 0 245 | Retired Pages 246 | Single Bit ECC : 0 247 | Double Bit ECC : 0 248 | Pending : No 249 | Temperature 250 | GPU Current Temp : 40 C 251 | GPU Shutdown Temp : 95 C 252 | GPU Slowdown Temp : 90 C 253 | Power Readings 254 | Power Management : Supported 255 | Power Draw : 64.53 W 256 | Power Limit : 235.00 W 257 | Default Power Limit : 235.00 W 258 | Enforced Power Limit : 235.00 W 259 | Min Power Limit : 180.00 W 260 | Max Power Limit : 235.00 W 261 | Clocks 262 | Graphics : 745 MHz 263 | SM : 745 MHz 264 | Memory : 3004 MHz 265 | Applications Clocks 266 | Graphics : 745 MHz 267 | Memory : 3004 MHz 268 | Default Applications Clocks 269 | Graphics : 745 MHz 270 | Memory : 3004 MHz 271 | Max Clocks 272 | Graphics : 875 MHz 273 | SM : 875 MHz 274 | Memory : 3004 MHz 275 | Clock Policy 276 | Auto Boost : N/A 277 | Auto Boost Default : N/A 278 | Processes : None 279 | 280 | GPU 0000:2B:00.0 281 | Product Name : Tesla K40m 282 | Product Brand : Tesla 283 | Display Mode : Disabled 284 | Display Active : Disabled 285 | Persistence Mode : Disabled 286 | Accounting Mode : Disabled 287 | Accounting Mode Buffer Size : 128 288 | Driver Model 289 | Current : N/A 290 | Pending : N/A 291 | Serial Number : 0323315058830 292 | GPU UUID : GPU-69643319-f398-0e93-e5a0-c1c019b5f866 293 | Minor Number : 2 294 | VBIOS Version : 80.80.3E.00.0F 295 | MultiGPU Board : No 296 | Board ID : 0x2b00 297 | Inforom Version 298 | Image Version : 2081.0202.01.04 299 | OEM Object : 1.1 300 | ECC Object : 3.0 301 | Power Management Object : N/A 302 | GPU Operation Mode 303 | Current : N/A 304 | Pending : N/A 305 | PCI 306 | Bus : 0x2B 307 | Device : 0x00 308 | Domain : 0x0000 309 | Device Id : 0x102310DE 310 | Bus Id : 0000:2B:00.0 311 | Sub System Id : 0x097E10DE 312 | GPU Link Info 313 | PCIe Generation 314 | Max : 3 315 | Current : 3 316 | Link Width 317 | Max : 16x 318 | Current : 16x 319 | Bridge Chip 320 | Type : N/A 321 | Firmware : N/A 322 | Replays since reset : 0 323 | Tx Throughput : N/A 324 | Rx Throughput : N/A 325 | Fan Speed : N/A 326 | Performance State : P0 327 | Clocks Throttle Reasons 328 | Idle : Not Active 329 | Applications Clocks Setting : Active 330 | SW Power Cap : Not Active 331 | HW Slowdown : Not Active 332 | Unknown : Not Active 333 | FB Memory Usage 334 | Total : 11519 MiB 335 | Used : 55 MiB 336 | Free : 11464 MiB 337 | BAR1 Memory Usage 338 | Total : 16384 MiB 339 | Used : 2 MiB 340 | Free : 16382 MiB 341 | Compute Mode : Default 342 | Utilization 343 | Gpu : 0 % 344 | Memory : 0 % 345 | Encoder : 0 % 346 | Decoder : 0 % 347 | Ecc Mode 348 | Current : Enabled 349 | Pending : Enabled 350 | ECC Errors 351 | Volatile 352 | Single Bit 353 | Device Memory : 0 354 | Register File : 0 355 | L1 Cache : 0 356 | L2 Cache : 0 357 | Texture Memory : 0 358 | Total : 0 359 | Double Bit 360 | Device Memory : 0 361 | Register File : 0 362 | L1 Cache : 0 363 | L2 Cache : 0 364 | Texture Memory : 0 365 | Total : 0 366 | Aggregate 367 | Single Bit 368 | Device Memory : 0 369 | Register File : 0 370 | L1 Cache : 0 371 | L2 Cache : 0 372 | Texture Memory : 0 373 | Total : 0 374 | Double Bit 375 | Device Memory : 0 376 | Register File : 0 377 | L1 Cache : 0 378 | L2 Cache : 0 379 | Texture Memory : 0 380 | Total : 0 381 | Retired Pages 382 | Single Bit ECC : 0 383 | Double Bit ECC : 0 384 | Pending : No 385 | Temperature 386 | GPU Current Temp : 43 C 387 | GPU Shutdown Temp : 95 C 388 | GPU Slowdown Temp : 90 C 389 | Power Readings 390 | Power Management : Supported 391 | Power Draw : 63.97 W 392 | Power Limit : 235.00 W 393 | Default Power Limit : 235.00 W 394 | Enforced Power Limit : 235.00 W 395 | Min Power Limit : 180.00 W 396 | Max Power Limit : 235.00 W 397 | Clocks 398 | Graphics : 745 MHz 399 | SM : 745 MHz 400 | Memory : 3004 MHz 401 | Applications Clocks 402 | Graphics : 745 MHz 403 | Memory : 3004 MHz 404 | Default Applications Clocks 405 | Graphics : 745 MHz 406 | Memory : 3004 MHz 407 | Max Clocks 408 | Graphics : 875 MHz 409 | SM : 875 MHz 410 | Memory : 3004 MHz 411 | Clock Policy 412 | Auto Boost : N/A 413 | Auto Boost Default : N/A 414 | Processes : None 415 | 416 | GPU 0000:30:00.0 417 | Product Name : Tesla K40m 418 | Product Brand : Tesla 419 | Display Mode : Disabled 420 | Display Active : Disabled 421 | Persistence Mode : Disabled 422 | Accounting Mode : Disabled 423 | Accounting Mode Buffer Size : 128 424 | Driver Model 425 | Current : N/A 426 | Pending : N/A 427 | Serial Number : 0323315059276 428 | GPU UUID : GPU-ea4ed96a-36b3-cb17-1b13-79d47b84b5e7 429 | Minor Number : 3 430 | VBIOS Version : 80.80.3E.00.0F 431 | MultiGPU Board : No 432 | Board ID : 0x3000 433 | Inforom Version 434 | Image Version : 2081.0202.01.04 435 | OEM Object : 1.1 436 | ECC Object : 3.0 437 | Power Management Object : N/A 438 | GPU Operation Mode 439 | Current : N/A 440 | Pending : N/A 441 | PCI 442 | Bus : 0x30 443 | Device : 0x00 444 | Domain : 0x0000 445 | Device Id : 0x102310DE 446 | Bus Id : 0000:30:00.0 447 | Sub System Id : 0x097E10DE 448 | GPU Link Info 449 | PCIe Generation 450 | Max : 3 451 | Current : 3 452 | Link Width 453 | Max : 16x 454 | Current : 16x 455 | Bridge Chip 456 | Type : N/A 457 | Firmware : N/A 458 | Replays since reset : 0 459 | Tx Throughput : N/A 460 | Rx Throughput : N/A 461 | Fan Speed : N/A 462 | Performance State : P0 463 | Clocks Throttle Reasons 464 | Idle : Not Active 465 | Applications Clocks Setting : Active 466 | SW Power Cap : Not Active 467 | HW Slowdown : Not Active 468 | Unknown : Not Active 469 | FB Memory Usage 470 | Total : 11519 MiB 471 | Used : 55 MiB 472 | Free : 11464 MiB 473 | BAR1 Memory Usage 474 | Total : 16384 MiB 475 | Used : 2 MiB 476 | Free : 16382 MiB 477 | Compute Mode : Default 478 | Utilization 479 | Gpu : 96 % 480 | Memory : 4 % 481 | Encoder : 0 % 482 | Decoder : 0 % 483 | Ecc Mode 484 | Current : Enabled 485 | Pending : Enabled 486 | ECC Errors 487 | Volatile 488 | Single Bit 489 | Device Memory : 0 490 | Register File : 0 491 | L1 Cache : 0 492 | L2 Cache : 0 493 | Texture Memory : 0 494 | Total : 0 495 | Double Bit 496 | Device Memory : 0 497 | Register File : 0 498 | L1 Cache : 0 499 | L2 Cache : 0 500 | Texture Memory : 0 501 | Total : 0 502 | Aggregate 503 | Single Bit 504 | Device Memory : 0 505 | Register File : 0 506 | L1 Cache : 0 507 | L2 Cache : 0 508 | Texture Memory : 0 509 | Total : 0 510 | Double Bit 511 | Device Memory : 0 512 | Register File : 0 513 | L1 Cache : 0 514 | L2 Cache : 0 515 | Texture Memory : 0 516 | Total : 0 517 | Retired Pages 518 | Single Bit ECC : 0 519 | Double Bit ECC : 0 520 | Pending : No 521 | Temperature 522 | GPU Current Temp : 42 C 523 | GPU Shutdown Temp : 95 C 524 | GPU Slowdown Temp : 90 C 525 | Power Readings 526 | Power Management : Supported 527 | Power Draw : 65.95 W 528 | Power Limit : 235.00 W 529 | Default Power Limit : 235.00 W 530 | Enforced Power Limit : 235.00 W 531 | Min Power Limit : 180.00 W 532 | Max Power Limit : 235.00 W 533 | Clocks 534 | Graphics : 745 MHz 535 | SM : 745 MHz 536 | Memory : 3004 MHz 537 | Applications Clocks 538 | Graphics : 745 MHz 539 | Memory : 3004 MHz 540 | Default Applications Clocks 541 | Graphics : 745 MHz 542 | Memory : 3004 MHz 543 | Max Clocks 544 | Graphics : 875 MHz 545 | SM : 875 MHz 546 | Memory : 3004 MHz 547 | Clock Policy 548 | Auto Boost : N/A 549 | Auto Boost Default : N/A 550 | Processes : None 551 | 552 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | Benchmarks for CNTK and other toolkits. 3 | 4 | Disclaimer: I'm a Microsoft employee, however, this is my personal github account and information/code shared here does not represent opinions or views of Microsoft in any way. 5 | -------------------------------------------------------------------------------- /TensorFlow/ffn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | featureDim = 512 5 | labelDim = 10000 6 | hiddenLayDim = 2048 7 | numMinibatches = 150 8 | 9 | FLAGS = tf.app.flags.FLAGS 10 | tf.app.flags.DEFINE_boolean('logDevicePlacement', False, 11 | """Whether to log device placement.""") 12 | tf.app.flags.DEFINE_boolean('noInputFeed', False, 13 | """Whether to not feed new features/labels data for each minibatch.""") 14 | 15 | data = np.loadtxt('../data.txt') 16 | features = data[:,1:] 17 | labels = data[:,0] 18 | 19 | # Get random parameters initialized with a iniform distribution between -0.5 and 0.5 20 | def getParameters(name, shape): 21 | return tf.get_variable(name, shape, initializer=tf.random_uniform_initializer(-0.5, 0.5)) 22 | 23 | def sigmoidDNNLayer(layerIdx, input, inputDim, outputDim): 24 | W = getParameters("W" + str(layerIdx), [inputDim, outputDim]) 25 | B = getParameters("B" + str(layerIdx), [outputDim]) 26 | return tf.nn.sigmoid(tf.nn.xw_plus_b(input, W, B)) 27 | 28 | def getFakeMinibatch(minibatchSize): 29 | #feat = np.random.randn(minibatchSize, featureDim) 30 | #lab = np.zeros((minibatchSize, labelDim)) 31 | #for row in lab: 32 | # row[np.random.randint(0, labelDim)] = 1 33 | feat = features[:minibatchSize] 34 | l = labels[:minibatchSize] 35 | lab = np.zeros((minibatchSize, labelDim)) 36 | for i in range(lab.shape[0]): 37 | lab[i][l[i]] = 1 38 | return feat, lab 39 | #fakeFeatures = [[0.0 for _ in xrange(featureDim)] for _ in xrange(minibatchSize)] 40 | #fakeLabels = [[0.0 for _ in xrange(labelDim)] for _ in xrange(minibatchSize)] 41 | #for sampleIdx in xrange(minibatchSize): 42 | # fakeLabels[sampleIdx][np.random.randint(0, labelDim - 1)] = 1.0 43 | # for featureIdx in xrange(featureDim): 44 | # fakeFeatures[sampleIdx][featureIdx] = np.random.randn() 45 | # 46 | #return fakeFeatures, fakeLabels 47 | 48 | 49 | def getLossAndAccuracyForSubBatch(features, labels): 50 | 51 | HL0 = sigmoidDNNLayer(0, features, featureDim, hiddenLayDim) 52 | HL1 = sigmoidDNNLayer(1, HL0, hiddenLayDim, hiddenLayDim) 53 | HL2 = sigmoidDNNLayer(2, HL1, hiddenLayDim, hiddenLayDim) 54 | HL3 = sigmoidDNNLayer(3, HL2, hiddenLayDim, hiddenLayDim) 55 | 56 | outputLayerW = getParameters("W5", [hiddenLayDim, labelDim]) 57 | outputLayerB = getParameters("B5", [labelDim]) 58 | outputLayer = tf.nn.softmax(tf.nn.xw_plus_b(HL3, outputLayerW, outputLayerB)) 59 | 60 | crossEntropy = -tf.reduce_mean(labels * tf.log(outputLayer)) 61 | predictionCorrectness = tf.equal(tf.argmax(outputLayer, 1), tf.argmax(labels, 1)) 62 | accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float")) 63 | 64 | return crossEntropy, accuracy 65 | 66 | def printTrainingStats(numGPUs, minibatchSize, perMinibatchTime): 67 | meanTimePerMinibatch = np.mean(perMinibatchTime) 68 | medianTimePerMinibatch = np.median(perMinibatchTime) 69 | minTimePerMinibatch = np.min(perMinibatchTime) 70 | 71 | def samplesPerSec(minibatchSize, processingTime): 72 | return minibatchSize/processingTime 73 | 74 | print('*****************************Training on %d GPUs***************************************' % numGPUs) 75 | print('MinibatchSize=%d, NumMinibatches=%d.' % (minibatchSize, numMinibatches)) 76 | print('Training speed (samples/sec): Average=%d, Median=%d, Max=%d' % (samplesPerSec(minibatchSize, meanTimePerMinibatch), 77 | samplesPerSec(minibatchSize, medianTimePerMinibatch), 78 | samplesPerSec(minibatchSize, minTimePerMinibatch))) 79 | print('*************************************************************************************') 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /TensorFlow/ffn_1GPU.log: -------------------------------------------------------------------------------- 1 | I tensorflow/core/common_runtime/local_device.cc:25] Local device intra op parallelism threads: 20 2 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 0 with properties: 3 | name: Tesla K40m 4 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 5 | pciBusID 0000:0d:00.0 6 | Total memory: 11.25GiB 7 | Free memory: 11.12GiB 8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 1 with properties: 9 | name: Tesla K40m 10 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 11 | pciBusID 0000:0a:00.0 12 | Total memory: 11.25GiB 13 | Free memory: 11.12GiB 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 2 with properties: 15 | name: Tesla K40m 16 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 17 | pciBusID 0000:2b:00.0 18 | Total memory: 11.25GiB 19 | Free memory: 11.12GiB 20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 3 with properties: 21 | name: Tesla K40m 22 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 23 | pciBusID 0000:30:00.0 24 | Total memory: 11.25GiB 25 | Free memory: 11.12GiB 26 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 2 27 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 3 28 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 2 29 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 3 30 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 0 31 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 1 32 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 0 33 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 1 34 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:112] DMA: 0 1 2 3 35 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 0: Y Y N N 36 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 1: Y Y N N 37 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 2: N N Y Y 38 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 3: N N Y Y 39 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:0d:00.0) 40 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40m, pci bus id: 0000:0a:00.0) 41 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K40m, pci bus id: 0000:2b:00.0) 42 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K40m, pci bus id: 0000:30:00.0) 43 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133 44 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351 45 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133 46 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351 47 | I tensorflow/core/common_runtime/local_session.cc:45] Local session inter op parallelism threads: 20 48 | *****************************Training on 1 GPUs*************************************** 49 | MinibatchSize=8192, NumMinibatches=150. 50 | Training speed (samples/sec): Average=8141, Median=8183, Max=8414 51 | ************************************************************************************* 52 | -------------------------------------------------------------------------------- /TensorFlow/ffn_4GPUs.log: -------------------------------------------------------------------------------- 1 | I tensorflow/core/common_runtime/local_device.cc:25] Local device intra op parallelism threads: 20 2 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 0 with properties: 3 | name: Tesla K40m 4 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 5 | pciBusID 0000:0d:00.0 6 | Total memory: 11.25GiB 7 | Free memory: 11.12GiB 8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 1 with properties: 9 | name: Tesla K40m 10 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 11 | pciBusID 0000:0a:00.0 12 | Total memory: 11.25GiB 13 | Free memory: 11.12GiB 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 2 with properties: 15 | name: Tesla K40m 16 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 17 | pciBusID 0000:2b:00.0 18 | Total memory: 11.25GiB 19 | Free memory: 11.12GiB 20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:88] Found device 3 with properties: 21 | name: Tesla K40m 22 | major: 3 minor: 5 memoryClockRate (GHz) 0.745 23 | pciBusID 0000:30:00.0 24 | Total memory: 11.25GiB 25 | Free memory: 11.12GiB 26 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 2 27 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 0 to device ordinal 3 28 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 2 29 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 1 to device ordinal 3 30 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 0 31 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 2 to device ordinal 1 32 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 0 33 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:45] cannot enable peer access from device ordinal 3 to device ordinal 1 34 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:112] DMA: 0 1 2 3 35 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 0: Y Y N N 36 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 1: Y Y N N 37 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 2: N N Y Y 38 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:122] 3: N N Y Y 39 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:0d:00.0) 40 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40m, pci bus id: 0000:0a:00.0) 41 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K40m, pci bus id: 0000:2b:00.0) 42 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:643] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K40m, pci bus id: 0000:30:00.0) 43 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133 44 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351 45 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344381133 46 | I tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc:47] Setting region size to 11344373351 47 | I tensorflow/core/common_runtime/local_session.cc:45] Local session inter op parallelism threads: 20 48 | *****************************Training on 4 GPUs*************************************** 49 | MinibatchSize=8192, NumMinibatches=150. 50 | Training speed (samples/sec): Average=11255, Median=11381, Max=12359 51 | ************************************************************************************* 52 | -------------------------------------------------------------------------------- /TensorFlow/ffn_exp.py: -------------------------------------------------------------------------------- 1 | # A feed-forward DNN with 5 hidden layers using sigmoid activations. 2 | 3 | import time 4 | import tensorflow as tf 5 | import ffn 6 | 7 | from ffn import * 8 | 9 | minibatchSize = 8192 10 | 11 | # Create the model 12 | if (FLAGS.noInputFeed): 13 | features, labels = getFakeMinibatch(minibatchSize) 14 | else: 15 | features = tf.placeholder("float", [None, featureDim]) 16 | labels = tf.placeholder("float", [None, labelDim]) 17 | 18 | crossEntropy, accuracy = getLossAndAccuracyForSubBatch(features, labels) 19 | trainStep = tf.train.GradientDescentOptimizer(0.01).minimize(crossEntropy) 20 | 21 | # Train 22 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.logDevicePlacement)) 23 | init = tf.initialize_all_variables() 24 | sess.run(init) 25 | 26 | perMinibatchTime = [] 27 | for i in range(numMinibatches): 28 | if (FLAGS.noInputFeed == False): 29 | minibatchFeatures, minibatchLabels = getFakeMinibatch(minibatchSize) 30 | 31 | startTime = time.time() 32 | if (FLAGS.noInputFeed): 33 | sess.run([trainStep, accuracy]) 34 | else: 35 | sess.run([trainStep, accuracy], feed_dict={features: minibatchFeatures, labels: minibatchLabels}) 36 | 37 | currMinibatchDuration = time.time() - startTime 38 | perMinibatchTime.append(currMinibatchDuration) 39 | 40 | printTrainingStats(1, minibatchSize, perMinibatchTime) 41 | 42 | -------------------------------------------------------------------------------- /TensorFlow/ffn_exp_4GPUs.py: -------------------------------------------------------------------------------- 1 | # A feed-forward DNN with 5 hidden layers using sigmoid activations. 2 | # Uses dataparallel SGD with multiple GPUs 3 | 4 | import time 5 | import tensorflow as tf 6 | import ffn 7 | 8 | from ffn import * 9 | 10 | tf.app.flags.DEFINE_integer('numGPUs', 4, 11 | """How many GPUs to use.""") 12 | 13 | subMinibatchSize = 2048 14 | minibatchSize = FLAGS.numGPUs * subMinibatchSize 15 | 16 | def aggregateGradients(subMinibatchGradients): 17 | aggGrads = [] 18 | for gradAndVars in zip(*subMinibatchGradients): 19 | # Note that each gradAndVars looks like the following: 20 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 21 | grads = [] 22 | for g, _ in gradAndVars: 23 | # Add 0 dimension to the gradients to represent the replica. 24 | expanded_g = tf.expand_dims(g, 0) 25 | 26 | # Append on a 'replica' dimension which we will sum over below. 27 | grads.append(expanded_g) 28 | 29 | # Sum over the 'replica' dimension. 30 | grad = tf.concat(0, grads) 31 | grad = tf.reduce_sum(grad, 0) 32 | 33 | # Keep in mind that the Variables are redundant because they are shared 34 | # across replicas. So .. we will just return the first replica's pointer to 35 | # the Variable. 36 | v = gradAndVars[0][1] 37 | gradAndVar = (grad, v) 38 | aggGrads.append(gradAndVar) 39 | return aggGrads 40 | 41 | if (FLAGS.noInputFeed): 42 | features, labels = getFakeMinibatch(subMinibatchSize) 43 | else: 44 | # HACK: Using the same subMinibatch across all GPUs 45 | features = tf.placeholder("float", [None, featureDim]) 46 | labels = tf.placeholder("float", [None, labelDim]) 47 | 48 | optimizer = tf.train.GradientDescentOptimizer(0.01) 49 | 50 | # Calculate the gradients for each subBatch on a different GPU 51 | subMinibatchGradients = [] 52 | subMinibatchAccuracies = [] 53 | for i in xrange(FLAGS.numGPUs): 54 | with tf.device('/gpu:%d' % i): 55 | with tf.name_scope('%s_%d' % ("replica", i)) as scope: 56 | # Calculate the loss for one subBatch. This function 57 | # constructs the entire model but shares the variables across 58 | # all replicas. 59 | loss, accuracy = getLossAndAccuracyForSubBatch(features, labels) 60 | 61 | # Reuse variables for the next replica. 62 | tf.get_variable_scope().reuse_variables() 63 | 64 | # Calculate the gradients for this subBatch on this GPU 65 | grads = optimizer.compute_gradients(loss) 66 | 67 | # Keep track of the gradients across all replicas. 68 | subMinibatchGradients.append(grads) 69 | subMinibatchAccuracies.append(accuracy) 70 | 71 | # We must calculate the sum of each gradient. Note that this is the 72 | # synchronization point across all towers. 73 | grads = aggregateGradients(subMinibatchGradients) 74 | accuracy = tf.reduce_sum(tf.pack(subMinibatchAccuracies)) 75 | 76 | # Apply the gradients to adjust the shared variables. 77 | applyGradientOp = optimizer.apply_gradients(grads) 78 | 79 | # Start running operations on the Graph. allow_soft_placement must be set to 80 | # True to build replicas on GPU, as some of the ops do not have GPU implementations. 81 | sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.logDevicePlacement)) 82 | init = tf.initialize_all_variables() 83 | sess.run(init) 84 | 85 | # Start the queue runners. 86 | tf.train.start_queue_runners(sess=sess) 87 | 88 | perMinibatchTime = [] 89 | for step in xrange(numMinibatches): 90 | if (FLAGS.noInputFeed == False): 91 | subMinibatchFeatures, subMinibatchLabels = getFakeMinibatch(subMinibatchSize) 92 | 93 | startTime = time.time() 94 | if (FLAGS.noInputFeed): 95 | sess.run([applyGradientOp, accuracy]) 96 | else: 97 | sess.run([applyGradientOp, accuracy], feed_dict={features: subMinibatchFeatures, labels: subMinibatchLabels}) 98 | 99 | currMinibatchDuration = time.time() - startTime 100 | perMinibatchTime.append(currMinibatchDuration) 101 | 102 | printTrainingStats(FLAGS.numGPUs, minibatchSize, perMinibatchTime) 103 | 104 | -------------------------------------------------------------------------------- /Torch/alexnet.lua: -------------------------------------------------------------------------------- 1 | require 'sys'; 2 | require 'bit'; 3 | require 'cunn'; 4 | require 'cudnn'; 5 | cudnn.benchmark = true; 6 | cudnn.verbose = true; 7 | require 'optim'; 8 | torch.setdefaulttensortype('torch.FloatTensor') 9 | 10 | local steps = 1 -- number of runs 11 | 12 | local Linear = nn.Linear 13 | local Transfer = cudnn.ReLU 14 | local hsize = 4096 15 | local osize = 1000 16 | 17 | -- Network definition 18 | local cnn = nn.Sequential() 19 | cnn:add(cudnn.SpatialConvolution(3,96,11,11,4,4,2,2)):add(Transfer(true)) 20 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2)) 21 | cnn:add(cudnn.SpatialConvolution(96,256,5,5,1,1,2,2)):add(Transfer(true)) 22 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2)) 23 | cnn:add(cudnn.SpatialConvolution(256,384,3,3,1,1,1,1)):add(Transfer(true)) 24 | cnn:add(cudnn.SpatialConvolution(384,384,3,3,1,1,1,1)):add(Transfer(true)) 25 | cnn:add(cudnn.SpatialConvolution(384,256,3,3,1,1,1,1)):add(Transfer(true)) 26 | cnn:add(cudnn.SpatialMaxPooling(3,3,2,2)) 27 | 28 | cnn:add(Linear(256*6*6,hsize)):add(Transfer(true)) -- hidden layer 1 29 | cnn:add(nn.Dropout(0.5)) 30 | cnn:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 2 31 | cnn:add(nn.Dropout(0.5)) 32 | cnn:add(Linear(hsize,osize)):add(cudnn.LogSoftMax()) -- output layer 33 | 34 | -- Fake data 35 | local bsize = 256 36 | local inputCPU = torch.randn(torch.LongStorage({bsize,3,224,224})) 37 | local input = torch.CudaTensor(inputCPU:size()) 38 | local target = torch.IntTensor(bsize):random(1,osize):cuda() 39 | 40 | for k=0,2 do 41 | nGPU = bit.lshift(1,k) 42 | 43 | local model = nil 44 | if nGPU > 1 then 45 | model = nn.DataParallelTable(1) 46 | for i=1,nGPU do 47 | cutorch.setDevice(i) 48 | model:add(cnn:clone():cuda(), i) 49 | end 50 | cutorch.setDevice(1) 51 | else 52 | model = cnn:cuda() 53 | end 54 | 55 | -- optimizer declarations 56 | local criterion = nn.ClassNLLCriterion():cuda() 57 | local parameters, gradParameters = model:getParameters() 58 | local optimState = { learningRate = 0.01 } 59 | 60 | collectgarbage() 61 | sys.tic() 62 | for t = 1, steps do 63 | input:copy(inputCPU) -- transfer data to GPU memory 64 | feval = function(x) 65 | model:zeroGradParameters() 66 | local output = model:forward(input) 67 | local err = criterion:forward(output, target) 68 | local gradOutput = criterion:backward(output, target) 69 | local gradInput = model:backward(input, gradOutput) 70 | return err, gradParameters 71 | end 72 | optim.sgd(feval, parameters, optimState) 73 | 74 | -- DataParallelTable's syncParameters 75 | model:apply(function(m) if m.syncParameters then m:syncParameters() end end) 76 | cutorch.synchronize() 77 | end 78 | local elapsed = sys.toc() 79 | 80 | print(string.format("%d GPUs: %0.0f samples per sec", nGPU, steps * bsize / elapsed)) 81 | end 82 | 83 | -------------------------------------------------------------------------------- /Torch/ffn.log: -------------------------------------------------------------------------------- 1 | 1 GPUs: 12522 samples per sec 2 | 2 GPUs: 19751 samples per sec 3 | 4 GPUs: 23076 samples per sec 4 | -------------------------------------------------------------------------------- /Torch/ffn.lua: -------------------------------------------------------------------------------- 1 | require 'sys'; 2 | require 'bit'; 3 | require 'cunn'; 4 | require 'cudnn'; 5 | require 'optim'; 6 | torch.setdefaulttensortype('torch.FloatTensor') 7 | 8 | local steps = 100 -- number of runs 9 | 10 | local Linear = nn.Linear 11 | local Transfer = nn.Sigmoid 12 | local isize = 512 13 | local hsize = 2048 14 | local osize = 10000 15 | 16 | -- Network definition 17 | local mlp = nn.Sequential() 18 | mlp:add(Linear(isize,hsize)):add(Transfer(true)) -- hidden layer 1 19 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 2 20 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 3 21 | mlp:add(Linear(hsize,hsize)):add(Transfer(true)) -- hidden layer 4 22 | mlp:add(Linear(hsize,osize)):add(cudnn.LogSoftMax()) -- output layer 23 | 24 | -- Fake data 25 | local bsize = 8192 26 | local inputCPU = torch.randn(bsize,isize) 27 | local input = torch.CudaTensor(inputCPU:size()) 28 | local target = torch.IntTensor(bsize):random(1,bsize):cuda() 29 | 30 | for k=0,2 do 31 | nGPU = bit.lshift(1,k) 32 | 33 | local model = nil 34 | if nGPU > 1 then 35 | model = nn.DataParallelTable(1) 36 | for i=1,nGPU do 37 | cutorch.setDevice(i) 38 | model:add(mlp:clone():cuda(), i) 39 | end 40 | cutorch.setDevice(1) 41 | else 42 | model = mlp:cuda() 43 | end 44 | 45 | -- optimizer declarations 46 | local criterion = nn.ClassNLLCriterion():cuda() 47 | local parameters, gradParameters = model:getParameters() 48 | local optimState = { learningRate = 0.01 } 49 | 50 | collectgarbage() 51 | sys.tic() 52 | for t = 1, steps do 53 | input:copy(inputCPU) -- transfer data to GPU memory 54 | feval = function(x) 55 | model:zeroGradParameters() 56 | local output = model:forward(input) 57 | local err = criterion:forward(output, target) 58 | local gradOutput = criterion:backward(output, target) 59 | local gradInput = model:backward(input, gradOutput) 60 | return err, gradParameters 61 | end 62 | optim.sgd(feval, parameters, optimState) 63 | 64 | -- DataParallelTable's syncParameters 65 | model:apply(function(m) if m.syncParameters then m:syncParameters() end end) 66 | cutorch.synchronize() 67 | end 68 | local elapsed = sys.toc() 69 | 70 | print(string.format("%d GPUs: %0.0f samples per sec", nGPU, steps * bsize / elapsed)) 71 | end 72 | 73 | -------------------------------------------------------------------------------- /caffe/alexnet.prototxt: -------------------------------------------------------------------------------- 1 | name: "AlexNet" 2 | layer { 3 | name: "data" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | data_param { 11 | source: "./fake_image_net.lmdb" 12 | batch_size: 256 13 | backend: LMDB 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 1 23 | decay_mult: 1 24 | } 25 | param { 26 | lr_mult: 2 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 11 32 | stride: 4 33 | weight_filler { 34 | type: "gaussian" 35 | std: 0.01 36 | } 37 | bias_filler { 38 | type: "constant" 39 | value: 0 40 | } 41 | } 42 | } 43 | layer { 44 | name: "relu1" 45 | type: "ReLU" 46 | bottom: "conv1" 47 | top: "conv1" 48 | } 49 | layer { 50 | name: "pool1" 51 | type: "Pooling" 52 | bottom: "conv1" 53 | top: "pool1" 54 | pooling_param { 55 | pool: MAX 56 | kernel_size: 3 57 | stride: 2 58 | } 59 | } 60 | layer { 61 | name: "conv2" 62 | type: "Convolution" 63 | bottom: "pool1" 64 | top: "conv2" 65 | param { 66 | lr_mult: 1 67 | decay_mult: 1 68 | } 69 | param { 70 | lr_mult: 2 71 | decay_mult: 0 72 | } 73 | convolution_param { 74 | num_output: 256 75 | pad: 2 76 | kernel_size: 5 77 | weight_filler { 78 | type: "gaussian" 79 | std: 0.01 80 | } 81 | bias_filler { 82 | type: "constant" 83 | value: 0.1 84 | } 85 | } 86 | } 87 | layer { 88 | name: "relu2" 89 | type: "ReLU" 90 | bottom: "conv2" 91 | top: "conv2" 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "conv2" 97 | top: "pool2" 98 | pooling_param { 99 | pool: MAX 100 | kernel_size: 3 101 | stride: 2 102 | } 103 | } 104 | layer { 105 | name: "conv3" 106 | type: "Convolution" 107 | bottom: "pool2" 108 | top: "conv3" 109 | param { 110 | lr_mult: 1 111 | decay_mult: 1 112 | } 113 | param { 114 | lr_mult: 2 115 | decay_mult: 0 116 | } 117 | convolution_param { 118 | num_output: 384 119 | pad: 1 120 | kernel_size: 3 121 | weight_filler { 122 | type: "gaussian" 123 | std: 0.01 124 | } 125 | bias_filler { 126 | type: "constant" 127 | value: 0 128 | } 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | pad: 1 153 | kernel_size: 3 154 | weight_filler { 155 | type: "gaussian" 156 | std: 0.01 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0.1 161 | } 162 | } 163 | } 164 | layer { 165 | name: "relu4" 166 | type: "ReLU" 167 | bottom: "conv4" 168 | top: "conv4" 169 | } 170 | layer { 171 | name: "conv5" 172 | type: "Convolution" 173 | bottom: "conv4" 174 | top: "conv5" 175 | param { 176 | lr_mult: 1 177 | decay_mult: 1 178 | } 179 | param { 180 | lr_mult: 2 181 | decay_mult: 0 182 | } 183 | convolution_param { 184 | num_output: 256 185 | pad: 1 186 | kernel_size: 3 187 | weight_filler { 188 | type: "gaussian" 189 | std: 0.01 190 | } 191 | bias_filler { 192 | type: "constant" 193 | value: 0.1 194 | } 195 | } 196 | } 197 | layer { 198 | name: "relu5" 199 | type: "ReLU" 200 | bottom: "conv5" 201 | top: "conv5" 202 | } 203 | layer { 204 | name: "pool5" 205 | type: "Pooling" 206 | bottom: "conv5" 207 | top: "pool5" 208 | pooling_param { 209 | pool: MAX 210 | kernel_size: 3 211 | stride: 2 212 | } 213 | } 214 | layer { 215 | name: "fc6" 216 | type: "InnerProduct" 217 | bottom: "pool5" 218 | top: "fc6" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 0 226 | } 227 | inner_product_param { 228 | num_output: 4096 229 | weight_filler { 230 | type: "gaussian" 231 | std: 0.005 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0.1 236 | } 237 | } 238 | } 239 | layer { 240 | name: "relu6" 241 | type: "ReLU" 242 | bottom: "fc6" 243 | top: "fc6" 244 | } 245 | layer { 246 | name: "drop6" 247 | type: "Dropout" 248 | bottom: "fc6" 249 | top: "fc6" 250 | dropout_param { 251 | dropout_ratio: 0.5 252 | } 253 | } 254 | layer { 255 | name: "fc7" 256 | type: "InnerProduct" 257 | bottom: "fc6" 258 | top: "fc7" 259 | param { 260 | lr_mult: 1 261 | decay_mult: 1 262 | } 263 | param { 264 | lr_mult: 2 265 | decay_mult: 0 266 | } 267 | inner_product_param { 268 | num_output: 4096 269 | weight_filler { 270 | type: "gaussian" 271 | std: 0.005 272 | } 273 | bias_filler { 274 | type: "constant" 275 | value: 0.1 276 | } 277 | } 278 | } 279 | layer { 280 | name: "relu7" 281 | type: "ReLU" 282 | bottom: "fc7" 283 | top: "fc7" 284 | } 285 | layer { 286 | name: "drop7" 287 | type: "Dropout" 288 | bottom: "fc7" 289 | top: "fc7" 290 | dropout_param { 291 | dropout_ratio: 0.5 292 | } 293 | } 294 | layer { 295 | name: "fc8" 296 | type: "InnerProduct" 297 | bottom: "fc7" 298 | top: "fc8" 299 | param { 300 | lr_mult: 1 301 | decay_mult: 1 302 | } 303 | param { 304 | lr_mult: 2 305 | decay_mult: 0 306 | } 307 | inner_product_param { 308 | num_output: 1000 309 | weight_filler { 310 | type: "gaussian" 311 | std: 0.01 312 | } 313 | bias_filler { 314 | type: "constant" 315 | value: 0 316 | } 317 | } 318 | } 319 | layer { 320 | name: "accuracy" 321 | type: "Accuracy" 322 | bottom: "fc8" 323 | bottom: "label" 324 | top: "accuracy" 325 | include { 326 | phase: TEST 327 | } 328 | } 329 | layer { 330 | name: "loss" 331 | type: "SoftmaxWithLoss" 332 | bottom: "fc8" 333 | bottom: "label" 334 | top: "loss" 335 | } 336 | 337 | -------------------------------------------------------------------------------- /caffe/alexnet_1GPU.log: -------------------------------------------------------------------------------- 1 | I1209 22:09:55.896893 63236 caffe.cpp:184] Using GPUs 0 2 | I1209 22:10:05.413523 63236 solver.cpp:48] Initializing solver from parameters: 3 | base_lr: 0.01 4 | max_iter: 50 5 | lr_policy: "fixed" 6 | solver_mode: GPU 7 | device_id: 0 8 | net: "./alexnet.prototxt" 9 | I1209 22:10:05.413588 63236 solver.cpp:91] Creating training net from net file: ./alexnet.prototxt 10 | I1209 22:10:05.414489 63236 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy 11 | I1209 22:10:05.414664 63236 net.cpp:49] Initializing net from parameters: 12 | name: "AlexNet" 13 | state { 14 | phase: TRAIN 15 | } 16 | layer { 17 | name: "data" 18 | type: "Data" 19 | top: "data" 20 | top: "label" 21 | include { 22 | phase: TRAIN 23 | } 24 | data_param { 25 | source: "./fake_image_net.lmdb" 26 | batch_size: 256 27 | backend: LMDB 28 | } 29 | } 30 | layer { 31 | name: "conv1" 32 | type: "Convolution" 33 | bottom: "data" 34 | top: "conv1" 35 | param { 36 | lr_mult: 1 37 | decay_mult: 1 38 | } 39 | param { 40 | lr_mult: 2 41 | decay_mult: 0 42 | } 43 | convolution_param { 44 | num_output: 96 45 | kernel_size: 11 46 | stride: 4 47 | weight_filler { 48 | type: "gaussian" 49 | std: 0.01 50 | } 51 | bias_filler { 52 | type: "constant" 53 | value: 0 54 | } 55 | } 56 | } 57 | layer { 58 | name: "relu1" 59 | type: "ReLU" 60 | bottom: "conv1" 61 | top: "conv1" 62 | } 63 | layer { 64 | name: "pool1" 65 | type: "Pooling" 66 | bottom: "conv1" 67 | top: "pool1" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 3 71 | stride: 2 72 | } 73 | } 74 | layer { 75 | name: "conv2" 76 | type: "Convolution" 77 | bottom: "pool1" 78 | top: "conv2" 79 | param { 80 | lr_mult: 1 81 | decay_mult: 1 82 | } 83 | param { 84 | lr_mult: 2 85 | decay_mult: 0 86 | } 87 | convolution_param { 88 | num_output: 256 89 | pad: 2 90 | kernel_size: 5 91 | weight_filler { 92 | type: "gaussian" 93 | std: 0.01 94 | } 95 | bias_filler { 96 | type: "constant" 97 | value: 0.1 98 | } 99 | } 100 | } 101 | layer { 102 | name: "relu2" 103 | type: "ReLU" 104 | bottom: "conv2" 105 | top: "conv2" 106 | } 107 | layer { 108 | name: "pool2" 109 | type: "Pooling" 110 | bottom: "conv2" 111 | top: "pool2" 112 | pooling_param { 113 | pool: MAX 114 | kernel_size: 3 115 | stride: 2 116 | } 117 | } 118 | layer { 119 | name: "conv3" 120 | type: "Convolution" 121 | bottom: "pool2" 122 | top: "conv3" 123 | param { 124 | lr_mult: 1 125 | decay_mult: 1 126 | } 127 | param { 128 | lr_mult: 2 129 | decay_mult: 0 130 | } 131 | convolution_param { 132 | num_output: 384 133 | pad: 1 134 | kernel_size: 3 135 | weight_filler { 136 | type: "gaussian" 137 | std: 0.01 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | layer { 146 | name: "relu3" 147 | type: "ReLU" 148 | bottom: "conv3" 149 | top: "conv3" 150 | } 151 | layer { 152 | name: "conv4" 153 | type: "Convolution" 154 | bottom: "conv3" 155 | top: "conv4" 156 | param { 157 | lr_mult: 1 158 | decay_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | decay_mult: 0 163 | } 164 | convolution_param { 165 | num_output: 384 166 | pad: 1 167 | kernel_size: 3 168 | weight_filler { 169 | type: "gaussian" 170 | std: 0.01 171 | } 172 | bias_filler { 173 | type: "constant" 174 | value: 0.1 175 | } 176 | } 177 | } 178 | layer { 179 | name: "relu4" 180 | type: "ReLU" 181 | bottom: "conv4" 182 | top: "conv4" 183 | } 184 | layer { 185 | name: "conv5" 186 | type: "Convolution" 187 | bottom: "conv4" 188 | top: "conv5" 189 | param { 190 | lr_mult: 1 191 | decay_mult: 1 192 | } 193 | param { 194 | lr_mult: 2 195 | decay_mult: 0 196 | } 197 | convolution_param { 198 | num_output: 256 199 | pad: 1 200 | kernel_size: 3 201 | weight_filler { 202 | type: "gaussian" 203 | std: 0.01 204 | } 205 | bias_filler { 206 | type: "constant" 207 | value: 0.1 208 | } 209 | } 210 | } 211 | layer { 212 | name: "relu5" 213 | type: "ReLU" 214 | bottom: "conv5" 215 | top: "conv5" 216 | } 217 | layer { 218 | name: "pool5" 219 | type: "Pooling" 220 | bottom: "conv5" 221 | top: "pool5" 222 | pooling_param { 223 | pool: MAX 224 | kernel_size: 3 225 | stride: 2 226 | } 227 | } 228 | layer { 229 | name: "fc6" 230 | type: "InnerProduct" 231 | bottom: "pool5" 232 | top: "fc6" 233 | param { 234 | lr_mult: 1 235 | decay_mult: 1 236 | } 237 | param { 238 | lr_mult: 2 239 | decay_mult: 0 240 | } 241 | inner_product_param { 242 | num_output: 4096 243 | weight_filler { 244 | type: "gaussian" 245 | std: 0.005 246 | } 247 | bias_filler { 248 | type: "constant" 249 | value: 0.1 250 | } 251 | } 252 | } 253 | layer { 254 | name: "relu6" 255 | type: "ReLU" 256 | bottom: "fc6" 257 | top: "fc6" 258 | } 259 | layer { 260 | name: "drop6" 261 | type: "Dropout" 262 | bottom: "fc6" 263 | top: "fc6" 264 | dropout_param { 265 | dropout_ratio: 0.5 266 | } 267 | } 268 | layer { 269 | name: "fc7" 270 | type: "InnerProduct" 271 | bottom: "fc6" 272 | top: "fc7" 273 | param { 274 | lr_mult: 1 275 | decay_mult: 1 276 | } 277 | param { 278 | lr_mult: 2 279 | decay_mult: 0 280 | } 281 | inner_product_param { 282 | num_output: 4096 283 | weight_filler { 284 | type: "gaussian" 285 | std: 0.005 286 | } 287 | bias_filler { 288 | type: "constant" 289 | value: 0.1 290 | } 291 | } 292 | } 293 | layer { 294 | name: "relu7" 295 | type: "ReLU" 296 | bottom: "fc7" 297 | top: "fc7" 298 | } 299 | layer { 300 | name: "drop7" 301 | type: "Dropout" 302 | bottom: "fc7" 303 | top: "fc7" 304 | dropout_param { 305 | dropout_ratio: 0.5 306 | } 307 | } 308 | layer { 309 | name: "fc8" 310 | type: "InnerProduct" 311 | bottom: "fc7" 312 | top: "fc8" 313 | param { 314 | lr_mult: 1 315 | decay_mult: 1 316 | } 317 | param { 318 | lr_mult: 2 319 | decay_mult: 0 320 | } 321 | inner_product_param { 322 | num_output: 1000 323 | weight_filler { 324 | type: "gaussian" 325 | std: 0.01 326 | } 327 | bias_filler { 328 | type: "constant" 329 | value: 0 330 | } 331 | } 332 | } 333 | layer { 334 | name: "loss" 335 | type: "SoftmaxWithLoss" 336 | bottom: "fc8" 337 | bottom: "label" 338 | top: "loss" 339 | } 340 | I1209 22:10:05.414804 63236 layer_factory.hpp:77] Creating layer data 341 | I1209 22:10:05.415355 63236 net.cpp:106] Creating Layer data 342 | I1209 22:10:05.415367 63236 net.cpp:411] data -> data 343 | I1209 22:10:05.415393 63236 net.cpp:411] data -> label 344 | I1209 22:10:05.417349 63238 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb 345 | I1209 22:10:05.433789 63236 data_layer.cpp:41] output data size: 256,3,224,224 346 | I1209 22:10:05.806865 63236 net.cpp:150] Setting up data 347 | I1209 22:10:05.806946 63236 net.cpp:157] Top shape: 256 3 224 224 (38535168) 348 | I1209 22:10:05.806953 63236 net.cpp:157] Top shape: 256 (256) 349 | I1209 22:10:05.806957 63236 net.cpp:165] Memory required for data: 154141696 350 | I1209 22:10:05.806968 63236 layer_factory.hpp:77] Creating layer conv1 351 | I1209 22:10:05.806993 63236 net.cpp:106] Creating Layer conv1 352 | I1209 22:10:05.807001 63236 net.cpp:454] conv1 <- data 353 | I1209 22:10:05.807014 63236 net.cpp:411] conv1 -> conv1 354 | I1209 22:10:05.955500 63236 net.cpp:150] Setting up conv1 355 | I1209 22:10:05.955545 63236 net.cpp:157] Top shape: 256 96 54 54 (71663616) 356 | I1209 22:10:05.955550 63236 net.cpp:165] Memory required for data: 440796160 357 | I1209 22:10:05.955574 63236 layer_factory.hpp:77] Creating layer relu1 358 | I1209 22:10:05.955590 63236 net.cpp:106] Creating Layer relu1 359 | I1209 22:10:05.955596 63236 net.cpp:454] relu1 <- conv1 360 | I1209 22:10:05.955605 63236 net.cpp:397] relu1 -> conv1 (in-place) 361 | I1209 22:10:05.955847 63236 net.cpp:150] Setting up relu1 362 | I1209 22:10:05.955858 63236 net.cpp:157] Top shape: 256 96 54 54 (71663616) 363 | I1209 22:10:05.955862 63236 net.cpp:165] Memory required for data: 727450624 364 | I1209 22:10:05.955867 63236 layer_factory.hpp:77] Creating layer pool1 365 | I1209 22:10:05.955876 63236 net.cpp:106] Creating Layer pool1 366 | I1209 22:10:05.955881 63236 net.cpp:454] pool1 <- conv1 367 | I1209 22:10:05.955888 63236 net.cpp:411] pool1 -> pool1 368 | I1209 22:10:05.956173 63236 net.cpp:150] Setting up pool1 369 | I1209 22:10:05.956202 63236 net.cpp:157] Top shape: 256 96 27 27 (17915904) 370 | I1209 22:10:05.956207 63236 net.cpp:165] Memory required for data: 799114240 371 | I1209 22:10:05.956210 63236 layer_factory.hpp:77] Creating layer conv2 372 | I1209 22:10:05.956226 63236 net.cpp:106] Creating Layer conv2 373 | I1209 22:10:05.956233 63236 net.cpp:454] conv2 <- pool1 374 | I1209 22:10:05.956238 63236 net.cpp:411] conv2 -> conv2 375 | I1209 22:10:05.973860 63236 net.cpp:150] Setting up conv2 376 | I1209 22:10:05.973872 63236 net.cpp:157] Top shape: 256 256 27 27 (47775744) 377 | I1209 22:10:05.973876 63236 net.cpp:165] Memory required for data: 990217216 378 | I1209 22:10:05.973886 63236 layer_factory.hpp:77] Creating layer relu2 379 | I1209 22:10:05.973894 63236 net.cpp:106] Creating Layer relu2 380 | I1209 22:10:05.973898 63236 net.cpp:454] relu2 <- conv2 381 | I1209 22:10:05.973904 63236 net.cpp:397] relu2 -> conv2 (in-place) 382 | I1209 22:10:05.974156 63236 net.cpp:150] Setting up relu2 383 | I1209 22:10:05.974167 63236 net.cpp:157] Top shape: 256 256 27 27 (47775744) 384 | I1209 22:10:05.974171 63236 net.cpp:165] Memory required for data: 1181320192 385 | I1209 22:10:05.974175 63236 layer_factory.hpp:77] Creating layer pool2 386 | I1209 22:10:05.974190 63236 net.cpp:106] Creating Layer pool2 387 | I1209 22:10:05.974195 63236 net.cpp:454] pool2 <- conv2 388 | I1209 22:10:05.974200 63236 net.cpp:411] pool2 -> pool2 389 | I1209 22:10:05.974372 63236 net.cpp:150] Setting up pool2 390 | I1209 22:10:05.974403 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584) 391 | I1209 22:10:05.974407 63236 net.cpp:165] Memory required for data: 1225622528 392 | I1209 22:10:05.974411 63236 layer_factory.hpp:77] Creating layer conv3 393 | I1209 22:10:05.974421 63236 net.cpp:106] Creating Layer conv3 394 | I1209 22:10:05.974424 63236 net.cpp:454] conv3 <- pool2 395 | I1209 22:10:05.974431 63236 net.cpp:411] conv3 -> conv3 396 | I1209 22:10:05.998834 63236 net.cpp:150] Setting up conv3 397 | I1209 22:10:05.998847 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376) 398 | I1209 22:10:05.998850 63236 net.cpp:165] Memory required for data: 1292076032 399 | I1209 22:10:05.998859 63236 layer_factory.hpp:77] Creating layer relu3 400 | I1209 22:10:05.998868 63236 net.cpp:106] Creating Layer relu3 401 | I1209 22:10:05.998873 63236 net.cpp:454] relu3 <- conv3 402 | I1209 22:10:05.998878 63236 net.cpp:397] relu3 -> conv3 (in-place) 403 | I1209 22:10:05.999124 63236 net.cpp:150] Setting up relu3 404 | I1209 22:10:05.999135 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376) 405 | I1209 22:10:05.999137 63236 net.cpp:165] Memory required for data: 1358529536 406 | I1209 22:10:05.999141 63236 layer_factory.hpp:77] Creating layer conv4 407 | I1209 22:10:05.999153 63236 net.cpp:106] Creating Layer conv4 408 | I1209 22:10:05.999157 63236 net.cpp:454] conv4 <- conv3 409 | I1209 22:10:05.999162 63236 net.cpp:411] conv4 -> conv4 410 | I1209 22:10:06.000876 63239 blocking_queue.cpp:50] Waiting for data 411 | I1209 22:10:06.034972 63236 net.cpp:150] Setting up conv4 412 | I1209 22:10:06.034986 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376) 413 | I1209 22:10:06.034989 63236 net.cpp:165] Memory required for data: 1424983040 414 | I1209 22:10:06.034996 63236 layer_factory.hpp:77] Creating layer relu4 415 | I1209 22:10:06.035006 63236 net.cpp:106] Creating Layer relu4 416 | I1209 22:10:06.035009 63236 net.cpp:454] relu4 <- conv4 417 | I1209 22:10:06.035014 63236 net.cpp:397] relu4 -> conv4 (in-place) 418 | I1209 22:10:06.035151 63236 net.cpp:150] Setting up relu4 419 | I1209 22:10:06.035159 63236 net.cpp:157] Top shape: 256 384 13 13 (16613376) 420 | I1209 22:10:06.035163 63236 net.cpp:165] Memory required for data: 1491436544 421 | I1209 22:10:06.035167 63236 layer_factory.hpp:77] Creating layer conv5 422 | I1209 22:10:06.035181 63236 net.cpp:106] Creating Layer conv5 423 | I1209 22:10:06.035187 63236 net.cpp:454] conv5 <- conv4 424 | I1209 22:10:06.035192 63236 net.cpp:411] conv5 -> conv5 425 | I1209 22:10:06.059911 63236 net.cpp:150] Setting up conv5 426 | I1209 22:10:06.059923 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584) 427 | I1209 22:10:06.059927 63236 net.cpp:165] Memory required for data: 1535738880 428 | I1209 22:10:06.059937 63236 layer_factory.hpp:77] Creating layer relu5 429 | I1209 22:10:06.059944 63236 net.cpp:106] Creating Layer relu5 430 | I1209 22:10:06.059948 63236 net.cpp:454] relu5 <- conv5 431 | I1209 22:10:06.059954 63236 net.cpp:397] relu5 -> conv5 (in-place) 432 | I1209 22:10:06.060103 63236 net.cpp:150] Setting up relu5 433 | I1209 22:10:06.060113 63236 net.cpp:157] Top shape: 256 256 13 13 (11075584) 434 | I1209 22:10:06.060117 63236 net.cpp:165] Memory required for data: 1580041216 435 | I1209 22:10:06.060120 63236 layer_factory.hpp:77] Creating layer pool5 436 | I1209 22:10:06.060127 63236 net.cpp:106] Creating Layer pool5 437 | I1209 22:10:06.060130 63236 net.cpp:454] pool5 <- conv5 438 | I1209 22:10:06.060137 63236 net.cpp:411] pool5 -> pool5 439 | I1209 22:10:06.060431 63236 net.cpp:150] Setting up pool5 440 | I1209 22:10:06.060441 63236 net.cpp:157] Top shape: 256 256 6 6 (2359296) 441 | I1209 22:10:06.060444 63236 net.cpp:165] Memory required for data: 1589478400 442 | I1209 22:10:06.060448 63236 layer_factory.hpp:77] Creating layer fc6 443 | I1209 22:10:06.060457 63236 net.cpp:106] Creating Layer fc6 444 | I1209 22:10:06.060461 63236 net.cpp:454] fc6 <- pool5 445 | I1209 22:10:06.060469 63236 net.cpp:411] fc6 -> fc6 446 | I1209 22:10:07.117044 63236 net.cpp:150] Setting up fc6 447 | I1209 22:10:07.117090 63236 net.cpp:157] Top shape: 256 4096 (1048576) 448 | I1209 22:10:07.117095 63236 net.cpp:165] Memory required for data: 1593672704 449 | I1209 22:10:07.117107 63236 layer_factory.hpp:77] Creating layer relu6 450 | I1209 22:10:07.117120 63236 net.cpp:106] Creating Layer relu6 451 | I1209 22:10:07.117132 63236 net.cpp:454] relu6 <- fc6 452 | I1209 22:10:07.117180 63236 net.cpp:397] relu6 -> fc6 (in-place) 453 | I1209 22:10:07.117430 63236 net.cpp:150] Setting up relu6 454 | I1209 22:10:07.117440 63236 net.cpp:157] Top shape: 256 4096 (1048576) 455 | I1209 22:10:07.117444 63236 net.cpp:165] Memory required for data: 1597867008 456 | I1209 22:10:07.117449 63236 layer_factory.hpp:77] Creating layer drop6 457 | I1209 22:10:07.117480 63236 net.cpp:106] Creating Layer drop6 458 | I1209 22:10:07.117483 63236 net.cpp:454] drop6 <- fc6 459 | I1209 22:10:07.117491 63236 net.cpp:397] drop6 -> fc6 (in-place) 460 | I1209 22:10:07.117530 63236 net.cpp:150] Setting up drop6 461 | I1209 22:10:07.117537 63236 net.cpp:157] Top shape: 256 4096 (1048576) 462 | I1209 22:10:07.117542 63236 net.cpp:165] Memory required for data: 1602061312 463 | I1209 22:10:07.117545 63236 layer_factory.hpp:77] Creating layer fc7 464 | I1209 22:10:07.117558 63236 net.cpp:106] Creating Layer fc7 465 | I1209 22:10:07.117561 63236 net.cpp:454] fc7 <- fc6 466 | I1209 22:10:07.117568 63236 net.cpp:411] fc7 -> fc7 467 | I1209 22:10:07.591801 63236 net.cpp:150] Setting up fc7 468 | I1209 22:10:07.591850 63236 net.cpp:157] Top shape: 256 4096 (1048576) 469 | I1209 22:10:07.591856 63236 net.cpp:165] Memory required for data: 1606255616 470 | I1209 22:10:07.591868 63236 layer_factory.hpp:77] Creating layer relu7 471 | I1209 22:10:07.591886 63236 net.cpp:106] Creating Layer relu7 472 | I1209 22:10:07.591892 63236 net.cpp:454] relu7 <- fc7 473 | I1209 22:10:07.591900 63236 net.cpp:397] relu7 -> fc7 (in-place) 474 | I1209 22:10:07.592543 63236 net.cpp:150] Setting up relu7 475 | I1209 22:10:07.592555 63236 net.cpp:157] Top shape: 256 4096 (1048576) 476 | I1209 22:10:07.592558 63236 net.cpp:165] Memory required for data: 1610449920 477 | I1209 22:10:07.592563 63236 layer_factory.hpp:77] Creating layer drop7 478 | I1209 22:10:07.592572 63236 net.cpp:106] Creating Layer drop7 479 | I1209 22:10:07.592577 63236 net.cpp:454] drop7 <- fc7 480 | I1209 22:10:07.592586 63236 net.cpp:397] drop7 -> fc7 (in-place) 481 | I1209 22:10:07.592610 63236 net.cpp:150] Setting up drop7 482 | I1209 22:10:07.592617 63236 net.cpp:157] Top shape: 256 4096 (1048576) 483 | I1209 22:10:07.592620 63236 net.cpp:165] Memory required for data: 1614644224 484 | I1209 22:10:07.592624 63236 layer_factory.hpp:77] Creating layer fc8 485 | I1209 22:10:07.592638 63236 net.cpp:106] Creating Layer fc8 486 | I1209 22:10:07.592641 63236 net.cpp:454] fc8 <- fc7 487 | I1209 22:10:07.592650 63236 net.cpp:411] fc8 -> fc8 488 | I1209 22:10:07.704557 63236 net.cpp:150] Setting up fc8 489 | I1209 22:10:07.704577 63236 net.cpp:157] Top shape: 256 1000 (256000) 490 | I1209 22:10:07.704581 63236 net.cpp:165] Memory required for data: 1615668224 491 | I1209 22:10:07.704589 63236 layer_factory.hpp:77] Creating layer loss 492 | I1209 22:10:07.704597 63236 net.cpp:106] Creating Layer loss 493 | I1209 22:10:07.704602 63236 net.cpp:454] loss <- fc8 494 | I1209 22:10:07.704607 63236 net.cpp:454] loss <- label 495 | I1209 22:10:07.704617 63236 net.cpp:411] loss -> loss 496 | I1209 22:10:07.704629 63236 layer_factory.hpp:77] Creating layer loss 497 | I1209 22:10:07.705862 63236 net.cpp:150] Setting up loss 498 | I1209 22:10:07.705873 63236 net.cpp:157] Top shape: (1) 499 | I1209 22:10:07.705876 63236 net.cpp:160] with loss weight 1 500 | I1209 22:10:07.705900 63236 net.cpp:165] Memory required for data: 1615668228 501 | I1209 22:10:07.705905 63236 net.cpp:226] loss needs backward computation. 502 | I1209 22:10:07.705909 63236 net.cpp:226] fc8 needs backward computation. 503 | I1209 22:10:07.705914 63236 net.cpp:226] drop7 needs backward computation. 504 | I1209 22:10:07.705917 63236 net.cpp:226] relu7 needs backward computation. 505 | I1209 22:10:07.705921 63236 net.cpp:226] fc7 needs backward computation. 506 | I1209 22:10:07.705925 63236 net.cpp:226] drop6 needs backward computation. 507 | I1209 22:10:07.705930 63236 net.cpp:226] relu6 needs backward computation. 508 | I1209 22:10:07.705934 63236 net.cpp:226] fc6 needs backward computation. 509 | I1209 22:10:07.705938 63236 net.cpp:226] pool5 needs backward computation. 510 | I1209 22:10:07.705942 63236 net.cpp:226] relu5 needs backward computation. 511 | I1209 22:10:07.705946 63236 net.cpp:226] conv5 needs backward computation. 512 | I1209 22:10:07.705951 63236 net.cpp:226] relu4 needs backward computation. 513 | I1209 22:10:07.705962 63236 net.cpp:226] conv4 needs backward computation. 514 | I1209 22:10:07.706001 63236 net.cpp:226] relu3 needs backward computation. 515 | I1209 22:10:07.706004 63236 net.cpp:226] conv3 needs backward computation. 516 | I1209 22:10:07.706009 63236 net.cpp:226] pool2 needs backward computation. 517 | I1209 22:10:07.706017 63236 net.cpp:226] relu2 needs backward computation. 518 | I1209 22:10:07.706020 63236 net.cpp:226] conv2 needs backward computation. 519 | I1209 22:10:07.706025 63236 net.cpp:226] pool1 needs backward computation. 520 | I1209 22:10:07.706028 63236 net.cpp:226] relu1 needs backward computation. 521 | I1209 22:10:07.706032 63236 net.cpp:226] conv1 needs backward computation. 522 | I1209 22:10:07.706037 63236 net.cpp:228] data does not need backward computation. 523 | I1209 22:10:07.706042 63236 net.cpp:270] This network produces output loss 524 | I1209 22:10:07.706055 63236 net.cpp:283] Network initialization done. 525 | I1209 22:10:07.706135 63236 solver.cpp:60] Solver scaffolding done. 526 | I1209 22:10:07.706593 63236 caffe.cpp:212] Starting Optimization 527 | I1209 22:10:07.706603 63236 solver.cpp:288] Solving AlexNet 528 | I1209 22:10:07.706606 63236 solver.cpp:289] Learning Rate Policy: fixed 529 | I1209 22:11:05.112869 63236 solver.cpp:459] Snapshotting to binary proto file _iter_50.caffemodel 530 | I1209 22:11:08.473160 63236 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_50.solverstate 531 | I1209 22:11:10.449729 63236 solver.cpp:326] Optimization Done. 532 | I1209 22:11:10.449767 63236 caffe.cpp:215] Optimization Done. 533 | -------------------------------------------------------------------------------- /caffe/alexnet_1GPU.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe train -solver=./alexnet_solver.prototxt -gpu=0 >alexnet_1GPU.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/alexnet_4GPUs.log: -------------------------------------------------------------------------------- 1 | I1210 17:39:15.308357 1423 caffe.cpp:184] Using GPUs 0, 1, 2, 3 2 | I1210 17:39:24.950314 1423 solver.cpp:48] Initializing solver from parameters: 3 | base_lr: 0.01 4 | max_iter: 50 5 | lr_policy: "fixed" 6 | solver_mode: GPU 7 | device_id: 0 8 | net: "./alexnet_4GPUs.prototxt" 9 | I1210 17:39:24.950376 1423 solver.cpp:91] Creating training net from net file: ./alexnet_4GPUs.prototxt 10 | I1210 17:39:24.951216 1423 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy 11 | I1210 17:39:24.951390 1423 net.cpp:49] Initializing net from parameters: 12 | name: "AlexNet" 13 | state { 14 | phase: TRAIN 15 | } 16 | layer { 17 | name: "data" 18 | type: "Data" 19 | top: "data" 20 | top: "label" 21 | include { 22 | phase: TRAIN 23 | } 24 | data_param { 25 | source: "./fake_image_net.lmdb" 26 | batch_size: 64 27 | backend: LMDB 28 | } 29 | } 30 | layer { 31 | name: "conv1" 32 | type: "Convolution" 33 | bottom: "data" 34 | top: "conv1" 35 | param { 36 | lr_mult: 1 37 | decay_mult: 1 38 | } 39 | param { 40 | lr_mult: 2 41 | decay_mult: 0 42 | } 43 | convolution_param { 44 | num_output: 96 45 | kernel_size: 11 46 | stride: 4 47 | weight_filler { 48 | type: "gaussian" 49 | std: 0.01 50 | } 51 | bias_filler { 52 | type: "constant" 53 | value: 0 54 | } 55 | } 56 | } 57 | layer { 58 | name: "relu1" 59 | type: "ReLU" 60 | bottom: "conv1" 61 | top: "conv1" 62 | } 63 | layer { 64 | name: "pool1" 65 | type: "Pooling" 66 | bottom: "conv1" 67 | top: "pool1" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 3 71 | stride: 2 72 | } 73 | } 74 | layer { 75 | name: "conv2" 76 | type: "Convolution" 77 | bottom: "pool1" 78 | top: "conv2" 79 | param { 80 | lr_mult: 1 81 | decay_mult: 1 82 | } 83 | param { 84 | lr_mult: 2 85 | decay_mult: 0 86 | } 87 | convolution_param { 88 | num_output: 256 89 | pad: 2 90 | kernel_size: 5 91 | weight_filler { 92 | type: "gaussian" 93 | std: 0.01 94 | } 95 | bias_filler { 96 | type: "constant" 97 | value: 0.1 98 | } 99 | } 100 | } 101 | layer { 102 | name: "relu2" 103 | type: "ReLU" 104 | bottom: "conv2" 105 | top: "conv2" 106 | } 107 | layer { 108 | name: "pool2" 109 | type: "Pooling" 110 | bottom: "conv2" 111 | top: "pool2" 112 | pooling_param { 113 | pool: MAX 114 | kernel_size: 3 115 | stride: 2 116 | } 117 | } 118 | layer { 119 | name: "conv3" 120 | type: "Convolution" 121 | bottom: "pool2" 122 | top: "conv3" 123 | param { 124 | lr_mult: 1 125 | decay_mult: 1 126 | } 127 | param { 128 | lr_mult: 2 129 | decay_mult: 0 130 | } 131 | convolution_param { 132 | num_output: 384 133 | pad: 1 134 | kernel_size: 3 135 | weight_filler { 136 | type: "gaussian" 137 | std: 0.01 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | layer { 146 | name: "relu3" 147 | type: "ReLU" 148 | bottom: "conv3" 149 | top: "conv3" 150 | } 151 | layer { 152 | name: "conv4" 153 | type: "Convolution" 154 | bottom: "conv3" 155 | top: "conv4" 156 | param { 157 | lr_mult: 1 158 | decay_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | decay_mult: 0 163 | } 164 | convolution_param { 165 | num_output: 384 166 | pad: 1 167 | kernel_size: 3 168 | weight_filler { 169 | type: "gaussian" 170 | std: 0.01 171 | } 172 | bias_filler { 173 | type: "constant" 174 | value: 0.1 175 | } 176 | } 177 | } 178 | layer { 179 | name: "relu4" 180 | type: "ReLU" 181 | bottom: "conv4" 182 | top: "conv4" 183 | } 184 | layer { 185 | name: "conv5" 186 | type: "Convolution" 187 | bottom: "conv4" 188 | top: "conv5" 189 | param { 190 | lr_mult: 1 191 | decay_mult: 1 192 | } 193 | param { 194 | lr_mult: 2 195 | decay_mult: 0 196 | } 197 | convolution_param { 198 | num_output: 256 199 | pad: 1 200 | kernel_size: 3 201 | weight_filler { 202 | type: "gaussian" 203 | std: 0.01 204 | } 205 | bias_filler { 206 | type: "constant" 207 | value: 0.1 208 | } 209 | } 210 | } 211 | layer { 212 | name: "relu5" 213 | type: "ReLU" 214 | bottom: "conv5" 215 | top: "conv5" 216 | } 217 | layer { 218 | name: "pool5" 219 | type: "Pooling" 220 | bottom: "conv5" 221 | top: "pool5" 222 | pooling_param { 223 | pool: MAX 224 | kernel_size: 3 225 | stride: 2 226 | } 227 | } 228 | layer { 229 | name: "fc6" 230 | type: "InnerProduct" 231 | bottom: "pool5" 232 | top: "fc6" 233 | param { 234 | lr_mult: 1 235 | decay_mult: 1 236 | } 237 | param { 238 | lr_mult: 2 239 | decay_mult: 0 240 | } 241 | inner_product_param { 242 | num_output: 4096 243 | weight_filler { 244 | type: "gaussian" 245 | std: 0.005 246 | } 247 | bias_filler { 248 | type: "constant" 249 | value: 0.1 250 | } 251 | } 252 | } 253 | layer { 254 | name: "relu6" 255 | type: "ReLU" 256 | bottom: "fc6" 257 | top: "fc6" 258 | } 259 | layer { 260 | name: "drop6" 261 | type: "Dropout" 262 | bottom: "fc6" 263 | top: "fc6" 264 | dropout_param { 265 | dropout_ratio: 0.5 266 | } 267 | } 268 | layer { 269 | name: "fc7" 270 | type: "InnerProduct" 271 | bottom: "fc6" 272 | top: "fc7" 273 | param { 274 | lr_mult: 1 275 | decay_mult: 1 276 | } 277 | param { 278 | lr_mult: 2 279 | decay_mult: 0 280 | } 281 | inner_product_param { 282 | num_output: 4096 283 | weight_filler { 284 | type: "gaussian" 285 | std: 0.005 286 | } 287 | bias_filler { 288 | type: "constant" 289 | value: 0.1 290 | } 291 | } 292 | } 293 | layer { 294 | name: "relu7" 295 | type: "ReLU" 296 | bottom: "fc7" 297 | top: "fc7" 298 | } 299 | layer { 300 | name: "drop7" 301 | type: "Dropout" 302 | bottom: "fc7" 303 | top: "fc7" 304 | dropout_param { 305 | dropout_ratio: 0.5 306 | } 307 | } 308 | layer { 309 | name: "fc8" 310 | type: "InnerProduct" 311 | bottom: "fc7" 312 | top: "fc8" 313 | param { 314 | lr_mult: 1 315 | decay_mult: 1 316 | } 317 | param { 318 | lr_mult: 2 319 | decay_mult: 0 320 | } 321 | inner_product_param { 322 | num_output: 1000 323 | weight_filler { 324 | type: "gaussian" 325 | std: 0.01 326 | } 327 | bias_filler { 328 | type: "constant" 329 | value: 0 330 | } 331 | } 332 | } 333 | layer { 334 | name: "loss" 335 | type: "SoftmaxWithLoss" 336 | bottom: "fc8" 337 | bottom: "label" 338 | top: "loss" 339 | } 340 | I1210 17:39:24.951545 1423 layer_factory.hpp:77] Creating layer data 341 | I1210 17:39:24.952008 1423 net.cpp:106] Creating Layer data 342 | I1210 17:39:24.952018 1423 net.cpp:411] data -> data 343 | I1210 17:39:24.952041 1423 net.cpp:411] data -> label 344 | I1210 17:39:24.954187 1425 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb 345 | I1210 17:39:24.967650 1423 data_layer.cpp:41] output data size: 64,3,224,224 346 | I1210 17:39:25.048879 1423 net.cpp:150] Setting up data 347 | I1210 17:39:25.048928 1423 net.cpp:157] Top shape: 64 3 224 224 (9633792) 348 | I1210 17:39:25.048936 1423 net.cpp:157] Top shape: 64 (64) 349 | I1210 17:39:25.048940 1423 net.cpp:165] Memory required for data: 38535424 350 | I1210 17:39:25.048950 1423 layer_factory.hpp:77] Creating layer conv1 351 | I1210 17:39:25.048975 1423 net.cpp:106] Creating Layer conv1 352 | I1210 17:39:25.048982 1423 net.cpp:454] conv1 <- data 353 | I1210 17:39:25.048998 1423 net.cpp:411] conv1 -> conv1 354 | I1210 17:39:25.054342 1426 blocking_queue.cpp:50] Waiting for data 355 | I1210 17:39:25.152736 1423 net.cpp:150] Setting up conv1 356 | I1210 17:39:25.152753 1423 net.cpp:157] Top shape: 64 96 54 54 (17915904) 357 | I1210 17:39:25.152758 1423 net.cpp:165] Memory required for data: 110199040 358 | I1210 17:39:25.152773 1423 layer_factory.hpp:77] Creating layer relu1 359 | I1210 17:39:25.152782 1423 net.cpp:106] Creating Layer relu1 360 | I1210 17:39:25.152787 1423 net.cpp:454] relu1 <- conv1 361 | I1210 17:39:25.152793 1423 net.cpp:397] relu1 -> conv1 (in-place) 362 | I1210 17:39:25.153028 1423 net.cpp:150] Setting up relu1 363 | I1210 17:39:25.153039 1423 net.cpp:157] Top shape: 64 96 54 54 (17915904) 364 | I1210 17:39:25.153043 1423 net.cpp:165] Memory required for data: 181862656 365 | I1210 17:39:25.153048 1423 layer_factory.hpp:77] Creating layer pool1 366 | I1210 17:39:25.153056 1423 net.cpp:106] Creating Layer pool1 367 | I1210 17:39:25.153060 1423 net.cpp:454] pool1 <- conv1 368 | I1210 17:39:25.153066 1423 net.cpp:411] pool1 -> pool1 369 | I1210 17:39:25.153345 1423 net.cpp:150] Setting up pool1 370 | I1210 17:39:25.153357 1423 net.cpp:157] Top shape: 64 96 27 27 (4478976) 371 | I1210 17:39:25.153360 1423 net.cpp:165] Memory required for data: 199778560 372 | I1210 17:39:25.153365 1423 layer_factory.hpp:77] Creating layer conv2 373 | I1210 17:39:25.153383 1423 net.cpp:106] Creating Layer conv2 374 | I1210 17:39:25.153388 1423 net.cpp:454] conv2 <- pool1 375 | I1210 17:39:25.153393 1423 net.cpp:411] conv2 -> conv2 376 | I1210 17:39:25.171576 1423 net.cpp:150] Setting up conv2 377 | I1210 17:39:25.171588 1423 net.cpp:157] Top shape: 64 256 27 27 (11943936) 378 | I1210 17:39:25.171592 1423 net.cpp:165] Memory required for data: 247554304 379 | I1210 17:39:25.171602 1423 layer_factory.hpp:77] Creating layer relu2 380 | I1210 17:39:25.171609 1423 net.cpp:106] Creating Layer relu2 381 | I1210 17:39:25.171613 1423 net.cpp:454] relu2 <- conv2 382 | I1210 17:39:25.171622 1423 net.cpp:397] relu2 -> conv2 (in-place) 383 | I1210 17:39:25.171888 1423 net.cpp:150] Setting up relu2 384 | I1210 17:39:25.171898 1423 net.cpp:157] Top shape: 64 256 27 27 (11943936) 385 | I1210 17:39:25.171902 1423 net.cpp:165] Memory required for data: 295330048 386 | I1210 17:39:25.171906 1423 layer_factory.hpp:77] Creating layer pool2 387 | I1210 17:39:25.171914 1423 net.cpp:106] Creating Layer pool2 388 | I1210 17:39:25.171918 1423 net.cpp:454] pool2 <- conv2 389 | I1210 17:39:25.171933 1423 net.cpp:411] pool2 -> pool2 390 | I1210 17:39:25.172139 1423 net.cpp:150] Setting up pool2 391 | I1210 17:39:25.172149 1423 net.cpp:157] Top shape: 64 256 13 13 (2768896) 392 | I1210 17:39:25.172153 1423 net.cpp:165] Memory required for data: 306405632 393 | I1210 17:39:25.172158 1423 layer_factory.hpp:77] Creating layer conv3 394 | I1210 17:39:25.172168 1423 net.cpp:106] Creating Layer conv3 395 | I1210 17:39:25.172173 1423 net.cpp:454] conv3 <- pool2 396 | I1210 17:39:25.172186 1423 net.cpp:411] conv3 -> conv3 397 | I1210 17:39:25.197000 1423 net.cpp:150] Setting up conv3 398 | I1210 17:39:25.197011 1423 net.cpp:157] Top shape: 64 384 13 13 (4153344) 399 | I1210 17:39:25.197016 1423 net.cpp:165] Memory required for data: 323019008 400 | I1210 17:39:25.197026 1423 layer_factory.hpp:77] Creating layer relu3 401 | I1210 17:39:25.197036 1423 net.cpp:106] Creating Layer relu3 402 | I1210 17:39:25.197041 1423 net.cpp:454] relu3 <- conv3 403 | I1210 17:39:25.197046 1423 net.cpp:397] relu3 -> conv3 (in-place) 404 | I1210 17:39:25.197306 1423 net.cpp:150] Setting up relu3 405 | I1210 17:39:25.197317 1423 net.cpp:157] Top shape: 64 384 13 13 (4153344) 406 | I1210 17:39:25.197321 1423 net.cpp:165] Memory required for data: 339632384 407 | I1210 17:39:25.197325 1423 layer_factory.hpp:77] Creating layer conv4 408 | I1210 17:39:25.197337 1423 net.cpp:106] Creating Layer conv4 409 | I1210 17:39:25.197341 1423 net.cpp:454] conv4 <- conv3 410 | I1210 17:39:25.197350 1423 net.cpp:411] conv4 -> conv4 411 | I1210 17:39:25.235277 1423 net.cpp:150] Setting up conv4 412 | I1210 17:39:25.235290 1423 net.cpp:157] Top shape: 64 384 13 13 (4153344) 413 | I1210 17:39:25.235293 1423 net.cpp:165] Memory required for data: 356245760 414 | I1210 17:39:25.235301 1423 layer_factory.hpp:77] Creating layer relu4 415 | I1210 17:39:25.235311 1423 net.cpp:106] Creating Layer relu4 416 | I1210 17:39:25.235316 1423 net.cpp:454] relu4 <- conv4 417 | I1210 17:39:25.235321 1423 net.cpp:397] relu4 -> conv4 (in-place) 418 | I1210 17:39:25.235462 1423 net.cpp:150] Setting up relu4 419 | I1210 17:39:25.235474 1423 net.cpp:157] Top shape: 64 384 13 13 (4153344) 420 | I1210 17:39:25.235478 1423 net.cpp:165] Memory required for data: 372859136 421 | I1210 17:39:25.235482 1423 layer_factory.hpp:77] Creating layer conv5 422 | I1210 17:39:25.235491 1423 net.cpp:106] Creating Layer conv5 423 | I1210 17:39:25.235496 1423 net.cpp:454] conv5 <- conv4 424 | I1210 17:39:25.235503 1423 net.cpp:411] conv5 -> conv5 425 | I1210 17:39:25.260288 1423 net.cpp:150] Setting up conv5 426 | I1210 17:39:25.260300 1423 net.cpp:157] Top shape: 64 256 13 13 (2768896) 427 | I1210 17:39:25.260304 1423 net.cpp:165] Memory required for data: 383934720 428 | I1210 17:39:25.260316 1423 layer_factory.hpp:77] Creating layer relu5 429 | I1210 17:39:25.260324 1423 net.cpp:106] Creating Layer relu5 430 | I1210 17:39:25.260329 1423 net.cpp:454] relu5 <- conv5 431 | I1210 17:39:25.260334 1423 net.cpp:397] relu5 -> conv5 (in-place) 432 | I1210 17:39:25.260481 1423 net.cpp:150] Setting up relu5 433 | I1210 17:39:25.260490 1423 net.cpp:157] Top shape: 64 256 13 13 (2768896) 434 | I1210 17:39:25.260494 1423 net.cpp:165] Memory required for data: 395010304 435 | I1210 17:39:25.260499 1423 layer_factory.hpp:77] Creating layer pool5 436 | I1210 17:39:25.260505 1423 net.cpp:106] Creating Layer pool5 437 | I1210 17:39:25.260509 1423 net.cpp:454] pool5 <- conv5 438 | I1210 17:39:25.260517 1423 net.cpp:411] pool5 -> pool5 439 | I1210 17:39:25.260802 1423 net.cpp:150] Setting up pool5 440 | I1210 17:39:25.260812 1423 net.cpp:157] Top shape: 64 256 6 6 (589824) 441 | I1210 17:39:25.260817 1423 net.cpp:165] Memory required for data: 397369600 442 | I1210 17:39:25.260820 1423 layer_factory.hpp:77] Creating layer fc6 443 | I1210 17:39:25.260833 1423 net.cpp:106] Creating Layer fc6 444 | I1210 17:39:25.260838 1423 net.cpp:454] fc6 <- pool5 445 | I1210 17:39:25.260846 1423 net.cpp:411] fc6 -> fc6 446 | I1210 17:39:26.329960 1423 net.cpp:150] Setting up fc6 447 | I1210 17:39:26.330013 1423 net.cpp:157] Top shape: 64 4096 (262144) 448 | I1210 17:39:26.330018 1423 net.cpp:165] Memory required for data: 398418176 449 | I1210 17:39:26.330035 1423 layer_factory.hpp:77] Creating layer relu6 450 | I1210 17:39:26.330051 1423 net.cpp:106] Creating Layer relu6 451 | I1210 17:39:26.330067 1423 net.cpp:454] relu6 <- fc6 452 | I1210 17:39:26.330078 1423 net.cpp:397] relu6 -> fc6 (in-place) 453 | I1210 17:39:26.330458 1423 net.cpp:150] Setting up relu6 454 | I1210 17:39:26.330468 1423 net.cpp:157] Top shape: 64 4096 (262144) 455 | I1210 17:39:26.330471 1423 net.cpp:165] Memory required for data: 399466752 456 | I1210 17:39:26.330476 1423 layer_factory.hpp:77] Creating layer drop6 457 | I1210 17:39:26.330504 1423 net.cpp:106] Creating Layer drop6 458 | I1210 17:39:26.330509 1423 net.cpp:454] drop6 <- fc6 459 | I1210 17:39:26.330514 1423 net.cpp:397] drop6 -> fc6 (in-place) 460 | I1210 17:39:26.330551 1423 net.cpp:150] Setting up drop6 461 | I1210 17:39:26.330559 1423 net.cpp:157] Top shape: 64 4096 (262144) 462 | I1210 17:39:26.330561 1423 net.cpp:165] Memory required for data: 400515328 463 | I1210 17:39:26.330565 1423 layer_factory.hpp:77] Creating layer fc7 464 | I1210 17:39:26.330585 1423 net.cpp:106] Creating Layer fc7 465 | I1210 17:39:26.330590 1423 net.cpp:454] fc7 <- fc6 466 | I1210 17:39:26.330596 1423 net.cpp:411] fc7 -> fc7 467 | I1210 17:39:26.805878 1423 net.cpp:150] Setting up fc7 468 | I1210 17:39:26.805927 1423 net.cpp:157] Top shape: 64 4096 (262144) 469 | I1210 17:39:26.805932 1423 net.cpp:165] Memory required for data: 401563904 470 | I1210 17:39:26.805943 1423 layer_factory.hpp:77] Creating layer relu7 471 | I1210 17:39:26.805958 1423 net.cpp:106] Creating Layer relu7 472 | I1210 17:39:26.805963 1423 net.cpp:454] relu7 <- fc7 473 | I1210 17:39:26.805973 1423 net.cpp:397] relu7 -> fc7 (in-place) 474 | I1210 17:39:26.806596 1423 net.cpp:150] Setting up relu7 475 | I1210 17:39:26.806605 1423 net.cpp:157] Top shape: 64 4096 (262144) 476 | I1210 17:39:26.806609 1423 net.cpp:165] Memory required for data: 402612480 477 | I1210 17:39:26.806614 1423 layer_factory.hpp:77] Creating layer drop7 478 | I1210 17:39:26.806622 1423 net.cpp:106] Creating Layer drop7 479 | I1210 17:39:26.806627 1423 net.cpp:454] drop7 <- fc7 480 | I1210 17:39:26.806637 1423 net.cpp:397] drop7 -> fc7 (in-place) 481 | I1210 17:39:26.806674 1423 net.cpp:150] Setting up drop7 482 | I1210 17:39:26.806684 1423 net.cpp:157] Top shape: 64 4096 (262144) 483 | I1210 17:39:26.806689 1423 net.cpp:165] Memory required for data: 403661056 484 | I1210 17:39:26.806692 1423 layer_factory.hpp:77] Creating layer fc8 485 | I1210 17:39:26.806704 1423 net.cpp:106] Creating Layer fc8 486 | I1210 17:39:26.806709 1423 net.cpp:454] fc8 <- fc7 487 | I1210 17:39:26.806717 1423 net.cpp:411] fc8 -> fc8 488 | I1210 17:39:26.918463 1423 net.cpp:150] Setting up fc8 489 | I1210 17:39:26.918480 1423 net.cpp:157] Top shape: 64 1000 (64000) 490 | I1210 17:39:26.918484 1423 net.cpp:165] Memory required for data: 403917056 491 | I1210 17:39:26.918493 1423 layer_factory.hpp:77] Creating layer loss 492 | I1210 17:39:26.918500 1423 net.cpp:106] Creating Layer loss 493 | I1210 17:39:26.918504 1423 net.cpp:454] loss <- fc8 494 | I1210 17:39:26.918510 1423 net.cpp:454] loss <- label 495 | I1210 17:39:26.918519 1423 net.cpp:411] loss -> loss 496 | I1210 17:39:26.918534 1423 layer_factory.hpp:77] Creating layer loss 497 | I1210 17:39:26.919003 1423 net.cpp:150] Setting up loss 498 | I1210 17:39:26.919014 1423 net.cpp:157] Top shape: (1) 499 | I1210 17:39:26.919018 1423 net.cpp:160] with loss weight 1 500 | I1210 17:39:26.919049 1423 net.cpp:165] Memory required for data: 403917060 501 | I1210 17:39:26.919054 1423 net.cpp:226] loss needs backward computation. 502 | I1210 17:39:26.919057 1423 net.cpp:226] fc8 needs backward computation. 503 | I1210 17:39:26.919061 1423 net.cpp:226] drop7 needs backward computation. 504 | I1210 17:39:26.919064 1423 net.cpp:226] relu7 needs backward computation. 505 | I1210 17:39:26.919069 1423 net.cpp:226] fc7 needs backward computation. 506 | I1210 17:39:26.919072 1423 net.cpp:226] drop6 needs backward computation. 507 | I1210 17:39:26.919076 1423 net.cpp:226] relu6 needs backward computation. 508 | I1210 17:39:26.919080 1423 net.cpp:226] fc6 needs backward computation. 509 | I1210 17:39:26.919085 1423 net.cpp:226] pool5 needs backward computation. 510 | I1210 17:39:26.919088 1423 net.cpp:226] relu5 needs backward computation. 511 | I1210 17:39:26.919092 1423 net.cpp:226] conv5 needs backward computation. 512 | I1210 17:39:26.919096 1423 net.cpp:226] relu4 needs backward computation. 513 | I1210 17:39:26.919100 1423 net.cpp:226] conv4 needs backward computation. 514 | I1210 17:39:26.919112 1423 net.cpp:226] relu3 needs backward computation. 515 | I1210 17:39:26.919149 1423 net.cpp:226] conv3 needs backward computation. 516 | I1210 17:39:26.919154 1423 net.cpp:226] pool2 needs backward computation. 517 | I1210 17:39:26.919162 1423 net.cpp:226] relu2 needs backward computation. 518 | I1210 17:39:26.919167 1423 net.cpp:226] conv2 needs backward computation. 519 | I1210 17:39:26.919172 1423 net.cpp:226] pool1 needs backward computation. 520 | I1210 17:39:26.919179 1423 net.cpp:226] relu1 needs backward computation. 521 | I1210 17:39:26.919184 1423 net.cpp:226] conv1 needs backward computation. 522 | I1210 17:39:26.919189 1423 net.cpp:228] data does not need backward computation. 523 | I1210 17:39:26.919193 1423 net.cpp:270] This network produces output loss 524 | I1210 17:39:26.919209 1423 net.cpp:283] Network initialization done. 525 | I1210 17:39:26.919297 1423 solver.cpp:60] Solver scaffolding done. 526 | I1210 17:39:26.956542 1423 parallel.cpp:391] GPUs pairs 0:1, 2:3, 0:2 527 | I1210 17:39:27.190856 1423 data_layer.cpp:41] output data size: 64,3,224,224 528 | I1210 17:39:29.479032 1423 data_layer.cpp:41] output data size: 64,3,224,224 529 | I1210 17:39:31.457525 1423 parallel.cpp:234] GPU 2 does not have p2p access to GPU 0 530 | I1210 17:39:31.698340 1423 data_layer.cpp:41] output data size: 64,3,224,224 531 | I1210 17:39:33.793845 1423 parallel.cpp:419] Starting Optimization 532 | I1210 17:39:33.794456 1423 solver.cpp:288] Solving AlexNet 533 | I1210 17:39:33.794497 1423 solver.cpp:289] Learning Rate Policy: fixed 534 | I1210 17:40:02.691781 1423 solver.cpp:459] Snapshotting to binary proto file _iter_50.caffemodel 535 | I1210 17:40:05.501304 1423 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_50.solverstate 536 | I1210 17:40:07.500490 1423 solver.cpp:326] Optimization Done. 537 | I1210 17:40:07.728231 1423 caffe.cpp:215] Optimization Done. 538 | -------------------------------------------------------------------------------- /caffe/alexnet_4GPUs.prototxt: -------------------------------------------------------------------------------- 1 | name: "AlexNet" 2 | layer { 3 | name: "data" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | data_param { 11 | source: "./fake_image_net.lmdb" 12 | batch_size: 64 13 | backend: LMDB 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 1 23 | decay_mult: 1 24 | } 25 | param { 26 | lr_mult: 2 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 11 32 | stride: 4 33 | weight_filler { 34 | type: "gaussian" 35 | std: 0.01 36 | } 37 | bias_filler { 38 | type: "constant" 39 | value: 0 40 | } 41 | } 42 | } 43 | layer { 44 | name: "relu1" 45 | type: "ReLU" 46 | bottom: "conv1" 47 | top: "conv1" 48 | } 49 | layer { 50 | name: "pool1" 51 | type: "Pooling" 52 | bottom: "conv1" 53 | top: "pool1" 54 | pooling_param { 55 | pool: MAX 56 | kernel_size: 3 57 | stride: 2 58 | } 59 | } 60 | layer { 61 | name: "conv2" 62 | type: "Convolution" 63 | bottom: "pool1" 64 | top: "conv2" 65 | param { 66 | lr_mult: 1 67 | decay_mult: 1 68 | } 69 | param { 70 | lr_mult: 2 71 | decay_mult: 0 72 | } 73 | convolution_param { 74 | num_output: 256 75 | pad: 2 76 | kernel_size: 5 77 | weight_filler { 78 | type: "gaussian" 79 | std: 0.01 80 | } 81 | bias_filler { 82 | type: "constant" 83 | value: 0.1 84 | } 85 | } 86 | } 87 | layer { 88 | name: "relu2" 89 | type: "ReLU" 90 | bottom: "conv2" 91 | top: "conv2" 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "conv2" 97 | top: "pool2" 98 | pooling_param { 99 | pool: MAX 100 | kernel_size: 3 101 | stride: 2 102 | } 103 | } 104 | layer { 105 | name: "conv3" 106 | type: "Convolution" 107 | bottom: "pool2" 108 | top: "conv3" 109 | param { 110 | lr_mult: 1 111 | decay_mult: 1 112 | } 113 | param { 114 | lr_mult: 2 115 | decay_mult: 0 116 | } 117 | convolution_param { 118 | num_output: 384 119 | pad: 1 120 | kernel_size: 3 121 | weight_filler { 122 | type: "gaussian" 123 | std: 0.01 124 | } 125 | bias_filler { 126 | type: "constant" 127 | value: 0 128 | } 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | pad: 1 153 | kernel_size: 3 154 | weight_filler { 155 | type: "gaussian" 156 | std: 0.01 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0.1 161 | } 162 | } 163 | } 164 | layer { 165 | name: "relu4" 166 | type: "ReLU" 167 | bottom: "conv4" 168 | top: "conv4" 169 | } 170 | layer { 171 | name: "conv5" 172 | type: "Convolution" 173 | bottom: "conv4" 174 | top: "conv5" 175 | param { 176 | lr_mult: 1 177 | decay_mult: 1 178 | } 179 | param { 180 | lr_mult: 2 181 | decay_mult: 0 182 | } 183 | convolution_param { 184 | num_output: 256 185 | pad: 1 186 | kernel_size: 3 187 | weight_filler { 188 | type: "gaussian" 189 | std: 0.01 190 | } 191 | bias_filler { 192 | type: "constant" 193 | value: 0.1 194 | } 195 | } 196 | } 197 | layer { 198 | name: "relu5" 199 | type: "ReLU" 200 | bottom: "conv5" 201 | top: "conv5" 202 | } 203 | layer { 204 | name: "pool5" 205 | type: "Pooling" 206 | bottom: "conv5" 207 | top: "pool5" 208 | pooling_param { 209 | pool: MAX 210 | kernel_size: 3 211 | stride: 2 212 | } 213 | } 214 | layer { 215 | name: "fc6" 216 | type: "InnerProduct" 217 | bottom: "pool5" 218 | top: "fc6" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 0 226 | } 227 | inner_product_param { 228 | num_output: 4096 229 | weight_filler { 230 | type: "gaussian" 231 | std: 0.005 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0.1 236 | } 237 | } 238 | } 239 | layer { 240 | name: "relu6" 241 | type: "ReLU" 242 | bottom: "fc6" 243 | top: "fc6" 244 | } 245 | layer { 246 | name: "drop6" 247 | type: "Dropout" 248 | bottom: "fc6" 249 | top: "fc6" 250 | dropout_param { 251 | dropout_ratio: 0.5 252 | } 253 | } 254 | layer { 255 | name: "fc7" 256 | type: "InnerProduct" 257 | bottom: "fc6" 258 | top: "fc7" 259 | param { 260 | lr_mult: 1 261 | decay_mult: 1 262 | } 263 | param { 264 | lr_mult: 2 265 | decay_mult: 0 266 | } 267 | inner_product_param { 268 | num_output: 4096 269 | weight_filler { 270 | type: "gaussian" 271 | std: 0.005 272 | } 273 | bias_filler { 274 | type: "constant" 275 | value: 0.1 276 | } 277 | } 278 | } 279 | layer { 280 | name: "relu7" 281 | type: "ReLU" 282 | bottom: "fc7" 283 | top: "fc7" 284 | } 285 | layer { 286 | name: "drop7" 287 | type: "Dropout" 288 | bottom: "fc7" 289 | top: "fc7" 290 | dropout_param { 291 | dropout_ratio: 0.5 292 | } 293 | } 294 | layer { 295 | name: "fc8" 296 | type: "InnerProduct" 297 | bottom: "fc7" 298 | top: "fc8" 299 | param { 300 | lr_mult: 1 301 | decay_mult: 1 302 | } 303 | param { 304 | lr_mult: 2 305 | decay_mult: 0 306 | } 307 | inner_product_param { 308 | num_output: 1000 309 | weight_filler { 310 | type: "gaussian" 311 | std: 0.01 312 | } 313 | bias_filler { 314 | type: "constant" 315 | value: 0 316 | } 317 | } 318 | } 319 | layer { 320 | name: "accuracy" 321 | type: "Accuracy" 322 | bottom: "fc8" 323 | bottom: "label" 324 | top: "accuracy" 325 | include { 326 | phase: TEST 327 | } 328 | } 329 | layer { 330 | name: "loss" 331 | type: "SoftmaxWithLoss" 332 | bottom: "fc8" 333 | bottom: "label" 334 | top: "loss" 335 | } 336 | 337 | -------------------------------------------------------------------------------- /caffe/alexnet_4GPUs.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe train -solver=./alexnet_4GPUs_solver.prototxt -gpu=0,1,2,3 >alexnet_4GPUs.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/alexnet_4GPUs_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "./alexnet_4GPUs.prototxt" 2 | max_iter: 50 3 | base_lr: 0.01 4 | lr_policy: "fixed" 5 | solver_mode: GPU 6 | -------------------------------------------------------------------------------- /caffe/alexnet_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "./alexnet.prototxt" 2 | max_iter: 50 3 | base_lr: 0.01 4 | lr_policy: "fixed" 5 | solver_mode: GPU 6 | -------------------------------------------------------------------------------- /caffe/alexnet_time_1GPU.log: -------------------------------------------------------------------------------- 1 | I1212 01:16:31.289032 40332 caffe.cpp:297] Use GPU with device ID 0 2 | I1212 01:16:41.500707 40332 net.cpp:322] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy 3 | I1212 01:16:41.500973 40332 net.cpp:49] Initializing net from parameters: 4 | name: "AlexNet" 5 | state { 6 | phase: TRAIN 7 | } 8 | layer { 9 | name: "data" 10 | type: "Data" 11 | top: "data" 12 | top: "label" 13 | include { 14 | phase: TRAIN 15 | } 16 | data_param { 17 | source: "./fake_image_net.lmdb" 18 | batch_size: 256 19 | backend: LMDB 20 | } 21 | } 22 | layer { 23 | name: "conv1" 24 | type: "Convolution" 25 | bottom: "data" 26 | top: "conv1" 27 | param { 28 | lr_mult: 1 29 | decay_mult: 1 30 | } 31 | param { 32 | lr_mult: 2 33 | decay_mult: 0 34 | } 35 | convolution_param { 36 | num_output: 96 37 | kernel_size: 11 38 | stride: 4 39 | weight_filler { 40 | type: "gaussian" 41 | std: 0.01 42 | } 43 | bias_filler { 44 | type: "constant" 45 | value: 0 46 | } 47 | } 48 | } 49 | layer { 50 | name: "relu1" 51 | type: "ReLU" 52 | bottom: "conv1" 53 | top: "conv1" 54 | } 55 | layer { 56 | name: "pool1" 57 | type: "Pooling" 58 | bottom: "conv1" 59 | top: "pool1" 60 | pooling_param { 61 | pool: MAX 62 | kernel_size: 3 63 | stride: 2 64 | } 65 | } 66 | layer { 67 | name: "conv2" 68 | type: "Convolution" 69 | bottom: "pool1" 70 | top: "conv2" 71 | param { 72 | lr_mult: 1 73 | decay_mult: 1 74 | } 75 | param { 76 | lr_mult: 2 77 | decay_mult: 0 78 | } 79 | convolution_param { 80 | num_output: 256 81 | pad: 2 82 | kernel_size: 5 83 | weight_filler { 84 | type: "gaussian" 85 | std: 0.01 86 | } 87 | bias_filler { 88 | type: "constant" 89 | value: 0.1 90 | } 91 | } 92 | } 93 | layer { 94 | name: "relu2" 95 | type: "ReLU" 96 | bottom: "conv2" 97 | top: "conv2" 98 | } 99 | layer { 100 | name: "pool2" 101 | type: "Pooling" 102 | bottom: "conv2" 103 | top: "pool2" 104 | pooling_param { 105 | pool: MAX 106 | kernel_size: 3 107 | stride: 2 108 | } 109 | } 110 | layer { 111 | name: "conv3" 112 | type: "Convolution" 113 | bottom: "pool2" 114 | top: "conv3" 115 | param { 116 | lr_mult: 1 117 | decay_mult: 1 118 | } 119 | param { 120 | lr_mult: 2 121 | decay_mult: 0 122 | } 123 | convolution_param { 124 | num_output: 384 125 | pad: 1 126 | kernel_size: 3 127 | weight_filler { 128 | type: "gaussian" 129 | std: 0.01 130 | } 131 | bias_filler { 132 | type: "constant" 133 | value: 0 134 | } 135 | } 136 | } 137 | layer { 138 | name: "relu3" 139 | type: "ReLU" 140 | bottom: "conv3" 141 | top: "conv3" 142 | } 143 | layer { 144 | name: "conv4" 145 | type: "Convolution" 146 | bottom: "conv3" 147 | top: "conv4" 148 | param { 149 | lr_mult: 1 150 | decay_mult: 1 151 | } 152 | param { 153 | lr_mult: 2 154 | decay_mult: 0 155 | } 156 | convolution_param { 157 | num_output: 384 158 | pad: 1 159 | kernel_size: 3 160 | weight_filler { 161 | type: "gaussian" 162 | std: 0.01 163 | } 164 | bias_filler { 165 | type: "constant" 166 | value: 0.1 167 | } 168 | } 169 | } 170 | layer { 171 | name: "relu4" 172 | type: "ReLU" 173 | bottom: "conv4" 174 | top: "conv4" 175 | } 176 | layer { 177 | name: "conv5" 178 | type: "Convolution" 179 | bottom: "conv4" 180 | top: "conv5" 181 | param { 182 | lr_mult: 1 183 | decay_mult: 1 184 | } 185 | param { 186 | lr_mult: 2 187 | decay_mult: 0 188 | } 189 | convolution_param { 190 | num_output: 256 191 | pad: 1 192 | kernel_size: 3 193 | weight_filler { 194 | type: "gaussian" 195 | std: 0.01 196 | } 197 | bias_filler { 198 | type: "constant" 199 | value: 0.1 200 | } 201 | } 202 | } 203 | layer { 204 | name: "relu5" 205 | type: "ReLU" 206 | bottom: "conv5" 207 | top: "conv5" 208 | } 209 | layer { 210 | name: "pool5" 211 | type: "Pooling" 212 | bottom: "conv5" 213 | top: "pool5" 214 | pooling_param { 215 | pool: MAX 216 | kernel_size: 3 217 | stride: 2 218 | } 219 | } 220 | layer { 221 | name: "fc6" 222 | type: "InnerProduct" 223 | bottom: "pool5" 224 | top: "fc6" 225 | param { 226 | lr_mult: 1 227 | decay_mult: 1 228 | } 229 | param { 230 | lr_mult: 2 231 | decay_mult: 0 232 | } 233 | inner_product_param { 234 | num_output: 4096 235 | weight_filler { 236 | type: "gaussian" 237 | std: 0.005 238 | } 239 | bias_filler { 240 | type: "constant" 241 | value: 0.1 242 | } 243 | } 244 | } 245 | layer { 246 | name: "relu6" 247 | type: "ReLU" 248 | bottom: "fc6" 249 | top: "fc6" 250 | } 251 | layer { 252 | name: "drop6" 253 | type: "Dropout" 254 | bottom: "fc6" 255 | top: "fc6" 256 | dropout_param { 257 | dropout_ratio: 0.5 258 | } 259 | } 260 | layer { 261 | name: "fc7" 262 | type: "InnerProduct" 263 | bottom: "fc6" 264 | top: "fc7" 265 | param { 266 | lr_mult: 1 267 | decay_mult: 1 268 | } 269 | param { 270 | lr_mult: 2 271 | decay_mult: 0 272 | } 273 | inner_product_param { 274 | num_output: 4096 275 | weight_filler { 276 | type: "gaussian" 277 | std: 0.005 278 | } 279 | bias_filler { 280 | type: "constant" 281 | value: 0.1 282 | } 283 | } 284 | } 285 | layer { 286 | name: "relu7" 287 | type: "ReLU" 288 | bottom: "fc7" 289 | top: "fc7" 290 | } 291 | layer { 292 | name: "drop7" 293 | type: "Dropout" 294 | bottom: "fc7" 295 | top: "fc7" 296 | dropout_param { 297 | dropout_ratio: 0.5 298 | } 299 | } 300 | layer { 301 | name: "fc8" 302 | type: "InnerProduct" 303 | bottom: "fc7" 304 | top: "fc8" 305 | param { 306 | lr_mult: 1 307 | decay_mult: 1 308 | } 309 | param { 310 | lr_mult: 2 311 | decay_mult: 0 312 | } 313 | inner_product_param { 314 | num_output: 1000 315 | weight_filler { 316 | type: "gaussian" 317 | std: 0.01 318 | } 319 | bias_filler { 320 | type: "constant" 321 | value: 0 322 | } 323 | } 324 | } 325 | layer { 326 | name: "loss" 327 | type: "SoftmaxWithLoss" 328 | bottom: "fc8" 329 | bottom: "label" 330 | top: "loss" 331 | } 332 | I1212 01:16:41.501101 40332 layer_factory.hpp:77] Creating layer data 333 | I1212 01:16:41.501677 40332 net.cpp:106] Creating Layer data 334 | I1212 01:16:41.501689 40332 net.cpp:411] data -> data 335 | I1212 01:16:41.501711 40332 net.cpp:411] data -> label 336 | I1212 01:16:41.503720 40334 db_lmdb.cpp:38] Opened lmdb ./fake_image_net.lmdb 337 | I1212 01:16:41.518365 40332 data_layer.cpp:41] output data size: 256,3,224,224 338 | I1212 01:16:41.777848 40332 net.cpp:150] Setting up data 339 | I1212 01:16:41.777902 40332 net.cpp:157] Top shape: 256 3 224 224 (38535168) 340 | I1212 01:16:41.777909 40332 net.cpp:157] Top shape: 256 (256) 341 | I1212 01:16:41.777914 40332 net.cpp:165] Memory required for data: 154141696 342 | I1212 01:16:41.777923 40332 layer_factory.hpp:77] Creating layer conv1 343 | I1212 01:16:41.777945 40332 net.cpp:106] Creating Layer conv1 344 | I1212 01:16:41.777951 40332 net.cpp:454] conv1 <- data 345 | I1212 01:16:41.777966 40332 net.cpp:411] conv1 -> conv1 346 | I1212 01:16:41.926776 40332 net.cpp:150] Setting up conv1 347 | I1212 01:16:41.926822 40332 net.cpp:157] Top shape: 256 96 54 54 (71663616) 348 | I1212 01:16:41.926828 40332 net.cpp:165] Memory required for data: 440796160 349 | I1212 01:16:41.926854 40332 layer_factory.hpp:77] Creating layer relu1 350 | I1212 01:16:41.926873 40332 net.cpp:106] Creating Layer relu1 351 | I1212 01:16:41.926879 40332 net.cpp:454] relu1 <- conv1 352 | I1212 01:16:41.926888 40332 net.cpp:397] relu1 -> conv1 (in-place) 353 | I1212 01:16:41.927146 40332 net.cpp:150] Setting up relu1 354 | I1212 01:16:41.927160 40332 net.cpp:157] Top shape: 256 96 54 54 (71663616) 355 | I1212 01:16:41.927163 40332 net.cpp:165] Memory required for data: 727450624 356 | I1212 01:16:41.927168 40332 layer_factory.hpp:77] Creating layer pool1 357 | I1212 01:16:41.927184 40332 net.cpp:106] Creating Layer pool1 358 | I1212 01:16:41.927189 40332 net.cpp:454] pool1 <- conv1 359 | I1212 01:16:41.927197 40332 net.cpp:411] pool1 -> pool1 360 | I1212 01:16:41.927513 40332 net.cpp:150] Setting up pool1 361 | I1212 01:16:41.927525 40332 net.cpp:157] Top shape: 256 96 27 27 (17915904) 362 | I1212 01:16:41.927530 40332 net.cpp:165] Memory required for data: 799114240 363 | I1212 01:16:41.927534 40332 layer_factory.hpp:77] Creating layer conv2 364 | I1212 01:16:41.927551 40332 net.cpp:106] Creating Layer conv2 365 | I1212 01:16:41.927556 40332 net.cpp:454] conv2 <- pool1 366 | I1212 01:16:41.927563 40332 net.cpp:411] conv2 -> conv2 367 | I1212 01:16:41.946357 40332 net.cpp:150] Setting up conv2 368 | I1212 01:16:41.946400 40332 net.cpp:157] Top shape: 256 256 27 27 (47775744) 369 | I1212 01:16:41.946406 40332 net.cpp:165] Memory required for data: 990217216 370 | I1212 01:16:41.946424 40332 layer_factory.hpp:77] Creating layer relu2 371 | I1212 01:16:41.946437 40332 net.cpp:106] Creating Layer relu2 372 | I1212 01:16:41.946444 40332 net.cpp:454] relu2 <- conv2 373 | I1212 01:16:41.946452 40332 net.cpp:397] relu2 -> conv2 (in-place) 374 | I1212 01:16:41.946724 40332 net.cpp:150] Setting up relu2 375 | I1212 01:16:41.946737 40332 net.cpp:157] Top shape: 256 256 27 27 (47775744) 376 | I1212 01:16:41.946740 40332 net.cpp:165] Memory required for data: 1181320192 377 | I1212 01:16:41.946745 40332 layer_factory.hpp:77] Creating layer pool2 378 | I1212 01:16:41.946753 40332 net.cpp:106] Creating Layer pool2 379 | I1212 01:16:41.946758 40332 net.cpp:454] pool2 <- conv2 380 | I1212 01:16:41.946766 40332 net.cpp:411] pool2 -> pool2 381 | I1212 01:16:41.946939 40332 net.cpp:150] Setting up pool2 382 | I1212 01:16:41.946949 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584) 383 | I1212 01:16:41.946954 40332 net.cpp:165] Memory required for data: 1225622528 384 | I1212 01:16:41.946959 40332 layer_factory.hpp:77] Creating layer conv3 385 | I1212 01:16:41.946976 40332 net.cpp:106] Creating Layer conv3 386 | I1212 01:16:41.947024 40332 net.cpp:454] conv3 <- pool2 387 | I1212 01:16:41.947033 40332 net.cpp:411] conv3 -> conv3 388 | I1212 01:16:41.973142 40332 net.cpp:150] Setting up conv3 389 | I1212 01:16:41.973193 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376) 390 | I1212 01:16:41.973198 40332 net.cpp:165] Memory required for data: 1292076032 391 | I1212 01:16:41.973217 40332 layer_factory.hpp:77] Creating layer relu3 392 | I1212 01:16:41.973234 40332 net.cpp:106] Creating Layer relu3 393 | I1212 01:16:41.973240 40332 net.cpp:454] relu3 <- conv3 394 | I1212 01:16:41.973250 40332 net.cpp:397] relu3 -> conv3 (in-place) 395 | I1212 01:16:41.973515 40332 net.cpp:150] Setting up relu3 396 | I1212 01:16:41.973526 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376) 397 | I1212 01:16:41.973531 40332 net.cpp:165] Memory required for data: 1358529536 398 | I1212 01:16:41.973536 40332 layer_factory.hpp:77] Creating layer conv4 399 | I1212 01:16:41.973552 40332 net.cpp:106] Creating Layer conv4 400 | I1212 01:16:41.973557 40332 net.cpp:454] conv4 <- conv3 401 | I1212 01:16:41.973567 40332 net.cpp:411] conv4 -> conv4 402 | I1212 01:16:42.002802 40335 blocking_queue.cpp:50] Waiting for data 403 | I1212 01:16:42.011420 40332 net.cpp:150] Setting up conv4 404 | I1212 01:16:42.011440 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376) 405 | I1212 01:16:42.011445 40332 net.cpp:165] Memory required for data: 1424983040 406 | I1212 01:16:42.011456 40332 layer_factory.hpp:77] Creating layer relu4 407 | I1212 01:16:42.011471 40332 net.cpp:106] Creating Layer relu4 408 | I1212 01:16:42.011476 40332 net.cpp:454] relu4 <- conv4 409 | I1212 01:16:42.011492 40332 net.cpp:397] relu4 -> conv4 (in-place) 410 | I1212 01:16:42.011638 40332 net.cpp:150] Setting up relu4 411 | I1212 01:16:42.011647 40332 net.cpp:157] Top shape: 256 384 13 13 (16613376) 412 | I1212 01:16:42.011651 40332 net.cpp:165] Memory required for data: 1491436544 413 | I1212 01:16:42.011656 40332 layer_factory.hpp:77] Creating layer conv5 414 | I1212 01:16:42.011669 40332 net.cpp:106] Creating Layer conv5 415 | I1212 01:16:42.011673 40332 net.cpp:454] conv5 <- conv4 416 | I1212 01:16:42.011692 40332 net.cpp:411] conv5 -> conv5 417 | I1212 01:16:42.050737 40332 net.cpp:150] Setting up conv5 418 | I1212 01:16:42.050758 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584) 419 | I1212 01:16:42.050763 40332 net.cpp:165] Memory required for data: 1535738880 420 | I1212 01:16:42.050776 40332 layer_factory.hpp:77] Creating layer relu5 421 | I1212 01:16:42.050786 40332 net.cpp:106] Creating Layer relu5 422 | I1212 01:16:42.050791 40332 net.cpp:454] relu5 <- conv5 423 | I1212 01:16:42.050798 40332 net.cpp:397] relu5 -> conv5 (in-place) 424 | I1212 01:16:42.050946 40332 net.cpp:150] Setting up relu5 425 | I1212 01:16:42.050956 40332 net.cpp:157] Top shape: 256 256 13 13 (11075584) 426 | I1212 01:16:42.050961 40332 net.cpp:165] Memory required for data: 1580041216 427 | I1212 01:16:42.050964 40332 layer_factory.hpp:77] Creating layer pool5 428 | I1212 01:16:42.050976 40332 net.cpp:106] Creating Layer pool5 429 | I1212 01:16:42.050979 40332 net.cpp:454] pool5 <- conv5 430 | I1212 01:16:42.050997 40332 net.cpp:411] pool5 -> pool5 431 | I1212 01:16:42.051321 40332 net.cpp:150] Setting up pool5 432 | I1212 01:16:42.051331 40332 net.cpp:157] Top shape: 256 256 6 6 (2359296) 433 | I1212 01:16:42.051336 40332 net.cpp:165] Memory required for data: 1589478400 434 | I1212 01:16:42.051340 40332 layer_factory.hpp:77] Creating layer fc6 435 | I1212 01:16:42.051352 40332 net.cpp:106] Creating Layer fc6 436 | I1212 01:16:42.051357 40332 net.cpp:454] fc6 <- pool5 437 | I1212 01:16:42.051363 40332 net.cpp:411] fc6 -> fc6 438 | I1212 01:16:43.094733 40332 net.cpp:150] Setting up fc6 439 | I1212 01:16:43.094801 40332 net.cpp:157] Top shape: 256 4096 (1048576) 440 | I1212 01:16:43.094806 40332 net.cpp:165] Memory required for data: 1593672704 441 | I1212 01:16:43.094828 40332 layer_factory.hpp:77] Creating layer relu6 442 | I1212 01:16:43.094853 40332 net.cpp:106] Creating Layer relu6 443 | I1212 01:16:43.094863 40332 net.cpp:454] relu6 <- fc6 444 | I1212 01:16:43.094873 40332 net.cpp:397] relu6 -> fc6 (in-place) 445 | I1212 01:16:43.095288 40332 net.cpp:150] Setting up relu6 446 | I1212 01:16:43.095300 40332 net.cpp:157] Top shape: 256 4096 (1048576) 447 | I1212 01:16:43.095309 40332 net.cpp:165] Memory required for data: 1597867008 448 | I1212 01:16:43.095340 40332 layer_factory.hpp:77] Creating layer drop6 449 | I1212 01:16:43.095379 40332 net.cpp:106] Creating Layer drop6 450 | I1212 01:16:43.095384 40332 net.cpp:454] drop6 <- fc6 451 | I1212 01:16:43.095391 40332 net.cpp:397] drop6 -> fc6 (in-place) 452 | I1212 01:16:43.095423 40332 net.cpp:150] Setting up drop6 453 | I1212 01:16:43.095430 40332 net.cpp:157] Top shape: 256 4096 (1048576) 454 | I1212 01:16:43.095434 40332 net.cpp:165] Memory required for data: 1602061312 455 | I1212 01:16:43.095438 40332 layer_factory.hpp:77] Creating layer fc7 456 | I1212 01:16:43.095456 40332 net.cpp:106] Creating Layer fc7 457 | I1212 01:16:43.095461 40332 net.cpp:454] fc7 <- fc6 458 | I1212 01:16:43.095466 40332 net.cpp:411] fc7 -> fc7 459 | I1212 01:16:43.556849 40332 net.cpp:150] Setting up fc7 460 | I1212 01:16:43.556907 40332 net.cpp:157] Top shape: 256 4096 (1048576) 461 | I1212 01:16:43.556912 40332 net.cpp:165] Memory required for data: 1606255616 462 | I1212 01:16:43.556933 40332 layer_factory.hpp:77] Creating layer relu7 463 | I1212 01:16:43.556972 40332 net.cpp:106] Creating Layer relu7 464 | I1212 01:16:43.556982 40332 net.cpp:454] relu7 <- fc7 465 | I1212 01:16:43.556994 40332 net.cpp:397] relu7 -> fc7 (in-place) 466 | I1212 01:16:43.557718 40332 net.cpp:150] Setting up relu7 467 | I1212 01:16:43.557731 40332 net.cpp:157] Top shape: 256 4096 (1048576) 468 | I1212 01:16:43.557735 40332 net.cpp:165] Memory required for data: 1610449920 469 | I1212 01:16:43.557740 40332 layer_factory.hpp:77] Creating layer drop7 470 | I1212 01:16:43.557759 40332 net.cpp:106] Creating Layer drop7 471 | I1212 01:16:43.557764 40332 net.cpp:454] drop7 <- fc7 472 | I1212 01:16:43.557772 40332 net.cpp:397] drop7 -> fc7 (in-place) 473 | I1212 01:16:43.557796 40332 net.cpp:150] Setting up drop7 474 | I1212 01:16:43.557803 40332 net.cpp:157] Top shape: 256 4096 (1048576) 475 | I1212 01:16:43.557807 40332 net.cpp:165] Memory required for data: 1614644224 476 | I1212 01:16:43.557812 40332 layer_factory.hpp:77] Creating layer fc8 477 | I1212 01:16:43.557826 40332 net.cpp:106] Creating Layer fc8 478 | I1212 01:16:43.557832 40332 net.cpp:454] fc8 <- fc7 479 | I1212 01:16:43.557839 40332 net.cpp:411] fc8 -> fc8 480 | I1212 01:16:43.667419 40332 net.cpp:150] Setting up fc8 481 | I1212 01:16:43.667448 40332 net.cpp:157] Top shape: 256 1000 (256000) 482 | I1212 01:16:43.667454 40332 net.cpp:165] Memory required for data: 1615668224 483 | I1212 01:16:43.667466 40332 layer_factory.hpp:77] Creating layer loss 484 | I1212 01:16:43.667481 40332 net.cpp:106] Creating Layer loss 485 | I1212 01:16:43.667486 40332 net.cpp:454] loss <- fc8 486 | I1212 01:16:43.667492 40332 net.cpp:454] loss <- label 487 | I1212 01:16:43.667506 40332 net.cpp:411] loss -> loss 488 | I1212 01:16:43.667521 40332 layer_factory.hpp:77] Creating layer loss 489 | I1212 01:16:43.668579 40332 net.cpp:150] Setting up loss 490 | I1212 01:16:43.668591 40332 net.cpp:157] Top shape: (1) 491 | I1212 01:16:43.668594 40332 net.cpp:160] with loss weight 1 492 | I1212 01:16:43.668633 40332 net.cpp:165] Memory required for data: 1615668228 493 | I1212 01:16:43.668637 40332 net.cpp:226] loss needs backward computation. 494 | I1212 01:16:43.668642 40332 net.cpp:226] fc8 needs backward computation. 495 | I1212 01:16:43.668647 40332 net.cpp:226] drop7 needs backward computation. 496 | I1212 01:16:43.668649 40332 net.cpp:226] relu7 needs backward computation. 497 | I1212 01:16:43.668653 40332 net.cpp:226] fc7 needs backward computation. 498 | I1212 01:16:43.668658 40332 net.cpp:226] drop6 needs backward computation. 499 | I1212 01:16:43.668661 40332 net.cpp:226] relu6 needs backward computation. 500 | I1212 01:16:43.668664 40332 net.cpp:226] fc6 needs backward computation. 501 | I1212 01:16:43.668670 40332 net.cpp:226] pool5 needs backward computation. 502 | I1212 01:16:43.668674 40332 net.cpp:226] relu5 needs backward computation. 503 | I1212 01:16:43.668679 40332 net.cpp:226] conv5 needs backward computation. 504 | I1212 01:16:43.668684 40332 net.cpp:226] relu4 needs backward computation. 505 | I1212 01:16:43.668689 40332 net.cpp:226] conv4 needs backward computation. 506 | I1212 01:16:43.668692 40332 net.cpp:226] relu3 needs backward computation. 507 | I1212 01:16:43.668697 40332 net.cpp:226] conv3 needs backward computation. 508 | I1212 01:16:43.668702 40332 net.cpp:226] pool2 needs backward computation. 509 | I1212 01:16:43.668711 40332 net.cpp:226] relu2 needs backward computation. 510 | I1212 01:16:43.668748 40332 net.cpp:226] conv2 needs backward computation. 511 | I1212 01:16:43.668753 40332 net.cpp:226] pool1 needs backward computation. 512 | I1212 01:16:43.668757 40332 net.cpp:226] relu1 needs backward computation. 513 | I1212 01:16:43.668761 40332 net.cpp:226] conv1 needs backward computation. 514 | I1212 01:16:43.668767 40332 net.cpp:228] data does not need backward computation. 515 | I1212 01:16:43.668771 40332 net.cpp:270] This network produces output loss 516 | I1212 01:16:43.668787 40332 net.cpp:283] Network initialization done. 517 | I1212 01:16:43.668884 40332 caffe.cpp:309] Performing Forward 518 | I1212 01:16:44.038889 40332 caffe.cpp:314] Initial loss: 6.93382 519 | I1212 01:16:44.038939 40332 caffe.cpp:315] Performing Backward 520 | I1212 01:16:44.043067 40332 caffe.cpp:323] *** Benchmark begins *** 521 | I1212 01:16:44.043078 40332 caffe.cpp:324] Testing for 10 iterations. 522 | I1212 01:16:46.011155 40332 caffe.cpp:352] Iteration: 1 forward-backward time: 1140 ms. 523 | I1212 01:16:47.155781 40332 caffe.cpp:352] Iteration: 2 forward-backward time: 1144.47 ms. 524 | I1212 01:16:48.294250 40332 caffe.cpp:352] Iteration: 3 forward-backward time: 1138.34 ms. 525 | I1212 01:16:49.432665 40332 caffe.cpp:352] Iteration: 4 forward-backward time: 1138.29 ms. 526 | I1212 01:16:50.570600 40332 caffe.cpp:352] Iteration: 5 forward-backward time: 1137.81 ms. 527 | I1212 01:16:51.709350 40332 caffe.cpp:352] Iteration: 6 forward-backward time: 1138.65 ms. 528 | I1212 01:16:52.846112 40332 caffe.cpp:352] Iteration: 7 forward-backward time: 1136.65 ms. 529 | I1212 01:16:53.984618 40332 caffe.cpp:352] Iteration: 8 forward-backward time: 1138.38 ms. 530 | I1212 01:16:55.122740 40332 caffe.cpp:352] Iteration: 9 forward-backward time: 1138.02 ms. 531 | I1212 01:16:56.258673 40332 caffe.cpp:352] Iteration: 10 forward-backward time: 1135.83 ms. 532 | I1212 01:16:56.258728 40332 caffe.cpp:355] Average time per layer: 533 | I1212 01:16:56.258733 40332 caffe.cpp:358] data forward: 1.70459 ms. 534 | I1212 01:16:56.258739 40332 caffe.cpp:361] data backward: 0.0043392 ms. 535 | I1212 01:16:56.258744 40332 caffe.cpp:358] conv1 forward: 38.4802 ms. 536 | I1212 01:16:56.258750 40332 caffe.cpp:361] conv1 backward: 48.4646 ms. 537 | I1212 01:16:56.258754 40332 caffe.cpp:358] relu1 forward: 3.21308 ms. 538 | I1212 01:16:56.258759 40332 caffe.cpp:361] relu1 backward: 4.87886 ms. 539 | I1212 01:16:56.258764 40332 caffe.cpp:358] pool1 forward: 4.21732 ms. 540 | I1212 01:16:56.258767 40332 caffe.cpp:361] pool1 backward: 19.1974 ms. 541 | I1212 01:16:56.258772 40332 caffe.cpp:358] conv2 forward: 105.522 ms. 542 | I1212 01:16:56.258777 40332 caffe.cpp:361] conv2 backward: 287.43 ms. 543 | I1212 01:16:56.258783 40332 caffe.cpp:358] relu2 forward: 2.18696 ms. 544 | I1212 01:16:56.258787 40332 caffe.cpp:361] relu2 backward: 3.26851 ms. 545 | I1212 01:16:56.258792 40332 caffe.cpp:358] pool2 forward: 2.58741 ms. 546 | I1212 01:16:56.258796 40332 caffe.cpp:361] pool2 backward: 10.4666 ms. 547 | I1212 01:16:56.258801 40332 caffe.cpp:358] conv3 forward: 41.0432 ms. 548 | I1212 01:16:56.258806 40332 caffe.cpp:361] conv3 backward: 117.781 ms. 549 | I1212 01:16:56.258811 40332 caffe.cpp:358] relu3 forward: 0.88577 ms. 550 | I1212 01:16:56.258816 40332 caffe.cpp:361] relu3 backward: 1.39212 ms. 551 | I1212 01:16:56.258821 40332 caffe.cpp:358] conv4 forward: 60.3497 ms. 552 | I1212 01:16:56.258826 40332 caffe.cpp:361] conv4 backward: 177.144 ms. 553 | I1212 01:16:56.258831 40332 caffe.cpp:358] relu4 forward: 0.872438 ms. 554 | I1212 01:16:56.258836 40332 caffe.cpp:361] relu4 backward: 1.37873 ms. 555 | I1212 01:16:56.258841 40332 caffe.cpp:358] conv5 forward: 36.2849 ms. 556 | I1212 01:16:56.258846 40332 caffe.cpp:361] conv5 backward: 117.303 ms. 557 | I1212 01:16:56.258852 40332 caffe.cpp:358] relu5 forward: 0.586986 ms. 558 | I1212 01:16:56.258855 40332 caffe.cpp:361] relu5 backward: 0.916598 ms. 559 | I1212 01:16:56.258860 40332 caffe.cpp:358] pool5 forward: 0.630026 ms. 560 | I1212 01:16:56.258865 40332 caffe.cpp:361] pool5 backward: 2.59878 ms. 561 | I1212 01:16:56.258870 40332 caffe.cpp:358] fc6 forward: 10.3887 ms. 562 | I1212 01:16:56.258884 40332 caffe.cpp:361] fc6 backward: 17.0964 ms. 563 | I1212 01:16:56.258925 40332 caffe.cpp:358] relu6 forward: 0.068016 ms. 564 | I1212 01:16:56.258932 40332 caffe.cpp:361] relu6 backward: 0.0939072 ms. 565 | I1212 01:16:56.258937 40332 caffe.cpp:358] drop6 forward: 0.171498 ms. 566 | I1212 01:16:56.258941 40332 caffe.cpp:361] drop6 backward: 0.0838112 ms. 567 | I1212 01:16:56.258946 40332 caffe.cpp:358] fc7 forward: 5.36998 ms. 568 | I1212 01:16:56.258950 40332 caffe.cpp:361] fc7 backward: 8.89202 ms. 569 | I1212 01:16:56.258955 40332 caffe.cpp:358] relu7 forward: 0.0690176 ms. 570 | I1212 01:16:56.258960 40332 caffe.cpp:361] relu7 backward: 0.103798 ms. 571 | I1212 01:16:56.258965 40332 caffe.cpp:358] drop7 forward: 0.145693 ms. 572 | I1212 01:16:56.258970 40332 caffe.cpp:361] drop7 backward: 0.0841472 ms. 573 | I1212 01:16:56.258975 40332 caffe.cpp:358] fc8 forward: 1.58494 ms. 574 | I1212 01:16:56.258978 40332 caffe.cpp:361] fc8 backward: 2.457 ms. 575 | I1212 01:16:56.258983 40332 caffe.cpp:358] loss forward: 0.234429 ms. 576 | I1212 01:16:56.258987 40332 caffe.cpp:361] loss backward: 0.0610048 ms. 577 | I1212 01:16:56.259012 40332 caffe.cpp:366] Average Forward pass: 317.065 ms. 578 | I1212 01:16:56.259021 40332 caffe.cpp:368] Average Backward pass: 821.549 ms. 579 | I1212 01:16:56.259030 40332 caffe.cpp:370] Average Forward-Backward: 1138.77 ms. 580 | I1212 01:16:56.259039 40332 caffe.cpp:372] Total Time: 11387.7 ms. 581 | I1212 01:16:56.259044 40332 caffe.cpp:373] *** Benchmark ends *** 582 | -------------------------------------------------------------------------------- /caffe/alexnet_time_1GPU.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe time --model=./alexnet.prototxt --iterations=10 -gpu=0 >alexnet_time_1GPU.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/createFakeData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lmdb 3 | import caffe as c 4 | 5 | featDim = 512 6 | labDim = 10000 7 | mbSize = 8192 8 | totalCount = mbSize * 16 9 | 10 | features = np.random.randn(totalCount, 1, 1, featDim) 11 | labels = np.random.randint(0, labDim, size=(totalCount,)) 12 | 13 | db = lmdb.open('./fake_data.lmdb', map_size=features.nbytes * 10) 14 | 15 | with db.begin(write = True) as txn: 16 | for i in range(totalCount): 17 | d = c.proto.caffe_pb2.Datum() 18 | d.channels = features.shape[1] 19 | d.height = features.shape[2] 20 | d.width = features.shape[3] 21 | d.data = features[i].tostring() 22 | d.label = labels[i] 23 | txn.put('{:08}'.format(i), d.SerializeToString()) 24 | 25 | -------------------------------------------------------------------------------- /caffe/createFakeImageNet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lmdb 3 | import caffe as c 4 | 5 | mbSize = 256 6 | totalCount = mbSize * 16 7 | 8 | features = np.random.randn(totalCount, 3, 224, 224) 9 | labels = np.random.randint(0, 1000, size=(totalCount,)) 10 | 11 | db = lmdb.open('./fake_image_net.lmdb', map_size=features.nbytes * 10) 12 | 13 | with db.begin(write = True) as txn: 14 | for i in range(totalCount): 15 | d = c.proto.caffe_pb2.Datum() 16 | d.channels = features.shape[1] 17 | d.height = features.shape[2] 18 | d.width = features.shape[3] 19 | d.data = features[i].tostring() 20 | d.label = labels[i] 21 | txn.put('{:08}'.format(i), d.SerializeToString()) 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /caffe/ffn.prototxt: -------------------------------------------------------------------------------- 1 | name: "FFN" 2 | layer { 3 | name: "data" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | data_param: { 11 | batch_size: 8192 12 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 13 | backend: LMDB 14 | } 15 | } 16 | 17 | layer { 18 | name: "H1" 19 | type: "InnerProduct" 20 | bottom: "data" 21 | top: "H1" 22 | inner_product_param { 23 | num_output: 2048 24 | } 25 | } 26 | 27 | layer { 28 | name: "H1_A" 29 | type: "Sigmoid" 30 | bottom: "H1" 31 | top: "H1" 32 | } 33 | 34 | layer { 35 | name: "H2" 36 | type: "InnerProduct" 37 | bottom: "H1" 38 | top: "H2" 39 | inner_product_param { 40 | num_output: 2048 41 | } 42 | } 43 | 44 | layer { 45 | name: "H2_A" 46 | type: "Sigmoid" 47 | bottom: "H2" 48 | top: "H2" 49 | } 50 | 51 | layer { 52 | name: "H3" 53 | type: "InnerProduct" 54 | bottom: "H2" 55 | top: "H3" 56 | inner_product_param { 57 | num_output: 2048 58 | } 59 | } 60 | 61 | layer { 62 | name: "H3_A" 63 | type: "Sigmoid" 64 | bottom: "H3" 65 | top: "H3" 66 | } 67 | 68 | layer { 69 | name: "H4" 70 | type: "InnerProduct" 71 | bottom: "H3" 72 | top: "H4" 73 | inner_product_param { 74 | num_output: 2048 75 | } 76 | } 77 | 78 | layer { 79 | name: "H4_A" 80 | type: "Sigmoid" 81 | bottom: "H4" 82 | top: "H4" 83 | } 84 | 85 | layer { 86 | name: "L" 87 | type: "InnerProduct" 88 | bottom: "H4" 89 | top: "L" 90 | inner_product_param { 91 | num_output: 10000 92 | } 93 | } 94 | 95 | layer { 96 | name: "loss" 97 | type: "SoftmaxWithLoss" 98 | bottom: "L" 99 | bottom: "label" 100 | top: "loss" 101 | } 102 | 103 | -------------------------------------------------------------------------------- /caffe/ffn_1GPU.log: -------------------------------------------------------------------------------- 1 | I1208 05:38:49.425312 49471 caffe.cpp:184] Using GPUs 0 2 | I1208 05:38:59.337173 49471 solver.cpp:48] Initializing solver from parameters: 3 | base_lr: 0.001 4 | max_iter: 100 5 | lr_policy: "fixed" 6 | solver_mode: GPU 7 | device_id: 0 8 | net: "./ffn.prototxt" 9 | I1208 05:38:59.337251 49471 solver.cpp:91] Creating training net from net file: ./ffn.prototxt 10 | I1208 05:38:59.337708 49471 net.cpp:49] Initializing net from parameters: 11 | name: "FFN" 12 | state { 13 | phase: TRAIN 14 | } 15 | layer { 16 | name: "data" 17 | type: "Data" 18 | top: "data" 19 | top: "label" 20 | include { 21 | phase: TRAIN 22 | } 23 | data_param { 24 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 25 | batch_size: 8192 26 | backend: LMDB 27 | } 28 | } 29 | layer { 30 | name: "H1" 31 | type: "InnerProduct" 32 | bottom: "data" 33 | top: "H1" 34 | inner_product_param { 35 | num_output: 2048 36 | } 37 | } 38 | layer { 39 | name: "H1_A" 40 | type: "Sigmoid" 41 | bottom: "H1" 42 | top: "H1" 43 | } 44 | layer { 45 | name: "H2" 46 | type: "InnerProduct" 47 | bottom: "H1" 48 | top: "H2" 49 | inner_product_param { 50 | num_output: 2048 51 | } 52 | } 53 | layer { 54 | name: "H2_A" 55 | type: "Sigmoid" 56 | bottom: "H2" 57 | top: "H2" 58 | } 59 | layer { 60 | name: "H3" 61 | type: "InnerProduct" 62 | bottom: "H2" 63 | top: "H3" 64 | inner_product_param { 65 | num_output: 2048 66 | } 67 | } 68 | layer { 69 | name: "H3_A" 70 | type: "Sigmoid" 71 | bottom: "H3" 72 | top: "H3" 73 | } 74 | layer { 75 | name: "H4" 76 | type: "InnerProduct" 77 | bottom: "H3" 78 | top: "H4" 79 | inner_product_param { 80 | num_output: 2048 81 | } 82 | } 83 | layer { 84 | name: "H4_A" 85 | type: "Sigmoid" 86 | bottom: "H4" 87 | top: "H4" 88 | } 89 | layer { 90 | name: "L" 91 | type: "InnerProduct" 92 | bottom: "H4" 93 | top: "L" 94 | inner_product_param { 95 | num_output: 10000 96 | } 97 | } 98 | layer { 99 | name: "loss" 100 | type: "SoftmaxWithLoss" 101 | bottom: "L" 102 | bottom: "label" 103 | top: "loss" 104 | } 105 | I1208 05:38:59.337765 49471 layer_factory.hpp:77] Creating layer data 106 | I1208 05:38:59.342125 49471 net.cpp:106] Creating Layer data 107 | I1208 05:38:59.342136 49471 net.cpp:411] data -> data 108 | I1208 05:38:59.342159 49471 net.cpp:411] data -> label 109 | I1208 05:38:59.344041 49474 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb 110 | I1208 05:38:59.358345 49471 data_layer.cpp:41] output data size: 8192,1,1,512 111 | I1208 05:38:59.387707 49471 net.cpp:150] Setting up data 112 | I1208 05:38:59.387778 49471 net.cpp:157] Top shape: 8192 1 1 512 (4194304) 113 | I1208 05:38:59.387785 49471 net.cpp:157] Top shape: 8192 (8192) 114 | I1208 05:38:59.387789 49471 net.cpp:165] Memory required for data: 16809984 115 | I1208 05:38:59.387799 49471 layer_factory.hpp:77] Creating layer H1 116 | I1208 05:38:59.387814 49471 net.cpp:106] Creating Layer H1 117 | I1208 05:38:59.387820 49471 net.cpp:454] H1 <- data 118 | I1208 05:38:59.387833 49471 net.cpp:411] H1 -> H1 119 | I1208 05:38:59.390786 49471 net.cpp:150] Setting up H1 120 | I1208 05:38:59.390800 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 121 | I1208 05:38:59.390805 49471 net.cpp:165] Memory required for data: 83918848 122 | I1208 05:38:59.390818 49471 layer_factory.hpp:77] Creating layer H1_A 123 | I1208 05:38:59.390830 49471 net.cpp:106] Creating Layer H1_A 124 | I1208 05:38:59.390836 49471 net.cpp:454] H1_A <- H1 125 | I1208 05:38:59.390841 49471 net.cpp:397] H1_A -> H1 (in-place) 126 | I1208 05:38:59.412019 49475 blocking_queue.cpp:50] Waiting for data 127 | I1208 05:38:59.438570 49475 blocking_queue.cpp:50] Waiting for data 128 | I1208 05:38:59.451354 49475 blocking_queue.cpp:50] Waiting for data 129 | I1208 05:38:59.464041 49475 blocking_queue.cpp:50] Waiting for data 130 | I1208 05:38:59.485954 49475 blocking_queue.cpp:50] Waiting for data 131 | I1208 05:38:59.498752 49475 blocking_queue.cpp:50] Waiting for data 132 | I1208 05:38:59.511708 49475 blocking_queue.cpp:50] Waiting for data 133 | I1208 05:38:59.513068 49471 net.cpp:150] Setting up H1_A 134 | I1208 05:38:59.513108 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 135 | I1208 05:38:59.513113 49471 net.cpp:165] Memory required for data: 151027712 136 | I1208 05:38:59.513119 49471 layer_factory.hpp:77] Creating layer H2 137 | I1208 05:38:59.513131 49471 net.cpp:106] Creating Layer H2 138 | I1208 05:38:59.513137 49471 net.cpp:454] H2 <- H1 139 | I1208 05:38:59.513147 49471 net.cpp:411] H2 -> H2 140 | I1208 05:38:59.521333 49471 net.cpp:150] Setting up H2 141 | I1208 05:38:59.521363 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 142 | I1208 05:38:59.521409 49471 net.cpp:165] Memory required for data: 218136576 143 | I1208 05:38:59.521428 49471 layer_factory.hpp:77] Creating layer H2_A 144 | I1208 05:38:59.521445 49471 net.cpp:106] Creating Layer H2_A 145 | I1208 05:38:59.521450 49471 net.cpp:454] H2_A <- H2 146 | I1208 05:38:59.521458 49471 net.cpp:397] H2_A -> H2 (in-place) 147 | I1208 05:38:59.521807 49471 net.cpp:150] Setting up H2_A 148 | I1208 05:38:59.521818 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 149 | I1208 05:38:59.521826 49471 net.cpp:165] Memory required for data: 285245440 150 | I1208 05:38:59.521831 49471 layer_factory.hpp:77] Creating layer H3 151 | I1208 05:38:59.521841 49471 net.cpp:106] Creating Layer H3 152 | I1208 05:38:59.521844 49471 net.cpp:454] H3 <- H2 153 | I1208 05:38:59.521850 49471 net.cpp:411] H3 -> H3 154 | I1208 05:38:59.530151 49471 net.cpp:150] Setting up H3 155 | I1208 05:38:59.530181 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 156 | I1208 05:38:59.530185 49471 net.cpp:165] Memory required for data: 352354304 157 | I1208 05:38:59.530199 49471 layer_factory.hpp:77] Creating layer H3_A 158 | I1208 05:38:59.530210 49471 net.cpp:106] Creating Layer H3_A 159 | I1208 05:38:59.530220 49471 net.cpp:454] H3_A <- H3 160 | I1208 05:38:59.530227 49471 net.cpp:397] H3_A -> H3 (in-place) 161 | I1208 05:38:59.530416 49471 net.cpp:150] Setting up H3_A 162 | I1208 05:38:59.530426 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 163 | I1208 05:38:59.530431 49471 net.cpp:165] Memory required for data: 419463168 164 | I1208 05:38:59.530434 49471 layer_factory.hpp:77] Creating layer H4 165 | I1208 05:38:59.530444 49471 net.cpp:106] Creating Layer H4 166 | I1208 05:38:59.530448 49471 net.cpp:454] H4 <- H3 167 | I1208 05:38:59.530455 49471 net.cpp:411] H4 -> H4 168 | I1208 05:38:59.538918 49475 blocking_queue.cpp:50] Waiting for data 169 | I1208 05:38:59.539352 49471 net.cpp:150] Setting up H4 170 | I1208 05:38:59.539376 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 171 | I1208 05:38:59.539381 49471 net.cpp:165] Memory required for data: 486572032 172 | I1208 05:38:59.539389 49471 layer_factory.hpp:77] Creating layer H4_A 173 | I1208 05:38:59.539404 49471 net.cpp:106] Creating Layer H4_A 174 | I1208 05:38:59.539412 49471 net.cpp:454] H4_A <- H4 175 | I1208 05:38:59.539419 49471 net.cpp:397] H4_A -> H4 (in-place) 176 | I1208 05:38:59.539780 49471 net.cpp:150] Setting up H4_A 177 | I1208 05:38:59.539791 49471 net.cpp:157] Top shape: 8192 2048 (16777216) 178 | I1208 05:38:59.539798 49471 net.cpp:165] Memory required for data: 553680896 179 | I1208 05:38:59.539803 49471 layer_factory.hpp:77] Creating layer L 180 | I1208 05:38:59.539810 49471 net.cpp:106] Creating Layer L 181 | I1208 05:38:59.539814 49471 net.cpp:454] L <- H4 182 | I1208 05:38:59.539822 49471 net.cpp:411] L -> L 183 | I1208 05:38:59.583822 49471 net.cpp:150] Setting up L 184 | I1208 05:38:59.583858 49471 net.cpp:157] Top shape: 8192 10000 (81920000) 185 | I1208 05:38:59.583863 49471 net.cpp:165] Memory required for data: 881360896 186 | I1208 05:38:59.583878 49471 layer_factory.hpp:77] Creating layer loss 187 | I1208 05:38:59.583896 49471 net.cpp:106] Creating Layer loss 188 | I1208 05:38:59.583902 49471 net.cpp:454] loss <- L 189 | I1208 05:38:59.583910 49471 net.cpp:454] loss <- label 190 | I1208 05:38:59.583922 49471 net.cpp:411] loss -> loss 191 | I1208 05:38:59.583940 49471 layer_factory.hpp:77] Creating layer loss 192 | I1208 05:38:59.765748 49471 net.cpp:150] Setting up loss 193 | I1208 05:38:59.765797 49471 net.cpp:157] Top shape: (1) 194 | I1208 05:38:59.765804 49471 net.cpp:160] with loss weight 1 195 | I1208 05:38:59.765825 49471 net.cpp:165] Memory required for data: 881360900 196 | I1208 05:38:59.765832 49471 net.cpp:226] loss needs backward computation. 197 | I1208 05:38:59.765840 49471 net.cpp:226] L needs backward computation. 198 | I1208 05:38:59.765844 49471 net.cpp:226] H4_A needs backward computation. 199 | I1208 05:38:59.765849 49471 net.cpp:226] H4 needs backward computation. 200 | I1208 05:38:59.765854 49471 net.cpp:226] H3_A needs backward computation. 201 | I1208 05:38:59.765859 49471 net.cpp:226] H3 needs backward computation. 202 | I1208 05:38:59.765864 49471 net.cpp:226] H2_A needs backward computation. 203 | I1208 05:38:59.765869 49471 net.cpp:226] H2 needs backward computation. 204 | I1208 05:38:59.765874 49471 net.cpp:226] H1_A needs backward computation. 205 | I1208 05:38:59.765887 49471 net.cpp:226] H1 needs backward computation. 206 | I1208 05:38:59.765934 49471 net.cpp:228] data does not need backward computation. 207 | I1208 05:38:59.765940 49471 net.cpp:270] This network produces output loss 208 | I1208 05:38:59.765955 49471 net.cpp:283] Network initialization done. 209 | I1208 05:38:59.765995 49471 solver.cpp:60] Solver scaffolding done. 210 | I1208 05:38:59.766335 49471 caffe.cpp:212] Starting Optimization 211 | I1208 05:38:59.766345 49471 solver.cpp:288] Solving FFN 212 | I1208 05:38:59.766347 49471 solver.cpp:289] Learning Rate Policy: fixed 213 | I1208 05:40:03.220875 49471 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel 214 | I1208 05:40:05.152649 49471 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate 215 | I1208 05:40:06.284504 49471 solver.cpp:326] Optimization Done. 216 | I1208 05:40:06.284602 49471 caffe.cpp:215] Optimization Done. 217 | -------------------------------------------------------------------------------- /caffe/ffn_1GPU.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import caffe as c 3 | import time 4 | 5 | mbSize = 512 6 | count = mbSize * 16 7 | 8 | feat = np.random.randn(count, 1, 1, 512).astype(np.float32) 9 | lab = np.random.randint(0, 10000, size=(count, 1, 1, 1)).astype(np.float32) 10 | 11 | def createSolver(solverFile): 12 | c.set_mode_gpu() 13 | solver = c.SGDSolver(solverFile) 14 | solver.net.set_input_arrays(feat, lab) 15 | return solver 16 | 17 | def samplesPerSec(minibatchSize, processingTime): 18 | return minibatchSize / processingTime 19 | 20 | def runBenchmark(solver, iter): 21 | startTime = time.time() 22 | solver.step(iter) 23 | stepTime = time.time() - startTime 24 | print "Samples per sec = %d." % samplesPerSec(mbSize * iter, stepTime) 25 | 26 | s = createSolver('./ffn_solver_md.prototxt') 27 | #runBenchmark(s, 10) 28 | 29 | -------------------------------------------------------------------------------- /caffe/ffn_1GPU.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe train -solver=./ffn_solver.prototxt -gpu=0 >ffn_1GPU.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/ffn_2GPUs.log: -------------------------------------------------------------------------------- 1 | I1208 05:42:01.665724 49481 caffe.cpp:184] Using GPUs 0, 1 2 | I1208 05:42:11.086211 49481 solver.cpp:48] Initializing solver from parameters: 3 | base_lr: 0.001 4 | max_iter: 100 5 | lr_policy: "fixed" 6 | solver_mode: GPU 7 | device_id: 0 8 | net: "./ffn_2GPUs.prototxt" 9 | I1208 05:42:11.086272 49481 solver.cpp:91] Creating training net from net file: ./ffn_2GPUs.prototxt 10 | I1208 05:42:11.086724 49481 net.cpp:49] Initializing net from parameters: 11 | name: "FFN" 12 | state { 13 | phase: TRAIN 14 | } 15 | layer { 16 | name: "data" 17 | type: "Data" 18 | top: "data" 19 | top: "label" 20 | include { 21 | phase: TRAIN 22 | } 23 | data_param { 24 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 25 | batch_size: 4096 26 | backend: LMDB 27 | } 28 | } 29 | layer { 30 | name: "H1" 31 | type: "InnerProduct" 32 | bottom: "data" 33 | top: "H1" 34 | inner_product_param { 35 | num_output: 2048 36 | } 37 | } 38 | layer { 39 | name: "H1_A" 40 | type: "Sigmoid" 41 | bottom: "H1" 42 | top: "H1" 43 | } 44 | layer { 45 | name: "H2" 46 | type: "InnerProduct" 47 | bottom: "H1" 48 | top: "H2" 49 | inner_product_param { 50 | num_output: 2048 51 | } 52 | } 53 | layer { 54 | name: "H2_A" 55 | type: "Sigmoid" 56 | bottom: "H2" 57 | top: "H2" 58 | } 59 | layer { 60 | name: "H3" 61 | type: "InnerProduct" 62 | bottom: "H2" 63 | top: "H3" 64 | inner_product_param { 65 | num_output: 2048 66 | } 67 | } 68 | layer { 69 | name: "H3_A" 70 | type: "Sigmoid" 71 | bottom: "H3" 72 | top: "H3" 73 | } 74 | layer { 75 | name: "H4" 76 | type: "InnerProduct" 77 | bottom: "H3" 78 | top: "H4" 79 | inner_product_param { 80 | num_output: 2048 81 | } 82 | } 83 | layer { 84 | name: "H4_A" 85 | type: "Sigmoid" 86 | bottom: "H4" 87 | top: "H4" 88 | } 89 | layer { 90 | name: "L" 91 | type: "InnerProduct" 92 | bottom: "H4" 93 | top: "L" 94 | inner_product_param { 95 | num_output: 10000 96 | } 97 | } 98 | layer { 99 | name: "loss" 100 | type: "SoftmaxWithLoss" 101 | bottom: "L" 102 | bottom: "label" 103 | top: "loss" 104 | } 105 | I1208 05:42:11.086781 49481 layer_factory.hpp:77] Creating layer data 106 | I1208 05:42:11.089282 49481 net.cpp:106] Creating Layer data 107 | I1208 05:42:11.089293 49481 net.cpp:411] data -> data 108 | I1208 05:42:11.089318 49481 net.cpp:411] data -> label 109 | I1208 05:42:11.091265 49484 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb 110 | I1208 05:42:11.102735 49481 data_layer.cpp:41] output data size: 4096,1,1,512 111 | I1208 05:42:11.117261 49481 net.cpp:150] Setting up data 112 | I1208 05:42:11.117303 49481 net.cpp:157] Top shape: 4096 1 1 512 (2097152) 113 | I1208 05:42:11.117310 49481 net.cpp:157] Top shape: 4096 (4096) 114 | I1208 05:42:11.117319 49481 net.cpp:165] Memory required for data: 8404992 115 | I1208 05:42:11.117331 49481 layer_factory.hpp:77] Creating layer H1 116 | I1208 05:42:11.117346 49481 net.cpp:106] Creating Layer H1 117 | I1208 05:42:11.117352 49481 net.cpp:454] H1 <- data 118 | I1208 05:42:11.117363 49481 net.cpp:411] H1 -> H1 119 | I1208 05:42:11.120373 49485 blocking_queue.cpp:50] Waiting for data 120 | I1208 05:42:11.120420 49481 net.cpp:150] Setting up H1 121 | I1208 05:42:11.120434 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 122 | I1208 05:42:11.120437 49481 net.cpp:165] Memory required for data: 41959424 123 | I1208 05:42:11.120451 49481 layer_factory.hpp:77] Creating layer H1_A 124 | I1208 05:42:11.120462 49481 net.cpp:106] Creating Layer H1_A 125 | I1208 05:42:11.120466 49481 net.cpp:454] H1_A <- H1 126 | I1208 05:42:11.120472 49481 net.cpp:397] H1_A -> H1 (in-place) 127 | I1208 05:42:11.213544 49481 net.cpp:150] Setting up H1_A 128 | I1208 05:42:11.213587 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 129 | I1208 05:42:11.213593 49481 net.cpp:165] Memory required for data: 75513856 130 | I1208 05:42:11.213599 49481 layer_factory.hpp:77] Creating layer H2 131 | I1208 05:42:11.213611 49481 net.cpp:106] Creating Layer H2 132 | I1208 05:42:11.213616 49481 net.cpp:454] H2 <- H1 133 | I1208 05:42:11.213626 49481 net.cpp:411] H2 -> H2 134 | I1208 05:42:11.222342 49481 net.cpp:150] Setting up H2 135 | I1208 05:42:11.222368 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 136 | I1208 05:42:11.222371 49481 net.cpp:165] Memory required for data: 109068288 137 | I1208 05:42:11.222383 49481 layer_factory.hpp:77] Creating layer H2_A 138 | I1208 05:42:11.222394 49481 net.cpp:106] Creating Layer H2_A 139 | I1208 05:42:11.222400 49481 net.cpp:454] H2_A <- H2 140 | I1208 05:42:11.222406 49481 net.cpp:397] H2_A -> H2 (in-place) 141 | I1208 05:42:11.222707 49481 net.cpp:150] Setting up H2_A 142 | I1208 05:42:11.222725 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 143 | I1208 05:42:11.222767 49481 net.cpp:165] Memory required for data: 142622720 144 | I1208 05:42:11.222772 49481 layer_factory.hpp:77] Creating layer H3 145 | I1208 05:42:11.222779 49481 net.cpp:106] Creating Layer H3 146 | I1208 05:42:11.222784 49481 net.cpp:454] H3 <- H2 147 | I1208 05:42:11.222790 49481 net.cpp:411] H3 -> H3 148 | I1208 05:42:11.231565 49481 net.cpp:150] Setting up H3 149 | I1208 05:42:11.231588 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 150 | I1208 05:42:11.231592 49481 net.cpp:165] Memory required for data: 176177152 151 | I1208 05:42:11.231603 49481 layer_factory.hpp:77] Creating layer H3_A 152 | I1208 05:42:11.231611 49481 net.cpp:106] Creating Layer H3_A 153 | I1208 05:42:11.231616 49481 net.cpp:454] H3_A <- H3 154 | I1208 05:42:11.231622 49481 net.cpp:397] H3_A -> H3 (in-place) 155 | I1208 05:42:11.231778 49481 net.cpp:150] Setting up H3_A 156 | I1208 05:42:11.231788 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 157 | I1208 05:42:11.231792 49481 net.cpp:165] Memory required for data: 209731584 158 | I1208 05:42:11.231796 49481 layer_factory.hpp:77] Creating layer H4 159 | I1208 05:42:11.231803 49481 net.cpp:106] Creating Layer H4 160 | I1208 05:42:11.231807 49481 net.cpp:454] H4 <- H3 161 | I1208 05:42:11.231813 49481 net.cpp:411] H4 -> H4 162 | I1208 05:42:11.240564 49481 net.cpp:150] Setting up H4 163 | I1208 05:42:11.240587 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 164 | I1208 05:42:11.240592 49481 net.cpp:165] Memory required for data: 243286016 165 | I1208 05:42:11.240599 49481 layer_factory.hpp:77] Creating layer H4_A 166 | I1208 05:42:11.240609 49481 net.cpp:106] Creating Layer H4_A 167 | I1208 05:42:11.240614 49481 net.cpp:454] H4_A <- H4 168 | I1208 05:42:11.240620 49481 net.cpp:397] H4_A -> H4 (in-place) 169 | I1208 05:42:11.240926 49481 net.cpp:150] Setting up H4_A 170 | I1208 05:42:11.240936 49481 net.cpp:157] Top shape: 4096 2048 (8388608) 171 | I1208 05:42:11.240941 49481 net.cpp:165] Memory required for data: 276840448 172 | I1208 05:42:11.240944 49481 layer_factory.hpp:77] Creating layer L 173 | I1208 05:42:11.240952 49481 net.cpp:106] Creating Layer L 174 | I1208 05:42:11.240957 49481 net.cpp:454] L <- H4 175 | I1208 05:42:11.240963 49481 net.cpp:411] L -> L 176 | I1208 05:42:11.288460 49481 net.cpp:150] Setting up L 177 | I1208 05:42:11.288503 49481 net.cpp:157] Top shape: 4096 10000 (40960000) 178 | I1208 05:42:11.288508 49481 net.cpp:165] Memory required for data: 440680448 179 | I1208 05:42:11.288525 49481 layer_factory.hpp:77] Creating layer loss 180 | I1208 05:42:11.288537 49481 net.cpp:106] Creating Layer loss 181 | I1208 05:42:11.288543 49481 net.cpp:454] loss <- L 182 | I1208 05:42:11.288550 49481 net.cpp:454] loss <- label 183 | I1208 05:42:11.288573 49481 net.cpp:411] loss -> loss 184 | I1208 05:42:11.288590 49481 layer_factory.hpp:77] Creating layer loss 185 | I1208 05:42:11.382125 49481 net.cpp:150] Setting up loss 186 | I1208 05:42:11.382170 49481 net.cpp:157] Top shape: (1) 187 | I1208 05:42:11.382175 49481 net.cpp:160] with loss weight 1 188 | I1208 05:42:11.382203 49481 net.cpp:165] Memory required for data: 440680452 189 | I1208 05:42:11.382210 49481 net.cpp:226] loss needs backward computation. 190 | I1208 05:42:11.382215 49481 net.cpp:226] L needs backward computation. 191 | I1208 05:42:11.382220 49481 net.cpp:226] H4_A needs backward computation. 192 | I1208 05:42:11.382225 49481 net.cpp:226] H4 needs backward computation. 193 | I1208 05:42:11.382230 49481 net.cpp:226] H3_A needs backward computation. 194 | I1208 05:42:11.382233 49481 net.cpp:226] H3 needs backward computation. 195 | I1208 05:42:11.382237 49481 net.cpp:226] H2_A needs backward computation. 196 | I1208 05:42:11.382253 49481 net.cpp:226] H2 needs backward computation. 197 | I1208 05:42:11.382257 49481 net.cpp:226] H1_A needs backward computation. 198 | I1208 05:42:11.382261 49481 net.cpp:226] H1 needs backward computation. 199 | I1208 05:42:11.382266 49481 net.cpp:228] data does not need backward computation. 200 | I1208 05:42:11.382269 49481 net.cpp:270] This network produces output loss 201 | I1208 05:42:11.382282 49481 net.cpp:283] Network initialization done. 202 | I1208 05:42:11.382318 49481 solver.cpp:60] Solver scaffolding done. 203 | I1208 05:42:11.397910 49481 parallel.cpp:391] GPUs pairs 0:1 204 | I1208 05:42:11.596046 49481 data_layer.cpp:41] output data size: 4096,1,1,512 205 | I1208 05:42:11.620034 49485 blocking_queue.cpp:50] Waiting for data 206 | I1208 05:42:11.635661 49485 blocking_queue.cpp:50] Waiting for data 207 | I1208 05:42:11.649173 49485 blocking_queue.cpp:50] Waiting for data 208 | I1208 05:42:11.664270 49485 blocking_queue.cpp:50] Waiting for data 209 | I1208 05:42:11.681982 49485 blocking_queue.cpp:50] Waiting for data 210 | I1208 05:42:11.697202 49485 blocking_queue.cpp:50] Waiting for data 211 | I1208 05:42:11.711992 49485 blocking_queue.cpp:50] Waiting for data 212 | I1208 05:42:11.728345 49485 blocking_queue.cpp:50] Waiting for data 213 | I1208 05:42:11.743849 49485 blocking_queue.cpp:50] Waiting for data 214 | I1208 05:42:11.758334 49485 blocking_queue.cpp:50] Waiting for data 215 | I1208 05:42:11.775403 49485 blocking_queue.cpp:50] Waiting for data 216 | I1208 05:42:12.035506 49481 parallel.cpp:419] Starting Optimization 217 | I1208 05:42:12.035604 49481 solver.cpp:288] Solving FFN 218 | I1208 05:42:12.035619 49481 solver.cpp:289] Learning Rate Policy: fixed 219 | I1208 05:42:48.475808 49481 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel 220 | I1208 05:42:49.984336 49481 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate 221 | I1208 05:42:50.939898 49481 solver.cpp:326] Optimization Done. 222 | I1208 05:42:50.990241 49481 caffe.cpp:215] Optimization Done. 223 | -------------------------------------------------------------------------------- /caffe/ffn_2GPUs.prototxt: -------------------------------------------------------------------------------- 1 | name: "FFN" 2 | layer { 3 | name: "data" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | data_param: { 11 | batch_size: 4096 12 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 13 | backend: LMDB 14 | } 15 | } 16 | 17 | layer { 18 | name: "H1" 19 | type: "InnerProduct" 20 | bottom: "data" 21 | top: "H1" 22 | inner_product_param { 23 | num_output: 2048 24 | } 25 | } 26 | 27 | layer { 28 | name: "H1_A" 29 | type: "Sigmoid" 30 | bottom: "H1" 31 | top: "H1" 32 | } 33 | 34 | layer { 35 | name: "H2" 36 | type: "InnerProduct" 37 | bottom: "H1" 38 | top: "H2" 39 | inner_product_param { 40 | num_output: 2048 41 | } 42 | } 43 | 44 | layer { 45 | name: "H2_A" 46 | type: "Sigmoid" 47 | bottom: "H2" 48 | top: "H2" 49 | } 50 | 51 | layer { 52 | name: "H3" 53 | type: "InnerProduct" 54 | bottom: "H2" 55 | top: "H3" 56 | inner_product_param { 57 | num_output: 2048 58 | } 59 | } 60 | 61 | layer { 62 | name: "H3_A" 63 | type: "Sigmoid" 64 | bottom: "H3" 65 | top: "H3" 66 | } 67 | 68 | layer { 69 | name: "H4" 70 | type: "InnerProduct" 71 | bottom: "H3" 72 | top: "H4" 73 | inner_product_param { 74 | num_output: 2048 75 | } 76 | } 77 | 78 | layer { 79 | name: "H4_A" 80 | type: "Sigmoid" 81 | bottom: "H4" 82 | top: "H4" 83 | } 84 | 85 | layer { 86 | name: "L" 87 | type: "InnerProduct" 88 | bottom: "H4" 89 | top: "L" 90 | inner_product_param { 91 | num_output: 10000 92 | } 93 | } 94 | 95 | layer { 96 | name: "loss" 97 | type: "SoftmaxWithLoss" 98 | bottom: "L" 99 | bottom: "label" 100 | top: "loss" 101 | } 102 | 103 | -------------------------------------------------------------------------------- /caffe/ffn_2GPUs.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe train -solver=./ffn_2GPUs_solver.prototxt -gpu=0,1 >ffn_2GPUs.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/ffn_2GPUs_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "./ffn_2GPUs.prototxt" 2 | max_iter: 100 3 | base_lr: 0.001 4 | lr_policy: "fixed" 5 | solver_mode: GPU 6 | -------------------------------------------------------------------------------- /caffe/ffn_4GPUs.log: -------------------------------------------------------------------------------- 1 | I1208 05:45:00.539070 49494 caffe.cpp:184] Using GPUs 0, 1, 2, 3 2 | I1208 05:45:10.508654 49494 solver.cpp:48] Initializing solver from parameters: 3 | base_lr: 0.001 4 | max_iter: 100 5 | lr_policy: "fixed" 6 | solver_mode: GPU 7 | device_id: 0 8 | net: "./ffn_4GPUs.prototxt" 9 | I1208 05:45:10.508736 49494 solver.cpp:91] Creating training net from net file: ./ffn_4GPUs.prototxt 10 | I1208 05:45:10.509438 49494 net.cpp:49] Initializing net from parameters: 11 | name: "FFN" 12 | state { 13 | phase: TRAIN 14 | } 15 | layer { 16 | name: "data" 17 | type: "Data" 18 | top: "data" 19 | top: "label" 20 | include { 21 | phase: TRAIN 22 | } 23 | data_param { 24 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 25 | batch_size: 2048 26 | backend: LMDB 27 | } 28 | } 29 | layer { 30 | name: "H1" 31 | type: "InnerProduct" 32 | bottom: "data" 33 | top: "H1" 34 | inner_product_param { 35 | num_output: 2048 36 | } 37 | } 38 | layer { 39 | name: "H1_A" 40 | type: "Sigmoid" 41 | bottom: "H1" 42 | top: "H1" 43 | } 44 | layer { 45 | name: "H2" 46 | type: "InnerProduct" 47 | bottom: "H1" 48 | top: "H2" 49 | inner_product_param { 50 | num_output: 2048 51 | } 52 | } 53 | layer { 54 | name: "H2_A" 55 | type: "Sigmoid" 56 | bottom: "H2" 57 | top: "H2" 58 | } 59 | layer { 60 | name: "H3" 61 | type: "InnerProduct" 62 | bottom: "H2" 63 | top: "H3" 64 | inner_product_param { 65 | num_output: 2048 66 | } 67 | } 68 | layer { 69 | name: "H3_A" 70 | type: "Sigmoid" 71 | bottom: "H3" 72 | top: "H3" 73 | } 74 | layer { 75 | name: "H4" 76 | type: "InnerProduct" 77 | bottom: "H3" 78 | top: "H4" 79 | inner_product_param { 80 | num_output: 2048 81 | } 82 | } 83 | layer { 84 | name: "H4_A" 85 | type: "Sigmoid" 86 | bottom: "H4" 87 | top: "H4" 88 | } 89 | layer { 90 | name: "L" 91 | type: "InnerProduct" 92 | bottom: "H4" 93 | top: "L" 94 | inner_product_param { 95 | num_output: 10000 96 | } 97 | } 98 | layer { 99 | name: "loss" 100 | type: "SoftmaxWithLoss" 101 | bottom: "L" 102 | bottom: "label" 103 | top: "loss" 104 | } 105 | I1208 05:45:10.509542 49494 layer_factory.hpp:77] Creating layer data 106 | I1208 05:45:10.511235 49494 net.cpp:106] Creating Layer data 107 | I1208 05:45:10.511248 49494 net.cpp:411] data -> data 108 | I1208 05:45:10.511288 49494 net.cpp:411] data -> label 109 | I1208 05:45:10.513157 49496 db_lmdb.cpp:38] Opened lmdb /var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb 110 | I1208 05:45:10.525254 49494 data_layer.cpp:41] output data size: 2048,1,1,512 111 | I1208 05:45:10.532680 49494 net.cpp:150] Setting up data 112 | I1208 05:45:10.532718 49494 net.cpp:157] Top shape: 2048 1 1 512 (1048576) 113 | I1208 05:45:10.532724 49494 net.cpp:157] Top shape: 2048 (2048) 114 | I1208 05:45:10.532728 49494 net.cpp:165] Memory required for data: 4202496 115 | I1208 05:45:10.532737 49494 layer_factory.hpp:77] Creating layer H1 116 | I1208 05:45:10.532749 49494 net.cpp:106] Creating Layer H1 117 | I1208 05:45:10.532754 49494 net.cpp:454] H1 <- data 118 | I1208 05:45:10.532766 49494 net.cpp:411] H1 -> H1 119 | I1208 05:45:10.534867 49494 net.cpp:150] Setting up H1 120 | I1208 05:45:10.534879 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 121 | I1208 05:45:10.534884 49494 net.cpp:165] Memory required for data: 20979712 122 | I1208 05:45:10.534898 49494 layer_factory.hpp:77] Creating layer H1_A 123 | I1208 05:45:10.534907 49494 net.cpp:106] Creating Layer H1_A 124 | I1208 05:45:10.534911 49494 net.cpp:454] H1_A <- H1 125 | I1208 05:45:10.534919 49494 net.cpp:397] H1_A -> H1 (in-place) 126 | I1208 05:45:10.535902 49497 blocking_queue.cpp:50] Waiting for data 127 | I1208 05:45:10.626925 49494 net.cpp:150] Setting up H1_A 128 | I1208 05:45:10.626981 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 129 | I1208 05:45:10.626986 49494 net.cpp:165] Memory required for data: 37756928 130 | I1208 05:45:10.626993 49494 layer_factory.hpp:77] Creating layer H2 131 | I1208 05:45:10.627008 49494 net.cpp:106] Creating Layer H2 132 | I1208 05:45:10.627013 49494 net.cpp:454] H2 <- H1 133 | I1208 05:45:10.627024 49494 net.cpp:411] H2 -> H2 134 | I1208 05:45:10.635015 49494 net.cpp:150] Setting up H2 135 | I1208 05:45:10.635040 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 136 | I1208 05:45:10.635043 49494 net.cpp:165] Memory required for data: 54534144 137 | I1208 05:45:10.635058 49494 layer_factory.hpp:77] Creating layer H2_A 138 | I1208 05:45:10.635071 49494 net.cpp:106] Creating Layer H2_A 139 | I1208 05:45:10.635076 49494 net.cpp:454] H2_A <- H2 140 | I1208 05:45:10.635082 49494 net.cpp:397] H2_A -> H2 (in-place) 141 | I1208 05:45:10.635387 49494 net.cpp:150] Setting up H2_A 142 | I1208 05:45:10.635404 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 143 | I1208 05:45:10.635445 49494 net.cpp:165] Memory required for data: 71311360 144 | I1208 05:45:10.635450 49494 layer_factory.hpp:77] Creating layer H3 145 | I1208 05:45:10.635457 49494 net.cpp:106] Creating Layer H3 146 | I1208 05:45:10.635462 49494 net.cpp:454] H3 <- H2 147 | I1208 05:45:10.635468 49494 net.cpp:411] H3 -> H3 148 | I1208 05:45:10.643630 49494 net.cpp:150] Setting up H3 149 | I1208 05:45:10.643656 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 150 | I1208 05:45:10.643661 49494 net.cpp:165] Memory required for data: 88088576 151 | I1208 05:45:10.643671 49494 layer_factory.hpp:77] Creating layer H3_A 152 | I1208 05:45:10.643681 49494 net.cpp:106] Creating Layer H3_A 153 | I1208 05:45:10.643685 49494 net.cpp:454] H3_A <- H3 154 | I1208 05:45:10.643692 49494 net.cpp:397] H3_A -> H3 (in-place) 155 | I1208 05:45:10.643851 49494 net.cpp:150] Setting up H3_A 156 | I1208 05:45:10.643860 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 157 | I1208 05:45:10.643864 49494 net.cpp:165] Memory required for data: 104865792 158 | I1208 05:45:10.643869 49494 layer_factory.hpp:77] Creating layer H4 159 | I1208 05:45:10.643877 49494 net.cpp:106] Creating Layer H4 160 | I1208 05:45:10.643880 49494 net.cpp:454] H4 <- H3 161 | I1208 05:45:10.643887 49494 net.cpp:411] H4 -> H4 162 | I1208 05:45:10.651945 49494 net.cpp:150] Setting up H4 163 | I1208 05:45:10.651967 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 164 | I1208 05:45:10.651971 49494 net.cpp:165] Memory required for data: 121643008 165 | I1208 05:45:10.651979 49494 layer_factory.hpp:77] Creating layer H4_A 166 | I1208 05:45:10.651993 49494 net.cpp:106] Creating Layer H4_A 167 | I1208 05:45:10.651998 49494 net.cpp:454] H4_A <- H4 168 | I1208 05:45:10.652004 49494 net.cpp:397] H4_A -> H4 (in-place) 169 | I1208 05:45:10.652295 49494 net.cpp:150] Setting up H4_A 170 | I1208 05:45:10.652307 49494 net.cpp:157] Top shape: 2048 2048 (4194304) 171 | I1208 05:45:10.652310 49494 net.cpp:165] Memory required for data: 138420224 172 | I1208 05:45:10.652314 49494 layer_factory.hpp:77] Creating layer L 173 | I1208 05:45:10.652323 49494 net.cpp:106] Creating Layer L 174 | I1208 05:45:10.652326 49494 net.cpp:454] L <- H4 175 | I1208 05:45:10.652333 49494 net.cpp:411] L -> L 176 | I1208 05:45:10.695912 49494 net.cpp:150] Setting up L 177 | I1208 05:45:10.695956 49494 net.cpp:157] Top shape: 2048 10000 (20480000) 178 | I1208 05:45:10.695961 49494 net.cpp:165] Memory required for data: 220340224 179 | I1208 05:45:10.695978 49494 layer_factory.hpp:77] Creating layer loss 180 | I1208 05:45:10.695992 49494 net.cpp:106] Creating Layer loss 181 | I1208 05:45:10.695997 49494 net.cpp:454] loss <- L 182 | I1208 05:45:10.696004 49494 net.cpp:454] loss <- label 183 | I1208 05:45:10.696014 49494 net.cpp:411] loss -> loss 184 | I1208 05:45:10.696033 49494 layer_factory.hpp:77] Creating layer loss 185 | I1208 05:45:10.739253 49494 net.cpp:150] Setting up loss 186 | I1208 05:45:10.739298 49494 net.cpp:157] Top shape: (1) 187 | I1208 05:45:10.739303 49494 net.cpp:160] with loss weight 1 188 | I1208 05:45:10.739328 49494 net.cpp:165] Memory required for data: 220340228 189 | I1208 05:45:10.739336 49494 net.cpp:226] loss needs backward computation. 190 | I1208 05:45:10.739342 49494 net.cpp:226] L needs backward computation. 191 | I1208 05:45:10.739347 49494 net.cpp:226] H4_A needs backward computation. 192 | I1208 05:45:10.739351 49494 net.cpp:226] H4 needs backward computation. 193 | I1208 05:45:10.739356 49494 net.cpp:226] H3_A needs backward computation. 194 | I1208 05:45:10.739359 49494 net.cpp:226] H3 needs backward computation. 195 | I1208 05:45:10.739363 49494 net.cpp:226] H2_A needs backward computation. 196 | I1208 05:45:10.739367 49494 net.cpp:226] H2 needs backward computation. 197 | I1208 05:45:10.739372 49494 net.cpp:226] H1_A needs backward computation. 198 | I1208 05:45:10.739374 49494 net.cpp:226] H1 needs backward computation. 199 | I1208 05:45:10.739380 49494 net.cpp:228] data does not need backward computation. 200 | I1208 05:45:10.739384 49494 net.cpp:270] This network produces output loss 201 | I1208 05:45:10.739399 49494 net.cpp:283] Network initialization done. 202 | I1208 05:45:10.739444 49494 solver.cpp:60] Solver scaffolding done. 203 | I1208 05:45:10.766202 49494 parallel.cpp:391] GPUs pairs 0:1, 2:3, 0:2 204 | I1208 05:45:10.960978 49494 data_layer.cpp:41] output data size: 2048,1,1,512 205 | I1208 05:45:11.484361 49494 data_layer.cpp:41] output data size: 2048,1,1,512 206 | I1208 05:45:11.806061 49494 parallel.cpp:234] GPU 2 does not have p2p access to GPU 0 207 | I1208 05:45:12.012104 49494 data_layer.cpp:41] output data size: 2048,1,1,512 208 | I1208 05:45:12.026998 49497 blocking_queue.cpp:50] Waiting for data 209 | I1208 05:45:12.038635 49499 blocking_queue.cpp:50] Waiting for data 210 | I1208 05:45:12.050448 49501 blocking_queue.cpp:50] Waiting for data 211 | I1208 05:45:12.061439 49497 blocking_queue.cpp:50] Waiting for data 212 | I1208 05:45:12.072552 49499 blocking_queue.cpp:50] Waiting for data 213 | I1208 05:45:12.082746 49501 blocking_queue.cpp:50] Waiting for data 214 | I1208 05:45:12.095509 49501 blocking_queue.cpp:50] Waiting for data 215 | I1208 05:45:12.106873 49497 blocking_queue.cpp:50] Waiting for data 216 | I1208 05:45:12.119169 49499 blocking_queue.cpp:50] Waiting for data 217 | I1208 05:45:12.130293 49501 blocking_queue.cpp:50] Waiting for data 218 | I1208 05:45:12.141988 49497 blocking_queue.cpp:50] Waiting for data 219 | I1208 05:45:12.153851 49501 blocking_queue.cpp:50] Waiting for data 220 | I1208 05:45:12.166868 49497 blocking_queue.cpp:50] Waiting for data 221 | I1208 05:45:12.178115 49499 blocking_queue.cpp:50] Waiting for data 222 | I1208 05:45:12.189239 49501 blocking_queue.cpp:50] Waiting for data 223 | I1208 05:45:12.200922 49497 blocking_queue.cpp:50] Waiting for data 224 | I1208 05:45:12.211683 49499 blocking_queue.cpp:50] Waiting for data 225 | I1208 05:45:12.222765 49497 blocking_queue.cpp:50] Waiting for data 226 | I1208 05:45:12.420398 49494 parallel.cpp:419] Starting Optimization 227 | I1208 05:45:12.420925 49494 solver.cpp:288] Solving FFN 228 | I1208 05:45:12.420943 49494 solver.cpp:289] Learning Rate Policy: fixed 229 | I1208 05:45:46.138041 49494 solver.cpp:459] Snapshotting to binary proto file _iter_100.caffemodel 230 | I1208 05:45:47.682664 49494 sgd_solver.cpp:269] Snapshotting solver state to binary proto file _iter_100.solverstate 231 | I1208 05:45:48.712003 49494 solver.cpp:326] Optimization Done. 232 | I1208 05:45:48.851454 49494 caffe.cpp:215] Optimization Done. 233 | -------------------------------------------------------------------------------- /caffe/ffn_4GPUs.prototxt: -------------------------------------------------------------------------------- 1 | name: "FFN" 2 | layer { 3 | name: "data" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | data_param: { 11 | batch_size: 2048 12 | source: "/var/storage/shared/ipgsp/sys/jobs/application_1447977864059_0183/benchmarks/caffe/fake_data.lmdb" 13 | backend: LMDB 14 | } 15 | } 16 | 17 | layer { 18 | name: "H1" 19 | type: "InnerProduct" 20 | bottom: "data" 21 | top: "H1" 22 | inner_product_param { 23 | num_output: 2048 24 | } 25 | } 26 | 27 | layer { 28 | name: "H1_A" 29 | type: "Sigmoid" 30 | bottom: "H1" 31 | top: "H1" 32 | } 33 | 34 | layer { 35 | name: "H2" 36 | type: "InnerProduct" 37 | bottom: "H1" 38 | top: "H2" 39 | inner_product_param { 40 | num_output: 2048 41 | } 42 | } 43 | 44 | layer { 45 | name: "H2_A" 46 | type: "Sigmoid" 47 | bottom: "H2" 48 | top: "H2" 49 | } 50 | 51 | layer { 52 | name: "H3" 53 | type: "InnerProduct" 54 | bottom: "H2" 55 | top: "H3" 56 | inner_product_param { 57 | num_output: 2048 58 | } 59 | } 60 | 61 | layer { 62 | name: "H3_A" 63 | type: "Sigmoid" 64 | bottom: "H3" 65 | top: "H3" 66 | } 67 | 68 | layer { 69 | name: "H4" 70 | type: "InnerProduct" 71 | bottom: "H3" 72 | top: "H4" 73 | inner_product_param { 74 | num_output: 2048 75 | } 76 | } 77 | 78 | layer { 79 | name: "H4_A" 80 | type: "Sigmoid" 81 | bottom: "H4" 82 | top: "H4" 83 | } 84 | 85 | layer { 86 | name: "L" 87 | type: "InnerProduct" 88 | bottom: "H4" 89 | top: "L" 90 | inner_product_param { 91 | num_output: 10000 92 | } 93 | } 94 | 95 | layer { 96 | name: "loss" 97 | type: "SoftmaxWithLoss" 98 | bottom: "L" 99 | bottom: "label" 100 | top: "loss" 101 | } 102 | 103 | -------------------------------------------------------------------------------- /caffe/ffn_4GPUs.sh: -------------------------------------------------------------------------------- 1 | ../../caffe/build/tools/caffe train -solver=./ffn_4GPUs_solver.prototxt -gpu=0,1,2,3 >ffn_4GPUs.log 2>&1 2 | 3 | -------------------------------------------------------------------------------- /caffe/ffn_4GPUs_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "./ffn_4GPUs.prototxt" 2 | max_iter: 100 3 | base_lr: 0.001 4 | lr_policy: "fixed" 5 | solver_mode: GPU 6 | -------------------------------------------------------------------------------- /caffe/ffn_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "./ffn.prototxt" 2 | max_iter: 100 3 | base_lr: 0.001 4 | lr_policy: "fixed" 5 | solver_mode: GPU 6 | -------------------------------------------------------------------------------- /createData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | featDim = 512 4 | labDim = 10000 5 | totalCount = 256 * 1024 6 | 7 | def createFakeData(count): 8 | features = np.random.randn(count, featDim) 9 | labels = np.random.randint(0, labDim, size=(count, 1)) 10 | return features, labels 11 | 12 | f, l = createFakeData(totalCount) 13 | 14 | np.savetxt(r'./data.txt', np.hstack((l, f)), fmt='%d' + ' %f4' * featDim) 15 | 16 | -------------------------------------------------------------------------------- /keras/ffn.log: -------------------------------------------------------------------------------- 1 | Using gpu device 0: Tesla K40m 2 | Using Theano backend. 3 | 1 GPU: 4585.43059456 samples per sec 4 | -------------------------------------------------------------------------------- /keras/ffn.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=gpu,floatX=float32" 3 | 4 | import theano 5 | import time 6 | import numpy as np 7 | 8 | from keras.models import Sequential 9 | from keras.layers.core import Dense, Activation 10 | from keras.optimizers import SGD 11 | 12 | nruns = 5 13 | bsize = 8192 14 | isize = 512 15 | hsize = 2048 16 | osize = 10000 17 | 18 | #fake data 19 | X = np.random.rand(bsize, isize).astype(np.float32) 20 | y = np.zeros((bsize, osize), dtype=np.bool) 21 | ind = np.random.randint(0,osize,bsize) 22 | for i in range(bsize): 23 | y[i,ind[i]] = True 24 | 25 | #model definition 26 | model = Sequential() 27 | model.add(Dense(hsize, input_dim=isize)) 28 | model.add(Activation('sigmoid')) #hidden layer 1 29 | model.add(Dense(hsize)) 30 | model.add(Activation('sigmoid')) #hidden layer 2 31 | model.add(Dense(hsize)) 32 | model.add(Activation('sigmoid')) #hidden layer 3 33 | model.add(Dense(hsize)) 34 | model.add(Activation('sigmoid')) #hidden layer 4 35 | model.add(Dense(osize)) 36 | model.add(Activation('softmax')) #output layer 37 | model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.1)) 38 | 39 | #start training and measuring 40 | start = time.time() 41 | for i in range(nruns): 42 | model.train_on_batch(X, y) 43 | end = time.time() 44 | print('1 GPU: {0} samples per sec'.format(nruns * bsize / (end-start))) 45 | 46 | --------------------------------------------------------------------------------