├── LICENSE.md
├── README.md
├── benchmark_caffe.py
├── benchmark_keras.py
├── benchmark_lasagne.py
├── benchmark_mxnet.py
├── benchmark_neon.py
├── benchmark_tensorflow.py
├── images
    ├── img0000.tiff
    └── list.txt
├── parse_results.py
├── results
    ├── gtx_1080
    │   ├── INFO.md
    │   ├── benchmark_caffe.dmon
    │   ├── benchmark_caffe.output
    │   ├── benchmark_keras_tensorflow.dmon
    │   ├── benchmark_keras_tensorflow.output
    │   ├── benchmark_keras_theano.dmon
    │   ├── benchmark_keras_theano.output
    │   └── benchmark_neon.output
    ├── k520
    │   ├── INFO.md
    │   ├── benchmark_caffe.dmon
    │   ├── benchmark_caffe.output
    │   ├── benchmark_keras_tensorflow.dmon
    │   ├── benchmark_keras_tensorflow.output
    │   ├── benchmark_keras_theano.dmon
    │   ├── benchmark_keras_theano.output
    │   ├── benchmark_mxnet.dmon
    │   ├── benchmark_mxnet.output
    │   ├── benchmark_tensorflow.dmon
    │   ├── benchmark_tensorflow.output
    │   ├── benchmark_tensorflow_slim.dmon
    │   └── benchmark_tensorflow_slim.output
    ├── k80
    │   ├── INFO.md
    │   ├── benchmark_caffe.output
    │   ├── benchmark_keras_tensorflow.output
    │   ├── benchmark_keras_theano.output
    │   ├── benchmark_mxnet.output
    │   ├── benchmark_neon.output
    │   ├── benchmark_tensorflow.output
    │   └── benchmark_tensorflow_slim.output
    ├── maxwell_titan_x
    │   ├── INFO.md
    │   ├── benchmark_caffe.output
    │   ├── benchmark_keras_tensorflow.output
    │   ├── benchmark_keras_theano.output
    │   ├── benchmark_mxnet.output
    │   ├── benchmark_neon.output
    │   ├── benchmark_tensorflow.output
    │   └── benchmark_tensorflow_slim.output
    └── v100
    │   ├── INFO.md
    │   ├── benchmark_keras_tensorflow.dmon
    │   ├── benchmark_keras_tensorflow.output
    │   ├── benchmark_mxnet.dmon
    │   ├── benchmark_mxnet.output
    │   ├── benchmark_tensorflow.dmon
    │   ├── benchmark_tensorflow.output
    │   ├── benchmark_tensorflow_slim.dmon
    │   └── benchmark_tensorflow_slim.output
├── run.sh
├── vgg16_deploy.prototxt
├── vgg16_solver.prototxt
└── vgg16_train_val.prototxt


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Alex Izvorski <aizvorski@gmail.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Deep Learning Benchmark (VGG16)
 2 | 
 3 | This mini-benchmark compares the speed of several deep learning frameworks on the VGG-16 network architecture.
 4 | 
 5 | *Contributions welcome!*
 6 | 
 7 | ## Results
 8 | 
 9 | The table contains training times in milliseconds per minibatch on the same VGG16 network in different frameworks. (Less is faster).  Minibatch size is 16. The time is for a complete SGD step including parameter updates, not just the forward+backward time.
10 | 
11 | | Framework | [V100](results/v100/INFO.md) | [GTX 1080](results/gtx_1080/INFO.md) | [Maxwell Titan X](results/maxwell_titan_x/INFO.md) | [K80](results/k80/INFO.md) | [K520](results/k520/INFO.md) | 
12 | |:---|:---|:---|:---|:---|:---|
13 | | MXNet | 81.22 | N/A | 324.63 | 1247.47 | OOM | 
14 | | TensorFlow | 88.03 | N/A | 332.15 | 1057.28 | 2290.58 | 
15 | | TensorFlow (slim) | 176.14 | N/A | 370.89 | 1126.70 | 2488.51 | 
16 | | Keras (TensorFlow) | 101.82 | 287.85 | 359.89 | 1020.97 | OOM | 
17 | | Keras (Theano) | N/A | 409.95 | 317.30 | 1141.79 | 2445.22 | 
18 | | Neon | N/A | 164.53 | 207.41 | N/A | N/A | 
19 | | Caffe | N/A | 244.44 | 311.06 | 787.81 | OOM | 
20 | | Torch (1) | N/A | 232.55 | 273.54 | N/A | N/A |
21 | 
22 | 
23 | N/A - test not ran
24 | 
25 | OOM - test ran but failed due to running out of memory (on the K520 with only 4GB memory)
26 | 
27 | (1) The Torch benchmark is from https://github.com/jcjohnson/cnn-benchmarks (it is not included in this repo).
28 | 
29 | ## Running
30 | 
31 | 
32 | ```
33 | bash run.sh
34 | ```
35 | 
36 | Note: this will back up and then restore your ~/.keras/keras.json
37 | 
38 | Caffe should be built inside caffe/ in the current directory (or a symlink).
39 | 
40 | Neon should be built anywhere and (just for the Neon test) the built Neon virtualenv should be activated.
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/benchmark_caffe.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import shutil
 3 | 
 4 | for n in range(1,16):
 5 |     shutil.copy("images/img0000.tiff", "images/img%04d.tiff"%(n))
 6 | 
 7 | CAFFE = "caffe/build/tools/caffe"
 8 | 
 9 | # this benchmarks forward+backward rather than a full SGD step; it turns out there is quite a significant difference between the two
10 | # subprocess.call(CAFFE + " time -model vgg16_deploy.prototxt -iterations 100 -gpu 0", shell=True)
11 | 
12 | subprocess.call(CAFFE + " train --solver vgg16_solver.prototxt -gpu 0", shell=True)
13 | 
14 | 


--------------------------------------------------------------------------------
/benchmark_keras.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import os.path
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--backend")
 7 | args = parser.parse_args()
 8 | 
 9 | os.environ['KERAS_BACKEND'] = args.backend
10 | 
11 | import tensorflow.python.keras.backend
12 | 
13 | if args.backend == "theano":
14 |     tensorflow.python.keras.backend.set_image_dim_ordering("th")
15 | elif args.backend == "tensorflow":
16 |     tensorflow.python.keras.backend.set_image_dim_ordering("tf")
17 | else:
18 |     print("Backend must be theano or tensorflow")
19 | 
20 | tensorflow.python.keras.backend.set_floatx('float32')
21 | tensorflow.python.keras.backend.set_epsilon(1e-07)
22 | 
23 | import numpy as np
24 | import time
25 | import sys
26 | import tensorflow
27 | # from tensorflow.keras.optimizers import SGD
28 | 
29 | from tensorflow.python.keras.applications.vgg16 import VGG16
30 | 
31 | width = 224
32 | height = 224
33 | batch_size = 16
34 | 
35 | model = VGG16(include_top=True, weights=None)
36 | sgd = tensorflow.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
37 | model.compile(loss='categorical_crossentropy', optimizer=sgd) # loss='hinge'
38 | 
39 | if args.backend == "theano":
40 |     x = np.zeros((batch_size, 3, width, height), dtype=np.float32)
41 | elif args.backend == "tensorflow":
42 |     x = np.zeros((batch_size, width, height, 3), dtype=np.float32)
43 | else:
44 |     print("Backend must be theano or tensorflow")
45 | y = np.zeros((batch_size, 1000), dtype=np.float32)
46 | 
47 | # warmup
48 | model.train_on_batch(x, y)
49 | 
50 | t0 = time.time()
51 | n = 0
52 | while n < 100:
53 |     tstart = time.time()
54 |     model.train_on_batch(x, y)
55 |     tend = time.time()
56 |     print("Iteration: %d train on batch time: %7.3f ms." %( n, (tend - tstart)*1000 ))
57 |     n += 1
58 | t1 = time.time()
59 | 
60 | print("Batch size: %d" %(batch_size))
61 | print("Iterations: %d" %(n))
62 | print("Time per iteration: %7.3f ms" %((t1 - t0) *1000 / n))
63 | 


--------------------------------------------------------------------------------
/benchmark_lasagne.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Adapted from: https://github.com/Lasagne/Recipes/blob/master/modelzoo/vgg16.py
 3 | """
 4 | 
 5 | # VGG-16, 16-layer model from the paper:
 6 | # "Very Deep Convolutional Networks for Large-Scale Image Recognition"
 7 | # Original source: https://gist.github.com/ksimonyan/211839e770f7b538e2d8
 8 | # License: see http://www.robots.ox.ac.uk/~vgg/research/very_deep/
 9 | 
10 | # Download pretrained weights from:
11 | # https://s3.amazonaws.com/lasagne/recipes/pretrained/imagenet/vgg16.pkl
12 | 
13 | from lasagne.layers import InputLayer
14 | from lasagne.layers import DenseLayer
15 | from lasagne.layers import NonlinearityLayer
16 | from lasagne.layers import DropoutLayer
17 | from lasagne.layers import Pool2DLayer as PoolLayer
18 | from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
19 | from lasagne.nonlinearities import softmax
20 | 
21 | 
22 | def build_model(input_var):
23 |     net = {}
24 |     net['input'] = InputLayer((None, 3, 224, 224), input_var=input_var)
25 |     net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False)
26 |     net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False)
27 |     net['pool1'] = PoolLayer(net['conv1_2'], 2)
28 |     net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False)
29 |     net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False)
30 |     net['pool2'] = PoolLayer(net['conv2_2'], 2)
31 |     net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False)
32 |     net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False)
33 |     net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False)
34 |     net['pool3'] = PoolLayer(net['conv3_3'], 2)
35 |     net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False)
36 |     net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False)
37 |     net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False)
38 |     net['pool4'] = PoolLayer(net['conv4_3'], 2)
39 |     net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False)
40 |     net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False)
41 |     net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False)
42 |     net['pool5'] = PoolLayer(net['conv5_3'], 2)
43 |     net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
44 |     net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5)
45 |     net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096)
46 |     net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5)
47 |     net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=1000, nonlinearity=None)
48 |     net['prob'] = NonlinearityLayer(net['fc8'], softmax)
49 | 
50 |     return net
51 | 
52 | import time
53 | import numpy as np
54 | import theano
55 | import theano.tensor as T
56 | import lasagne
57 | 
58 | input_var = T.tensor4('inputs')
59 | target_var = T.ivector('targets')
60 | 
61 | network = build_model(input_var)
62 | 
63 | prediction = lasagne.layers.get_output(network['prob'])
64 | loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
65 | loss = loss.mean()
66 | 
67 | params = lasagne.layers.get_all_params(network['prob'], trainable=True)
68 | updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)
69 | 
70 | train_fn = theano.function([input_var, target_var], loss, updates=updates)
71 | 
72 | width = 224
73 | height = 224
74 | batch_size = 16
75 | 
76 | x = np.zeros((batch_size, 3, width, height), dtype=np.float32)
77 | y = np.zeros((batch_size), dtype=np.int32)
78 | 
79 | # warmup
80 | train_fn(x, y)
81 | 
82 | t0 = time.time()
83 | n = 0
84 | while n < 100:
85 |     tstart = time.time()
86 |     train_fn(x, y)
87 |     tend = time.time()
88 |     print "Iteration: %d train on batch time: %7.3f ms." %( n, (tend - tstart)*1000 )
89 |     n += 1
90 | t1 = time.time()
91 | 
92 | print "Batch size: %d" %(batch_size)
93 | print "Iterations: %d" %(n)
94 | print "Time per iteration: %7.3f ms" %((t1 - t0) *1000 / n)
95 | 


--------------------------------------------------------------------------------
/benchmark_mxnet.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import numpy as np
 3 | import time
 4 | 
 5 | 
 6 | def vgg16_symbol():
 7 |     data = mx.sym.Variable('data')
 8 | 
 9 |     # conv1
10 |     conv1_1 = mx.sym.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
11 |     relu1_1 = mx.sym.Activation(data=conv1_1, act_type="relu", name="relu1_1")
12 |     conv1_2 = mx.sym.Convolution(data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
13 |     relu1_2 = mx.sym.Activation(data=conv1_2, act_type="relu", name="relu1_2")
14 |     pool1 = mx.sym.Pooling(data=relu1_2, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool1")
15 | 
16 |     # conv2
17 |     conv2_1 = mx.sym.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
18 |     relu2_1 = mx.sym.Activation(data=conv2_1, act_type="relu", name="relu2_1")
19 |     conv2_2 = mx.sym.Convolution(data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
20 |     relu2_2 = mx.sym.Activation(data=conv2_2, act_type="relu", name="relu2_2")
21 |     pool2 = mx.sym.Pooling(data=relu2_2, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool2")
22 | 
23 |     # conv3
24 |     conv3_1 = mx.sym.Convolution(data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
25 |     relu3_1 = mx.sym.Activation(data=conv3_1, act_type="relu", name="relu3_1")
26 |     conv3_2 = mx.sym.Convolution(data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
27 |     relu3_2 = mx.sym.Activation(data=conv3_2, act_type="relu", name="relu3_2")
28 |     conv3_3 = mx.sym.Convolution(data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
29 |     relu3_3 = mx.sym.Activation(data=conv3_3, act_type="relu", name="relu3_3")
30 |     pool3 = mx.sym.Pooling(data=relu3_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool3")
31 | 
32 |     # conv4
33 |     conv4_1 = mx.sym.Convolution(data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
34 |     relu4_1 = mx.sym.Activation(data=conv4_1, act_type="relu", name="relu4_1")
35 |     conv4_2 = mx.sym.Convolution(data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
36 |     relu4_2 = mx.sym.Activation(data=conv4_2, act_type="relu", name="relu4_2")
37 |     conv4_3 = mx.sym.Convolution(data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
38 |     relu4_3 = mx.sym.Activation(data=conv4_3, act_type="relu", name="relu4_3")
39 |     pool4 = mx.sym.Pooling(data=relu4_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool4")
40 | 
41 |     # conv5
42 |     conv5_1 = mx.sym.Convolution(data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
43 |     relu5_1 = mx.sym.Activation(data=conv5_1, act_type="relu", name="relu5_1")
44 |     conv5_2 = mx.sym.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
45 |     relu5_2 = mx.sym.Activation(data=conv5_2, act_type="relu", name="relu5_2")
46 |     conv5_3 = mx.sym.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
47 |     relu5_3 = mx.sym.Activation(data=conv5_3, act_type="relu", name="relu5_3")
48 |     pool5 = mx.sym.Pooling(data=relu5_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool5")
49 | 
50 |     # fc6
51 |     flat6 = mx.sym.Flatten(data=pool5, name="flat6")
52 |     fc6 = mx.sym.FullyConnected(data=flat6, num_hidden=4096, name="fc6")
53 |     relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
54 |     drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
55 | 
56 |     # fc7
57 |     fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
58 |     relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
59 |     drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
60 | 
61 |     # fc8
62 |     fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=1000, name="fc8")
63 |     softmax = mx.sym.SoftmaxOutput(data=fc8, name="softmax")
64 | 
65 |     return softmax
66 | 
67 | 
68 | batch_size = 16
69 | nb_epoch = 400
70 | 
71 | imgs = np.random.uniform(0, 1, (batch_size * 3 * 224 * 224))
72 | imgs = imgs.reshape((batch_size, 3, 224, 224))
73 | labels = np.random.randint(0,1000, batch_size)
74 | 
75 | train_iter = mx.io.NDArrayIter(data=imgs.astype(np.float32),
76 |                                label=labels.astype(np.float32),
77 |                                batch_size=batch_size)
78 | 
79 | vgg16 = vgg16_symbol()
80 | 
81 | model = mx.model.FeedForward(
82 |     ctx=mx.gpu(),
83 |     symbol=vgg16,
84 |     num_epoch=nb_epoch,
85 |     learning_rate=0.01,
86 |     momentum=0.9,
87 |     wd=0.00001,
88 |     initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
89 | )
90 | 
91 | model.fit(X=train_iter)
92 | print("Start")
93 | t0 = time.time()
94 | model.fit(X=train_iter)
95 | t1 = time.time()
96 | print("Batch size: %d" % (batch_size))
97 | print("Iterations: %d" % (nb_epoch))
98 | print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / nb_epoch))
99 | 


--------------------------------------------------------------------------------
/benchmark_neon.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | """
 17 | VGG_A Benchmark
 18 | https://github.com/soumith/convnet-benchmarks
 19 | 
 20 | ./vgg_a.py
 21 | ./vgg_a.py -d f16
 22 | """
 23 | 
 24 | from neon import NervanaObject
 25 | from neon.util.argparser import NeonArgparser
 26 | from neon.initializers import Constant, GlorotUniform, Xavier
 27 | from neon.layers import Conv, Dropout, Pooling, GeneralizedCost, Affine
 28 | from neon.optimizers import GradientDescentMomentum, MultiOptimizer, Schedule
 29 | from neon.transforms import Rectlin, Softmax, CrossEntropyMulti
 30 | from neon.models import Model
 31 | from neon.data import ArrayIterator
 32 | from neon.callbacks.callbacks import Callbacks
 33 | 
 34 | import numpy as np
 35 | parser = NeonArgparser(__doc__)
 36 | args = parser.parse_args()
 37 | 
 38 | NervanaObject.be.bsz = 16
 39 | NervanaObject.be.enable_winograd = 4
 40 | 
 41 | # setup data provider
 42 | X_train = np.random.uniform(-1, 1, (16, 3 * 224 * 224))
 43 | y_train = np.random.randint(0, 999, (16, 1000))
 44 | train = ArrayIterator(X_train, y_train, nclass=1000, lshape=(3, 224, 224))
 45 | 
 46 | # layers = [Conv((3, 3, 64), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 47 | #           Pooling(2, strides=2),
 48 | #           Conv((3, 3, 128), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 49 | #           Pooling(2, strides=2),
 50 | #           Conv((3, 3, 256), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 51 | #           Conv((3, 3, 256), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 52 | #           Pooling(2, strides=2),
 53 | #           Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 54 | #           Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 55 | #           Pooling(2, strides=2),
 56 | #           Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 57 | #           Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1),
 58 | #           Pooling(2, strides=2),
 59 | #           Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()),
 60 | #           Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()),
 61 | #           Affine(nout=1000, init=Gaussian(scale=0.01), activation=Softmax())]
 62 | # model = Model(layers=layers)
 63 | 
 64 | """
 65 | Modified from https://github.com/NervanaSystems/ModelZoo/blob/master/ImageClassification/ILSVRC2012/VGG/vgg_neon.py
 66 | """
 67 | 
 68 | init1 = Xavier(local=True)
 69 | initfc = GlorotUniform()
 70 | 
 71 | relu = Rectlin()
 72 | conv_params = {'init': init1,
 73 |                'strides': 1,
 74 |                'padding': 1,
 75 |                'bias': Constant(0),
 76 |                'activation': relu}
 77 | 
 78 | # Set up the model layers
 79 | layers = []
 80 | for nofm in [64, 128, 256, 512, 512]:
 81 |     layers.append(Conv((3, 3, nofm), **conv_params))
 82 |     layers.append(Conv((3, 3, nofm), **conv_params))
 83 |     if nofm > 128:
 84 |         layers.append(Conv((3, 3, nofm), **conv_params))
 85 |         # if args.vgg_version == 'E':
 86 |         #     layers.append(Conv((3, 3, nofm), **conv_params))
 87 |     layers.append(Pooling(2, strides=2))
 88 | 
 89 | layers.append(Affine(nout=4096, init=initfc, bias=Constant(0), activation=relu))
 90 | layers.append(Dropout(keep=0.5))
 91 | layers.append(Affine(nout=4096, init=initfc, bias=Constant(0), activation=relu))
 92 | layers.append(Dropout(keep=0.5))
 93 | layers.append(Affine(nout=1000, init=initfc, bias=Constant(0), activation=Softmax()))
 94 | 
 95 | model = Model(layers=layers)
 96 | 
 97 | weight_sched = Schedule([22, 44, 65], (1 / 250.)**(1 / 3.))
 98 | opt_gdm = GradientDescentMomentum(0.01, 0.0, wdecay=0.0005, schedule=weight_sched)
 99 | opt = MultiOptimizer({'default': opt_gdm})
100 | cost = GeneralizedCost(costfunc=CrossEntropyMulti())
101 | callbacks = Callbacks(model)
102 | 
103 | import time
104 | 
105 | num_epochs=100
106 | t0 = time.time()
107 | # model.benchmark(train, cost=cost, optimizer=opt, niterations=100, nskip=1)
108 | model.fit( train, cost=cost, optimizer=opt, num_epochs=100, callbacks=callbacks )
109 | t1 = time.time()
110 | 
111 | print("Batch size: %d" %(NervanaObject.be.bsz))
112 | print("Iterations: %d" %(num_epochs))
113 | print("Time per iteration: %7.3f ms" %((t1 - t0) *1000 / num_epochs))
114 | 
115 | 


--------------------------------------------------------------------------------
/benchmark_tensorflow.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.contrib.slim as slim
  3 | import numpy as np
  4 | import time
  5 | import argparse
  6 | 
  7 | 
  8 | def vgg16(inputs, num_classes, batch_size):
  9 |     with slim.arg_scope([slim.conv2d, slim.fully_connected],
 10 |                         activation_fn=tf.nn.relu,
 11 |                         weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)):
 12 |         net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], padding="SAME", scope='conv1')
 13 |         net = slim.max_pool2d(net, [2, 2], scope='pool1')
 14 |         net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], padding="SAME", scope='conv2')
 15 |         net = slim.max_pool2d(net, [2, 2], scope='pool2')
 16 |         net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], padding="SAME", scope='conv3')
 17 |         net = slim.max_pool2d(net, [2, 2], scope='pool3')
 18 |         net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], padding="SAME", scope='conv4')
 19 |         net = slim.max_pool2d(net, [2, 2], scope='pool4')
 20 |         net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], padding="SAME", scope='conv5')
 21 |         net = slim.max_pool2d(net, [2, 2], scope='pool5')
 22 |         net = tf.reshape(net, (batch_size, 7 * 7 * 512))
 23 |         net = slim.fully_connected(net, 4096, scope='fc6')
 24 |         net = slim.dropout(net, 0.5, scope='dropout6')
 25 |         net = slim.fully_connected(net, 4096, scope='fc7')
 26 |         net = slim.dropout(net, 0.5, scope='dropout7')
 27 |         net = slim.fully_connected(net, 1000, activation_fn=None, scope='fc8')
 28 |     return net
 29 | 
 30 | 
 31 | def train_slim(batch_size, height, width, num_classes, learning_rate):
 32 |     """Built-in slim training schedule"""
 33 | 
 34 |     # number of iterations
 35 |     n = 100
 36 | 
 37 |     logdir = None  # Don't store checkpoints
 38 | 
 39 |     with tf.Session():
 40 |         train_inputs = tf.random_uniform((batch_size, height, width, 3))
 41 |         labels = tf.one_hot(np.arange(batch_size), on_value=1.0, off_value=0.0, depth=num_classes)
 42 | 
 43 |         predictions = vgg16(train_inputs, num_classes, batch_size)
 44 |         loss = slim.losses.softmax_cross_entropy(predictions, labels)
 45 | 
 46 |         optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 47 | 
 48 |         train_op = slim.learning.create_train_op(loss, optimizer)
 49 | 
 50 |         t0 = time.time()
 51 |         slim.learning.train(
 52 |             train_op,
 53 |             logdir,
 54 |             number_of_steps=n,
 55 |             save_summaries_secs=3000,
 56 |             save_interval_secs=6000)
 57 |         t1 = time.time()
 58 | 
 59 |         print("Batch size: %d" % (batch_size))
 60 |         print("Iterations: %d" % (n))
 61 |         print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / n))
 62 | 
 63 | 
 64 | def train_pure_tf(batch_size, height, width, num_classes, learning_rate):
 65 |     """pure tensorflow training schedule, possibly with less overhead than slim"""
 66 | 
 67 |     # number of iterations
 68 |     n = 100
 69 | 
 70 |     with tf.Graph().as_default(), tf.device('/gpu:0'):
 71 | 
 72 |         train_inputs = tf.random_uniform((batch_size, height, width, 3))
 73 |         labels = tf.one_hot(np.arange(batch_size), on_value=1.0, off_value=0.0, depth=num_classes)
 74 | 
 75 |         # Predictions
 76 |         predictions = vgg16(train_inputs, num_classes, batch_size)
 77 |         # Loss function
 78 |         loss = slim.losses.softmax_cross_entropy(predictions, labels)
 79 |         # Optimizer
 80 |         opt = tf.train.GradientDescentOptimizer(learning_rate)
 81 | 
 82 |         # Calculate the gradients for the batch of data
 83 |         grads = opt.compute_gradients(loss)
 84 | 
 85 |         # Apply the gradients to adjust the shared variables.
 86 |         apply_gradient_op = opt.apply_gradients(grads)
 87 | 
 88 |         # Run a session
 89 |         with tf.Session() as sess:
 90 |             # Build an initialization operation to run below.
 91 |             init = tf.initialize_all_variables()
 92 |             sess.run(init)
 93 | 
 94 |             # warmup run (generally the first run is much slower than the others)
 95 |             sess.run([apply_gradient_op])
 96 | 
 97 |             t0 = time.time()
 98 |             for i in range(n):
 99 |                 tstart = time.time()
100 |                 sess.run([apply_gradient_op])
101 |                 tend = time.time()
102 |                 print("Iteration: %d train on batch time: %7.3f ms." % (i, (tend - tstart) * 1000))
103 |             t1 = time.time()
104 | 
105 |         print("Batch size: %d" % (batch_size))
106 |         print("Iterations: %d" % (n))
107 |         print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / n))
108 | 
109 | 
110 | if __name__ == '__main__':
111 | 
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument("--train_schedule", default="pure_tf")
114 |     parser.add_argument("--batch_size", type=int, default=16)
115 |     parser.add_argument("--height", type=int, default=224)
116 |     parser.add_argument("--width", type=int, default=224)
117 |     parser.add_argument("--num_classes", type=int, default=1000)
118 |     parser.add_argument("--learning_rate", type=float, default=0.1)
119 |     args = parser.parse_args()
120 | 
121 |     if args.train_schedule == "slim":
122 |         train_slim(args.batch_size, args.height, args.width, args.num_classes, args.learning_rate)
123 |     elif args.train_schedule == "pure_tf":
124 |         train_pure_tf(args.batch_size, args.height, args.width, args.num_classes, args.learning_rate)
125 |     else:
126 |         print("Train schedule must be slim or pure_tf")
127 | 


--------------------------------------------------------------------------------
/images/img0000.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aizvorski/vgg-benchmarks/4f551e19c89a83df0f9e0e6156d53e6e37463a8e/images/img0000.tiff


--------------------------------------------------------------------------------
/images/list.txt:
--------------------------------------------------------------------------------
 1 | images/img0000.tiff 0
 2 | images/img0001.tiff 0
 3 | images/img0002.tiff 0
 4 | images/img0003.tiff 0
 5 | images/img0004.tiff 0
 6 | images/img0005.tiff 0
 7 | images/img0006.tiff 0
 8 | images/img0007.tiff 0
 9 | images/img0008.tiff 0
10 | images/img0009.tiff 0
11 | images/img0010.tiff 0
12 | images/img0011.tiff 0
13 | images/img0012.tiff 0
14 | images/img0013.tiff 0
15 | images/img0014.tiff 0
16 | images/img0015.tiff 0
17 | 


--------------------------------------------------------------------------------
/parse_results.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import numpy as np
 3 | import re
 4 | 
 5 | def parse_output(output_file):
 6 |     """ Utility to parse the output of keras, neon and tensorflow"""
 7 | 
 8 |     with open("./results/%s" % output_file, "r") as f:
 9 |         list_lines = f.readlines()
10 |         for line in list_lines:
11 |             if "Time per iteration" in line:
12 |                 result = line.split(" ")[-2]
13 |                 return float(result)
14 | 
15 | 
16 | def parse_output_per_iter(output_file):
17 |     """ Utility to parse the output of keras, neon and tensorflow"""
18 |     with open("./results/%s" % output_file, "r") as fh:
19 |         results = []
20 |         for line in fh:
21 |             m = re.search(r'train on batch time:\s*(\d+\.\d+) ms', line)
22 |             if m:
23 |                 r = m.group(1)
24 |                 results.append(float(r))
25 |         return np.mean(results[20:])
26 |         # return 0
27 | 
28 | def parse_output_caffe(output_file):
29 |     """ Utility to parse the output of caffe
30 | 
31 |     Count the time as the clock difference between two consecutive appearances of solver.cpp:228
32 | 
33 |     """
34 | 
35 |     # string formatting
36 |     FMT = '%H:%M:%S.%f'
37 | 
38 |     list_time = []
39 | 
40 |     with open("./results/%s" % output_file, "r") as f:
41 |         list_lines = f.readlines()
42 |         for line in list_lines:
43 |             if "solver.cpp:228" in line:
44 |                 time = line.split(" ")[1]
45 |                 time = datetime.strptime(time, FMT)
46 |                 list_time.append(time)
47 | 
48 |     arr_time = np.array(list_time)
49 |     arr_delta = arr_time[1:] - arr_time[:-1]
50 |     arr_delta = [d.microseconds for d in arr_delta]
51 | 
52 |     return 1E-3 * np.mean(arr_delta)
53 | 
54 | 
55 | def name_to_path(name):
56 |     return name.lower().replace('(', '').replace(')', '').replace(' ', '_')
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     benchmark_names = ('Neon', 'Caffe', 'Keras (TensorFlow)', 'Keras (Theano)',
61 |                        'TensorFlow', 'TensorFlow (slim)', 'MXNet')
62 |     run_names = ('V100', 'GTX 1080', 'Maxwell Titan X', 'K80', 'K520')
63 |     
64 |     line = "| Framework | "
65 |     for run_name in run_names:
66 |         line += run_name + " | "
67 |     print(line)
68 | 
69 |     for name in benchmark_names:
70 |         line = "| " + name + " | "
71 |         for run_name in run_names:
72 |             result_path = name_to_path(run_name) + "/benchmark_" + name_to_path(name) + ".output"
73 |             try:
74 |                 if name == 'Caffe':
75 |                     time_ = parse_output_caffe(result_path)
76 |                 elif name == 'TensorFlow' or name == 'Keras (TensorFlow)':
77 |                     time_ = parse_output_per_iter(result_path)
78 |                 else:
79 |                     time_ = parse_output(result_path)
80 |                 time_ = "%.2f" % (time_)
81 |             except Exception as e:
82 |                 print(repr(e))
83 |                 time_ = "N/A"
84 |             line += time_
85 |             line += " | "
86 |         print(line)
87 | 


--------------------------------------------------------------------------------
/results/gtx_1080/INFO.md:
--------------------------------------------------------------------------------
 1 | ## Results GTX 1080
 2 | 
 3 | The mini batch size is 16
 4 | 
 5 | | Framework  | Time (per minibatch)  |
 6 | |:---|:---|
 7 | | Neon  | 164.527 ms  |
 8 | | Torch (1)  | 232.55 ms  |
 9 | | Caffe (2)  | 244.445 ms  |
10 | | Keras (TensorFlow)  | 287.693 ms  |
11 | | Keras (Theano)  | 409.953 ms  |
12 | 
13 | The environment for the results listed above is as follows:
14 | 
15 | - Hardware: GTX 1080 (EVGA GTX 1080 Founders Edition)
16 | - OS: Ubuntu 14.04.3 LTS
17 | - CUDA: 8.0 (cuda-repo-ubuntu1404-8-0-rc_8.0.27-1_amd64.deb)
18 | - CuDNN: 5.0 (cudnn-8.0-linux-x64-v5.0-ga.tgz)
19 | - Caffe: df412ac (from source)
20 | - Keras: 1.0.7
21 | - Theano: 0.8.2
22 | - TensorFlow: 85f76f5 (from source)
23 | - Neon: 1.5.4 (485033c)
24 | - Python: 2.7.6
25 | 
26 | The TensorFlow version was *very* recent, it has to be in order for it to work with CUDA 8.0.
27 | 
28 | (1) The Torch benchmark is from https://github.com/jcjohnson/cnn-benchmarks (it has an essentially identical setup, VGG-16, GTX 1080, CUDA 8, cuDNN 5, minibatch size 16).
29 | 
30 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time.
31 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_caffe.dmon:
--------------------------------------------------------------------------------
 1 |     0    38    46     2     0     0     0  5005  1592
 2 |     0   141    50    99    52     0     0  5005  1592
 3 |     0   139    51    99    48     0     0  5005  1820
 4 |     0   158    51    99    47     0     0  5005  1835
 5 |     0   166    52   100    54     0     0  5005  1837
 6 |     0   165    53   100    62     0     0  5005  1837
 7 |     0   123    54   100    52     0     0  5005  1837
 8 |     0   153    54   100    55     0     0  5005  1836
 9 |     0   135    54   100    61     0     0  5005  1836
10 |     0   157    55    99    58     0     0  5005  1836
11 |     0   169    56    99    60     0     0  5005  1836
12 |     0   161    56    99    58     0     0  5005  1836
13 |     0   137    57    99    60     0     0  5005  1824
14 |     0   132    57    99    53     0     0  5005  1821
15 |     0   152    57    99    48     0     0  5005  1821
16 |     0   140    58    99    45     0     0  5005  1821
17 |     0   160    58   100    53     0     0  5005  1820
18 |     0   167    59   100    57     0     0  5005  1820
19 |     0   153    59    99    64     0     0  5005  1820
20 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
21 | # Idx     W     C     %     %     %     %   MHz   MHz
22 |     0   118    60    99    51     0     0  5005  1820
23 |     0   155    60   100    56     0     0  5005  1820
24 |     0   137    60   100    63     0     0  5005  1820
25 |     0   157    61   100    58     0     0  5005  1820
26 |     0   166    61    99    61     0     0  5005  1815
27 |     0   166    62    99    60     0     0  5005  1812
28 |     0    45    59     0     0     0     0  5005  1816
29 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_keras_tensorflow.dmon:
--------------------------------------------------------------------------------
 1 |     0    44    46     6     3     0     0  5005  1590
 2 |     0   161    48    28    19     0     0  5005  1754
 3 |     0   145    50   100    59     0     0  5005  1827
 4 |     0   181    50    98    63     0     0  5005  1832
 5 |     0   166    51    99    85     0     0  5005  1830
 6 |     0   191    52    98    61     0     0  5005  1827
 7 |     0   159    53    97    84     0     0  5005  1826
 8 |     0   133    53    98    61     0     0  5005  1829
 9 |     0   146    54    96    82     0     0  5005  1829
10 |     0   119    54    98    60     0     0  5005  1828
11 |     0   154    54    97    85     0     0  5005  1825
12 |     0   155    55    98    61     0     0  5005  1829
13 |     0   102    55    97    83     0     0  5005  1821
14 |     0   161    56    98    61     0     0  5005  1810
15 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
16 | # Idx     W     C     %     %     %     %   MHz   MHz
17 |     0   126    56    97    83     0     0  5005  1813
18 |     0   149    57    99    85     0     0  5005  1811
19 |     0   148    57    98    60     0     0  5005  1815
20 |     0   164    57    97    84     0     0  5005  1813
21 |     0   181    58    98    60     0     0  5005  1800
22 |     0   168    58    97    82     0     0  5005  1808
23 |     0   179    58   100    64     0     0  5005  1806
24 |     0   166    59    97    82     0     0  5005  1810
25 |     0   193    59   100    69     0     0  5005  1807
26 |     0   164    60    97    81     0     0  5005  1812
27 |     0   137    60   100    69     0     0  5005  1805
28 |     0   155    60    98    81     0     0  5005  1802
29 |     0   117    61   100    71     0     0  5005  1802
30 |     0   143    61    98    75     0     0  5005  1798
31 |     0   109    61    99    73     0     0  5005  1801
32 |     0   148    62   100    73     0     0  5005  1798
33 |     0   137    62    97    76     0     0  5005  1798
34 |     0    45    60     0     0     0     0  5005  1807
35 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_keras_tensorflow.output:
--------------------------------------------------------------------------------
  1 | Using TensorFlow backend.
  2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so locally
  3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
  4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so locally
  5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
  6 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so locally
  7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:118] Found device 0 with properties: 
  8 | name: Graphics Device
  9 | major: 6 minor: 1 memoryClockRate (GHz) 1.7335
 10 | pciBusID 0000:03:00.0
 11 | Total memory: 7.92GiB
 12 | Free memory: 7.81GiB
 13 | W tensorflow/stream_executor/cuda/cuda_driver.cc:572] creating context when one is currently active; existing: 0x2a01cc0
 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:118] Found device 1 with properties: 
 15 | name: Graphics Device
 16 | major: 6 minor: 1 memoryClockRate (GHz) 1.7335
 17 | pciBusID 0000:05:00.0
 18 | Total memory: 7.92GiB
 19 | Free memory: 7.81GiB
 20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:138] DMA: 0 1 
 21 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:148] 0:   Y Y 
 22 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:148] 1:   Y Y 
 23 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:868] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Graphics Device, pci bus id: 0000:03:00.0)
 24 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:868] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Graphics Device, pci bus id: 0000:05:00.0)
 25 | Iteration: 0 train on batch time: 287.307 ms.
 26 | Iteration: 1 train on batch time: 285.672 ms.
 27 | Iteration: 2 train on batch time: 286.286 ms.
 28 | Iteration: 3 train on batch time: 287.512 ms.
 29 | Iteration: 4 train on batch time: 288.259 ms.
 30 | Iteration: 5 train on batch time: 286.027 ms.
 31 | Iteration: 6 train on batch time: 286.402 ms.
 32 | Iteration: 7 train on batch time: 286.703 ms.
 33 | Iteration: 8 train on batch time: 287.772 ms.
 34 | Iteration: 9 train on batch time: 285.709 ms.
 35 | Iteration: 10 train on batch time: 286.838 ms.
 36 | Iteration: 11 train on batch time: 288.765 ms.
 37 | Iteration: 12 train on batch time: 286.001 ms.
 38 | Iteration: 13 train on batch time: 286.320 ms.
 39 | Iteration: 14 train on batch time: 286.807 ms.
 40 | Iteration: 15 train on batch time: 285.734 ms.
 41 | Iteration: 16 train on batch time: 287.607 ms.
 42 | Iteration: 17 train on batch time: 287.108 ms.
 43 | Iteration: 18 train on batch time: 286.929 ms.
 44 | Iteration: 19 train on batch time: 286.319 ms.
 45 | Iteration: 20 train on batch time: 286.867 ms.
 46 | Iteration: 21 train on batch time: 286.533 ms.
 47 | Iteration: 22 train on batch time: 286.395 ms.
 48 | Iteration: 23 train on batch time: 286.727 ms.
 49 | Iteration: 24 train on batch time: 286.351 ms.
 50 | Iteration: 25 train on batch time: 284.861 ms.
 51 | Iteration: 26 train on batch time: 284.247 ms.
 52 | Iteration: 27 train on batch time: 287.205 ms.
 53 | Iteration: 28 train on batch time: 286.115 ms.
 54 | Iteration: 29 train on batch time: 288.505 ms.
 55 | Iteration: 30 train on batch time: 285.953 ms.
 56 | Iteration: 31 train on batch time: 286.647 ms.
 57 | Iteration: 32 train on batch time: 287.382 ms.
 58 | Iteration: 33 train on batch time: 285.452 ms.
 59 | Iteration: 34 train on batch time: 287.961 ms.
 60 | Iteration: 35 train on batch time: 291.466 ms.
 61 | Iteration: 36 train on batch time: 288.147 ms.
 62 | Iteration: 37 train on batch time: 286.794 ms.
 63 | Iteration: 38 train on batch time: 287.992 ms.
 64 | Iteration: 39 train on batch time: 288.307 ms.
 65 | Iteration: 40 train on batch time: 286.961 ms.
 66 | Iteration: 41 train on batch time: 287.075 ms.
 67 | Iteration: 42 train on batch time: 288.158 ms.
 68 | Iteration: 43 train on batch time: 290.944 ms.
 69 | Iteration: 44 train on batch time: 286.356 ms.
 70 | Iteration: 45 train on batch time: 286.673 ms.
 71 | Iteration: 46 train on batch time: 289.705 ms.
 72 | Iteration: 47 train on batch time: 286.751 ms.
 73 | Iteration: 48 train on batch time: 291.429 ms.
 74 | Iteration: 49 train on batch time: 287.108 ms.
 75 | Iteration: 50 train on batch time: 288.143 ms.
 76 | Iteration: 51 train on batch time: 286.341 ms.
 77 | Iteration: 52 train on batch time: 287.558 ms.
 78 | Iteration: 53 train on batch time: 289.172 ms.
 79 | Iteration: 54 train on batch time: 289.523 ms.
 80 | Iteration: 55 train on batch time: 287.128 ms.
 81 | Iteration: 56 train on batch time: 288.374 ms.
 82 | Iteration: 57 train on batch time: 288.952 ms.
 83 | Iteration: 58 train on batch time: 291.691 ms.
 84 | Iteration: 59 train on batch time: 287.035 ms.
 85 | Iteration: 60 train on batch time: 288.489 ms.
 86 | Iteration: 61 train on batch time: 289.944 ms.
 87 | Iteration: 62 train on batch time: 286.632 ms.
 88 | Iteration: 63 train on batch time: 288.199 ms.
 89 | Iteration: 64 train on batch time: 288.198 ms.
 90 | Iteration: 65 train on batch time: 287.945 ms.
 91 | Iteration: 66 train on batch time: 287.715 ms.
 92 | Iteration: 67 train on batch time: 285.903 ms.
 93 | Iteration: 68 train on batch time: 289.123 ms.
 94 | Iteration: 69 train on batch time: 286.140 ms.
 95 | Iteration: 70 train on batch time: 289.240 ms.
 96 | Iteration: 71 train on batch time: 289.794 ms.
 97 | Iteration: 72 train on batch time: 286.609 ms.
 98 | Iteration: 73 train on batch time: 287.146 ms.
 99 | Iteration: 74 train on batch time: 288.039 ms.
100 | Iteration: 75 train on batch time: 286.723 ms.
101 | Iteration: 76 train on batch time: 286.833 ms.
102 | Iteration: 77 train on batch time: 287.447 ms.
103 | Iteration: 78 train on batch time: 286.315 ms.
104 | Iteration: 79 train on batch time: 287.332 ms.
105 | Iteration: 80 train on batch time: 287.199 ms.
106 | Iteration: 81 train on batch time: 287.768 ms.
107 | Iteration: 82 train on batch time: 290.930 ms.
108 | Iteration: 83 train on batch time: 288.858 ms.
109 | Iteration: 84 train on batch time: 291.169 ms.
110 | Iteration: 85 train on batch time: 287.972 ms.
111 | Iteration: 86 train on batch time: 288.800 ms.
112 | Iteration: 87 train on batch time: 286.622 ms.
113 | Iteration: 88 train on batch time: 286.225 ms.
114 | Iteration: 89 train on batch time: 287.903 ms.
115 | Iteration: 90 train on batch time: 290.274 ms.
116 | Iteration: 91 train on batch time: 286.583 ms.
117 | Iteration: 92 train on batch time: 287.495 ms.
118 | Iteration: 93 train on batch time: 287.426 ms.
119 | Iteration: 94 train on batch time: 287.889 ms.
120 | Iteration: 95 train on batch time: 288.170 ms.
121 | Iteration: 96 train on batch time: 288.284 ms.
122 | Iteration: 97 train on batch time: 290.761 ms.
123 | Iteration: 98 train on batch time: 287.909 ms.
124 | Iteration: 99 train on batch time: 291.303 ms.
125 | Batch size: 16
126 | Iterations: 100
127 | Time per iteration: 287.693 ms
128 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_keras_theano.dmon:
--------------------------------------------------------------------------------
 1 |     0    36    31     0     0     0     0  5005  1603
 2 |     0   104    33    17     6     0     0  5005  1603
 3 |     0   165    36    95    31     0     0  5005  1748
 4 |     0   153    37    97    28     0     0  5005  1863
 5 |     0   150    38    96    31     0     0  5005  1854
 6 |     0   152    38    97    24     0     0  5005  1847
 7 |     0   153    39    95    34     0     0  5005  1847
 8 |     0   153    40    95    37     0     0  5005  1847
 9 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
10 | # Idx     W     C     %     %     %     %   MHz   MHz
11 |     0   150    40    97    26     0     0  5005  1847
12 |     0   171    41    95    35     0     0  5005  1847
13 |     0   150    42    97    29     0     0  5005  1846
14 |     0   172    42    96    36     0     0  5005  1847
15 |     0   153    43    95    30     0     0  5005  1847
16 |     0   167    44    95    35     0     0  5005  1849
17 |     0   155    44    96    30     0     0  5005  1849
18 |     0   170    45    95    34     0     0  5005  1850
19 |     0   152    46    96    29     0     0  5005  1850
20 |     0   180    46    97    32     0     0  5005  1850
21 |     0   150    47    96    30     0     0  5005  1850
22 |     0   151    47    95    35     0     0  5005  1850
23 |     0   154    48    97    24     0     0  5005  1850
24 |     0   178    48    95    35     0     0  5005  1840
25 |     0   143    49    96    28     0     0  5005  1839
26 |     0   156    49    94    35     0     0  5005  1834
27 |     0   163    50    96    29     0     0  5005  1839
28 |     0   159    51    97    31     0     0  5005  1837
29 |     0   121    51    91    30     0     0  5005  1836
30 |     0   133    51    96    24     0     0  5005  1837
31 |     0   155    52    91    35     0     0  5005  1834
32 |     0   104    52    95    30     0     0  5005  1837
33 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
34 | # Idx     W     C     %     %     %     %   MHz   MHz
35 |     0   175    54    95    34     0     0  5005  1837
36 |     0   142    54    96    23     0     0  5005  1837
37 |     0   184    54    91    35     0     0  5005  1834
38 |     0   154    55    97    26     0     0  5005  1832
39 |     0   161    55    95    36     0     0  5005  1834
40 |     0   166    55    94    31     0     0  5005  1831
41 |     0   148    56    93    32     0     0  5005  1833
42 |     0   140    56    94    32     0     0  5005  1831
43 |     0   123    56    95    23     0     0  5005  1831
44 |     0   145    58    94    35     0     0  5005  1834
45 |     0   116    58    96    31     0     0  5005  1830
46 |     0   164    58    93    34     0     0  5005  1835
47 |     0    44    56     0     0     0     0  5005  1826
48 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_keras_theano.output:
--------------------------------------------------------------------------------
  1 | Using Theano backend.
  2 | Using gpu device 0: Graphics Device (CNMeM is disabled, cuDNN 5005)
  3 | Iteration: 0 train on batch time: 408.542 ms.
  4 | Iteration: 1 train on batch time: 404.476 ms.
  5 | Iteration: 2 train on batch time: 403.843 ms.
  6 | Iteration: 3 train on batch time: 402.034 ms.
  7 | Iteration: 4 train on batch time: 403.209 ms.
  8 | Iteration: 5 train on batch time: 405.481 ms.
  9 | Iteration: 6 train on batch time: 404.473 ms.
 10 | Iteration: 7 train on batch time: 404.539 ms.
 11 | Iteration: 8 train on batch time: 404.342 ms.
 12 | Iteration: 9 train on batch time: 406.517 ms.
 13 | Iteration: 10 train on batch time: 404.390 ms.
 14 | Iteration: 11 train on batch time: 404.463 ms.
 15 | Iteration: 12 train on batch time: 404.048 ms.
 16 | Iteration: 13 train on batch time: 404.791 ms.
 17 | Iteration: 14 train on batch time: 404.642 ms.
 18 | Iteration: 15 train on batch time: 404.891 ms.
 19 | Iteration: 16 train on batch time: 405.597 ms.
 20 | Iteration: 17 train on batch time: 404.474 ms.
 21 | Iteration: 18 train on batch time: 403.967 ms.
 22 | Iteration: 19 train on batch time: 405.766 ms.
 23 | Iteration: 20 train on batch time: 404.242 ms.
 24 | Iteration: 21 train on batch time: 405.184 ms.
 25 | Iteration: 22 train on batch time: 404.359 ms.
 26 | Iteration: 23 train on batch time: 404.569 ms.
 27 | Iteration: 24 train on batch time: 404.223 ms.
 28 | Iteration: 25 train on batch time: 412.660 ms.
 29 | Iteration: 26 train on batch time: 413.368 ms.
 30 | Iteration: 27 train on batch time: 407.599 ms.
 31 | Iteration: 28 train on batch time: 406.539 ms.
 32 | Iteration: 29 train on batch time: 404.855 ms.
 33 | Iteration: 30 train on batch time: 403.773 ms.
 34 | Iteration: 31 train on batch time: 406.304 ms.
 35 | Iteration: 32 train on batch time: 403.726 ms.
 36 | Iteration: 33 train on batch time: 404.343 ms.
 37 | Iteration: 34 train on batch time: 403.955 ms.
 38 | Iteration: 35 train on batch time: 404.055 ms.
 39 | Iteration: 36 train on batch time: 403.440 ms.
 40 | Iteration: 37 train on batch time: 404.931 ms.
 41 | Iteration: 38 train on batch time: 403.599 ms.
 42 | Iteration: 39 train on batch time: 405.201 ms.
 43 | Iteration: 40 train on batch time: 403.820 ms.
 44 | Iteration: 41 train on batch time: 404.627 ms.
 45 | Iteration: 42 train on batch time: 403.844 ms.
 46 | Iteration: 43 train on batch time: 404.620 ms.
 47 | Iteration: 44 train on batch time: 408.143 ms.
 48 | Iteration: 45 train on batch time: 412.360 ms.
 49 | Iteration: 46 train on batch time: 414.074 ms.
 50 | Iteration: 47 train on batch time: 409.032 ms.
 51 | Iteration: 48 train on batch time: 413.834 ms.
 52 | Iteration: 49 train on batch time: 407.383 ms.
 53 | Iteration: 50 train on batch time: 407.769 ms.
 54 | Iteration: 51 train on batch time: 419.355 ms.
 55 | Iteration: 52 train on batch time: 416.029 ms.
 56 | Iteration: 53 train on batch time: 412.512 ms.
 57 | Iteration: 54 train on batch time: 408.329 ms.
 58 | Iteration: 55 train on batch time: 409.426 ms.
 59 | Iteration: 56 train on batch time: 410.921 ms.
 60 | Iteration: 57 train on batch time: 412.376 ms.
 61 | Iteration: 58 train on batch time: 405.777 ms.
 62 | Iteration: 59 train on batch time: 412.392 ms.
 63 | Iteration: 60 train on batch time: 413.598 ms.
 64 | Iteration: 61 train on batch time: 422.148 ms.
 65 | Iteration: 62 train on batch time: 415.627 ms.
 66 | Iteration: 63 train on batch time: 417.485 ms.
 67 | Iteration: 64 train on batch time: 418.663 ms.
 68 | Iteration: 65 train on batch time: 418.118 ms.
 69 | Iteration: 66 train on batch time: 419.845 ms.
 70 | Iteration: 67 train on batch time: 408.545 ms.
 71 | Iteration: 68 train on batch time: 415.054 ms.
 72 | Iteration: 69 train on batch time: 417.838 ms.
 73 | Iteration: 70 train on batch time: 408.967 ms.
 74 | Iteration: 71 train on batch time: 409.098 ms.
 75 | Iteration: 72 train on batch time: 412.858 ms.
 76 | Iteration: 73 train on batch time: 413.543 ms.
 77 | Iteration: 74 train on batch time: 416.946 ms.
 78 | Iteration: 75 train on batch time: 413.417 ms.
 79 | Iteration: 76 train on batch time: 415.105 ms.
 80 | Iteration: 77 train on batch time: 409.024 ms.
 81 | Iteration: 78 train on batch time: 409.921 ms.
 82 | Iteration: 79 train on batch time: 411.784 ms.
 83 | Iteration: 80 train on batch time: 412.612 ms.
 84 | Iteration: 81 train on batch time: 416.431 ms.
 85 | Iteration: 82 train on batch time: 413.436 ms.
 86 | Iteration: 83 train on batch time: 414.391 ms.
 87 | Iteration: 84 train on batch time: 413.834 ms.
 88 | Iteration: 85 train on batch time: 419.902 ms.
 89 | Iteration: 86 train on batch time: 426.593 ms.
 90 | Iteration: 87 train on batch time: 415.107 ms.
 91 | Iteration: 88 train on batch time: 412.426 ms.
 92 | Iteration: 89 train on batch time: 411.061 ms.
 93 | Iteration: 90 train on batch time: 419.293 ms.
 94 | Iteration: 91 train on batch time: 416.285 ms.
 95 | Iteration: 92 train on batch time: 416.264 ms.
 96 | Iteration: 93 train on batch time: 416.440 ms.
 97 | Iteration: 94 train on batch time: 414.160 ms.
 98 | Iteration: 95 train on batch time: 411.973 ms.
 99 | Iteration: 96 train on batch time: 408.613 ms.
100 | Iteration: 97 train on batch time: 413.174 ms.
101 | Iteration: 98 train on batch time: 417.960 ms.
102 | Iteration: 99 train on batch time: 413.599 ms.
103 | Batch size: 16
104 | Iterations: 100
105 | Time per iteration: 409.953 ms
106 | 


--------------------------------------------------------------------------------
/results/gtx_1080/benchmark_neon.output:
--------------------------------------------------------------------------------
  1 | Epoch 0   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.16s]
  2 | Epoch 1   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.06s]
  3 | Epoch 2   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.15s]
  4 | Epoch 3   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
  5 | Epoch 4   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
  6 | Epoch 5   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
  7 | Epoch 6   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
  8 | Epoch 7   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
  9 | Epoch 8   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 10 | Epoch 9   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.14s]
 11 | Epoch 10  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 12 | Epoch 11  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 13 | Epoch 12  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 14 | Epoch 13  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 15 | Epoch 14  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.15s]
 16 | Epoch 15  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 17 | Epoch 16  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 18 | Epoch 17  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 19 | Epoch 18  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 20 | Epoch 19  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 21 | Epoch 20  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 22 | Epoch 21  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.15s]
 23 | Epoch 22  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 24 | Epoch 23  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 25 | Epoch 24  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 26 | Epoch 25  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 27 | Epoch 26  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 28 | Epoch 27  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 29 | Epoch 28  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 30 | Epoch 29  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 31 | Epoch 30  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 32 | Epoch 31  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 33 | Epoch 32  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 34 | Epoch 33  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 35 | Epoch 34  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 36 | Epoch 35  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 37 | Epoch 36  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 38 | Epoch 37  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 39 | Epoch 38  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.15s]
 40 | Epoch 39  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 41 | Epoch 40  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 42 | Epoch 41  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 43 | Epoch 42  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 44 | Epoch 43  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 45 | Epoch 44  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 46 | Epoch 45  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 47 | Epoch 46  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 48 | Epoch 47  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 49 | Epoch 48  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 50 | Epoch 49  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 51 | Epoch 50  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 52 | Epoch 51  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 53 | Epoch 52  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 54 | Epoch 53  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 55 | Epoch 54  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 56 | Epoch 55  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 57 | Epoch 56  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 58 | Epoch 57  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 59 | Epoch 58  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 60 | Epoch 59  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 61 | Epoch 60  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 62 | Epoch 61  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 63 | Epoch 62  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 64 | Epoch 63  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 65 | Epoch 64  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 66 | Epoch 65  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 67 | Epoch 66  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 68 | Epoch 67  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 69 | Epoch 68  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 70 | Epoch 69  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 71 | Epoch 70  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 72 | Epoch 71  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 73 | Epoch 72  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 74 | Epoch 73  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 75 | Epoch 74  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 76 | Epoch 75  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 77 | Epoch 76  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 78 | Epoch 77  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 79 | Epoch 78  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 80 | Epoch 79  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 81 | Epoch 80  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 82 | Epoch 81  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 83 | Epoch 82  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 84 | Epoch 83  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 85 | Epoch 84  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 86 | Epoch 85  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 87 | Epoch 86  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 88 | Epoch 87  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 89 | Epoch 88  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 90 | Epoch 89  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 91 | Epoch 90  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 92 | Epoch 91  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 93 | Epoch 92  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 94 | Epoch 93  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 95 | Epoch 94  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 96 | Epoch 95  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 97 | Epoch 96  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 98 | Epoch 97  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
 99 | Epoch 98  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
100 | Epoch 99  [Train |████████████████████|    1/1    batches, 6.87 cost, 0.15s]
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 164.527 ms
104 | 
105 | real    0m18.269s
106 | user    0m15.268s
107 | sys 0m3.460s
108 | 


--------------------------------------------------------------------------------
/results/k520/INFO.md:
--------------------------------------------------------------------------------
 1 | - Hardware: AWS EC2 g2.2xlarge (half of NVIDIA K520)
 2 | - OS: Deep Learning AMI for Amazon Linux, v1.5
 3 | - CUDA: 7.5
 4 | - CuDNN: 5.1
 5 | - NVIDIA drivers: 352.99
 6 | - Caffe: 1.0.0-rc3
 7 | - Keras: 1.0.8
 8 | - Theano: 0.8.2.dev-127aa70c6d34421ba7808b23d8bea506dc878068
 9 | - TensorFlow: 0.10.0
10 | - Neon: N/A
11 | - MXNet: 0.7.0
12 | - Python: 2.7.12
13 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_caffe.dmon:
--------------------------------------------------------------------------------
 1 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 2 | # Idx     W     C     %     %     %     %   MHz   MHz
 3 |     0    17    30     0     0     0     0   324   324
 4 |     0    17    30     0     0     0     0   324   324
 5 |     0    17    30     0     0     0     0   324   324
 6 |     0    17    30     0     0     0     0   324   324
 7 |     0    17    30     0     0     0     0   324   324
 8 |     0    17    30     0     0     0     0   324   324
 9 |     0    17    30     0     0     0     0   324   324
10 |     0    17    30     0     0     0     0   324   324
11 |     0    17    29     0     0     0     0   324   324
12 |     0    45    30     0     0     0     0  2500   797
13 |     0    45    31     0     0     0     0  2500   797
14 |     0    44    31     0     0     0     0  2500   797
15 |     0    45    31     0     0     0     0  2500   797
16 |     0    44    31     0     0     0     0  2500   797
17 |     0    44    31     0     0     0     0  2500   797
18 |     0    45    31     0     0     0     0  2500   797
19 |     0    45    31     0     0     0     0  2500   797
20 |     0    44    32     0     0     0     0  2500   797
21 |     0    44    32     0     0     0     0  2500   797
22 |     0    44    32     0     0     0     0  2500   797
23 |     0    44    32     0     0     0     0  2500   797
24 |     0    44    32     0     0     0     0  2500   797
25 |     0    44    32     0     0     0     0  2500   797
26 |     0    44    32     0     0     0     0  2500   797
27 |     0    44    32     0     0     0     0  2500   797
28 |     0    44    32     0     0     0     0  2500   797
29 |     0    45    32     0     0     0     0  2500   797
30 |     0    45    32     0     0     0     0  2500   797
31 |     0    44    32     0     0     0     0  2500   797
32 |     0    44    32     0     0     0     0  2500   797
33 |     0    44    32     0     0     0     0  2500   797
34 |     0    44    32     0     0     0     0  2500   797
35 |     0    44    32     0     0     0     0  2500   797
36 |     0    45    32     0     0     0     0  2500   797
37 |     0    90    35    98    42     0     0  2500   797
38 |     0    82    35    99    32     0     0  2500   797
39 |     0    47    35    70     6     0     0  2500   797
40 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_keras_tensorflow.dmon:
--------------------------------------------------------------------------------
 1 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 2 | # Idx     W     C     %     %     %     %   MHz   MHz
 3 |     0    45    55     0     0     0     0  2500   797
 4 |     0    45    54     0     0     0     0  2500   797
 5 |     0    46    53     6     0     0     0  2500   797
 6 |     0    45    53     0     0     0     0  2500   797
 7 |     0    46    53     0     0     0     0  2500   797
 8 |     0    89    54    99    47     0     0  2500   797
 9 |     0    96    55    99    34     0     0  2500   797
10 |     0    74    54    99    34     0     0  2500   797
11 |     0    75    54    99    36     0     0  2500   797
12 |     0    75    54    99    36     0     0  2500   797
13 |     0    95    55    99    22     0     0  2500   797
14 |     0    74    54    99    31     0     0  2500   797
15 |     0    74    55    99    27     0     0  2500   797
16 |     0    45    53     0     0     0     0  2500   797
17 |     0    45    52     0     0     0     0  2500   797
18 |     0    45    52     0     0     0     0  2500   797
19 |     0    45    52     0     0     0     0  2500   797
20 |     0    45    52     0     0     0     0  2500   797
21 |     0    45    51     0     0     0     0  2500   797
22 |     0    45    51     0     0     0     0  2500   797
23 |     0    45    51     0     0     0     0  2500   797
24 |     0    45    50     0     0     0     0  2500   797
25 |     0    45    50     0     0     0     0  2500   797
26 |     0    88    52    99    35     0     0  2500   797
27 |     0    74    52    99    35     0     0  2500   797
28 |     0    74    52   100    35     0     0  2500   797
29 |     0    74    52    99    32     0     0  2500   797
30 |     0    74    52    99    36     0     0  2500   797
31 |     0    74    52    99    36     0     0  2500   797
32 |     0    74    52    99    34     0     0  2500   797
33 |     0    74    52   100    34     0     0  2500   797
34 |     0    79    52    99    34     0     0  2500   797
35 |     0    74    52    99    34     0     0  2500   797
36 |     0    86    52    99     8     0     0  2500   797
37 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_keras_theano.output:
--------------------------------------------------------------------------------
  1 | Using Theano backend.
  2 | Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5103)
  3 | /usr/local/lib/python2.7/site-packages/Theano-0.8.2-py2.7.egg/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  4 |   warnings.warn(warn)
  5 | Iteration: 0 train on batch time: 2444.628 ms.
  6 | Iteration: 1 train on batch time: 2446.240 ms.
  7 | Iteration: 2 train on batch time: 2443.903 ms.
  8 | Iteration: 3 train on batch time: 2443.137 ms.
  9 | Iteration: 4 train on batch time: 2442.147 ms.
 10 | Iteration: 5 train on batch time: 2444.695 ms.
 11 | Iteration: 6 train on batch time: 2444.683 ms.
 12 | Iteration: 7 train on batch time: 2442.783 ms.
 13 | Iteration: 8 train on batch time: 2444.892 ms.
 14 | Iteration: 9 train on batch time: 2442.176 ms.
 15 | Iteration: 10 train on batch time: 2441.531 ms.
 16 | Iteration: 11 train on batch time: 2442.843 ms.
 17 | Iteration: 12 train on batch time: 2443.993 ms.
 18 | Iteration: 13 train on batch time: 2442.021 ms.
 19 | Iteration: 14 train on batch time: 2442.236 ms.
 20 | Iteration: 15 train on batch time: 2445.117 ms.
 21 | Iteration: 16 train on batch time: 2443.993 ms.
 22 | Iteration: 17 train on batch time: 2443.142 ms.
 23 | Iteration: 18 train on batch time: 2442.801 ms.
 24 | Iteration: 19 train on batch time: 2445.849 ms.
 25 | Iteration: 20 train on batch time: 2446.833 ms.
 26 | Iteration: 21 train on batch time: 2444.769 ms.
 27 | Iteration: 22 train on batch time: 2446.223 ms.
 28 | Iteration: 23 train on batch time: 2442.227 ms.
 29 | Iteration: 24 train on batch time: 2444.088 ms.
 30 | Iteration: 25 train on batch time: 2444.733 ms.
 31 | Iteration: 26 train on batch time: 2443.682 ms.
 32 | Iteration: 27 train on batch time: 2442.925 ms.
 33 | Iteration: 28 train on batch time: 2444.028 ms.
 34 | Iteration: 29 train on batch time: 2444.721 ms.
 35 | Iteration: 30 train on batch time: 2446.014 ms.
 36 | Iteration: 31 train on batch time: 2440.911 ms.
 37 | Iteration: 32 train on batch time: 2443.289 ms.
 38 | Iteration: 33 train on batch time: 2443.552 ms.
 39 | Iteration: 34 train on batch time: 2445.217 ms.
 40 | Iteration: 35 train on batch time: 2442.353 ms.
 41 | Iteration: 36 train on batch time: 2442.496 ms.
 42 | Iteration: 37 train on batch time: 2444.923 ms.
 43 | Iteration: 38 train on batch time: 2448.871 ms.
 44 | Iteration: 39 train on batch time: 2449.978 ms.
 45 | Iteration: 40 train on batch time: 2446.498 ms.
 46 | Iteration: 41 train on batch time: 2447.461 ms.
 47 | Iteration: 42 train on batch time: 2445.459 ms.
 48 | Iteration: 43 train on batch time: 2444.839 ms.
 49 | Iteration: 44 train on batch time: 2445.726 ms.
 50 | Iteration: 45 train on batch time: 2450.397 ms.
 51 | Iteration: 46 train on batch time: 2447.203 ms.
 52 | Iteration: 47 train on batch time: 2444.968 ms.
 53 | Iteration: 48 train on batch time: 2445.119 ms.
 54 | Iteration: 49 train on batch time: 2446.987 ms.
 55 | Iteration: 50 train on batch time: 2446.264 ms.
 56 | Iteration: 51 train on batch time: 2443.741 ms.
 57 | Iteration: 52 train on batch time: 2445.557 ms.
 58 | Iteration: 53 train on batch time: 2448.646 ms.
 59 | Iteration: 54 train on batch time: 2447.709 ms.
 60 | Iteration: 55 train on batch time: 2447.929 ms.
 61 | Iteration: 56 train on batch time: 2441.227 ms.
 62 | Iteration: 57 train on batch time: 2447.513 ms.
 63 | Iteration: 58 train on batch time: 2445.976 ms.
 64 | Iteration: 59 train on batch time: 2448.064 ms.
 65 | Iteration: 60 train on batch time: 2445.141 ms.
 66 | Iteration: 61 train on batch time: 2447.166 ms.
 67 | Iteration: 62 train on batch time: 2447.258 ms.
 68 | Iteration: 63 train on batch time: 2445.542 ms.
 69 | Iteration: 64 train on batch time: 2447.106 ms.
 70 | Iteration: 65 train on batch time: 2446.711 ms.
 71 | Iteration: 66 train on batch time: 2445.068 ms.
 72 | Iteration: 67 train on batch time: 2444.910 ms.
 73 | Iteration: 68 train on batch time: 2444.447 ms.
 74 | Iteration: 69 train on batch time: 2445.627 ms.
 75 | Iteration: 70 train on batch time: 2445.507 ms.
 76 | Iteration: 71 train on batch time: 2448.621 ms.
 77 | Iteration: 72 train on batch time: 2443.731 ms.
 78 | Iteration: 73 train on batch time: 2447.027 ms.
 79 | Iteration: 74 train on batch time: 2446.535 ms.
 80 | Iteration: 75 train on batch time: 2444.343 ms.
 81 | Iteration: 76 train on batch time: 2446.748 ms.
 82 | Iteration: 77 train on batch time: 2446.792 ms.
 83 | Iteration: 78 train on batch time: 2447.206 ms.
 84 | Iteration: 79 train on batch time: 2444.551 ms.
 85 | Iteration: 80 train on batch time: 2442.678 ms.
 86 | Iteration: 81 train on batch time: 2442.499 ms.
 87 | Iteration: 82 train on batch time: 2442.150 ms.
 88 | Iteration: 83 train on batch time: 2450.410 ms.
 89 | Iteration: 84 train on batch time: 2444.117 ms.
 90 | Iteration: 85 train on batch time: 2444.983 ms.
 91 | Iteration: 86 train on batch time: 2447.025 ms.
 92 | Iteration: 87 train on batch time: 2445.948 ms.
 93 | Iteration: 88 train on batch time: 2444.379 ms.
 94 | Iteration: 89 train on batch time: 2444.040 ms.
 95 | Iteration: 90 train on batch time: 2445.782 ms.
 96 | Iteration: 91 train on batch time: 2445.015 ms.
 97 | Iteration: 92 train on batch time: 2444.981 ms.
 98 | Iteration: 93 train on batch time: 2445.117 ms.
 99 | Iteration: 94 train on batch time: 2444.239 ms.
100 | Iteration: 95 train on batch time: 2445.872 ms.
101 | Iteration: 96 train on batch time: 2445.494 ms.
102 | Iteration: 97 train on batch time: 2446.789 ms.
103 | Iteration: 98 train on batch time: 2446.074 ms.
104 | Iteration: 99 train on batch time: 2450.346 ms.
105 | Batch size: 16
106 | Iterations: 100
107 | Time per iteration: 2445.223 ms
108 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_mxnet.dmon:
--------------------------------------------------------------------------------
 1 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 2 | # Idx     W     C     %     %     %     %   MHz   MHz
 3 |     0    17    22     0     0     0     0   324   324
 4 |     0    17    22     0     0     0     0   324   324
 5 |     0    17    22     0     0     0     0   324   324
 6 |     0    17    22     0     0     0     0   324   324
 7 |     0    17    22     0     0     0     0   324   324
 8 |     0    17    22     0     0     0     0   324   324
 9 |     0    17    22     0     0     0     0   324   324
10 |     0    17    22     0     0     0     0   324   324
11 |     0    17    22     0     0     0     0   324   324
12 |     0    17    22     0     0     0     0   324   324
13 |     0    17    22     0     0     0     0   324   324
14 |     0    17    22     0     0     0     0   324   324
15 |     0    17    22     0     0     0     0   324   324
16 |     0    17    22     0     0     0     0   324   324
17 |     0    17    22     0     0     0     0   324   324
18 |     0    17    22     0     0     0     0   324   324
19 |     0    17    22     0     0     0     0   324   324
20 |     0    17    22     0     0     0     0   324   324
21 |     0    17    22     0     0     0     0   324   324
22 |     0    17    22     0     0     0     0   324   324
23 |     0    17    22     0     0     0     0   324   324
24 |     0    17    22     0     0     0     0   324   324
25 |     0    17    22     0     0     0     0   324   324
26 |     0    17    22     0     0     0     0   324   324
27 |     0    17    22     0     0     0     0   324   324
28 |     0    17    22     0     0     0     0   324   324
29 |     0    17    22     0     0     0     0   324   324
30 |     0    17    22     0     0     0     0   324   324
31 |     0    17    22     0     0     0     0   324   324
32 |     0    17    22     0     0     0     0   324   324
33 |     0    17    22     0     0     0     0   324   324
34 |     0    17    22     0     0     0     0   324   324
35 |     0    17    22     0     0     0     0   324   324
36 |     0    17    22     0     0     0     0   324   324
37 |     0    17    22     0     0     0     0   324   324
38 |     0    17    22     0     0     0     0   324   324
39 |     0    17    22     0     0     0     0   324   324
40 |     0    44    23     0     0     0     0  2500   797
41 |     0    45    23     0     0     0     0  2500   797
42 |     0    45    24     0     0     0     0  2500   797
43 |     0    44    24     0     0     0     0  2500   797
44 |     0    44    24     0     0     0     0  2500   797
45 |     0    44    24     0     0     0     0  2500   797
46 |     0    44    24     0     0     0     0  2500   797
47 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
48 | # Idx     W     C     %     %     %     %   MHz   MHz
49 |     0    45    25     0     0     0     0  2500   797
50 |     0    45    25     0     0     0     0  2500   797
51 |     0    44    25     0     0     0     0  2500   797
52 |     0    45    25     0     0     0     0  2500   797
53 |     0    44    25     0     0     0     0  2500   797
54 |     0    45    25     0     0     0     0  2500   797
55 |     0    45    26     0     0     0     0  2500   797
56 |     0    44    25     0     0     0     0  2500   797
57 |     0    44    26     0     0     0     0  2500   797
58 |     0    45    26     2     0     0     0  2500   797
59 |     0    45    26     0     0     0     0  2500   797
60 |     0    45    26     0     0     0     0  2500   797
61 |     0    45    26     0     0     0     0  2500   797
62 |     0    44    26     0     0     0     0  2500   797
63 |     0    44    26     0     0     0     0  2500   797
64 |     0    44    26     0     0     0     0  2500   797
65 |     0    45    27     0     0     0     0  2500   797
66 |     0    44    27     0     0     0     0  2500   797
67 |     0    44    27     0     0     0     0  2500   797
68 |     0    44    27     0     0     0     0  2500   797
69 |     0    44    27     0     0     0     0  2500   797
70 |     0    44    27     0     0     0     0  2500   797
71 |     0    44    27     0     0     0     0  2500   797
72 |     0    44    28     0     0     0     0  2500   797
73 |     0    44    27     0     0     0     0  2500   797
74 |     0    44    28     0     0     0     0  2500   797
75 |     0    44    28     0     0     0     0  2500   797
76 |     0    44    28     0     0     0     0  2500   797
77 |     0    44    28     0     0     0     0  2500   797
78 |     0    45    28     0     0     0     0  2500   797
79 |     0    44    28     0     0     0     0  2500   797
80 |     0    44    28     0     0     0     0  2500   797
81 |     0    45    28     0     0     0     0  2500   797
82 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_mxnet.output:
--------------------------------------------------------------------------------
1 | [09:36:46] /home/ec2-user/src/mxnet/dmlc-core/include/dmlc/./logging.h:235: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory
2 | [09:36:46] /home/ec2-user/src/mxnet/dmlc-core/include/dmlc/logging.h:235: [09:36:46] src/engine/./threaded_engine.h:306: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory
3 | An fatal error occurred in asynchronous engine operation. If you do not know what caused this error, you can try set environment variable MXNET_ENGINE_TYPE to NaiveEngine and run with debugger (i.e. gdb). This will force all operations to be synchronous and backtrace will give you the series of calls that lead to this error. Remember to set MXNET_ENGINE_TYPE back to empty after debugging.
4 | terminate called after throwing an instance of 'dmlc::Error'
5 |   what():  [09:36:46] src/engine/./threaded_engine.h:306: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory
6 | An fatal error occurred in asynchronous engine operation. If you do not know what caused this error, you can try set environment variable MXNET_ENGINE_TYPE to NaiveEngine and run with debugger (i.e. gdb). This will force all operations to be synchronous and backtrace will give you the series of calls that lead to this error. Remember to set MXNET_ENGINE_TYPE back to empty after debugging.
7 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_tensorflow.dmon:
--------------------------------------------------------------------------------
  1 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
  2 | # Idx     W     C     %     %     %     %   MHz   MHz
  3 |     0    47    35    70     6     0     0  2500   797
  4 |     0    44    34     0     0     0     0  2500   797
  5 |     0    45    34     0     0     0     0  2500   797
  6 |     0    54    35    62    45     0     0  2500   797
  7 |     0    78    36    99    34     0     0  2500   797
  8 |     0   108    37    99    23     0     0  2500   797
  9 |     0    90    37    99    50     0     0  2500   797
 10 |     0    78    37    99    36     0     0  2500   797
 11 |     0    77    37    99    36     0     0  2500   797
 12 |     0    78    38    99    35     0     0  2500   797
 13 |     0    78    38    99    35     0     0  2500   797
 14 |     0    78    38    99    36     0     0  2500   797
 15 |     0   100    39    99    30     0     0  2500   797
 16 |     0    86    39    99    35     0     0  2500   797
 17 |     0   102    40    99    21     0     0  2500   797
 18 |     0   100    40    99    26     0     0  2500   797
 19 |     0   105    41    99    34     0     0  2500   797
 20 |     0    99    40    99    26     0     0  2500   797
 21 |     0    87    40    99    36     0     0  2500   797
 22 |     0   101    40    99    32     0     0  2500   797
 23 |     0    94    41    99    35     0     0  2500   797
 24 |     0    97    41    99    35     0     0  2500   797
 25 |     0    78    40    99    34     0     0  2500   797
 26 |     0    91    42    99     8     0     0  2500   797
 27 |     0   100    42    99    21     0     0  2500   797
 28 |     0    96    42    99    36     0     0  2500   797
 29 |     0    95    42    99    38     0     0  2500   797
 30 |     0    91    42    99    34     0     0  2500   797
 31 |     0    91    43    99    17     0     0  2500   797
 32 |     0    89    43    99    34     0     0  2500   797
 33 |     0    94    43    99    39     0     0  2500   797
 34 |     0    92    44    99    12     0     0  2500   797
 35 |     0    95    44    99    36     0     0  2500   797
 36 |     0    98    44    99    39     0     0  2500   797
 37 |     0    99    44    99    37     0     0  2500   797
 38 |     0    93    44    99    29     0     0  2500   797
 39 |     0    87    44    99    27     0     0  2500   797
 40 |     0    87    45    99    43     0     0  2500   797
 41 |     0    88    45    99    31     0     0  2500   797
 42 |     0    97    45    99    40     0     0  2500   797
 43 |     0    97    46    99    21     0     0  2500   797
 44 |     0   100    45    99    36     0     0  2500   797
 45 |     0    98    45    99    38     0     0  2500   797
 46 |     0    89    46    99    33     0     0  2500   797
 47 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 48 | # Idx     W     C     %     %     %     %   MHz   MHz
 49 |     0    91    46    99    17     0     0  2500   797
 50 |     0    89    46    99    35     0     0  2500   797
 51 |     0    88    46    99    39     0     0  2500   797
 52 |     0    85    46    99    12     0     0  2500   797
 53 |     0    95    46    99    36     0     0  2500   797
 54 |     0    95    47    99    39     0     0  2500   797
 55 |     0    95    47    99    37     0     0  2500   797
 56 |     0    94    47    99    30     0     0  2500   797
 57 |     0    94    47    99    25     0     0  2500   797
 58 |     0    89    47    99    43     0     0  2500   797
 59 |     0    87    48    99    32     0     0  2500   797
 60 |     0    96    48    99    40     0     0  2500   797
 61 |     0    87    48    99    20     0     0  2500   797
 62 |     0    98    48    99    36     0     0  2500   797
 63 |     0    93    48    99    38     0     0  2500   797
 64 |     0    91    48    99    38     0     0  2500   797
 65 |     0    92    48    99    35     0     0  2500   797
 66 |     0    93    48    99    26     0     0  2500   797
 67 |     0    87    48    99    41     0     0  2500   797
 68 |     0    93    48    99    32     0     0  2500   797
 69 |     0    96    49    99    41     0     0  2500   797
 70 |     0   100    50    99    15     0     0  2500   797
 71 |     0    95    49    99    36     0     0  2500   797
 72 |     0    93    49    99    35     0     0  2500   797
 73 |     0    89    49    99    34     0     0  2500   797
 74 |     0    90    49    99    10     0     0  2500   797
 75 |     0    89    49    99    30     0     0  2500   797
 76 |     0    96    49    99    42     0     0  2500   797
 77 |     0    90    50    99    15     0     0  2500   797
 78 |     0    97    50    99    37     0     0  2500   797
 79 |     0    96    50    99    38     0     0  2500   797
 80 |     0    97    50    99    38     0     0  2500   797
 81 |     0    92    50    99    35     0     0  2500   797
 82 |     0    89    50    99    26     0     0  2500   797
 83 |     0    87    50    99    40     0     0  2500   797
 84 |     0    92    50    99    32     0     0  2500   797
 85 |     0    96    50    99    41     0     0  2500   797
 86 |     0    95    51    99    14     0     0  2500   797
 87 |     0    99    51    99    35     0     0  2500   797
 88 |     0    91    51    99    36     0     0  2500   797
 89 |     0    90    51    99    35     0     0  2500   797
 90 |     0    90    51    99     9     0     0  2500   797
 91 |     0    91    51    99    29     0     0  2500   797
 92 |     0    86    51    99    43     0     0  2500   797
 93 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 94 | # Idx     W     C     %     %     %     %   MHz   MHz
 95 |     0    86    51    99    16     0     0  2500   797
 96 |     0    94    51    99    38     0     0  2500   797
 97 |     0    97    52    99    39     0     0  2500   797
 98 |     0    97    52    99    38     0     0  2500   797
 99 |     0    94    52    99    36     0     0  2500   797
100 |     0    94    51    99    27     0     0  2500   797
101 |     0    90    52    99    39     0     0  2500   797
102 |     0    87    52    99    33     0     0  2500   797
103 |     0    96    52    99    41     0     0  2500   797
104 |     0    97    52    99    13     0     0  2500   797
105 |     0    97    52    99    36     0     0  2500   797
106 |     0    94    52    99    36     0     0  2500   797
107 |     0    91    52    99    35     0     0  2500   797
108 |     0    92    52   100     8     0     0  2500   797
109 |     0    91    52    99    28     0     0  2500   797
110 |     0    85    52    99    44     0     0  2500   797
111 |     0    94    52    99    16     0     0  2500   797
112 |     0    97    53    99    38     0     0  2500   797
113 |     0    98    53    99    39     0     0  2500   797
114 |     0    97    53    99    37     0     0  2500   797
115 |     0    94    53    99    37     0     0  2500   797
116 |     0    95    53    99    29     0     0  2500   797
117 |     0    91    53    99    11     0     0  2500   797
118 |     0    94    53    99    27     0     0  2500   797
119 |     0    96    53    99    44     0     0  2500   797
120 |     0    86    53    99    20     0     0  2500   797
121 |     0    97    53    99    39     0     0  2500   797
122 |     0    95    53    99    32     0     0  2500   797
123 |     0    95    53    99    36     0     0  2500   797
124 |     0    90    53    99    43     0     0  2500   797
125 |     0    93    54    99    32     0     0  2500   797
126 |     0    89    54    99    28     0     0  2500   797
127 |     0    93    53    99    36     0     0  2500   797
128 |     0    97    54    99    39     0     0  2500   797
129 |     0    96    54    99     9     0     0  2500   797
130 |     0    98    54    99    36     0     0  2500   797
131 |     0    94    53    99    37     0     0  2500   797
132 |     0    89    54    99    36     0     0  2500   797
133 |     0    90    54    99    13     0     0  2500   797
134 |     0    95    54    99    25     0     0  2500   797
135 |     0    87    54    99    44     0     0  2500   797
136 |     0    90    54    99    22     0     0  2500   797
137 |     0    93    54    99    39     0     0  2500   797
138 |     0    98    55    99    33     0     0  2500   797
139 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
140 | # Idx     W     C     %     %     %     %   MHz   MHz
141 |     0    97    54    99    37     0     0  2500   797
142 |     0    91    54    99    44     0     0  2500   797
143 |     0    93    54    99    32     0     0  2500   797
144 |     0    91    54    99    28     0     0  2500   797
145 |     0    87    54    99    36     0     0  2500   797
146 |     0    95    55    99    39     0     0  2500   797
147 |     0    97    55    99     9     0     0  2500   797
148 |     0    98    55    99    35     0     0  2500   797
149 |     0    91    55    99    37     0     0  2500   797
150 |     0    93    55    99    36     0     0  2500   797
151 |     0    91    55    99    13     0     0  2500   797
152 |     0    89    55    99    24     0     0  2500   797
153 |     0    86    54    99    44     0     0  2500   797
154 |     0    91    55    99    22     0     0  2500   797
155 |     0    98    55    99    39     0     0  2500   797
156 |     0    98    56    99    33     0     0  2500   797
157 |     0    94    55    99    36     0     0  2500   797
158 |     0    96    55    99    44     0     0  2500   797
159 |     0    95    55    99    30     0     0  2500   797
160 |     0    91    55    99    28     0     0  2500   797
161 |     0    95    55    99    36     0     0  2500   797
162 |     0    94    55    99    39     0     0  2500   797
163 |     0    87    55    99     9     0     0  2500   797
164 |     0    96    55    99    36     0     0  2500   797
165 |     0    95    55    99    37     0     0  2500   797
166 |     0    90    55    99    36     0     0  2500   797
167 |     0    91    55    99    13     0     0  2500   797
168 |     0    93    55    99    28     0     0  2500   797
169 |     0    89    55    99    44     0     0  2500   797
170 |     0    94    55    99    21     0     0  2500   797
171 |     0    96    55    99    39     0     0  2500   797
172 |     0    98    56    99     8     0     0  2500   797
173 |     0    95    55    99    35     0     0  2500   797
174 |     0    93    56    99    35     0     0  2500   797
175 |     0    88    55    99    36     0     0  2500   797
176 |     0    91    55    99    19     0     0  2500   797
177 |     0    94    55    99    25     0     0  2500   797
178 |     0    96    55    99    43     0     0  2500   797
179 |     0    92    55    99    27     0     0  2500   797
180 |     0    91    55    99    40     0     0  2500   797
181 |     0    99    56    99    27     0     0  2500   797
182 |     0    93    56    99    36     0     0  2500   797
183 |     0    90    56    99    41     0     0  2500   797
184 |     0    93    55    99    34     0     0  2500   797
185 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
186 | # Idx     W     C     %     %     %     %   MHz   MHz
187 |     0    91    56    99    23     0     0  2500   797
188 |     0    89    56    99    34     0     0  2500   797
189 |     0    94    56    99    39     0     0  2500   797
190 |     0    96    56    99     9     0     0  2500   797
191 |     0    96    56    99    35     0     0  2500   797
192 |     0    94    56    99    34     0     0  2500   797
193 |     0    94    56    99    38     0     0  2500   797
194 |     0    92    56    99    20     0     0  2500   797
195 |     0    87    56    99    24     0     0  2500   797
196 |     0    87    55    99    43     0     0  2500   797
197 |     0    92    56    99    27     0     0  2500   797
198 |     0    90    56    99    40     0     0  2500   797
199 |     0    99    57    99    27     0     0  2500   797
200 |     0    95    56    99    36     0     0  2500   797
201 |     0    94    56    99    41     0     0  2500   797
202 |     0    91    56    99    34     0     0  2500   797
203 |     0    91    56    99    23     0     0  2500   797
204 |     0    85    56    99    34     0     0  2500   797
205 |     0    93    56    99    39     0     0  2500   797
206 |     0    88    56    99     8     0     0  2500   797
207 |     0    95    56    99    36     0     0  2500   797
208 |     0    94    56    99    35     0     0  2500   797
209 |     0    92    56    99    36     0     0  2500   797
210 |     0    92    56    99    20     0     0  2500   797
211 |     0    95    56    99    25     0     0  2500   797
212 |     0    89    56    99    43     0     0  2500   797
213 |     0    94    56    99    27     0     0  2500   797
214 |     0    98    56    99    40     0     0  2500   797
215 |     0    98    57    99    27     0     0  2500   797
216 |     0    95    56    99    36     0     0  2500   797
217 |     0    92    56    99    41     0     0  2500   797
218 |     0    91    56    99    34     0     0  2500   797
219 |     0    89    56    99    23     0     0  2500   797
220 |     0    97    56    99    34     0     0  2500   797
221 |     0    96    56    99    39     0     0  2500   797
222 |     0    87    56    99     9     0     0  2500   797
223 |     0    92    56    99    35     0     0  2500   797
224 |     0    99    57    99    35     0     0  2500   797
225 |     0    92    57    99    36     0     0  2500   797
226 |     0    91    57    99    39     0     0  2500   797
227 |     0    95    57    99    34     0     0  2500   797
228 |     0    87    57    99    18     0     0  2500   797
229 |     0    88    57    99    34     0     0  2500   797
230 |     0    94    57    99    39     0     0  2500   797
231 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
232 | # Idx     W     C     %     %     %     %   MHz   MHz
233 |     0    95    57    99    10     0     0  2500   797
234 |     0    96    57    99    36     0     0  2500   797
235 |     0    93    57    99    38     0     0  2500   797
236 |     0    88    57    99    37     0     0  2500   797
237 |     0    92    57    99    28     0     0  2500   797
238 |     0    85    57    99    21     0     0  2500   797
239 |     0    88    56    99    42     0     0  2500   797
240 |     0    92    57    99    30     0     0  2500   797
241 |     0    93    57    99    40     0     0  2500   797
242 |     0    96    58    99    22     0     0  2500   797
243 |     0    95    57    99    36     0     0  2500   797
244 |     0    95    57    99    38     0     0  2500   797
245 |     0    92    57    99    34     0     0  2500   797
246 |     0    91    57    99    18     0     0  2500   797
247 |     0    89    57    99    34     0     0  2500   797
248 |     0    95    56    99    39     0     0  2500   797
249 |     0    88    57    99    11     0     0  2500   797
250 |     0    95    57    99    36     0     0  2500   797
251 |     0    95    57    99    39     0     0  2500   797
252 |     0    93    57    99    37     0     0  2500   797
253 |     0    92    57    99    28     0     0  2500   797
254 |     0    88    57    99    25     0     0  2500   797
255 |     0    87    57    99    43     0     0  2500   797
256 |     0    94    57    99    30     0     0  2500   797
257 |     0   100    57    99    40     0     0  2500   797
258 |     0   100    58    99    22     0     0  2500   797
259 |     0    96    57    99    36     0     0  2500   797
260 |     0    95    57    99    38     0     0  2500   797
261 |     0    89    57    99    34     0     0  2500   797
262 |     0    90    57    99    18     0     0  2500   797
263 |     0    93    57    99    34     0     0  2500   797
264 |     0    95    57    99    39     0     0  2500   797
265 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_tensorflow.output:
--------------------------------------------------------------------------------
  1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally
  2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
  3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally
  4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
  5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally
  6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
  7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
  8 | name: GRID K520
  9 | major: 3 minor: 0 memoryClockRate (GHz) 0.797
 10 | pciBusID 0000:00:03.0
 11 | Total memory: 4.00GiB
 12 | Free memory: 3.95GiB
 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0)
 16 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.52GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 17 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.29GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 18 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 19 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 20 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 21 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 22 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 23 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 24 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 25 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
 26 | Iteration: 0 train on batch time: 2288.014 ms.
 27 | Iteration: 1 train on batch time: 2290.354 ms.
 28 | Iteration: 2 train on batch time: 2289.724 ms.
 29 | Iteration: 3 train on batch time: 2290.773 ms.
 30 | Iteration: 4 train on batch time: 2290.362 ms.
 31 | Iteration: 5 train on batch time: 2291.452 ms.
 32 | Iteration: 6 train on batch time: 2288.561 ms.
 33 | Iteration: 7 train on batch time: 2292.294 ms.
 34 | Iteration: 8 train on batch time: 2290.141 ms.
 35 | Iteration: 9 train on batch time: 2289.341 ms.
 36 | Iteration: 10 train on batch time: 2289.933 ms.
 37 | Iteration: 11 train on batch time: 2290.004 ms.
 38 | Iteration: 12 train on batch time: 2288.960 ms.
 39 | Iteration: 13 train on batch time: 2291.385 ms.
 40 | Iteration: 14 train on batch time: 2289.223 ms.
 41 | Iteration: 15 train on batch time: 2291.922 ms.
 42 | Iteration: 16 train on batch time: 2290.300 ms.
 43 | Iteration: 17 train on batch time: 2289.473 ms.
 44 | Iteration: 18 train on batch time: 2290.086 ms.
 45 | Iteration: 19 train on batch time: 2290.494 ms.
 46 | Iteration: 20 train on batch time: 2291.832 ms.
 47 | Iteration: 21 train on batch time: 2290.415 ms.
 48 | Iteration: 22 train on batch time: 2289.739 ms.
 49 | Iteration: 23 train on batch time: 2289.833 ms.
 50 | Iteration: 24 train on batch time: 2289.924 ms.
 51 | Iteration: 25 train on batch time: 2290.042 ms.
 52 | Iteration: 26 train on batch time: 2290.865 ms.
 53 | Iteration: 27 train on batch time: 2289.665 ms.
 54 | Iteration: 28 train on batch time: 2291.388 ms.
 55 | Iteration: 29 train on batch time: 2289.806 ms.
 56 | Iteration: 30 train on batch time: 2290.459 ms.
 57 | Iteration: 31 train on batch time: 2290.359 ms.
 58 | Iteration: 32 train on batch time: 2290.220 ms.
 59 | Iteration: 33 train on batch time: 2289.432 ms.
 60 | Iteration: 34 train on batch time: 2289.253 ms.
 61 | Iteration: 35 train on batch time: 2289.966 ms.
 62 | Iteration: 36 train on batch time: 2290.815 ms.
 63 | Iteration: 37 train on batch time: 2290.038 ms.
 64 | Iteration: 38 train on batch time: 2289.975 ms.
 65 | Iteration: 39 train on batch time: 2289.872 ms.
 66 | Iteration: 40 train on batch time: 2290.819 ms.
 67 | Iteration: 41 train on batch time: 2290.014 ms.
 68 | Iteration: 42 train on batch time: 2290.018 ms.
 69 | Iteration: 43 train on batch time: 2290.331 ms.
 70 | Iteration: 44 train on batch time: 2291.564 ms.
 71 | Iteration: 45 train on batch time: 2289.280 ms.
 72 | Iteration: 46 train on batch time: 2290.379 ms.
 73 | Iteration: 47 train on batch time: 2290.502 ms.
 74 | Iteration: 48 train on batch time: 2291.871 ms.
 75 | Iteration: 49 train on batch time: 2291.065 ms.
 76 | Iteration: 50 train on batch time: 2289.859 ms.
 77 | Iteration: 51 train on batch time: 2292.050 ms.
 78 | Iteration: 52 train on batch time: 2290.174 ms.
 79 | Iteration: 53 train on batch time: 2290.171 ms.
 80 | Iteration: 54 train on batch time: 2291.706 ms.
 81 | Iteration: 55 train on batch time: 2290.779 ms.
 82 | Iteration: 56 train on batch time: 2290.103 ms.
 83 | Iteration: 57 train on batch time: 2290.064 ms.
 84 | Iteration: 58 train on batch time: 2291.862 ms.
 85 | Iteration: 59 train on batch time: 2291.895 ms.
 86 | Iteration: 60 train on batch time: 2291.257 ms.
 87 | Iteration: 61 train on batch time: 2289.168 ms.
 88 | Iteration: 62 train on batch time: 2288.342 ms.
 89 | Iteration: 63 train on batch time: 2289.242 ms.
 90 | Iteration: 64 train on batch time: 2289.082 ms.
 91 | Iteration: 65 train on batch time: 2292.338 ms.
 92 | Iteration: 66 train on batch time: 2292.939 ms.
 93 | Iteration: 67 train on batch time: 2289.538 ms.
 94 | Iteration: 68 train on batch time: 2291.592 ms.
 95 | Iteration: 69 train on batch time: 2289.194 ms.
 96 | Iteration: 70 train on batch time: 2291.248 ms.
 97 | Iteration: 71 train on batch time: 2291.441 ms.
 98 | Iteration: 72 train on batch time: 2290.788 ms.
 99 | Iteration: 73 train on batch time: 2291.898 ms.
100 | Iteration: 74 train on batch time: 2290.556 ms.
101 | Iteration: 75 train on batch time: 2291.116 ms.
102 | Iteration: 76 train on batch time: 2291.694 ms.
103 | Iteration: 77 train on batch time: 2291.067 ms.
104 | Iteration: 78 train on batch time: 2291.529 ms.
105 | Iteration: 79 train on batch time: 2289.126 ms.
106 | Iteration: 80 train on batch time: 2291.065 ms.
107 | Iteration: 81 train on batch time: 2290.981 ms.
108 | Iteration: 82 train on batch time: 2291.545 ms.
109 | Iteration: 83 train on batch time: 2289.720 ms.
110 | Iteration: 84 train on batch time: 2291.157 ms.
111 | Iteration: 85 train on batch time: 2291.588 ms.
112 | Iteration: 86 train on batch time: 2292.701 ms.
113 | Iteration: 87 train on batch time: 2288.769 ms.
114 | Iteration: 88 train on batch time: 2289.381 ms.
115 | Iteration: 89 train on batch time: 2292.850 ms.
116 | Iteration: 90 train on batch time: 2289.262 ms.
117 | Iteration: 91 train on batch time: 2291.040 ms.
118 | Iteration: 92 train on batch time: 2291.041 ms.
119 | Iteration: 93 train on batch time: 2291.432 ms.
120 | Iteration: 94 train on batch time: 2290.171 ms.
121 | Iteration: 95 train on batch time: 2291.687 ms.
122 | Iteration: 96 train on batch time: 2289.717 ms.
123 | Iteration: 97 train on batch time: 2291.111 ms.
124 | Iteration: 98 train on batch time: 2290.549 ms.
125 | Iteration: 99 train on batch time: 2289.340 ms.
126 | Batch size: 16
127 | Iterations: 100
128 | Time per iteration: 2290.514 ms
129 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_tensorflow_slim.dmon:
--------------------------------------------------------------------------------
  1 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
  2 | # Idx     W     C     %     %     %     %   MHz   MHz
  3 |     0    45    55    35    12     0     0  2500   797
  4 |     0    45    54     0     0     0     0  2500   797
  5 |     0    45    54     0     0     0     0  2500   797
  6 |     0    90    55    14     6     0     0  2500   797
  7 |     0    78    55    99    35     0     0  2500   797
  8 |     0    78    55   100    34     0     0  2500   797
  9 |     0    87    56    99    34     0     0  2500   797
 10 |     0    79    55    99    36     0     0  2500   797
 11 |     0    79    55    99    35     0     0  2500   797
 12 |     0   107    57    99    20     0     0  2500   797
 13 |     0    95    56    99    25     0     0  2500   797
 14 |     0    81    56    99    35     0     0  2500   797
 15 |     0    86    55    99    35     0     0  2500   797
 16 |     0   100    57    99    33     0     0  2500   797
 17 |     0    97    56    99    34     0     0  2500   797
 18 |     0    78    55    99    35     0     0  2500   797
 19 |     0   100    56    99    35     0     0  2500   797
 20 |     0    85    56    99    35     0     0  2500   797
 21 |     0    79    55   100    35     0     0  2500   797
 22 |     0    78    55    99    42     0     0  2500   797
 23 |     0    78    55    99    34     0     0  2500   797
 24 |     0    78    55    99    69     0     0  2500   797
 25 |     0    78    56    99    28     0     0  2500   797
 26 |     0    91    56    99    16     0     0  2500   797
 27 |     0    96    56    95    45     0     0  2500   797
 28 |     0    93    56    99    34     0     0  2500   797
 29 |     0    91    56    99    28     0     0  2500   797
 30 |     0    95    56    99    34     0     0  2500   797
 31 |     0    92    56    99    43     0     0  2500   797
 32 |     0    85    56    99    26     0     0  2500   797
 33 |     0    97    56    99    40     0     0  2500   797
 34 |     0    96    57    99    29     0     0  2500   797
 35 |     0    95    56    99    36     0     0  2500   797
 36 |     0    90    57    98    42     0     0  2500   797
 37 |     0    96    57    99    33     0     0  2500   797
 38 |     0    90    57    99    24     0     0  2500   797
 39 |     0    90    56    99    36     0     0  2500   797
 40 |     0    99    57    99    39     0     0  2500   797
 41 |     0    93    57    99     8     0     0  2500   797
 42 |     0    97    57    99    36     0     0  2500   797
 43 |     0    93    57    99    35     0     0  2500   797
 44 |     0    91    57    99    38     0     0  2500   797
 45 |     0    90    56    99    17     0     0  2500   797
 46 |     0    95    56    99    34     0     0  2500   797
 47 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 48 | # Idx     W     C     %     %     %     %   MHz   MHz
 49 |     0    89    57    99    43     0     0  2500   797
 50 |     0    90    57    99    26     0     0  2500   797
 51 |     0    92    57    99    40     0     0  2500   797
 52 |     0    99    57    99    29     0     0  2500   797
 53 |     0    95    57    99    35     0     0  2500   797
 54 |     0    90    57    98    42     0     0  2500   797
 55 |     0    90    57    99    33     0     0  2500   797
 56 |     0    91    57    99    24     0     0  2500   797
 57 |     0    87    57    99    35     0     0  2500   797
 58 |     0    95    57    99    39     0     0  2500   797
 59 |     0    95    57    99     8     0     0  2500   797
 60 |     0    98    57    99    36     0     0  2500   797
 61 |     0    96    57    99    35     0     0  2500   797
 62 |     0    98    57    99    36     0     0  2500   797
 63 |     0    93    57    99    17     0     0  2500   797
 64 |     0    85    57    99    34     0     0  2500   797
 65 |     0    89    57    99    43     0     0  2500   797
 66 |     0    91    57    99    25     0     0  2500   797
 67 |     0   100    57    99    40     0     0  2500   797
 68 |     0   100    58    99    30     0     0  2500   797
 69 |     0    99    58    99    36     0     0  2500   797
 70 |     0    95    57    98    43     0     0  2500   797
 71 |     0    90    57    99    34     0     0  2500   797
 72 |     0    91    57    99    25     0     0  2500   797
 73 |     0    95    57    99    36     0     0  2500   797
 74 |     0    92    57    99    39     0     0  2500   797
 75 |     0    85    58    99     9     0     0  2500   797
 76 |     0    96    58    99    35     0     0  2500   797
 77 |     0    95    58    99    36     0     0  2500   797
 78 |     0    94    57    99    36     0     0  2500   797
 79 |     0    91    57    99    16     0     0  2500   797
 80 |     0    94    57    99    35     0     0  2500   797
 81 |     0    89    57    99    43     0     0  2500   797
 82 |     0    94    57    99    24     0     0  2500   797
 83 |     0    98    58    99    40     0     0  2500   797
 84 |     0    99    58    99    31     0     0  2500   797
 85 |     0    98    58    99    36     0     0  2500   797
 86 |     0    94    58    99    35     0     0  2500   797
 87 |     0    89    57    99    36     0     0  2500   797
 88 |     0    90    58    99    21     0     0  2500   797
 89 |     0    94    58    99    33     0     0  2500   797
 90 |     0    95    57    99    43     0     0  2500   797
 91 |     0    92    57    99    27     0     0  2500   797
 92 |     0    90    58    99    40     0     0  2500   797
 93 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 94 | # Idx     W     C     %     %     %     %   MHz   MHz
 95 |     0    98    58    99    26     0     0  2500   797
 96 |     0    96    58    99    36     0     0  2500   797
 97 |     0    91    58    98    41     0     0  2500   797
 98 |     0    92    58    99    34     0     0  2500   797
 99 |     0    91    58    99    22     0     0  2500   797
100 |     0    86    58    99    34     0     0  2500   797
101 |     0    93    58    99    39     0     0  2500   797
102 |     0    95    58    99     8     0     0  2500   797
103 |     0   100    58    99    36     0     0  2500   797
104 |     0    95    58    99    34     0     0  2500   797
105 |     0    95    58    99    38     0     0  2500   797
106 |     0    92    58    99    19     0     0  2500   797
107 |     0    87    58    99    34     0     0  2500   797
108 |     0    87    58    99    43     0     0  2500   797
109 |     0    89    58    99    26     0     0  2500   797
110 |     0    93    58    99    40     0     0  2500   797
111 |     0   100    59    99    28     0     0  2500   797
112 |     0    97    58    99    36     0     0  2500   797
113 |     0    96    58    98    42     0     0  2500   797
114 |     0    93    58    99    34     0     0  2500   797
115 |     0    91    58    99    24     0     0  2500   797
116 |     0    86    58    99    36     0     0  2500   797
117 |     0    94    57    99    39     0     0  2500   797
118 |     0    86    58    99     8     0     0  2500   797
119 |     0    97    58    99    36     0     0  2500   797
120 |     0    94    58    99    36     0     0  2500   797
121 |     0    92    58    99    36     0     0  2500   797
122 |     0    92    58    99    17     0     0  2500   797
123 |     0    94    58    99    34     0     0  2500   797
124 |     0    89    58    99    43     0     0  2500   797
125 |     0    92    58    99    24     0     0  2500   797
126 |     0    97    58    99    39     0     0  2500   797
127 |     0    99    59    99    30     0     0  2500   797
128 |     0    96    58    99    37     0     0  2500   797
129 |     0    94    58    98    43     0     0  2500   797
130 |     0    91    58    99    34     0     0  2500   797
131 |     0    92    58    99    26     0     0  2500   797
132 |     0    91    58    99    37     0     0  2500   797
133 |     0    97    58    99    39     0     0  2500   797
134 |     0    90    58    99     9     0     0  2500   797
135 |     0    92    58    99    36     0     0  2500   797
136 |     0    97    59    99    38     0     0  2500   797
137 |     0    97    59    99    36     0     0  2500   797
138 |     0    92    58    99    13     0     0  2500   797
139 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
140 | # Idx     W     C     %     %     %     %   MHz   MHz
141 |     0    87    59    99    34     0     0  2500   797
142 |     0    89    58    99    24     0     0  2500   797
143 |     0    90    58    99    36     0     0  2500   797
144 |     0    97    58    99    39     0     0  2500   797
145 |     0    98    59    99     9     0     0  2500   797
146 |     0    99    59    99    36     0     0  2500   797
147 |     0    97    58    99    36     0     0  2500   797
148 |     0    89    59    99    36     0     0  2500   797
149 |     0    92    58    99    17     0     0  2500   797
150 |     0    96    58    99    35     0     0  2500   797
151 |     0    89    58    99    43     0     0  2500   797
152 |     0    87    58    99    24     0     0  2500   797
153 |     0    92    59    99    39     0     0  2500   797
154 |     0    97    59    99    31     0     0  2500   797
155 |     0    95    59    99    37     0     0  2500   797
156 |     0    95    59    98    43     0     0  2500   797
157 |     0    98    58    99    33     0     0  2500   797
158 |     0    90    58    99    26     0     0  2500   797
159 |     0    95    58    99    37     0     0  2500   797
160 |     0   100    59    99    39     0     0  2500   797
161 |     0    91    59    99     9     0     0  2500   797
162 |     0    97    59    99    36     0     0  2500   797
163 |     0    94    59    99    37     0     0  2500   797
164 |     0    90    58    99    36     0     0  2500   797
165 |     0    90    58    99    14     0     0  2500   797
166 |     0    97    58    99    34     0     0  2500   797
167 |     0    89    58    99    44     0     0  2500   797
168 |     0    92    58    99    22     0     0  2500   797
169 |     0    94    59    99    39     0     0  2500   797
170 |     0    99    59    99    33     0     0  2500   797
171 |     0    93    59    99    36     0     0  2500   797
172 |     0    91    59    98    43     0     0  2500   797
173 |     0    91    58    99    34     0     0  2500   797
174 |     0    92    59    99    28     0     0  2500   797
175 |     0    89    59    99    34     0     0  2500   797
176 |     0    96    59    99    39     0     0  2500   797
177 |     0    96    59    99     9     0     0  2500   797
178 |     0    96    59    99    36     0     0  2500   797
179 |     0    99    59    99    36     0     0  2500   797
180 |     0    93    59    99    35     0     0  2500   797
181 |     0    91    59    99    11     0     0  2500   797
182 |     0    88    59    99    35     0     0  2500   797
183 |     0    87    59    99    44     0     0  2500   797
184 |     0    92    59    99    17     0     0  2500   797
185 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
186 | # Idx     W     C     %     %     %     %   MHz   MHz
187 |     0   100    59    99    39     0     0  2500   797
188 |     0   100    59    99    36     0     0  2500   797
189 |     0    95    59    99    37     0     0  2500   797
190 |     0    95    59    98    39     0     0  2500   797
191 |     0    92    59    99    34     0     0  2500   797
192 |     0    92    59    99    33     0     0  2500   797
193 |     0    97    59    99    33     0     0  2500   797
194 |     0    95    58    99    40     0     0  2500   797
195 |     0    84    59    99    21     0     0  2500   797
196 |     0    96    59    99    39     0     0  2500   797
197 |     0    94    59    99    33     0     0  2500   797
198 |     0    91    59    99    37     0     0  2500   797
199 |     0    92    59    98    43     0     0  2500   797
200 |     0    96    59    99    34     0     0  2500   797
201 |     0    89    59    99    28     0     0  2500   797
202 |     0    95    59    99    35     0     0  2500   797
203 |     0    97    59    99    39     0     0  2500   797
204 |     0    98    59    99     9     0     0  2500   797
205 |     0    97    59    99    36     0     0  2500   797
206 |     0    95    59    99    36     0     0  2500   797
207 |     0    88    58    99    35     0     0  2500   797
208 |     0    92    59    99    11     0     0  2500   797
209 |     0    95    59    99    34     0     0  2500   797
210 |     0    97    59    99    44     0     0  2500   797
211 |     0    88    58    99    18     0     0  2500   797
212 |     0    93    59    99    39     0     0  2500   797
213 |     0    96    59    99    35     0     0  2500   797
214 |     0    94    59    99    37     0     0  2500   797
215 |     0    92    59    98    40     0     0  2500   797
216 |     0    89    59    99    34     0     0  2500   797
217 |     0    91    59    99    32     0     0  2500   797
218 |     0    87    59    99    33     0     0  2500   797
219 |     0    95    59    99    39     0     0  2500   797
220 |     0    97    59    99    10     0     0  2500   797
221 |     0    98    59    99    35     0     0  2500   797
222 |     0    94    59    99    36     0     0  2500   797
223 |     0    89    59    99    35     0     0  2500   797
224 |     0    93    59    99     8     0     0  2500   797
225 |     0    93    59    99    34     0     0  2500   797
226 |     0    87    59    99    44     0     0  2500   797
227 |     0    88    59    99    17     0     0  2500   797
228 |     0    92    59    99    39     0     0  2500   797
229 |     0    98    59    99    38     0     0  2500   797
230 |     0    95    59    99    38     0     0  2500   797
231 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
232 | # Idx     W     C     %     %     %     %   MHz   MHz
233 |     0    94    59    98    37     0     0  2500   797
234 |     0    96    59    99    34     0     0  2500   797
235 |     0    90    59    99    37     0     0  2500   797
236 |     0    91    59    99    34     0     0  2500   797
237 |     0    96    59    99    41     0     0  2500   797
238 |     0    88    59    99    12     0     0  2500   797
239 |     0    97    59    99    35     0     0  2500   797
240 |     0    94    59    99    36     0     0  2500   797
241 |     0    91    59    99    35     0     0  2500   797
242 |     0    92    59    99     8     0     0  2500   797
243 |     0    89    59    99    34     0     0  2500   797
244 |     0    89    59    99    44     0     0  2500   797
245 |     0    93    59    99    16     0     0  2500   797
246 |     0    96    59    99    38     0     0  2500   797
247 |     0    99    60    99    39     0     0  2500   797
248 |     0    94    59    99    38     0     0  2500   797
249 |     0    97    59    99    36     0     0  2500   797
250 |     0    90    59    99    35     0     0  2500   797
251 |     0    91    59    99     9     0     0  2500   797
252 |     0    89    59    99    35     0     0  2500   797
253 |     0    96    58    99    44     0     0  2500   797
254 |     0    93    59    99    17     0     0  2500   797
255 |     0    94    59    99    39     0     0  2500   797
256 |     0    97    59    99    38     0     0  2500   797
257 |     0    95    59    99    38     0     0  2500   797
258 |     0    93    59    98    37     0     0  2500   797
259 |     0    89    59    99    34     0     0  2500   797
260 |     0    89    59    99    37     0     0  2500   797
261 |     0    94    59    99    34     0     0  2500   797
262 |     0    99    59    99    41     0     0  2500   797
263 | 


--------------------------------------------------------------------------------
/results/k520/benchmark_tensorflow_slim.output:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally
 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally
 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally
 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 8 | name: GRID K520
 9 | major: 3 minor: 0 memoryClockRate (GHz) 0.797
10 | pciBusID 0000:00:03.0
11 | Total memory: 4.00GiB
12 | Free memory: 3.95GiB
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0)
16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0)
17 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.52GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
18 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.29GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
19 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
20 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
21 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
22 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
23 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
24 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
25 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
26 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available.
27 | Batch size: 16
28 | Iterations: 100
29 | Time per iteration: 2488.508 ms
30 | 


--------------------------------------------------------------------------------
/results/k80/INFO.md:
--------------------------------------------------------------------------------
 1 | - Hardware: AWS EC2 p2.xlarge (half of NVIDIA K80)
 2 | - OS: Deep Learning AMI for Amazon Linux, v1.5
 3 | - CUDA: 7.5
 4 | - CuDNN: 5.1
 5 | - Caffe: 1.0.0-rc3
 6 | - Keras: 1.0.8
 7 | - Theano: 0.8.2.dev-127aa70c6d34421ba7808b23d8bea506dc878068
 8 | - TensorFlow: 0.10.0
 9 | - Neon: N/A
10 | - MXNet: 0.7.0
11 | - Python: 2.7.12
12 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_keras_tensorflow.output:
--------------------------------------------------------------------------------
  1 | Using TensorFlow backend.
  2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally
  3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
  4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally
  5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
  6 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally
  7 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
  8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
  9 | name: Tesla K80
 10 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235
 11 | pciBusID 0000:00:1e.0
 12 | Total memory: 11.25GiB
 13 | Free memory: 11.13GiB
 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
 15 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
 16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0)
 17 | Iteration: 0 train on batch time: 1017.236 ms.
 18 | Iteration: 1 train on batch time: 1020.970 ms.
 19 | Iteration: 2 train on batch time: 1018.325 ms.
 20 | Iteration: 3 train on batch time: 1012.985 ms.
 21 | Iteration: 4 train on batch time: 1020.162 ms.
 22 | Iteration: 5 train on batch time: 1032.775 ms.
 23 | Iteration: 6 train on batch time: 1014.535 ms.
 24 | Iteration: 7 train on batch time: 1011.061 ms.
 25 | Iteration: 8 train on batch time: 1105.583 ms.
 26 | Iteration: 9 train on batch time: 1017.483 ms.
 27 | Iteration: 10 train on batch time: 1018.831 ms.
 28 | Iteration: 11 train on batch time: 1021.091 ms.
 29 | Iteration: 12 train on batch time: 1026.713 ms.
 30 | Iteration: 13 train on batch time: 1021.117 ms.
 31 | Iteration: 14 train on batch time: 1013.455 ms.
 32 | Iteration: 15 train on batch time: 1036.014 ms.
 33 | Iteration: 16 train on batch time: 1023.874 ms.
 34 | Iteration: 17 train on batch time: 1016.743 ms.
 35 | Iteration: 18 train on batch time: 1019.208 ms.
 36 | Iteration: 19 train on batch time: 1032.383 ms.
 37 | Iteration: 20 train on batch time: 1016.591 ms.
 38 | Iteration: 21 train on batch time: 1026.585 ms.
 39 | Iteration: 22 train on batch time: 1039.200 ms.
 40 | Iteration: 23 train on batch time: 1032.526 ms.
 41 | Iteration: 24 train on batch time: 1026.185 ms.
 42 | Iteration: 25 train on batch time: 1027.078 ms.
 43 | Iteration: 26 train on batch time: 1026.375 ms.
 44 | Iteration: 27 train on batch time: 1030.565 ms.
 45 | Iteration: 28 train on batch time: 1023.217 ms.
 46 | Iteration: 29 train on batch time: 1038.791 ms.
 47 | Iteration: 30 train on batch time: 1051.703 ms.
 48 | Iteration: 31 train on batch time: 1028.012 ms.
 49 | Iteration: 32 train on batch time: 1026.479 ms.
 50 | Iteration: 33 train on batch time: 1025.240 ms.
 51 | Iteration: 34 train on batch time: 1028.033 ms.
 52 | Iteration: 35 train on batch time: 1024.030 ms.
 53 | Iteration: 36 train on batch time: 1031.428 ms.
 54 | Iteration: 37 train on batch time: 1047.575 ms.
 55 | Iteration: 38 train on batch time: 1025.456 ms.
 56 | Iteration: 39 train on batch time: 1024.458 ms.
 57 | Iteration: 40 train on batch time: 1036.066 ms.
 58 | Iteration: 41 train on batch time: 1030.012 ms.
 59 | Iteration: 42 train on batch time: 1023.693 ms.
 60 | Iteration: 43 train on batch time: 1034.144 ms.
 61 | Iteration: 44 train on batch time: 1037.713 ms.
 62 | Iteration: 45 train on batch time: 1023.031 ms.
 63 | Iteration: 46 train on batch time: 1013.374 ms.
 64 | Iteration: 47 train on batch time: 1040.580 ms.
 65 | Iteration: 48 train on batch time: 1022.442 ms.
 66 | Iteration: 49 train on batch time: 1017.749 ms.
 67 | Iteration: 50 train on batch time: 1014.432 ms.
 68 | Iteration: 51 train on batch time: 1027.870 ms.
 69 | Iteration: 52 train on batch time: 1013.653 ms.
 70 | Iteration: 53 train on batch time: 1013.838 ms.
 71 | Iteration: 54 train on batch time: 1029.386 ms.
 72 | Iteration: 55 train on batch time: 1013.163 ms.
 73 | Iteration: 56 train on batch time: 1007.933 ms.
 74 | Iteration: 57 train on batch time: 1009.042 ms.
 75 | Iteration: 58 train on batch time: 1030.207 ms.
 76 | Iteration: 59 train on batch time: 1013.684 ms.
 77 | Iteration: 60 train on batch time: 1012.273 ms.
 78 | Iteration: 61 train on batch time: 1124.486 ms.
 79 | Iteration: 62 train on batch time: 1010.607 ms.
 80 | Iteration: 63 train on batch time: 1007.886 ms.
 81 | Iteration: 64 train on batch time: 1008.700 ms.
 82 | Iteration: 65 train on batch time: 1022.307 ms.
 83 | Iteration: 66 train on batch time: 1010.578 ms.
 84 | Iteration: 67 train on batch time: 1006.925 ms.
 85 | Iteration: 68 train on batch time: 1011.284 ms.
 86 | Iteration: 69 train on batch time: 1021.331 ms.
 87 | Iteration: 70 train on batch time: 1006.844 ms.
 88 | Iteration: 71 train on batch time: 1008.393 ms.
 89 | Iteration: 72 train on batch time: 1009.937 ms.
 90 | Iteration: 73 train on batch time: 1008.786 ms.
 91 | Iteration: 74 train on batch time: 1005.366 ms.
 92 | Iteration: 75 train on batch time: 1011.294 ms.
 93 | Iteration: 76 train on batch time: 1016.031 ms.
 94 | Iteration: 77 train on batch time: 1009.339 ms.
 95 | Iteration: 78 train on batch time: 1010.431 ms.
 96 | Iteration: 79 train on batch time: 1095.084 ms.
 97 | Iteration: 80 train on batch time: 1014.021 ms.
 98 | Iteration: 81 train on batch time: 1007.499 ms.
 99 | Iteration: 82 train on batch time: 1014.144 ms.
100 | Iteration: 83 train on batch time: 1010.009 ms.
101 | Iteration: 84 train on batch time: 1006.829 ms.
102 | Iteration: 85 train on batch time: 1004.976 ms.
103 | Iteration: 86 train on batch time: 1031.637 ms.
104 | Iteration: 87 train on batch time: 1008.206 ms.
105 | Iteration: 88 train on batch time: 1002.442 ms.
106 | Iteration: 89 train on batch time: 1005.437 ms.
107 | Iteration: 90 train on batch time: 1016.133 ms.
108 | Iteration: 91 train on batch time: 1007.179 ms.
109 | Iteration: 92 train on batch time: 1004.678 ms.
110 | Iteration: 93 train on batch time: 1019.152 ms.
111 | Iteration: 94 train on batch time: 1010.556 ms.
112 | Iteration: 95 train on batch time: 1005.696 ms.
113 | Iteration: 96 train on batch time: 1006.815 ms.
114 | Iteration: 97 train on batch time: 1013.493 ms.
115 | Iteration: 98 train on batch time: 1011.643 ms.
116 | Iteration: 99 train on batch time: 1007.593 ms.
117 | Batch size: 16
118 | Iterations: 100
119 | Time per iteration: 1021.813 ms
120 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_keras_theano.output:
--------------------------------------------------------------------------------
  1 | Iteration: 0 train on batch time: 1156.279 ms.
  2 | Iteration: 1 train on batch time: 1152.773 ms.
  3 | Iteration: 2 train on batch time: 1143.487 ms.
  4 | Iteration: 3 train on batch time: 1143.189 ms.
  5 | Iteration: 4 train on batch time: 1156.002 ms.
  6 | Iteration: 5 train on batch time: 1152.428 ms.
  7 | Iteration: 6 train on batch time: 1142.399 ms.
  8 | Iteration: 7 train on batch time: 1145.704 ms.
  9 | Iteration: 8 train on batch time: 1138.460 ms.
 10 | Iteration: 9 train on batch time: 1189.352 ms.
 11 | Iteration: 10 train on batch time: 1143.049 ms.
 12 | Iteration: 11 train on batch time: 1146.637 ms.
 13 | Iteration: 12 train on batch time: 1167.725 ms.
 14 | Iteration: 13 train on batch time: 1173.385 ms.
 15 | Iteration: 14 train on batch time: 1145.851 ms.
 16 | Iteration: 15 train on batch time: 1156.808 ms.
 17 | Iteration: 16 train on batch time: 1160.264 ms.
 18 | Iteration: 17 train on batch time: 1155.437 ms.
 19 | Iteration: 18 train on batch time: 1136.610 ms.
 20 | Iteration: 19 train on batch time: 1151.689 ms.
 21 | Iteration: 20 train on batch time: 1148.285 ms.
 22 | Iteration: 21 train on batch time: 1150.323 ms.
 23 | Iteration: 22 train on batch time: 1136.111 ms.
 24 | Iteration: 23 train on batch time: 1151.901 ms.
 25 | Iteration: 24 train on batch time: 1138.016 ms.
 26 | Iteration: 25 train on batch time: 1153.965 ms.
 27 | Iteration: 26 train on batch time: 1127.675 ms.
 28 | Iteration: 27 train on batch time: 1142.117 ms.
 29 | Iteration: 28 train on batch time: 1151.639 ms.
 30 | Iteration: 29 train on batch time: 1146.955 ms.
 31 | Iteration: 30 train on batch time: 1130.169 ms.
 32 | Iteration: 31 train on batch time: 1129.715 ms.
 33 | Iteration: 32 train on batch time: 1142.414 ms.
 34 | Iteration: 33 train on batch time: 1136.229 ms.
 35 | Iteration: 34 train on batch time: 1134.486 ms.
 36 | Iteration: 35 train on batch time: 1141.239 ms.
 37 | Iteration: 36 train on batch time: 1132.407 ms.
 38 | Iteration: 37 train on batch time: 1133.712 ms.
 39 | Iteration: 38 train on batch time: 1207.620 ms.
 40 | Iteration: 39 train on batch time: 1139.921 ms.
 41 | Iteration: 40 train on batch time: 1127.063 ms.
 42 | Iteration: 41 train on batch time: 1138.914 ms.
 43 | Iteration: 42 train on batch time: 1121.130 ms.
 44 | Iteration: 43 train on batch time: 1132.552 ms.
 45 | Iteration: 44 train on batch time: 1140.805 ms.
 46 | Iteration: 45 train on batch time: 1134.416 ms.
 47 | Iteration: 46 train on batch time: 1119.847 ms.
 48 | Iteration: 47 train on batch time: 1137.358 ms.
 49 | Iteration: 48 train on batch time: 1143.902 ms.
 50 | Iteration: 49 train on batch time: 1134.764 ms.
 51 | Iteration: 50 train on batch time: 1119.605 ms.
 52 | Iteration: 51 train on batch time: 1137.893 ms.
 53 | Iteration: 52 train on batch time: 1124.636 ms.
 54 | Iteration: 53 train on batch time: 1134.856 ms.
 55 | Iteration: 54 train on batch time: 1115.432 ms.
 56 | Iteration: 55 train on batch time: 1129.295 ms.
 57 | Iteration: 56 train on batch time: 1115.512 ms.
 58 | Iteration: 57 train on batch time: 1149.882 ms.
 59 | Iteration: 58 train on batch time: 1121.193 ms.
 60 | Iteration: 59 train on batch time: 1124.862 ms.
 61 | Iteration: 60 train on batch time: 1150.790 ms.
 62 | Iteration: 61 train on batch time: 1128.271 ms.
 63 | Iteration: 62 train on batch time: 1120.504 ms.
 64 | Iteration: 63 train on batch time: 1135.152 ms.
 65 | Iteration: 64 train on batch time: 1144.666 ms.
 66 | Iteration: 65 train on batch time: 1132.896 ms.
 67 | Iteration: 66 train on batch time: 1125.409 ms.
 68 | Iteration: 67 train on batch time: 1138.033 ms.
 69 | Iteration: 68 train on batch time: 1137.423 ms.
 70 | Iteration: 69 train on batch time: 1141.092 ms.
 71 | Iteration: 70 train on batch time: 1123.861 ms.
 72 | Iteration: 71 train on batch time: 1132.350 ms.
 73 | Iteration: 72 train on batch time: 1127.056 ms.
 74 | Iteration: 73 train on batch time: 1141.376 ms.
 75 | Iteration: 74 train on batch time: 1125.757 ms.
 76 | Iteration: 75 train on batch time: 1131.033 ms.
 77 | Iteration: 76 train on batch time: 1153.230 ms.
 78 | Iteration: 77 train on batch time: 1135.386 ms.
 79 | Iteration: 78 train on batch time: 1128.103 ms.
 80 | Iteration: 79 train on batch time: 1158.865 ms.
 81 | Iteration: 80 train on batch time: 1145.302 ms.
 82 | Iteration: 81 train on batch time: 1140.393 ms.
 83 | Iteration: 82 train on batch time: 1135.405 ms.
 84 | Iteration: 83 train on batch time: 1157.408 ms.
 85 | Iteration: 84 train on batch time: 1134.044 ms.
 86 | Iteration: 85 train on batch time: 1145.173 ms.
 87 | Iteration: 86 train on batch time: 1138.993 ms.
 88 | Iteration: 87 train on batch time: 1140.910 ms.
 89 | Iteration: 88 train on batch time: 1147.842 ms.
 90 | Iteration: 89 train on batch time: 1144.074 ms.
 91 | Iteration: 90 train on batch time: 1140.689 ms.
 92 | Iteration: 91 train on batch time: 1142.731 ms.
 93 | Iteration: 92 train on batch time: 1156.817 ms.
 94 | Iteration: 93 train on batch time: 1145.158 ms.
 95 | Iteration: 94 train on batch time: 1137.080 ms.
 96 | Iteration: 95 train on batch time: 1164.166 ms.
 97 | Iteration: 96 train on batch time: 1142.763 ms.
 98 | Iteration: 97 train on batch time: 1140.657 ms.
 99 | Iteration: 98 train on batch time: 1168.779 ms.
100 | Iteration: 99 train on batch time: 1161.280 ms.
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 1141.791 ms
104 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_mxnet.output:
--------------------------------------------------------------------------------
1 | Batch size: 16
2 | Iterations: 400
3 | Time per iteration: 1247.469 ms
4 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_neon.output:
--------------------------------------------------------------------------------
 1 | Epoch 0   [Train |████████████████████|    1/1    batches, 6.91 cost, 130.02s]
 2 |                                                                               Epoch 1   [Train |████████████████████|    1/1    batches, 6.91 cost, 130.25s]
 3 |                                                                               Epoch 2   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.64s]
 4 |                                                                               Epoch 3   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.35s]
 5 |                                                                               Epoch 4   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.55s]
 6 |                                                                               Epoch 5   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.53s]
 7 |                                                                               Epoch 6   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.70s]
 8 |                                                                               Epoch 7   [Train |████████████████████|    1/1    batches, 6.91 cost, 129.68s]
 9 |                                                                               Epoch 8   [Train |████████████████████|    1/1    batches, 6.90 cost, 129.45s]
10 |                                                                               Epoch 9   [Train |████████████████████|    1/1    batches, 6.90 cost, 129.41s]
11 |                                                                               Epoch 10  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.48s]
12 |                                                                               Epoch 11  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.21s]
13 |                                                                               Epoch 12  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.64s]
14 |                                                                               Epoch 13  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.41s]
15 |                                                                               Epoch 14  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.79s]
16 |                                                                               Epoch 15  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.61s]
17 |                                                                               Epoch 16  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.91s]
18 |                                                                               Epoch 17  [Train |████████████████████|    1/1    batches, 6.90 cost, 129.72s]
19 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_tensorflow.output:
--------------------------------------------------------------------------------
  1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally
  2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
  3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally
  4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
  5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally
  6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
  7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
  8 | name: Tesla K80
  9 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235
 10 | pciBusID 0000:00:1e.0
 11 | Total memory: 11.25GiB
 12 | Free memory: 11.13GiB
 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0)
 16 | Iteration: 0 train on batch time: 1085.995 ms.
 17 | Iteration: 1 train on batch time: 1049.083 ms.
 18 | Iteration: 2 train on batch time: 1039.483 ms.
 19 | Iteration: 3 train on batch time: 1051.917 ms.
 20 | Iteration: 4 train on batch time: 1053.509 ms.
 21 | Iteration: 5 train on batch time: 1046.565 ms.
 22 | Iteration: 6 train on batch time: 1040.916 ms.
 23 | Iteration: 7 train on batch time: 1063.469 ms.
 24 | Iteration: 8 train on batch time: 1054.336 ms.
 25 | Iteration: 9 train on batch time: 1048.346 ms.
 26 | Iteration: 10 train on batch time: 1050.751 ms.
 27 | Iteration: 11 train on batch time: 1052.788 ms.
 28 | Iteration: 12 train on batch time: 1057.438 ms.
 29 | Iteration: 13 train on batch time: 1054.737 ms.
 30 | Iteration: 14 train on batch time: 1071.120 ms.
 31 | Iteration: 15 train on batch time: 1061.348 ms.
 32 | Iteration: 16 train on batch time: 1055.097 ms.
 33 | Iteration: 17 train on batch time: 1082.748 ms.
 34 | Iteration: 18 train on batch time: 1057.704 ms.
 35 | Iteration: 19 train on batch time: 1050.902 ms.
 36 | Iteration: 20 train on batch time: 1069.026 ms.
 37 | Iteration: 21 train on batch time: 1064.463 ms.
 38 | Iteration: 22 train on batch time: 1059.791 ms.
 39 | Iteration: 23 train on batch time: 1063.129 ms.
 40 | Iteration: 24 train on batch time: 1081.003 ms.
 41 | Iteration: 25 train on batch time: 1065.987 ms.
 42 | Iteration: 26 train on batch time: 1061.391 ms.
 43 | Iteration: 27 train on batch time: 1075.312 ms.
 44 | Iteration: 28 train on batch time: 1082.355 ms.
 45 | Iteration: 29 train on batch time: 1066.022 ms.
 46 | Iteration: 30 train on batch time: 1068.718 ms.
 47 | Iteration: 31 train on batch time: 1084.667 ms.
 48 | Iteration: 32 train on batch time: 1068.265 ms.
 49 | Iteration: 33 train on batch time: 1065.237 ms.
 50 | Iteration: 34 train on batch time: 1085.944 ms.
 51 | Iteration: 35 train on batch time: 1071.550 ms.
 52 | Iteration: 36 train on batch time: 1061.112 ms.
 53 | Iteration: 37 train on batch time: 1072.375 ms.
 54 | Iteration: 38 train on batch time: 1092.962 ms.
 55 | Iteration: 39 train on batch time: 1081.078 ms.
 56 | Iteration: 40 train on batch time: 1064.120 ms.
 57 | Iteration: 41 train on batch time: 1103.913 ms.
 58 | Iteration: 42 train on batch time: 1078.496 ms.
 59 | Iteration: 43 train on batch time: 1065.931 ms.
 60 | Iteration: 44 train on batch time: 1067.544 ms.
 61 | Iteration: 45 train on batch time: 1091.250 ms.
 62 | Iteration: 46 train on batch time: 1071.544 ms.
 63 | Iteration: 47 train on batch time: 1060.247 ms.
 64 | Iteration: 48 train on batch time: 1183.992 ms.
 65 | Iteration: 49 train on batch time: 1064.452 ms.
 66 | Iteration: 50 train on batch time: 1059.961 ms.
 67 | Iteration: 51 train on batch time: 1064.271 ms.
 68 | Iteration: 52 train on batch time: 1081.608 ms.
 69 | Iteration: 53 train on batch time: 1052.056 ms.
 70 | Iteration: 54 train on batch time: 1043.359 ms.
 71 | Iteration: 55 train on batch time: 1065.859 ms.
 72 | Iteration: 56 train on batch time: 1055.350 ms.
 73 | Iteration: 57 train on batch time: 1047.651 ms.
 74 | Iteration: 58 train on batch time: 1075.331 ms.
 75 | Iteration: 59 train on batch time: 1046.515 ms.
 76 | Iteration: 60 train on batch time: 1043.636 ms.
 77 | Iteration: 61 train on batch time: 1049.837 ms.
 78 | Iteration: 62 train on batch time: 1053.220 ms.
 79 | Iteration: 63 train on batch time: 1056.382 ms.
 80 | Iteration: 64 train on batch time: 1042.745 ms.
 81 | Iteration: 65 train on batch time: 1051.990 ms.
 82 | Iteration: 66 train on batch time: 1046.271 ms.
 83 | Iteration: 67 train on batch time: 1037.513 ms.
 84 | Iteration: 68 train on batch time: 1031.894 ms.
 85 | Iteration: 69 train on batch time: 1052.511 ms.
 86 | Iteration: 70 train on batch time: 1050.446 ms.
 87 | Iteration: 71 train on batch time: 1024.116 ms.
 88 | Iteration: 72 train on batch time: 1147.200 ms.
 89 | Iteration: 73 train on batch time: 1029.651 ms.
 90 | Iteration: 74 train on batch time: 1029.550 ms.
 91 | Iteration: 75 train on batch time: 1048.105 ms.
 92 | Iteration: 76 train on batch time: 1050.782 ms.
 93 | Iteration: 77 train on batch time: 1024.452 ms.
 94 | Iteration: 78 train on batch time: 1027.308 ms.
 95 | Iteration: 79 train on batch time: 1042.816 ms.
 96 | Iteration: 80 train on batch time: 1043.986 ms.
 97 | Iteration: 81 train on batch time: 1033.075 ms.
 98 | Iteration: 82 train on batch time: 1048.731 ms.
 99 | Iteration: 83 train on batch time: 1048.694 ms.
100 | Iteration: 84 train on batch time: 1021.870 ms.
101 | Iteration: 85 train on batch time: 1025.438 ms.
102 | Iteration: 86 train on batch time: 1051.195 ms.
103 | Iteration: 87 train on batch time: 1040.411 ms.
104 | Iteration: 88 train on batch time: 1023.682 ms.
105 | Iteration: 89 train on batch time: 1043.288 ms.
106 | Iteration: 90 train on batch time: 1042.701 ms.
107 | Iteration: 91 train on batch time: 1022.932 ms.
108 | Iteration: 92 train on batch time: 1045.106 ms.
109 | Iteration: 93 train on batch time: 1050.785 ms.
110 | Iteration: 94 train on batch time: 1037.357 ms.
111 | Iteration: 95 train on batch time: 1025.389 ms.
112 | Iteration: 96 train on batch time: 1038.959 ms.
113 | Iteration: 97 train on batch time: 1045.040 ms.
114 | Iteration: 98 train on batch time: 1033.745 ms.
115 | Iteration: 99 train on batch time: 1035.373 ms.
116 | Batch size: 16
117 | Iterations: 100
118 | Time per iteration: 1057.119 ms
119 | 


--------------------------------------------------------------------------------
/results/k80/benchmark_tensorflow_slim.output:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally
 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally
 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally
 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally
 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 8 | name: Tesla K80
 9 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235
10 | pciBusID 0000:00:1e.0
11 | Total memory: 11.25GiB
12 | Free memory: 11.13GiB
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0)
16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0)
17 | Batch size: 16
18 | Iterations: 100
19 | Time per iteration: 1126.704 ms
20 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/INFO.md:
--------------------------------------------------------------------------------
 1 | ## Results Maxwell Titan X
 2 | 
 3 | The mini batch size is 16. As the speed changed with GPU heat, the fan speed was set to 100 % and the tests were all started with GPU temperature = 45 degrees.
 4 | 
 5 | | Framework  | Time (per minibatch)  |
 6 | |:---|:---|
 7 | | Neon  | 207.406 ms  |
 8 | | Torch (1) | 273.542 ms  |
 9 | | Caffe (2)  | 311.061 ms |
10 | | Keras (TensorFlow)  | 360.753 ms  |
11 | | Keras (Theano)  | 317.298 ms  |
12 | | TensorFlow  | 332.27 ms  |
13 | | TensorFlow (slim) (3) | 370.89 ms  |
14 | | mxnet | 324.635 ms |
15 | 
16 | (1) The code in https://github.com/jcjohnson/cnn-benchmarks was re-run with 100 iterations, 100 % GPU fan and starting temperature of 45.
17 | 
18 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time.
19 | 
20 | (3) Uses the built-in slim training function, which has possibly more overhead.
21 | 
22 | - Hardware: Titan X Maxwell
23 | - OS: Ubuntu 14.04.3 LTS
24 | - CUDA: 8.0 (cuda-repo-ubuntu1404-8-0-rc_8.0.27-1_amd64.deb)
25 | - CuDNN: 5.0 (cudnn-8.0-linux-x64-v5.0-ga.tgz)
26 | - Caffe: b2982c7 (from source)
27 | - Keras: 1.0.8
28 | - Theano: 0.9.0dev2.dev-338384adeabd2a56ccae22a9f1105a9f82ce9b8f
29 | - TensorFlow:  2a6d751 (from source)
30 | - Neon: 1.5.4 (485033c)
31 | - mxnet: (066373a) from source
32 | - Python: 2.7.6
33 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_keras_tensorflow.output:
--------------------------------------------------------------------------------
  1 | Iteration: 0 train on batch time: 361.193 ms.
  2 | Iteration: 1 train on batch time: 362.416 ms.
  3 | Iteration: 2 train on batch time: 358.185 ms.
  4 | Iteration: 3 train on batch time: 376.177 ms.
  5 | Iteration: 4 train on batch time: 382.268 ms.
  6 | Iteration: 5 train on batch time: 356.856 ms.
  7 | Iteration: 6 train on batch time: 374.421 ms.
  8 | Iteration: 7 train on batch time: 362.809 ms.
  9 | Iteration: 8 train on batch time: 355.884 ms.
 10 | Iteration: 9 train on batch time: 361.131 ms.
 11 | Iteration: 10 train on batch time: 387.957 ms.
 12 | Iteration: 11 train on batch time: 379.649 ms.
 13 | Iteration: 12 train on batch time: 357.392 ms.
 14 | Iteration: 13 train on batch time: 357.279 ms.
 15 | Iteration: 14 train on batch time: 358.012 ms.
 16 | Iteration: 15 train on batch time: 356.508 ms.
 17 | Iteration: 16 train on batch time: 357.140 ms.
 18 | Iteration: 17 train on batch time: 358.781 ms.
 19 | Iteration: 18 train on batch time: 358.696 ms.
 20 | Iteration: 19 train on batch time: 358.779 ms.
 21 | Iteration: 20 train on batch time: 357.467 ms.
 22 | Iteration: 21 train on batch time: 357.330 ms.
 23 | Iteration: 22 train on batch time: 356.411 ms.
 24 | Iteration: 23 train on batch time: 358.103 ms.
 25 | Iteration: 24 train on batch time: 355.624 ms.
 26 | Iteration: 25 train on batch time: 354.981 ms.
 27 | Iteration: 26 train on batch time: 357.243 ms.
 28 | Iteration: 27 train on batch time: 357.966 ms.
 29 | Iteration: 28 train on batch time: 358.989 ms.
 30 | Iteration: 29 train on batch time: 356.666 ms.
 31 | Iteration: 30 train on batch time: 356.416 ms.
 32 | Iteration: 31 train on batch time: 356.045 ms.
 33 | Iteration: 32 train on batch time: 358.270 ms.
 34 | Iteration: 33 train on batch time: 357.858 ms.
 35 | Iteration: 34 train on batch time: 358.622 ms.
 36 | Iteration: 35 train on batch time: 356.075 ms.
 37 | Iteration: 36 train on batch time: 356.267 ms.
 38 | Iteration: 37 train on batch time: 357.824 ms.
 39 | Iteration: 38 train on batch time: 357.307 ms.
 40 | Iteration: 39 train on batch time: 357.824 ms.
 41 | Iteration: 40 train on batch time: 355.557 ms.
 42 | Iteration: 41 train on batch time: 357.795 ms.
 43 | Iteration: 42 train on batch time: 355.751 ms.
 44 | Iteration: 43 train on batch time: 356.666 ms.
 45 | Iteration: 44 train on batch time: 356.345 ms.
 46 | Iteration: 45 train on batch time: 356.759 ms.
 47 | Iteration: 46 train on batch time: 357.225 ms.
 48 | Iteration: 47 train on batch time: 358.491 ms.
 49 | Iteration: 48 train on batch time: 356.588 ms.
 50 | Iteration: 49 train on batch time: 356.175 ms.
 51 | Iteration: 50 train on batch time: 358.338 ms.
 52 | Iteration: 51 train on batch time: 357.325 ms.
 53 | Iteration: 52 train on batch time: 357.810 ms.
 54 | Iteration: 53 train on batch time: 357.431 ms.
 55 | Iteration: 54 train on batch time: 357.332 ms.
 56 | Iteration: 55 train on batch time: 358.000 ms.
 57 | Iteration: 56 train on batch time: 357.014 ms.
 58 | Iteration: 57 train on batch time: 359.220 ms.
 59 | Iteration: 58 train on batch time: 355.329 ms.
 60 | Iteration: 59 train on batch time: 356.514 ms.
 61 | Iteration: 60 train on batch time: 356.627 ms.
 62 | Iteration: 61 train on batch time: 358.517 ms.
 63 | Iteration: 62 train on batch time: 356.074 ms.
 64 | Iteration: 63 train on batch time: 356.139 ms.
 65 | Iteration: 64 train on batch time: 356.988 ms.
 66 | Iteration: 65 train on batch time: 355.899 ms.
 67 | Iteration: 66 train on batch time: 357.607 ms.
 68 | Iteration: 67 train on batch time: 358.560 ms.
 69 | Iteration: 68 train on batch time: 357.264 ms.
 70 | Iteration: 69 train on batch time: 358.806 ms.
 71 | Iteration: 70 train on batch time: 357.964 ms.
 72 | Iteration: 71 train on batch time: 357.494 ms.
 73 | Iteration: 72 train on batch time: 356.260 ms.
 74 | Iteration: 73 train on batch time: 357.511 ms.
 75 | Iteration: 74 train on batch time: 357.707 ms.
 76 | Iteration: 75 train on batch time: 358.153 ms.
 77 | Iteration: 76 train on batch time: 357.256 ms.
 78 | Iteration: 77 train on batch time: 357.494 ms.
 79 | Iteration: 78 train on batch time: 358.091 ms.
 80 | Iteration: 79 train on batch time: 356.644 ms.
 81 | Iteration: 80 train on batch time: 357.613 ms.
 82 | Iteration: 81 train on batch time: 357.086 ms.
 83 | Iteration: 82 train on batch time: 358.372 ms.
 84 | Iteration: 83 train on batch time: 361.066 ms.
 85 | Iteration: 84 train on batch time: 367.126 ms.
 86 | Iteration: 85 train on batch time: 388.391 ms.
 87 | Iteration: 86 train on batch time: 366.225 ms.
 88 | Iteration: 87 train on batch time: 364.328 ms.
 89 | Iteration: 88 train on batch time: 357.749 ms.
 90 | Iteration: 89 train on batch time: 361.414 ms.
 91 | Iteration: 90 train on batch time: 381.775 ms.
 92 | Iteration: 91 train on batch time: 378.080 ms.
 93 | Iteration: 92 train on batch time: 382.169 ms.
 94 | Iteration: 93 train on batch time: 367.850 ms.
 95 | Iteration: 94 train on batch time: 381.222 ms.
 96 | Iteration: 95 train on batch time: 358.042 ms.
 97 | Iteration: 96 train on batch time: 361.979 ms.
 98 | Iteration: 97 train on batch time: 358.980 ms.
 99 | Iteration: 98 train on batch time: 376.588 ms.
100 | Iteration: 99 train on batch time: 373.197 ms.
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 360.753 ms
104 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_keras_theano.output:
--------------------------------------------------------------------------------
  1 | Iteration: 0 train on batch time: 316.571 ms.
  2 | Iteration: 1 train on batch time: 317.315 ms.
  3 | Iteration: 2 train on batch time: 317.677 ms.
  4 | Iteration: 3 train on batch time: 317.544 ms.
  5 | Iteration: 4 train on batch time: 317.125 ms.
  6 | Iteration: 5 train on batch time: 316.060 ms.
  7 | Iteration: 6 train on batch time: 317.397 ms.
  8 | Iteration: 7 train on batch time: 316.203 ms.
  9 | Iteration: 8 train on batch time: 315.844 ms.
 10 | Iteration: 9 train on batch time: 318.185 ms.
 11 | Iteration: 10 train on batch time: 318.545 ms.
 12 | Iteration: 11 train on batch time: 317.864 ms.
 13 | Iteration: 12 train on batch time: 317.422 ms.
 14 | Iteration: 13 train on batch time: 318.078 ms.
 15 | Iteration: 14 train on batch time: 316.676 ms.
 16 | Iteration: 15 train on batch time: 316.480 ms.
 17 | Iteration: 16 train on batch time: 317.987 ms.
 18 | Iteration: 17 train on batch time: 316.802 ms.
 19 | Iteration: 18 train on batch time: 317.709 ms.
 20 | Iteration: 19 train on batch time: 318.014 ms.
 21 | Iteration: 20 train on batch time: 317.708 ms.
 22 | Iteration: 21 train on batch time: 317.348 ms.
 23 | Iteration: 22 train on batch time: 318.015 ms.
 24 | Iteration: 23 train on batch time: 316.362 ms.
 25 | Iteration: 24 train on batch time: 316.876 ms.
 26 | Iteration: 25 train on batch time: 317.187 ms.
 27 | Iteration: 26 train on batch time: 316.574 ms.
 28 | Iteration: 27 train on batch time: 317.994 ms.
 29 | Iteration: 28 train on batch time: 316.957 ms.
 30 | Iteration: 29 train on batch time: 316.761 ms.
 31 | Iteration: 30 train on batch time: 316.354 ms.
 32 | Iteration: 31 train on batch time: 317.619 ms.
 33 | Iteration: 32 train on batch time: 317.344 ms.
 34 | Iteration: 33 train on batch time: 316.610 ms.
 35 | Iteration: 34 train on batch time: 316.180 ms.
 36 | Iteration: 35 train on batch time: 317.139 ms.
 37 | Iteration: 36 train on batch time: 316.756 ms.
 38 | Iteration: 37 train on batch time: 316.698 ms.
 39 | Iteration: 38 train on batch time: 316.859 ms.
 40 | Iteration: 39 train on batch time: 316.749 ms.
 41 | Iteration: 40 train on batch time: 316.636 ms.
 42 | Iteration: 41 train on batch time: 318.698 ms.
 43 | Iteration: 42 train on batch time: 317.036 ms.
 44 | Iteration: 43 train on batch time: 316.692 ms.
 45 | Iteration: 44 train on batch time: 317.117 ms.
 46 | Iteration: 45 train on batch time: 317.050 ms.
 47 | Iteration: 46 train on batch time: 316.263 ms.
 48 | Iteration: 47 train on batch time: 319.322 ms.
 49 | Iteration: 48 train on batch time: 318.018 ms.
 50 | Iteration: 49 train on batch time: 316.385 ms.
 51 | Iteration: 50 train on batch time: 318.187 ms.
 52 | Iteration: 51 train on batch time: 317.213 ms.
 53 | Iteration: 52 train on batch time: 317.435 ms.
 54 | Iteration: 53 train on batch time: 317.183 ms.
 55 | Iteration: 54 train on batch time: 317.591 ms.
 56 | Iteration: 55 train on batch time: 317.128 ms.
 57 | Iteration: 56 train on batch time: 317.527 ms.
 58 | Iteration: 57 train on batch time: 317.905 ms.
 59 | Iteration: 58 train on batch time: 316.414 ms.
 60 | Iteration: 59 train on batch time: 317.886 ms.
 61 | Iteration: 60 train on batch time: 316.230 ms.
 62 | Iteration: 61 train on batch time: 317.368 ms.
 63 | Iteration: 62 train on batch time: 316.102 ms.
 64 | Iteration: 63 train on batch time: 317.860 ms.
 65 | Iteration: 64 train on batch time: 316.927 ms.
 66 | Iteration: 65 train on batch time: 317.274 ms.
 67 | Iteration: 66 train on batch time: 316.880 ms.
 68 | Iteration: 67 train on batch time: 317.347 ms.
 69 | Iteration: 68 train on batch time: 317.733 ms.
 70 | Iteration: 69 train on batch time: 318.467 ms.
 71 | Iteration: 70 train on batch time: 318.170 ms.
 72 | Iteration: 71 train on batch time: 316.283 ms.
 73 | Iteration: 72 train on batch time: 319.012 ms.
 74 | Iteration: 73 train on batch time: 317.762 ms.
 75 | Iteration: 74 train on batch time: 316.997 ms.
 76 | Iteration: 75 train on batch time: 316.797 ms.
 77 | Iteration: 76 train on batch time: 317.759 ms.
 78 | Iteration: 77 train on batch time: 316.534 ms.
 79 | Iteration: 78 train on batch time: 316.722 ms.
 80 | Iteration: 79 train on batch time: 316.897 ms.
 81 | Iteration: 80 train on batch time: 316.676 ms.
 82 | Iteration: 81 train on batch time: 317.529 ms.
 83 | Iteration: 82 train on batch time: 318.468 ms.
 84 | Iteration: 83 train on batch time: 317.519 ms.
 85 | Iteration: 84 train on batch time: 317.008 ms.
 86 | Iteration: 85 train on batch time: 318.626 ms.
 87 | Iteration: 86 train on batch time: 316.383 ms.
 88 | Iteration: 87 train on batch time: 317.032 ms.
 89 | Iteration: 88 train on batch time: 317.510 ms.
 90 | Iteration: 89 train on batch time: 317.324 ms.
 91 | Iteration: 90 train on batch time: 317.124 ms.
 92 | Iteration: 91 train on batch time: 318.425 ms.
 93 | Iteration: 92 train on batch time: 316.791 ms.
 94 | Iteration: 93 train on batch time: 317.963 ms.
 95 | Iteration: 94 train on batch time: 317.323 ms.
 96 | Iteration: 95 train on batch time: 318.413 ms.
 97 | Iteration: 96 train on batch time: 317.258 ms.
 98 | Iteration: 97 train on batch time: 316.941 ms.
 99 | Iteration: 98 train on batch time: 317.517 ms.
100 | Iteration: 99 train on batch time: 318.043 ms.
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 317.298 ms
104 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_mxnet.output:
--------------------------------------------------------------------------------
1 | Batch size: 16
2 | Iterations: 400
3 | Time per iteration: 324.635 ms
4 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_neon.output:
--------------------------------------------------------------------------------
  1 | Epoch 0   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.22s]
  2 |                                                                             Epoch 1   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.08s]
  3 |                                                                             Epoch 2   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.19s]
  4 |                                                                             Epoch 3   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.19s]
  5 |                                                                             Epoch 4   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.19s]
  6 |                                                                             Epoch 5   [Train |████████████████████|    1/1    batches, 6.91 cost, 0.19s]
  7 |                                                                             Epoch 6   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
  8 |                                                                             Epoch 7   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
  9 |                                                                             Epoch 8   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 10 |                                                                             Epoch 9   [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 11 |                                                                             Epoch 10  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 12 |                                                                             Epoch 11  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 13 |                                                                             Epoch 12  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 14 |                                                                             Epoch 13  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 15 |                                                                             Epoch 14  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 16 |                                                                             Epoch 15  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 17 |                                                                             Epoch 16  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 18 |                                                                             Epoch 17  [Train |████████████████████|    1/1    batches, 6.90 cost, 0.19s]
 19 |                                                                             Epoch 18  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 20 |                                                                             Epoch 19  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 21 |                                                                             Epoch 20  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 22 |                                                                             Epoch 21  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 23 |                                                                             Epoch 22  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 24 |                                                                             Epoch 23  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 25 |                                                                             Epoch 24  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 26 |                                                                             Epoch 25  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 27 |                                                                             Epoch 26  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 28 |                                                                             Epoch 27  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 29 |                                                                             Epoch 28  [Train |████████████████████|    1/1    batches, 6.89 cost, 0.19s]
 30 |                                                                             Epoch 29  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 31 |                                                                             Epoch 30  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 32 |                                                                             Epoch 31  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 33 |                                                                             Epoch 32  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 34 |                                                                             Epoch 33  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 35 |                                                                             Epoch 34  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 36 |                                                                             Epoch 35  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 37 |                                                                             Epoch 36  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 38 |                                                                             Epoch 37  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 39 |                                                                             Epoch 38  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 40 |                                                                             Epoch 39  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 41 |                                                                             Epoch 40  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 42 |                                                                             Epoch 41  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 43 |                                                                             Epoch 42  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 44 |                                                                             Epoch 43  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 45 |                                                                             Epoch 44  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 46 |                                                                             Epoch 45  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 47 |                                                                             Epoch 46  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 48 |                                                                             Epoch 47  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 49 |                                                                             Epoch 48  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 50 |                                                                             Epoch 49  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 51 |                                                                             Epoch 50  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 52 |                                                                             Epoch 51  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 53 |                                                                             Epoch 52  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 54 |                                                                             Epoch 53  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 55 |                                                                             Epoch 54  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 56 |                                                                             Epoch 55  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 57 |                                                                             Epoch 56  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 58 |                                                                             Epoch 57  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 59 |                                                                             Epoch 58  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 60 |                                                                             Epoch 59  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 61 |                                                                             Epoch 60  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 62 |                                                                             Epoch 61  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 63 |                                                                             Epoch 62  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 64 |                                                                             Epoch 63  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 65 |                                                                             Epoch 64  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 66 |                                                                             Epoch 65  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 67 |                                                                             Epoch 66  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 68 |                                                                             Epoch 67  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 69 |                                                                             Epoch 68  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 70 |                                                                             Epoch 69  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 71 |                                                                             Epoch 70  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 72 |                                                                             Epoch 71  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 73 |                                                                             Epoch 72  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 74 |                                                                             Epoch 73  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 75 |                                                                             Epoch 74  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 76 |                                                                             Epoch 75  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 77 |                                                                             Epoch 76  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 78 |                                                                             Epoch 77  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 79 |                                                                             Epoch 78  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 80 |                                                                             Epoch 79  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 81 |                                                                             Epoch 80  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 82 |                                                                             Epoch 81  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 83 |                                                                             Epoch 82  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 84 |                                                                             Epoch 83  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 85 |                                                                             Epoch 84  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 86 |                                                                             Epoch 85  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 87 |                                                                             Epoch 86  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 88 |                                                                             Epoch 87  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 89 |                                                                             Epoch 88  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.20s]
 90 |                                                                             Epoch 89  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 91 |                                                                             Epoch 90  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 92 |                                                                             Epoch 91  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 93 |                                                                             Epoch 92  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 94 |                                                                             Epoch 93  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 95 |                                                                             Epoch 94  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.20s]
 96 |                                                                             Epoch 95  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 97 |                                                                             Epoch 96  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 98 |                                                                             Epoch 97  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
 99 |                                                                             Epoch 98  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
100 |                                                                             Epoch 99  [Train |████████████████████|    1/1    batches, 6.88 cost, 0.19s]
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 207.406 ms
104 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_tensorflow.output:
--------------------------------------------------------------------------------
  1 | Iteration: 0 train on batch time: 328.818 ms.
  2 | Iteration: 1 train on batch time: 333.407 ms.
  3 | Iteration: 2 train on batch time: 333.047 ms.
  4 | Iteration: 3 train on batch time: 332.079 ms.
  5 | Iteration: 4 train on batch time: 333.472 ms.
  6 | Iteration: 5 train on batch time: 334.534 ms.
  7 | Iteration: 6 train on batch time: 332.017 ms.
  8 | Iteration: 7 train on batch time: 333.512 ms.
  9 | Iteration: 8 train on batch time: 334.184 ms.
 10 | Iteration: 9 train on batch time: 330.417 ms.
 11 | Iteration: 10 train on batch time: 333.059 ms.
 12 | Iteration: 11 train on batch time: 333.217 ms.
 13 | Iteration: 12 train on batch time: 330.912 ms.
 14 | Iteration: 13 train on batch time: 333.879 ms.
 15 | Iteration: 14 train on batch time: 333.876 ms.
 16 | Iteration: 15 train on batch time: 331.909 ms.
 17 | Iteration: 16 train on batch time: 332.823 ms.
 18 | Iteration: 17 train on batch time: 333.824 ms.
 19 | Iteration: 18 train on batch time: 332.093 ms.
 20 | Iteration: 19 train on batch time: 332.637 ms.
 21 | Iteration: 20 train on batch time: 333.547 ms.
 22 | Iteration: 21 train on batch time: 330.950 ms.
 23 | Iteration: 22 train on batch time: 332.907 ms.
 24 | Iteration: 23 train on batch time: 333.391 ms.
 25 | Iteration: 24 train on batch time: 332.391 ms.
 26 | Iteration: 25 train on batch time: 334.819 ms.
 27 | Iteration: 26 train on batch time: 333.753 ms.
 28 | Iteration: 27 train on batch time: 332.266 ms.
 29 | Iteration: 28 train on batch time: 334.893 ms.
 30 | Iteration: 29 train on batch time: 334.298 ms.
 31 | Iteration: 30 train on batch time: 333.276 ms.
 32 | Iteration: 31 train on batch time: 335.117 ms.
 33 | Iteration: 32 train on batch time: 333.548 ms.
 34 | Iteration: 33 train on batch time: 331.597 ms.
 35 | Iteration: 34 train on batch time: 333.420 ms.
 36 | Iteration: 35 train on batch time: 333.415 ms.
 37 | Iteration: 36 train on batch time: 332.857 ms.
 38 | Iteration: 37 train on batch time: 334.195 ms.
 39 | Iteration: 38 train on batch time: 334.632 ms.
 40 | Iteration: 39 train on batch time: 333.794 ms.
 41 | Iteration: 40 train on batch time: 334.871 ms.
 42 | Iteration: 41 train on batch time: 334.118 ms.
 43 | Iteration: 42 train on batch time: 333.610 ms.
 44 | Iteration: 43 train on batch time: 335.168 ms.
 45 | Iteration: 44 train on batch time: 336.150 ms.
 46 | Iteration: 45 train on batch time: 329.581 ms.
 47 | Iteration: 46 train on batch time: 335.177 ms.
 48 | Iteration: 47 train on batch time: 332.751 ms.
 49 | Iteration: 48 train on batch time: 331.409 ms.
 50 | Iteration: 49 train on batch time: 333.099 ms.
 51 | Iteration: 50 train on batch time: 333.680 ms.
 52 | Iteration: 51 train on batch time: 329.999 ms.
 53 | Iteration: 52 train on batch time: 331.872 ms.
 54 | Iteration: 53 train on batch time: 333.053 ms.
 55 | Iteration: 54 train on batch time: 331.581 ms.
 56 | Iteration: 55 train on batch time: 330.702 ms.
 57 | Iteration: 56 train on batch time: 333.057 ms.
 58 | Iteration: 57 train on batch time: 330.838 ms.
 59 | Iteration: 58 train on batch time: 331.483 ms.
 60 | Iteration: 59 train on batch time: 330.193 ms.
 61 | Iteration: 60 train on batch time: 331.512 ms.
 62 | Iteration: 61 train on batch time: 330.924 ms.
 63 | Iteration: 62 train on batch time: 330.957 ms.
 64 | Iteration: 63 train on batch time: 333.135 ms.
 65 | Iteration: 64 train on batch time: 330.593 ms.
 66 | Iteration: 65 train on batch time: 331.780 ms.
 67 | Iteration: 66 train on batch time: 331.242 ms.
 68 | Iteration: 67 train on batch time: 330.998 ms.
 69 | Iteration: 68 train on batch time: 329.692 ms.
 70 | Iteration: 69 train on batch time: 332.326 ms.
 71 | Iteration: 70 train on batch time: 330.668 ms.
 72 | Iteration: 71 train on batch time: 331.139 ms.
 73 | Iteration: 72 train on batch time: 330.898 ms.
 74 | Iteration: 73 train on batch time: 331.845 ms.
 75 | Iteration: 74 train on batch time: 331.395 ms.
 76 | Iteration: 75 train on batch time: 330.457 ms.
 77 | Iteration: 76 train on batch time: 332.719 ms.
 78 | Iteration: 77 train on batch time: 330.476 ms.
 79 | Iteration: 78 train on batch time: 332.566 ms.
 80 | Iteration: 79 train on batch time: 333.037 ms.
 81 | Iteration: 80 train on batch time: 329.615 ms.
 82 | Iteration: 81 train on batch time: 331.192 ms.
 83 | Iteration: 82 train on batch time: 333.480 ms.
 84 | Iteration: 83 train on batch time: 330.991 ms.
 85 | Iteration: 84 train on batch time: 330.611 ms.
 86 | Iteration: 85 train on batch time: 332.442 ms.
 87 | Iteration: 86 train on batch time: 330.716 ms.
 88 | Iteration: 87 train on batch time: 330.958 ms.
 89 | Iteration: 88 train on batch time: 331.185 ms.
 90 | Iteration: 89 train on batch time: 331.274 ms.
 91 | Iteration: 90 train on batch time: 327.980 ms.
 92 | Iteration: 91 train on batch time: 330.826 ms.
 93 | Iteration: 92 train on batch time: 332.512 ms.
 94 | Iteration: 93 train on batch time: 330.908 ms.
 95 | Iteration: 94 train on batch time: 331.749 ms.
 96 | Iteration: 95 train on batch time: 333.341 ms.
 97 | Iteration: 96 train on batch time: 330.234 ms.
 98 | Iteration: 97 train on batch time: 329.032 ms.
 99 | Iteration: 98 train on batch time: 333.054 ms.
100 | Iteration: 99 train on batch time: 330.318 ms.
101 | Batch size: 16
102 | Iterations: 100
103 | Time per iteration: 332.274 ms
104 | 


--------------------------------------------------------------------------------
/results/maxwell_titan_x/benchmark_tensorflow_slim.output:
--------------------------------------------------------------------------------
1 | Batch size: 16
2 | Iterations: 100
3 | Time per iteration: 370.890 ms
4 | 


--------------------------------------------------------------------------------
/results/v100/INFO.md:
--------------------------------------------------------------------------------
 1 | ## Results V100
 2 | 
 3 | The mini batch size is 16
 4 | 
 5 | | Framework | Time (per minibatch)  |
 6 | |:---|:---|
 7 | | TensorFlow | 117.11 | 
 8 | | TensorFlow (slim) | 176.14 | 
 9 | | Keras (TensorFlow) | 149.85 | 
10 | | MXNet | 81.22 | 
11 | 
12 | - Hardware: Tesla V100-SXM2-16GB (Amazon AWS EC2 p3.2xlarge)
13 | - OS: Ubuntu 16.04.3 LTS (NVIDIA Volta Deep Learning AMI ami-b933cac4)
14 | - Python: 2.7.6
15 | - CUDA: 9.0.333
16 | - CuDNN: ?
17 | - TensorFlow: 1.4.0
18 | - Keras: 2.0.8-tf
19 | - MXNet: 1.1.0
20 | - Caffe: n/a
21 | - Theano: n/a
22 | - Neon: n/a
23 | 
24 | (1) Tensorflow and Keras were run in NGC docker container nvcr.io/nvidia/tensorflow:18.03-py3, and MXNet in nvcr.io/nvidia/mxnet:18.03-py3
25 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time.
26 | 


--------------------------------------------------------------------------------
/results/v100/benchmark_keras_tensorflow.output:
--------------------------------------------------------------------------------
  1 | 2018-04-17 01:32:20.656147: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
  2 | 2018-04-17 01:32:20.656557: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
  3 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
  4 | pciBusID: 0000:00:1e.0
  5 | totalMemory: 15.77GiB freeMemory: 15.36GiB
  6 | 2018-04-17 01:32:20.656581: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0)
  7 | Iteration: 0 train on batch time: 1405.454 ms.
  8 | Iteration: 1 train on batch time: 915.364 ms.
  9 | Iteration: 2 train on batch time: 914.906 ms.
 10 | Iteration: 3 train on batch time: 476.989 ms.
 11 | Iteration: 4 train on batch time: 474.223 ms.
 12 | Iteration: 5 train on batch time: 419.132 ms.
 13 | Iteration: 6 train on batch time: 303.623 ms.
 14 | Iteration: 7 train on batch time: 196.285 ms.
 15 | Iteration: 8 train on batch time: 195.734 ms.
 16 | Iteration: 9 train on batch time: 196.247 ms.
 17 | Iteration: 10 train on batch time: 196.194 ms.
 18 | Iteration: 11 train on batch time: 126.764 ms.
 19 | Iteration: 12 train on batch time: 126.625 ms.
 20 | Iteration: 13 train on batch time: 127.042 ms.
 21 | Iteration: 14 train on batch time: 127.317 ms.
 22 | Iteration: 15 train on batch time: 127.124 ms.
 23 | Iteration: 16 train on batch time: 127.184 ms.
 24 | Iteration: 17 train on batch time: 126.979 ms.
 25 | Iteration: 18 train on batch time: 127.601 ms.
 26 | Iteration: 19 train on batch time: 126.789 ms.
 27 | Iteration: 20 train on batch time: 127.373 ms.
 28 | Iteration: 21 train on batch time: 126.867 ms.
 29 | Iteration: 22 train on batch time: 126.948 ms.
 30 | Iteration: 23 train on batch time: 126.527 ms.
 31 | Iteration: 24 train on batch time: 126.791 ms.
 32 | Iteration: 25 train on batch time:  99.899 ms.
 33 | Iteration: 26 train on batch time: 100.132 ms.
 34 | Iteration: 27 train on batch time: 100.119 ms.
 35 | Iteration: 28 train on batch time: 100.211 ms.
 36 | Iteration: 29 train on batch time: 100.334 ms.
 37 | Iteration: 30 train on batch time: 100.300 ms.
 38 | Iteration: 31 train on batch time: 100.119 ms.
 39 | Iteration: 32 train on batch time: 100.117 ms.
 40 | Iteration: 33 train on batch time: 100.148 ms.
 41 | Iteration: 34 train on batch time: 100.061 ms.
 42 | Iteration: 35 train on batch time: 100.078 ms.
 43 | Iteration: 36 train on batch time: 100.421 ms.
 44 | Iteration: 37 train on batch time: 100.193 ms.
 45 | Iteration: 38 train on batch time:  99.708 ms.
 46 | Iteration: 39 train on batch time: 100.033 ms.
 47 | Iteration: 40 train on batch time: 100.316 ms.
 48 | Iteration: 41 train on batch time: 100.313 ms.
 49 | Iteration: 42 train on batch time: 100.267 ms.
 50 | Iteration: 43 train on batch time: 100.075 ms.
 51 | Iteration: 44 train on batch time:  99.874 ms.
 52 | Iteration: 45 train on batch time: 100.243 ms.
 53 | Iteration: 46 train on batch time: 100.165 ms.
 54 | Iteration: 47 train on batch time: 100.256 ms.
 55 | Iteration: 48 train on batch time: 100.179 ms.
 56 | Iteration: 49 train on batch time:  99.802 ms.
 57 | Iteration: 50 train on batch time: 100.371 ms.
 58 | Iteration: 51 train on batch time: 100.073 ms.
 59 | Iteration: 52 train on batch time: 100.186 ms.
 60 | Iteration: 53 train on batch time: 100.362 ms.
 61 | Iteration: 54 train on batch time: 100.454 ms.
 62 | Iteration: 55 train on batch time: 100.064 ms.
 63 | Iteration: 56 train on batch time: 100.130 ms.
 64 | Iteration: 57 train on batch time: 100.151 ms.
 65 | Iteration: 58 train on batch time: 100.061 ms.
 66 | Iteration: 59 train on batch time: 100.596 ms.
 67 | Iteration: 60 train on batch time: 100.262 ms.
 68 | Iteration: 61 train on batch time: 100.859 ms.
 69 | Iteration: 62 train on batch time:  99.812 ms.
 70 | Iteration: 63 train on batch time: 100.214 ms.
 71 | Iteration: 64 train on batch time: 100.019 ms.
 72 | Iteration: 65 train on batch time:  99.767 ms.
 73 | Iteration: 66 train on batch time: 100.262 ms.
 74 | Iteration: 67 train on batch time: 100.151 ms.
 75 | Iteration: 68 train on batch time: 100.474 ms.
 76 | Iteration: 69 train on batch time:  99.996 ms.
 77 | Iteration: 70 train on batch time: 100.012 ms.
 78 | Iteration: 71 train on batch time: 100.011 ms.
 79 | Iteration: 72 train on batch time: 100.111 ms.
 80 | Iteration: 73 train on batch time: 100.047 ms.
 81 | Iteration: 74 train on batch time: 100.210 ms.
 82 | Iteration: 75 train on batch time: 100.140 ms.
 83 | Iteration: 76 train on batch time: 100.271 ms.
 84 | Iteration: 77 train on batch time: 100.267 ms.
 85 | Iteration: 78 train on batch time: 100.216 ms.
 86 | Iteration: 79 train on batch time: 100.512 ms.
 87 | Iteration: 80 train on batch time: 100.171 ms.
 88 | Iteration: 81 train on batch time: 100.132 ms.
 89 | Iteration: 82 train on batch time: 100.168 ms.
 90 | Iteration: 83 train on batch time: 100.218 ms.
 91 | Iteration: 84 train on batch time: 100.365 ms.
 92 | Iteration: 85 train on batch time:  99.833 ms.
 93 | Iteration: 86 train on batch time: 100.144 ms.
 94 | Iteration: 87 train on batch time: 100.059 ms.
 95 | Iteration: 88 train on batch time: 100.269 ms.
 96 | Iteration: 89 train on batch time: 100.168 ms.
 97 | Iteration: 90 train on batch time: 100.107 ms.
 98 | Iteration: 91 train on batch time: 100.015 ms.
 99 | Iteration: 92 train on batch time: 100.058 ms.
100 | Iteration: 93 train on batch time:  99.955 ms.
101 | Iteration: 94 train on batch time:  99.883 ms.
102 | Iteration: 95 train on batch time: 100.128 ms.
103 | Iteration: 96 train on batch time:  99.910 ms.
104 | Iteration: 97 train on batch time: 100.212 ms.
105 | Iteration: 98 train on batch time:  99.941 ms.
106 | Iteration: 99 train on batch time: 100.041 ms.
107 | Batch size: 16
108 | Iterations: 100
109 | Time per iteration: 149.847 ms
110 | 


--------------------------------------------------------------------------------
/results/v100/benchmark_mxnet.output:
--------------------------------------------------------------------------------
 1 | [04:39:44] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:130: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
 2 | /opt/mxnet/python/mxnet/optimizer.py:136: UserWarning: WARNING: New optimizer mxnet.optimizer.NAG is overriding existing optimizer mxnet.optimizer.NAG
 3 |   Optimizer.opt_registry[name].__name__))
 4 | benchmark_mxnet.py:88: DeprecationWarning: [91mmxnet.model.FeedForward has been deprecated. Please use mxnet.mod.Module instead.[0m
 5 |   initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
 6 | /opt/mxnet/python/mxnet/model.py:573: DeprecationWarning: [91mCalling initializer with init(str, NDArray) has been deprecated.please use init(mx.init.InitDesc(...), NDArray) instead.[0m
 7 |   self.initializer(k, v)
 8 | Start
 9 | Batch size: 16
10 | Iterations: 400
11 | Time per iteration:  81.217 ms
12 | 


--------------------------------------------------------------------------------
/results/v100/benchmark_tensorflow.output:
--------------------------------------------------------------------------------
  1 | WARNING:tensorflow:From benchmark_tensorflow.py:78: softmax_cross_entropy (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
  2 | Instructions for updating:
  3 | Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed.
  4 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:398: compute_weighted_loss (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
  5 | Instructions for updating:
  6 | Use tf.losses.compute_weighted_loss instead.
  7 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:151: add_arg_scope.<locals>.func_with_args (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
  8 | Instructions for updating:
  9 | Use tf.losses.add_loss instead.
 10 | 2018-04-17 01:31:44.259351: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
 11 | 2018-04-17 01:31:44.259751: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
 12 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
 13 | pciBusID: 0000:00:1e.0
 14 | totalMemory: 15.77GiB freeMemory: 15.36GiB
 15 | 2018-04-17 01:31:44.259774: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0)
 16 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
 17 | Instructions for updating:
 18 | Use `tf.global_variables_initializer` instead.
 19 | Iteration: 0 train on batch time: 1095.193 ms.
 20 | Iteration: 1 train on batch time: 650.711 ms.
 21 | Iteration: 2 train on batch time: 652.697 ms.
 22 | Iteration: 3 train on batch time: 344.923 ms.
 23 | Iteration: 4 train on batch time: 345.038 ms.
 24 | Iteration: 5 train on batch time: 179.180 ms.
 25 | Iteration: 6 train on batch time: 179.378 ms.
 26 | Iteration: 7 train on batch time: 125.554 ms.
 27 | Iteration: 8 train on batch time: 125.383 ms.
 28 | Iteration: 9 train on batch time:  88.094 ms.
 29 | Iteration: 10 train on batch time:  88.195 ms.
 30 | Iteration: 11 train on batch time:  87.938 ms.
 31 | Iteration: 12 train on batch time:  88.148 ms.
 32 | Iteration: 13 train on batch time:  88.087 ms.
 33 | Iteration: 14 train on batch time:  88.032 ms.
 34 | Iteration: 15 train on batch time:  87.999 ms.
 35 | Iteration: 16 train on batch time:  88.042 ms.
 36 | Iteration: 17 train on batch time:  88.144 ms.
 37 | Iteration: 18 train on batch time:  88.071 ms.
 38 | Iteration: 19 train on batch time:  88.006 ms.
 39 | Iteration: 20 train on batch time:  88.110 ms.
 40 | Iteration: 21 train on batch time:  88.095 ms.
 41 | Iteration: 22 train on batch time:  88.169 ms.
 42 | Iteration: 23 train on batch time:  87.847 ms.
 43 | Iteration: 24 train on batch time:  87.962 ms.
 44 | Iteration: 25 train on batch time:  88.184 ms.
 45 | Iteration: 26 train on batch time:  88.016 ms.
 46 | Iteration: 27 train on batch time:  88.027 ms.
 47 | Iteration: 28 train on batch time:  88.076 ms.
 48 | Iteration: 29 train on batch time:  87.893 ms.
 49 | Iteration: 30 train on batch time:  88.014 ms.
 50 | Iteration: 31 train on batch time:  87.990 ms.
 51 | Iteration: 32 train on batch time:  87.952 ms.
 52 | Iteration: 33 train on batch time:  88.172 ms.
 53 | Iteration: 34 train on batch time:  87.979 ms.
 54 | Iteration: 35 train on batch time:  88.007 ms.
 55 | Iteration: 36 train on batch time:  87.991 ms.
 56 | Iteration: 37 train on batch time:  87.919 ms.
 57 | Iteration: 38 train on batch time:  88.131 ms.
 58 | Iteration: 39 train on batch time:  87.980 ms.
 59 | Iteration: 40 train on batch time:  88.053 ms.
 60 | Iteration: 41 train on batch time:  88.027 ms.
 61 | Iteration: 42 train on batch time:  88.001 ms.
 62 | Iteration: 43 train on batch time:  88.121 ms.
 63 | Iteration: 44 train on batch time:  88.205 ms.
 64 | Iteration: 45 train on batch time:  88.191 ms.
 65 | Iteration: 46 train on batch time:  88.133 ms.
 66 | Iteration: 47 train on batch time:  88.099 ms.
 67 | Iteration: 48 train on batch time:  88.160 ms.
 68 | Iteration: 49 train on batch time:  88.120 ms.
 69 | Iteration: 50 train on batch time:  88.138 ms.
 70 | Iteration: 51 train on batch time:  88.142 ms.
 71 | Iteration: 52 train on batch time:  88.001 ms.
 72 | Iteration: 53 train on batch time:  87.923 ms.
 73 | Iteration: 54 train on batch time:  88.105 ms.
 74 | Iteration: 55 train on batch time:  88.137 ms.
 75 | Iteration: 56 train on batch time:  88.028 ms.
 76 | Iteration: 57 train on batch time:  87.955 ms.
 77 | Iteration: 58 train on batch time:  88.229 ms.
 78 | Iteration: 59 train on batch time:  88.140 ms.
 79 | Iteration: 60 train on batch time:  88.050 ms.
 80 | Iteration: 61 train on batch time:  88.076 ms.
 81 | Iteration: 62 train on batch time:  87.965 ms.
 82 | Iteration: 63 train on batch time:  88.112 ms.
 83 | Iteration: 64 train on batch time:  87.935 ms.
 84 | Iteration: 65 train on batch time:  88.025 ms.
 85 | Iteration: 66 train on batch time:  88.126 ms.
 86 | Iteration: 67 train on batch time:  88.147 ms.
 87 | Iteration: 68 train on batch time:  88.022 ms.
 88 | Iteration: 69 train on batch time:  88.084 ms.
 89 | Iteration: 70 train on batch time:  88.057 ms.
 90 | Iteration: 71 train on batch time:  88.007 ms.
 91 | Iteration: 72 train on batch time:  88.055 ms.
 92 | Iteration: 73 train on batch time:  87.965 ms.
 93 | Iteration: 74 train on batch time:  88.014 ms.
 94 | Iteration: 75 train on batch time:  88.025 ms.
 95 | Iteration: 76 train on batch time:  87.869 ms.
 96 | Iteration: 77 train on batch time:  88.068 ms.
 97 | Iteration: 78 train on batch time:  87.859 ms.
 98 | Iteration: 79 train on batch time:  88.062 ms.
 99 | Iteration: 80 train on batch time:  87.958 ms.
100 | Iteration: 81 train on batch time:  87.940 ms.
101 | Iteration: 82 train on batch time:  87.945 ms.
102 | Iteration: 83 train on batch time:  88.151 ms.
103 | Iteration: 84 train on batch time:  88.000 ms.
104 | Iteration: 85 train on batch time:  88.052 ms.
105 | Iteration: 86 train on batch time:  87.935 ms.
106 | Iteration: 87 train on batch time:  87.997 ms.
107 | Iteration: 88 train on batch time:  87.870 ms.
108 | Iteration: 89 train on batch time:  88.095 ms.
109 | Iteration: 90 train on batch time:  87.870 ms.
110 | Iteration: 91 train on batch time:  88.105 ms.
111 | Iteration: 92 train on batch time:  87.972 ms.
112 | Iteration: 93 train on batch time:  87.895 ms.
113 | Iteration: 94 train on batch time:  87.955 ms.
114 | Iteration: 95 train on batch time:  87.976 ms.
115 | Iteration: 96 train on batch time:  87.909 ms.
116 | Iteration: 97 train on batch time:  87.986 ms.
117 | Iteration: 98 train on batch time:  87.991 ms.
118 | Iteration: 99 train on batch time:  87.980 ms.
119 | Batch size: 16
120 | Iterations: 100
121 | Time per iteration: 117.105 ms
122 | 


--------------------------------------------------------------------------------
/results/v100/benchmark_tensorflow_slim.output:
--------------------------------------------------------------------------------
 1 | 2018-04-17 01:32:00.668931: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
 2 | 2018-04-17 01:32:00.669350: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
 3 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
 4 | pciBusID: 0000:00:1e.0
 5 | totalMemory: 15.77GiB freeMemory: 15.36GiB
 6 | 2018-04-17 01:32:00.669374: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0)
 7 | WARNING:tensorflow:From benchmark_tensorflow.py:44: softmax_cross_entropy (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
 8 | Instructions for updating:
 9 | Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed.
10 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:398: compute_weighted_loss (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
11 | Instructions for updating:
12 | Use tf.losses.compute_weighted_loss instead.
13 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:151: add_arg_scope.<locals>.func_with_args (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30.
14 | Instructions for updating:
15 | Use tf.losses.add_loss instead.
16 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/training/python/training/training.py:412: get_or_create_global_step (from tensorflow.contrib.framework.python.ops.variables) is deprecated and will be removed in a future version.
17 | Instructions for updating:
18 | Please switch to tf.train.get_or_create_global_step
19 | 2018-04-17 01:32:01.530145: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0)
20 | Batch size: 16
21 | Iterations: 100
22 | Time per iteration: 176.140 ms
23 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | 
 3 | if [ -d deep-learning-models ];
 4 | then
 5 |    cd deep-learning-models ; git pull ; cd ..
 6 | else
 7 |    git clone https://github.com/fchollet/deep-learning-models
 8 | fi
 9 | 
10 | nvidia-smi dmon > results/benchmark_caffe.dmon 2>&1 &
11 | python benchmark_caffe.py > results/benchmark_caffe.output 2>&1
12 | killall -9 nvidia-smi
13 | 
14 | nvidia-smi dmon > results/benchmark_mxnet.dmon 2>&1 &
15 | python benchmark_mxnet.py > results/benchmark_mxnet.output 2>&1
16 | killall -9 nvidia-smi
17 | 
18 | # PATH=/usr/local/cuda/bin/:$PATH python benchmark_neon.py > results/benchmark_neon.output
19 | 
20 | nvidia-smi dmon > results/benchmark_tensorflow.dmon 2>&1 &
21 | python benchmark_tensorflow.py --train_schedule pure_tf > results/benchmark_tensorflow.output 2>&1
22 | killall -9 nvidia-smi
23 | 
24 | nvidia-smi dmon > results/benchmark_tensorflow_slim.dmon 2>&1 &
25 | python benchmark_tensorflow.py --train_schedule slim > results/benchmark_tensorflow_slim.output 2>&1
26 | killall -9 nvidia-smi
27 | 
28 | nvidia-smi dmon > results/benchmark_keras_theano.dmon 2>&1 &
29 | python benchmark_keras.py --backend theano     > results/benchmark_keras_theano.output 2>&1
30 | killall -9 nvidia-smi
31 | 
32 | nvidia-smi dmon > results/benchmark_keras_tensorflow.dmon 2>&1 &
33 | python benchmark_keras.py --backend tensorflow > results/benchmark_keras_tensorflow.output 2>&1
34 | killall -9 nvidia-smi
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/vgg16_deploy.prototxt:
--------------------------------------------------------------------------------
  1 | # From https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/ded9363bd93ec0c770134f4e387d8aaaaa2407ce/VGG_ILSVRC_16_layers_deploy.prototxt
  2 | 
  3 | name: "VGG_ILSVRC_16_layers"
  4 | input: "data"
  5 | input_dim: 16
  6 | input_dim: 3
  7 | input_dim: 224
  8 | input_dim: 224
  9 | layers {
 10 |   bottom: "data"
 11 |   top: "conv1_1"
 12 |   name: "conv1_1"
 13 |   type: CONVOLUTION
 14 |   convolution_param {
 15 |     num_output: 64
 16 |     pad: 1
 17 |     kernel_size: 3
 18 |   }
 19 | }
 20 | layers {
 21 |   bottom: "conv1_1"
 22 |   top: "conv1_1"
 23 |   name: "relu1_1"
 24 |   type: RELU
 25 | }
 26 | layers {
 27 |   bottom: "conv1_1"
 28 |   top: "conv1_2"
 29 |   name: "conv1_2"
 30 |   type: CONVOLUTION
 31 |   convolution_param {
 32 |     num_output: 64
 33 |     pad: 1
 34 |     kernel_size: 3
 35 |   }
 36 | }
 37 | layers {
 38 |   bottom: "conv1_2"
 39 |   top: "conv1_2"
 40 |   name: "relu1_2"
 41 |   type: RELU
 42 | }
 43 | layers {
 44 |   bottom: "conv1_2"
 45 |   top: "pool1"
 46 |   name: "pool1"
 47 |   type: POOLING
 48 |   pooling_param {
 49 |     pool: MAX
 50 |     kernel_size: 2
 51 |     stride: 2
 52 |   }
 53 | }
 54 | layers {
 55 |   bottom: "pool1"
 56 |   top: "conv2_1"
 57 |   name: "conv2_1"
 58 |   type: CONVOLUTION
 59 |   convolution_param {
 60 |     num_output: 128
 61 |     pad: 1
 62 |     kernel_size: 3
 63 |   }
 64 | }
 65 | layers {
 66 |   bottom: "conv2_1"
 67 |   top: "conv2_1"
 68 |   name: "relu2_1"
 69 |   type: RELU
 70 | }
 71 | layers {
 72 |   bottom: "conv2_1"
 73 |   top: "conv2_2"
 74 |   name: "conv2_2"
 75 |   type: CONVOLUTION
 76 |   convolution_param {
 77 |     num_output: 128
 78 |     pad: 1
 79 |     kernel_size: 3
 80 |   }
 81 | }
 82 | layers {
 83 |   bottom: "conv2_2"
 84 |   top: "conv2_2"
 85 |   name: "relu2_2"
 86 |   type: RELU
 87 | }
 88 | layers {
 89 |   bottom: "conv2_2"
 90 |   top: "pool2"
 91 |   name: "pool2"
 92 |   type: POOLING
 93 |   pooling_param {
 94 |     pool: MAX
 95 |     kernel_size: 2
 96 |     stride: 2
 97 |   }
 98 | }
 99 | layers {
100 |   bottom: "pool2"
101 |   top: "conv3_1"
102 |   name: "conv3_1"
103 |   type: CONVOLUTION
104 |   convolution_param {
105 |     num_output: 256
106 |     pad: 1
107 |     kernel_size: 3
108 |   }
109 | }
110 | layers {
111 |   bottom: "conv3_1"
112 |   top: "conv3_1"
113 |   name: "relu3_1"
114 |   type: RELU
115 | }
116 | layers {
117 |   bottom: "conv3_1"
118 |   top: "conv3_2"
119 |   name: "conv3_2"
120 |   type: CONVOLUTION
121 |   convolution_param {
122 |     num_output: 256
123 |     pad: 1
124 |     kernel_size: 3
125 |   }
126 | }
127 | layers {
128 |   bottom: "conv3_2"
129 |   top: "conv3_2"
130 |   name: "relu3_2"
131 |   type: RELU
132 | }
133 | layers {
134 |   bottom: "conv3_2"
135 |   top: "conv3_3"
136 |   name: "conv3_3"
137 |   type: CONVOLUTION
138 |   convolution_param {
139 |     num_output: 256
140 |     pad: 1
141 |     kernel_size: 3
142 |   }
143 | }
144 | layers {
145 |   bottom: "conv3_3"
146 |   top: "conv3_3"
147 |   name: "relu3_3"
148 |   type: RELU
149 | }
150 | layers {
151 |   bottom: "conv3_3"
152 |   top: "pool3"
153 |   name: "pool3"
154 |   type: POOLING
155 |   pooling_param {
156 |     pool: MAX
157 |     kernel_size: 2
158 |     stride: 2
159 |   }
160 | }
161 | layers {
162 |   bottom: "pool3"
163 |   top: "conv4_1"
164 |   name: "conv4_1"
165 |   type: CONVOLUTION
166 |   convolution_param {
167 |     num_output: 512
168 |     pad: 1
169 |     kernel_size: 3
170 |   }
171 | }
172 | layers {
173 |   bottom: "conv4_1"
174 |   top: "conv4_1"
175 |   name: "relu4_1"
176 |   type: RELU
177 | }
178 | layers {
179 |   bottom: "conv4_1"
180 |   top: "conv4_2"
181 |   name: "conv4_2"
182 |   type: CONVOLUTION
183 |   convolution_param {
184 |     num_output: 512
185 |     pad: 1
186 |     kernel_size: 3
187 |   }
188 | }
189 | layers {
190 |   bottom: "conv4_2"
191 |   top: "conv4_2"
192 |   name: "relu4_2"
193 |   type: RELU
194 | }
195 | layers {
196 |   bottom: "conv4_2"
197 |   top: "conv4_3"
198 |   name: "conv4_3"
199 |   type: CONVOLUTION
200 |   convolution_param {
201 |     num_output: 512
202 |     pad: 1
203 |     kernel_size: 3
204 |   }
205 | }
206 | layers {
207 |   bottom: "conv4_3"
208 |   top: "conv4_3"
209 |   name: "relu4_3"
210 |   type: RELU
211 | }
212 | layers {
213 |   bottom: "conv4_3"
214 |   top: "pool4"
215 |   name: "pool4"
216 |   type: POOLING
217 |   pooling_param {
218 |     pool: MAX
219 |     kernel_size: 2
220 |     stride: 2
221 |   }
222 | }
223 | layers {
224 |   bottom: "pool4"
225 |   top: "conv5_1"
226 |   name: "conv5_1"
227 |   type: CONVOLUTION
228 |   convolution_param {
229 |     num_output: 512
230 |     pad: 1
231 |     kernel_size: 3
232 |   }
233 | }
234 | layers {
235 |   bottom: "conv5_1"
236 |   top: "conv5_1"
237 |   name: "relu5_1"
238 |   type: RELU
239 | }
240 | layers {
241 |   bottom: "conv5_1"
242 |   top: "conv5_2"
243 |   name: "conv5_2"
244 |   type: CONVOLUTION
245 |   convolution_param {
246 |     num_output: 512
247 |     pad: 1
248 |     kernel_size: 3
249 |   }
250 | }
251 | layers {
252 |   bottom: "conv5_2"
253 |   top: "conv5_2"
254 |   name: "relu5_2"
255 |   type: RELU
256 | }
257 | layers {
258 |   bottom: "conv5_2"
259 |   top: "conv5_3"
260 |   name: "conv5_3"
261 |   type: CONVOLUTION
262 |   convolution_param {
263 |     num_output: 512
264 |     pad: 1
265 |     kernel_size: 3
266 |   }
267 | }
268 | layers {
269 |   bottom: "conv5_3"
270 |   top: "conv5_3"
271 |   name: "relu5_3"
272 |   type: RELU
273 | }
274 | layers {
275 |   bottom: "conv5_3"
276 |   top: "pool5"
277 |   name: "pool5"
278 |   type: POOLING
279 |   pooling_param {
280 |     pool: MAX
281 |     kernel_size: 2
282 |     stride: 2
283 |   }
284 | }
285 | layers {
286 |   bottom: "pool5"
287 |   top: "fc6"
288 |   name: "fc6"
289 |   type: INNER_PRODUCT
290 |   inner_product_param {
291 |     num_output: 4096
292 |   }
293 | }
294 | layers {
295 |   bottom: "fc6"
296 |   top: "fc6"
297 |   name: "relu6"
298 |   type: RELU
299 | }
300 | layers {
301 |   bottom: "fc6"
302 |   top: "fc6"
303 |   name: "drop6"
304 |   type: DROPOUT
305 |   dropout_param {
306 |     dropout_ratio: 0.5
307 |   }
308 | }
309 | layers {
310 |   bottom: "fc6"
311 |   top: "fc7"
312 |   name: "fc7"
313 |   type: INNER_PRODUCT
314 |   inner_product_param {
315 |     num_output: 4096
316 |   }
317 | }
318 | layers {
319 |   bottom: "fc7"
320 |   top: "fc7"
321 |   name: "relu7"
322 |   type: RELU
323 | }
324 | layers {
325 |   bottom: "fc7"
326 |   top: "fc7"
327 |   name: "drop7"
328 |   type: DROPOUT
329 |   dropout_param {
330 |     dropout_ratio: 0.5
331 |   }
332 | }
333 | layers {
334 |   bottom: "fc7"
335 |   top: "fc8"
336 |   name: "fc8"
337 |   type: INNER_PRODUCT
338 |   inner_product_param {
339 |     num_output: 1000
340 |   }
341 | }
342 | layers {
343 |   bottom: "fc8"
344 |   top: "prob"
345 |   name: "prob"
346 |   type: SOFTMAX
347 | }
348 | 


--------------------------------------------------------------------------------
/vgg16_solver.prototxt:
--------------------------------------------------------------------------------
 1 | net: "vgg16_train_val.prototxt"
 2 | test_iter: 1
 3 | test_interval: 1000
 4 | base_lr: 0.002
 5 | stepsize: 50000
 6 | lr_policy: "step"
 7 | gamma: 0.1
 8 | display: 1
 9 | max_iter: 100
10 | momentum: 0.9
11 | weight_decay: 0.0005
12 | snapshot: 1000
13 | snapshot_prefix: "./"
14 | solver_mode: GPU
15 | 


--------------------------------------------------------------------------------
/vgg16_train_val.prototxt:
--------------------------------------------------------------------------------
  1 | # from http://cs.stanford.edu/people/karpathy/vgg_train_val.prototxt
  2 | name: "VGG_ILSVRC_16_layers"
  3 | layers {
  4 |   name: "data"
  5 |   type: IMAGE_DATA
  6 |   top: "data"
  7 |   top: "label"
  8 |   include {
  9 |     phase: TRAIN
 10 |   }
 11 |   transform_param {
 12 |     crop_size: 224
 13 |   }
 14 |   image_data_param {
 15 |     source: "images/list.txt"
 16 |     batch_size: 16
 17 |   }
 18 | }
 19 | layers {
 20 |   name: "data"
 21 |   type: IMAGE_DATA
 22 |   top: "data"
 23 |   top: "label"
 24 |   include {
 25 |     phase: TEST
 26 |   }
 27 |   transform_param {
 28 |     crop_size: 224
 29 |   }
 30 |   image_data_param {
 31 |     source: "images/list.txt"
 32 |     batch_size: 16
 33 |   }
 34 | }
 35 | layers {
 36 |   bottom: "data"
 37 |   top: "conv1_1"
 38 |   name: "conv1_1"
 39 |   type: CONVOLUTION
 40 |   convolution_param {
 41 |     num_output: 64
 42 |     pad: 1
 43 |     kernel_size: 3
 44 |   }
 45 |   blobs_lr: 0.001
 46 |   blobs_lr: 0.001
 47 | }
 48 | layers {
 49 |   bottom: "conv1_1"
 50 |   top: "conv1_1"
 51 |   name: "relu1_1"
 52 |   type: RELU
 53 | }
 54 | layers {
 55 |   bottom: "conv1_1"
 56 |   top: "conv1_2"
 57 |   name: "conv1_2"
 58 |   type: CONVOLUTION
 59 |   convolution_param {
 60 |     num_output: 64
 61 |     pad: 1
 62 |     kernel_size: 3
 63 |   }
 64 |   blobs_lr: 0.001
 65 |   blobs_lr: 0.001
 66 | }
 67 | layers {
 68 |   bottom: "conv1_2"
 69 |   top: "conv1_2"
 70 |   name: "relu1_2"
 71 |   type: RELU
 72 | }
 73 | layers {
 74 |   bottom: "conv1_2"
 75 |   top: "pool1"
 76 |   name: "pool1"
 77 |   type: POOLING
 78 |   pooling_param {
 79 |     pool: MAX
 80 |     kernel_size: 2
 81 |     stride: 2
 82 |   }
 83 | }
 84 | layers {
 85 |   bottom: "pool1"
 86 |   top: "conv2_1"
 87 |   name: "conv2_1"
 88 |   type: CONVOLUTION
 89 |   convolution_param {
 90 |     num_output: 128
 91 |     pad: 1
 92 |     kernel_size: 3
 93 |   }
 94 |   blobs_lr: 0.001
 95 |   blobs_lr: 0.001
 96 | }
 97 | layers {
 98 |   bottom: "conv2_1"
 99 |   top: "conv2_1"
100 |   name: "relu2_1"
101 |   type: RELU
102 | }
103 | layers {
104 |   bottom: "conv2_1"
105 |   top: "conv2_2"
106 |   name: "conv2_2"
107 |   type: CONVOLUTION
108 |   convolution_param {
109 |     num_output: 128
110 |     pad: 1
111 |     kernel_size: 3
112 |   }
113 |   blobs_lr: 0.001
114 |   blobs_lr: 0.001
115 | }
116 | layers {
117 |   bottom: "conv2_2"
118 |   top: "conv2_2"
119 |   name: "relu2_2"
120 |   type: RELU
121 | }
122 | layers {
123 |   bottom: "conv2_2"
124 |   top: "pool2"
125 |   name: "pool2"
126 |   type: POOLING
127 |   pooling_param {
128 |     pool: MAX
129 |     kernel_size: 2
130 |     stride: 2
131 |   }
132 | }
133 | layers {
134 |   bottom: "pool2"
135 |   top: "conv3_1"
136 |   name: "conv3_1"
137 |   type: CONVOLUTION
138 |   convolution_param {
139 |     num_output: 256
140 |     pad: 1
141 |     kernel_size: 3
142 |   }
143 |   blobs_lr: 0.001
144 |   blobs_lr: 0.001
145 | }
146 | layers {
147 |   bottom: "conv3_1"
148 |   top: "conv3_1"
149 |   name: "relu3_1"
150 |   type: RELU
151 | }
152 | layers {
153 |   bottom: "conv3_1"
154 |   top: "conv3_2"
155 |   name: "conv3_2"
156 |   type: CONVOLUTION
157 |   convolution_param {
158 |     num_output: 256
159 |     pad: 1
160 |     kernel_size: 3
161 |   }
162 |   blobs_lr: 0.001
163 |   blobs_lr: 0.001
164 | }
165 | layers {
166 |   bottom: "conv3_2"
167 |   top: "conv3_2"
168 |   name: "relu3_2"
169 |   type: RELU
170 | }
171 | layers {
172 |   bottom: "conv3_2"
173 |   top: "conv3_3"
174 |   name: "conv3_3"
175 |   type: CONVOLUTION
176 |   convolution_param {
177 |     num_output: 256
178 |     pad: 1
179 |     kernel_size: 3
180 |   }
181 |   blobs_lr: 0.001
182 |   blobs_lr: 0.001
183 | }
184 | layers {
185 |   bottom: "conv3_3"
186 |   top: "conv3_3"
187 |   name: "relu3_3"
188 |   type: RELU
189 | }
190 | layers {
191 |   bottom: "conv3_3"
192 |   top: "pool3"
193 |   name: "pool3"
194 |   type: POOLING
195 |   pooling_param {
196 |     pool: MAX
197 |     kernel_size: 2
198 |     stride: 2
199 |   }
200 | }
201 | layers {
202 |   bottom: "pool3"
203 |   top: "conv4_1"
204 |   name: "conv4_1"
205 |   type: CONVOLUTION
206 |   convolution_param {
207 |     num_output: 512
208 |     pad: 1
209 |     kernel_size: 3
210 |   }
211 |   blobs_lr: 0.001
212 |   blobs_lr: 0.001
213 | }
214 | layers {
215 |   bottom: "conv4_1"
216 |   top: "conv4_1"
217 |   name: "relu4_1"
218 |   type: RELU
219 | }
220 | layers {
221 |   bottom: "conv4_1"
222 |   top: "conv4_2"
223 |   name: "conv4_2"
224 |   type: CONVOLUTION
225 |   convolution_param {
226 |     num_output: 512
227 |     pad: 1
228 |     kernel_size: 3
229 |   }
230 |   blobs_lr: 0.001
231 |   blobs_lr: 0.001
232 | }
233 | layers {
234 |   bottom: "conv4_2"
235 |   top: "conv4_2"
236 |   name: "relu4_2"
237 |   type: RELU
238 | }
239 | layers {
240 |   bottom: "conv4_2"
241 |   top: "conv4_3"
242 |   name: "conv4_3"
243 |   type: CONVOLUTION
244 |   convolution_param {
245 |     num_output: 512
246 |     pad: 1
247 |     kernel_size: 3
248 |   }
249 |   blobs_lr: 0.001
250 |   blobs_lr: 0.001
251 | }
252 | layers {
253 |   bottom: "conv4_3"
254 |   top: "conv4_3"
255 |   name: "relu4_3"
256 |   type: RELU
257 | }
258 | layers {
259 |   bottom: "conv4_3"
260 |   top: "pool4"
261 |   name: "pool4"
262 |   type: POOLING
263 |   pooling_param {
264 |     pool: MAX
265 |     kernel_size: 2
266 |     stride: 2
267 |   }
268 | }
269 | layers {
270 |   bottom: "pool4"
271 |   top: "conv5_1"
272 |   name: "conv5_1"
273 |   type: CONVOLUTION
274 |   convolution_param {
275 |     num_output: 512
276 |     pad: 1
277 |     kernel_size: 3
278 |   }
279 |   blobs_lr: 0.001
280 |   blobs_lr: 0.001
281 | }
282 | layers {
283 |   bottom: "conv5_1"
284 |   top: "conv5_1"
285 |   name: "relu5_1"
286 |   type: RELU
287 | }
288 | layers {
289 |   bottom: "conv5_1"
290 |   top: "conv5_2"
291 |   name: "conv5_2"
292 |   type: CONVOLUTION
293 |   convolution_param {
294 |     num_output: 512
295 |     pad: 1
296 |     kernel_size: 3
297 |   }
298 |   blobs_lr: 0.001
299 |   blobs_lr: 0.001
300 | }
301 | layers {
302 |   bottom: "conv5_2"
303 |   top: "conv5_2"
304 |   name: "relu5_2"
305 |   type: RELU
306 | }
307 | layers {
308 |   bottom: "conv5_2"
309 |   top: "conv5_3"
310 |   name: "conv5_3"
311 |   type: CONVOLUTION
312 |   convolution_param {
313 |     num_output: 512
314 |     pad: 1
315 |     kernel_size: 3
316 |   }
317 |   blobs_lr: 0.001
318 |   blobs_lr: 0.001
319 | }
320 | layers {
321 |   bottom: "conv5_3"
322 |   top: "conv5_3"
323 |   name: "relu5_3"
324 |   type: RELU
325 | }
326 | layers {
327 |   bottom: "conv5_3"
328 |   top: "pool5"
329 |   name: "pool5"
330 |   type: POOLING
331 |   pooling_param {
332 |     pool: MAX
333 |     kernel_size: 2
334 |     stride: 2
335 |   }
336 | }
337 | layers {
338 |   bottom: "pool5"
339 |   top: "fc6"
340 |   name: "fc6"
341 |   type: INNER_PRODUCT
342 |   inner_product_param {
343 |     num_output: 4096
344 |   }
345 |   blobs_lr: 0.001
346 |   blobs_lr: 0.001
347 | }
348 | layers {
349 |   bottom: "fc6"
350 |   top: "fc6"
351 |   name: "relu6"
352 |   type: RELU
353 | }
354 | layers {
355 |   bottom: "fc6"
356 |   top: "fc6"
357 |   name: "drop6"
358 |   type: DROPOUT
359 |   dropout_param {
360 |     dropout_ratio: 0.5
361 |   }
362 | }
363 | layers {
364 |   bottom: "fc6"
365 |   top: "fc7"
366 |   name: "fc7"
367 |   type: INNER_PRODUCT
368 |   inner_product_param {
369 |     num_output: 4096
370 |   }
371 |   blobs_lr: 0.001
372 |   blobs_lr: 0.001
373 | }
374 | layers {
375 |   bottom: "fc7"
376 |   top: "fc7"
377 |   name: "relu7"
378 |   type: RELU
379 | }
380 | layers {
381 |   bottom: "fc7"
382 |   top: "fc7"
383 |   name: "drop7"
384 |   type: DROPOUT
385 |   dropout_param {
386 |     dropout_ratio: 0.5
387 |   }
388 | }
389 | layers {
390 |   name: "fc8"
391 |   bottom: "fc7"
392 |   top: "fc8"
393 |   type: INNER_PRODUCT
394 |   inner_product_param {
395 |     num_output: 1000
396 |   }
397 |   blobs_lr: 0.001
398 |   blobs_lr: 0.001
399 | }
400 | layers {
401 |   name: "loss"
402 |   type: SOFTMAX_LOSS
403 |   bottom: "fc8"
404 |   bottom: "label"
405 |   top: "loss/loss"
406 | }
407 | layers {
408 |   name: "accuracy/top1"
409 |   type: ACCURACY
410 |   bottom: "fc8"
411 |   bottom: "label"
412 |   top: "accuracy@1"
413 |   include: { phase: TEST }
414 |   accuracy_param {
415 |     top_k: 1
416 |   }
417 | }
418 | layers {
419 |   name: "accuracy/top5"
420 |   type: ACCURACY
421 |   bottom: "fc8"
422 |   bottom: "label"
423 |   top: "accuracy@5"
424 |   include: { phase: TEST }
425 |   accuracy_param {
426 |     top_k: 5
427 |   }
428 | }
429 | 


--------------------------------------------------------------------------------