├── LICENSE.md ├── README.md ├── benchmark_caffe.py ├── benchmark_keras.py ├── benchmark_lasagne.py ├── benchmark_mxnet.py ├── benchmark_neon.py ├── benchmark_tensorflow.py ├── images ├── img0000.tiff └── list.txt ├── parse_results.py ├── results ├── gtx_1080 │ ├── INFO.md │ ├── benchmark_caffe.dmon │ ├── benchmark_caffe.output │ ├── benchmark_keras_tensorflow.dmon │ ├── benchmark_keras_tensorflow.output │ ├── benchmark_keras_theano.dmon │ ├── benchmark_keras_theano.output │ └── benchmark_neon.output ├── k520 │ ├── INFO.md │ ├── benchmark_caffe.dmon │ ├── benchmark_caffe.output │ ├── benchmark_keras_tensorflow.dmon │ ├── benchmark_keras_tensorflow.output │ ├── benchmark_keras_theano.dmon │ ├── benchmark_keras_theano.output │ ├── benchmark_mxnet.dmon │ ├── benchmark_mxnet.output │ ├── benchmark_tensorflow.dmon │ ├── benchmark_tensorflow.output │ ├── benchmark_tensorflow_slim.dmon │ └── benchmark_tensorflow_slim.output ├── k80 │ ├── INFO.md │ ├── benchmark_caffe.output │ ├── benchmark_keras_tensorflow.output │ ├── benchmark_keras_theano.output │ ├── benchmark_mxnet.output │ ├── benchmark_neon.output │ ├── benchmark_tensorflow.output │ └── benchmark_tensorflow_slim.output ├── maxwell_titan_x │ ├── INFO.md │ ├── benchmark_caffe.output │ ├── benchmark_keras_tensorflow.output │ ├── benchmark_keras_theano.output │ ├── benchmark_mxnet.output │ ├── benchmark_neon.output │ ├── benchmark_tensorflow.output │ └── benchmark_tensorflow_slim.output └── v100 │ ├── INFO.md │ ├── benchmark_keras_tensorflow.dmon │ ├── benchmark_keras_tensorflow.output │ ├── benchmark_mxnet.dmon │ ├── benchmark_mxnet.output │ ├── benchmark_tensorflow.dmon │ ├── benchmark_tensorflow.output │ ├── benchmark_tensorflow_slim.dmon │ └── benchmark_tensorflow_slim.output ├── run.sh ├── vgg16_deploy.prototxt ├── vgg16_solver.prototxt └── vgg16_train_val.prototxt /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Alex Izvorski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Deep Learning Benchmark (VGG16) 2 | 3 | This mini-benchmark compares the speed of several deep learning frameworks on the VGG-16 network architecture. 4 | 5 | *Contributions welcome!* 6 | 7 | ## Results 8 | 9 | The table contains training times in milliseconds per minibatch on the same VGG16 network in different frameworks. (Less is faster). Minibatch size is 16. The time is for a complete SGD step including parameter updates, not just the forward+backward time. 10 | 11 | | Framework | [V100](results/v100/INFO.md) | [GTX 1080](results/gtx_1080/INFO.md) | [Maxwell Titan X](results/maxwell_titan_x/INFO.md) | [K80](results/k80/INFO.md) | [K520](results/k520/INFO.md) | 12 | |:---|:---|:---|:---|:---|:---| 13 | | MXNet | 81.22 | N/A | 324.63 | 1247.47 | OOM | 14 | | TensorFlow | 88.03 | N/A | 332.15 | 1057.28 | 2290.58 | 15 | | TensorFlow (slim) | 176.14 | N/A | 370.89 | 1126.70 | 2488.51 | 16 | | Keras (TensorFlow) | 101.82 | 287.85 | 359.89 | 1020.97 | OOM | 17 | | Keras (Theano) | N/A | 409.95 | 317.30 | 1141.79 | 2445.22 | 18 | | Neon | N/A | 164.53 | 207.41 | N/A | N/A | 19 | | Caffe | N/A | 244.44 | 311.06 | 787.81 | OOM | 20 | | Torch (1) | N/A | 232.55 | 273.54 | N/A | N/A | 21 | 22 | 23 | N/A - test not ran 24 | 25 | OOM - test ran but failed due to running out of memory (on the K520 with only 4GB memory) 26 | 27 | (1) The Torch benchmark is from https://github.com/jcjohnson/cnn-benchmarks (it is not included in this repo). 28 | 29 | ## Running 30 | 31 | 32 | ``` 33 | bash run.sh 34 | ``` 35 | 36 | Note: this will back up and then restore your ~/.keras/keras.json 37 | 38 | Caffe should be built inside caffe/ in the current directory (or a symlink). 39 | 40 | Neon should be built anywhere and (just for the Neon test) the built Neon virtualenv should be activated. 41 | 42 | 43 | -------------------------------------------------------------------------------- /benchmark_caffe.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import shutil 3 | 4 | for n in range(1,16): 5 | shutil.copy("images/img0000.tiff", "images/img%04d.tiff"%(n)) 6 | 7 | CAFFE = "caffe/build/tools/caffe" 8 | 9 | # this benchmarks forward+backward rather than a full SGD step; it turns out there is quite a significant difference between the two 10 | # subprocess.call(CAFFE + " time -model vgg16_deploy.prototxt -iterations 100 -gpu 0", shell=True) 11 | 12 | subprocess.call(CAFFE + " train --solver vgg16_solver.prototxt -gpu 0", shell=True) 13 | 14 | -------------------------------------------------------------------------------- /benchmark_keras.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import os.path 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--backend") 7 | args = parser.parse_args() 8 | 9 | os.environ['KERAS_BACKEND'] = args.backend 10 | 11 | import tensorflow.python.keras.backend 12 | 13 | if args.backend == "theano": 14 | tensorflow.python.keras.backend.set_image_dim_ordering("th") 15 | elif args.backend == "tensorflow": 16 | tensorflow.python.keras.backend.set_image_dim_ordering("tf") 17 | else: 18 | print("Backend must be theano or tensorflow") 19 | 20 | tensorflow.python.keras.backend.set_floatx('float32') 21 | tensorflow.python.keras.backend.set_epsilon(1e-07) 22 | 23 | import numpy as np 24 | import time 25 | import sys 26 | import tensorflow 27 | # from tensorflow.keras.optimizers import SGD 28 | 29 | from tensorflow.python.keras.applications.vgg16 import VGG16 30 | 31 | width = 224 32 | height = 224 33 | batch_size = 16 34 | 35 | model = VGG16(include_top=True, weights=None) 36 | sgd = tensorflow.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True) 37 | model.compile(loss='categorical_crossentropy', optimizer=sgd) # loss='hinge' 38 | 39 | if args.backend == "theano": 40 | x = np.zeros((batch_size, 3, width, height), dtype=np.float32) 41 | elif args.backend == "tensorflow": 42 | x = np.zeros((batch_size, width, height, 3), dtype=np.float32) 43 | else: 44 | print("Backend must be theano or tensorflow") 45 | y = np.zeros((batch_size, 1000), dtype=np.float32) 46 | 47 | # warmup 48 | model.train_on_batch(x, y) 49 | 50 | t0 = time.time() 51 | n = 0 52 | while n < 100: 53 | tstart = time.time() 54 | model.train_on_batch(x, y) 55 | tend = time.time() 56 | print("Iteration: %d train on batch time: %7.3f ms." %( n, (tend - tstart)*1000 )) 57 | n += 1 58 | t1 = time.time() 59 | 60 | print("Batch size: %d" %(batch_size)) 61 | print("Iterations: %d" %(n)) 62 | print("Time per iteration: %7.3f ms" %((t1 - t0) *1000 / n)) 63 | -------------------------------------------------------------------------------- /benchmark_lasagne.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from: https://github.com/Lasagne/Recipes/blob/master/modelzoo/vgg16.py 3 | """ 4 | 5 | # VGG-16, 16-layer model from the paper: 6 | # "Very Deep Convolutional Networks for Large-Scale Image Recognition" 7 | # Original source: https://gist.github.com/ksimonyan/211839e770f7b538e2d8 8 | # License: see http://www.robots.ox.ac.uk/~vgg/research/very_deep/ 9 | 10 | # Download pretrained weights from: 11 | # https://s3.amazonaws.com/lasagne/recipes/pretrained/imagenet/vgg16.pkl 12 | 13 | from lasagne.layers import InputLayer 14 | from lasagne.layers import DenseLayer 15 | from lasagne.layers import NonlinearityLayer 16 | from lasagne.layers import DropoutLayer 17 | from lasagne.layers import Pool2DLayer as PoolLayer 18 | from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer 19 | from lasagne.nonlinearities import softmax 20 | 21 | 22 | def build_model(input_var): 23 | net = {} 24 | net['input'] = InputLayer((None, 3, 224, 224), input_var=input_var) 25 | net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False) 26 | net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False) 27 | net['pool1'] = PoolLayer(net['conv1_2'], 2) 28 | net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False) 29 | net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False) 30 | net['pool2'] = PoolLayer(net['conv2_2'], 2) 31 | net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False) 32 | net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False) 33 | net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False) 34 | net['pool3'] = PoolLayer(net['conv3_3'], 2) 35 | net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False) 36 | net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False) 37 | net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False) 38 | net['pool4'] = PoolLayer(net['conv4_3'], 2) 39 | net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False) 40 | net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False) 41 | net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False) 42 | net['pool5'] = PoolLayer(net['conv5_3'], 2) 43 | net['fc6'] = DenseLayer(net['pool5'], num_units=4096) 44 | net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5) 45 | net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096) 46 | net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5) 47 | net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=1000, nonlinearity=None) 48 | net['prob'] = NonlinearityLayer(net['fc8'], softmax) 49 | 50 | return net 51 | 52 | import time 53 | import numpy as np 54 | import theano 55 | import theano.tensor as T 56 | import lasagne 57 | 58 | input_var = T.tensor4('inputs') 59 | target_var = T.ivector('targets') 60 | 61 | network = build_model(input_var) 62 | 63 | prediction = lasagne.layers.get_output(network['prob']) 64 | loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) 65 | loss = loss.mean() 66 | 67 | params = lasagne.layers.get_all_params(network['prob'], trainable=True) 68 | updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) 69 | 70 | train_fn = theano.function([input_var, target_var], loss, updates=updates) 71 | 72 | width = 224 73 | height = 224 74 | batch_size = 16 75 | 76 | x = np.zeros((batch_size, 3, width, height), dtype=np.float32) 77 | y = np.zeros((batch_size), dtype=np.int32) 78 | 79 | # warmup 80 | train_fn(x, y) 81 | 82 | t0 = time.time() 83 | n = 0 84 | while n < 100: 85 | tstart = time.time() 86 | train_fn(x, y) 87 | tend = time.time() 88 | print "Iteration: %d train on batch time: %7.3f ms." %( n, (tend - tstart)*1000 ) 89 | n += 1 90 | t1 = time.time() 91 | 92 | print "Batch size: %d" %(batch_size) 93 | print "Iterations: %d" %(n) 94 | print "Time per iteration: %7.3f ms" %((t1 - t0) *1000 / n) 95 | -------------------------------------------------------------------------------- /benchmark_mxnet.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | import time 4 | 5 | 6 | def vgg16_symbol(): 7 | data = mx.sym.Variable('data') 8 | 9 | # conv1 10 | conv1_1 = mx.sym.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") 11 | relu1_1 = mx.sym.Activation(data=conv1_1, act_type="relu", name="relu1_1") 12 | conv1_2 = mx.sym.Convolution(data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") 13 | relu1_2 = mx.sym.Activation(data=conv1_2, act_type="relu", name="relu1_2") 14 | pool1 = mx.sym.Pooling(data=relu1_2, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool1") 15 | 16 | # conv2 17 | conv2_1 = mx.sym.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") 18 | relu2_1 = mx.sym.Activation(data=conv2_1, act_type="relu", name="relu2_1") 19 | conv2_2 = mx.sym.Convolution(data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") 20 | relu2_2 = mx.sym.Activation(data=conv2_2, act_type="relu", name="relu2_2") 21 | pool2 = mx.sym.Pooling(data=relu2_2, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool2") 22 | 23 | # conv3 24 | conv3_1 = mx.sym.Convolution(data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") 25 | relu3_1 = mx.sym.Activation(data=conv3_1, act_type="relu", name="relu3_1") 26 | conv3_2 = mx.sym.Convolution(data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") 27 | relu3_2 = mx.sym.Activation(data=conv3_2, act_type="relu", name="relu3_2") 28 | conv3_3 = mx.sym.Convolution(data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") 29 | relu3_3 = mx.sym.Activation(data=conv3_3, act_type="relu", name="relu3_3") 30 | pool3 = mx.sym.Pooling(data=relu3_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool3") 31 | 32 | # conv4 33 | conv4_1 = mx.sym.Convolution(data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") 34 | relu4_1 = mx.sym.Activation(data=conv4_1, act_type="relu", name="relu4_1") 35 | conv4_2 = mx.sym.Convolution(data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") 36 | relu4_2 = mx.sym.Activation(data=conv4_2, act_type="relu", name="relu4_2") 37 | conv4_3 = mx.sym.Convolution(data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") 38 | relu4_3 = mx.sym.Activation(data=conv4_3, act_type="relu", name="relu4_3") 39 | pool4 = mx.sym.Pooling(data=relu4_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool4") 40 | 41 | # conv5 42 | conv5_1 = mx.sym.Convolution(data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") 43 | relu5_1 = mx.sym.Activation(data=conv5_1, act_type="relu", name="relu5_1") 44 | conv5_2 = mx.sym.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") 45 | relu5_2 = mx.sym.Activation(data=conv5_2, act_type="relu", name="relu5_2") 46 | conv5_3 = mx.sym.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") 47 | relu5_3 = mx.sym.Activation(data=conv5_3, act_type="relu", name="relu5_3") 48 | pool5 = mx.sym.Pooling(data=relu5_3, kernel=(2, 2), stride=(2, 2), pool_type="max", name="pool5") 49 | 50 | # fc6 51 | flat6 = mx.sym.Flatten(data=pool5, name="flat6") 52 | fc6 = mx.sym.FullyConnected(data=flat6, num_hidden=4096, name="fc6") 53 | relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6") 54 | drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6") 55 | 56 | # fc7 57 | fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7") 58 | relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7") 59 | drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7") 60 | 61 | # fc8 62 | fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=1000, name="fc8") 63 | softmax = mx.sym.SoftmaxOutput(data=fc8, name="softmax") 64 | 65 | return softmax 66 | 67 | 68 | batch_size = 16 69 | nb_epoch = 400 70 | 71 | imgs = np.random.uniform(0, 1, (batch_size * 3 * 224 * 224)) 72 | imgs = imgs.reshape((batch_size, 3, 224, 224)) 73 | labels = np.random.randint(0,1000, batch_size) 74 | 75 | train_iter = mx.io.NDArrayIter(data=imgs.astype(np.float32), 76 | label=labels.astype(np.float32), 77 | batch_size=batch_size) 78 | 79 | vgg16 = vgg16_symbol() 80 | 81 | model = mx.model.FeedForward( 82 | ctx=mx.gpu(), 83 | symbol=vgg16, 84 | num_epoch=nb_epoch, 85 | learning_rate=0.01, 86 | momentum=0.9, 87 | wd=0.00001, 88 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), 89 | ) 90 | 91 | model.fit(X=train_iter) 92 | print("Start") 93 | t0 = time.time() 94 | model.fit(X=train_iter) 95 | t1 = time.time() 96 | print("Batch size: %d" % (batch_size)) 97 | print("Iterations: %d" % (nb_epoch)) 98 | print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / nb_epoch)) 99 | -------------------------------------------------------------------------------- /benchmark_neon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | """ 17 | VGG_A Benchmark 18 | https://github.com/soumith/convnet-benchmarks 19 | 20 | ./vgg_a.py 21 | ./vgg_a.py -d f16 22 | """ 23 | 24 | from neon import NervanaObject 25 | from neon.util.argparser import NeonArgparser 26 | from neon.initializers import Constant, GlorotUniform, Xavier 27 | from neon.layers import Conv, Dropout, Pooling, GeneralizedCost, Affine 28 | from neon.optimizers import GradientDescentMomentum, MultiOptimizer, Schedule 29 | from neon.transforms import Rectlin, Softmax, CrossEntropyMulti 30 | from neon.models import Model 31 | from neon.data import ArrayIterator 32 | from neon.callbacks.callbacks import Callbacks 33 | 34 | import numpy as np 35 | parser = NeonArgparser(__doc__) 36 | args = parser.parse_args() 37 | 38 | NervanaObject.be.bsz = 16 39 | NervanaObject.be.enable_winograd = 4 40 | 41 | # setup data provider 42 | X_train = np.random.uniform(-1, 1, (16, 3 * 224 * 224)) 43 | y_train = np.random.randint(0, 999, (16, 1000)) 44 | train = ArrayIterator(X_train, y_train, nclass=1000, lshape=(3, 224, 224)) 45 | 46 | # layers = [Conv((3, 3, 64), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 47 | # Pooling(2, strides=2), 48 | # Conv((3, 3, 128), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 49 | # Pooling(2, strides=2), 50 | # Conv((3, 3, 256), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 51 | # Conv((3, 3, 256), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 52 | # Pooling(2, strides=2), 53 | # Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 54 | # Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 55 | # Pooling(2, strides=2), 56 | # Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 57 | # Conv((3, 3, 512), init=Gaussian(scale=0.01), activation=Rectlin(), padding=1), 58 | # Pooling(2, strides=2), 59 | # Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), 60 | # Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), 61 | # Affine(nout=1000, init=Gaussian(scale=0.01), activation=Softmax())] 62 | # model = Model(layers=layers) 63 | 64 | """ 65 | Modified from https://github.com/NervanaSystems/ModelZoo/blob/master/ImageClassification/ILSVRC2012/VGG/vgg_neon.py 66 | """ 67 | 68 | init1 = Xavier(local=True) 69 | initfc = GlorotUniform() 70 | 71 | relu = Rectlin() 72 | conv_params = {'init': init1, 73 | 'strides': 1, 74 | 'padding': 1, 75 | 'bias': Constant(0), 76 | 'activation': relu} 77 | 78 | # Set up the model layers 79 | layers = [] 80 | for nofm in [64, 128, 256, 512, 512]: 81 | layers.append(Conv((3, 3, nofm), **conv_params)) 82 | layers.append(Conv((3, 3, nofm), **conv_params)) 83 | if nofm > 128: 84 | layers.append(Conv((3, 3, nofm), **conv_params)) 85 | # if args.vgg_version == 'E': 86 | # layers.append(Conv((3, 3, nofm), **conv_params)) 87 | layers.append(Pooling(2, strides=2)) 88 | 89 | layers.append(Affine(nout=4096, init=initfc, bias=Constant(0), activation=relu)) 90 | layers.append(Dropout(keep=0.5)) 91 | layers.append(Affine(nout=4096, init=initfc, bias=Constant(0), activation=relu)) 92 | layers.append(Dropout(keep=0.5)) 93 | layers.append(Affine(nout=1000, init=initfc, bias=Constant(0), activation=Softmax())) 94 | 95 | model = Model(layers=layers) 96 | 97 | weight_sched = Schedule([22, 44, 65], (1 / 250.)**(1 / 3.)) 98 | opt_gdm = GradientDescentMomentum(0.01, 0.0, wdecay=0.0005, schedule=weight_sched) 99 | opt = MultiOptimizer({'default': opt_gdm}) 100 | cost = GeneralizedCost(costfunc=CrossEntropyMulti()) 101 | callbacks = Callbacks(model) 102 | 103 | import time 104 | 105 | num_epochs=100 106 | t0 = time.time() 107 | # model.benchmark(train, cost=cost, optimizer=opt, niterations=100, nskip=1) 108 | model.fit( train, cost=cost, optimizer=opt, num_epochs=100, callbacks=callbacks ) 109 | t1 = time.time() 110 | 111 | print("Batch size: %d" %(NervanaObject.be.bsz)) 112 | print("Iterations: %d" %(num_epochs)) 113 | print("Time per iteration: %7.3f ms" %((t1 - t0) *1000 / num_epochs)) 114 | 115 | -------------------------------------------------------------------------------- /benchmark_tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.slim as slim 3 | import numpy as np 4 | import time 5 | import argparse 6 | 7 | 8 | def vgg16(inputs, num_classes, batch_size): 9 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 10 | activation_fn=tf.nn.relu, 11 | weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)): 12 | net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], padding="SAME", scope='conv1') 13 | net = slim.max_pool2d(net, [2, 2], scope='pool1') 14 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], padding="SAME", scope='conv2') 15 | net = slim.max_pool2d(net, [2, 2], scope='pool2') 16 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], padding="SAME", scope='conv3') 17 | net = slim.max_pool2d(net, [2, 2], scope='pool3') 18 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], padding="SAME", scope='conv4') 19 | net = slim.max_pool2d(net, [2, 2], scope='pool4') 20 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], padding="SAME", scope='conv5') 21 | net = slim.max_pool2d(net, [2, 2], scope='pool5') 22 | net = tf.reshape(net, (batch_size, 7 * 7 * 512)) 23 | net = slim.fully_connected(net, 4096, scope='fc6') 24 | net = slim.dropout(net, 0.5, scope='dropout6') 25 | net = slim.fully_connected(net, 4096, scope='fc7') 26 | net = slim.dropout(net, 0.5, scope='dropout7') 27 | net = slim.fully_connected(net, 1000, activation_fn=None, scope='fc8') 28 | return net 29 | 30 | 31 | def train_slim(batch_size, height, width, num_classes, learning_rate): 32 | """Built-in slim training schedule""" 33 | 34 | # number of iterations 35 | n = 100 36 | 37 | logdir = None # Don't store checkpoints 38 | 39 | with tf.Session(): 40 | train_inputs = tf.random_uniform((batch_size, height, width, 3)) 41 | labels = tf.one_hot(np.arange(batch_size), on_value=1.0, off_value=0.0, depth=num_classes) 42 | 43 | predictions = vgg16(train_inputs, num_classes, batch_size) 44 | loss = slim.losses.softmax_cross_entropy(predictions, labels) 45 | 46 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 47 | 48 | train_op = slim.learning.create_train_op(loss, optimizer) 49 | 50 | t0 = time.time() 51 | slim.learning.train( 52 | train_op, 53 | logdir, 54 | number_of_steps=n, 55 | save_summaries_secs=3000, 56 | save_interval_secs=6000) 57 | t1 = time.time() 58 | 59 | print("Batch size: %d" % (batch_size)) 60 | print("Iterations: %d" % (n)) 61 | print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / n)) 62 | 63 | 64 | def train_pure_tf(batch_size, height, width, num_classes, learning_rate): 65 | """pure tensorflow training schedule, possibly with less overhead than slim""" 66 | 67 | # number of iterations 68 | n = 100 69 | 70 | with tf.Graph().as_default(), tf.device('/gpu:0'): 71 | 72 | train_inputs = tf.random_uniform((batch_size, height, width, 3)) 73 | labels = tf.one_hot(np.arange(batch_size), on_value=1.0, off_value=0.0, depth=num_classes) 74 | 75 | # Predictions 76 | predictions = vgg16(train_inputs, num_classes, batch_size) 77 | # Loss function 78 | loss = slim.losses.softmax_cross_entropy(predictions, labels) 79 | # Optimizer 80 | opt = tf.train.GradientDescentOptimizer(learning_rate) 81 | 82 | # Calculate the gradients for the batch of data 83 | grads = opt.compute_gradients(loss) 84 | 85 | # Apply the gradients to adjust the shared variables. 86 | apply_gradient_op = opt.apply_gradients(grads) 87 | 88 | # Run a session 89 | with tf.Session() as sess: 90 | # Build an initialization operation to run below. 91 | init = tf.initialize_all_variables() 92 | sess.run(init) 93 | 94 | # warmup run (generally the first run is much slower than the others) 95 | sess.run([apply_gradient_op]) 96 | 97 | t0 = time.time() 98 | for i in range(n): 99 | tstart = time.time() 100 | sess.run([apply_gradient_op]) 101 | tend = time.time() 102 | print("Iteration: %d train on batch time: %7.3f ms." % (i, (tend - tstart) * 1000)) 103 | t1 = time.time() 104 | 105 | print("Batch size: %d" % (batch_size)) 106 | print("Iterations: %d" % (n)) 107 | print("Time per iteration: %7.3f ms" % ((t1 - t0) * 1000 / n)) 108 | 109 | 110 | if __name__ == '__main__': 111 | 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--train_schedule", default="pure_tf") 114 | parser.add_argument("--batch_size", type=int, default=16) 115 | parser.add_argument("--height", type=int, default=224) 116 | parser.add_argument("--width", type=int, default=224) 117 | parser.add_argument("--num_classes", type=int, default=1000) 118 | parser.add_argument("--learning_rate", type=float, default=0.1) 119 | args = parser.parse_args() 120 | 121 | if args.train_schedule == "slim": 122 | train_slim(args.batch_size, args.height, args.width, args.num_classes, args.learning_rate) 123 | elif args.train_schedule == "pure_tf": 124 | train_pure_tf(args.batch_size, args.height, args.width, args.num_classes, args.learning_rate) 125 | else: 126 | print("Train schedule must be slim or pure_tf") 127 | -------------------------------------------------------------------------------- /images/img0000.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aizvorski/vgg-benchmarks/4f551e19c89a83df0f9e0e6156d53e6e37463a8e/images/img0000.tiff -------------------------------------------------------------------------------- /images/list.txt: -------------------------------------------------------------------------------- 1 | images/img0000.tiff 0 2 | images/img0001.tiff 0 3 | images/img0002.tiff 0 4 | images/img0003.tiff 0 5 | images/img0004.tiff 0 6 | images/img0005.tiff 0 7 | images/img0006.tiff 0 8 | images/img0007.tiff 0 9 | images/img0008.tiff 0 10 | images/img0009.tiff 0 11 | images/img0010.tiff 0 12 | images/img0011.tiff 0 13 | images/img0012.tiff 0 14 | images/img0013.tiff 0 15 | images/img0014.tiff 0 16 | images/img0015.tiff 0 17 | -------------------------------------------------------------------------------- /parse_results.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import numpy as np 3 | import re 4 | 5 | def parse_output(output_file): 6 | """ Utility to parse the output of keras, neon and tensorflow""" 7 | 8 | with open("./results/%s" % output_file, "r") as f: 9 | list_lines = f.readlines() 10 | for line in list_lines: 11 | if "Time per iteration" in line: 12 | result = line.split(" ")[-2] 13 | return float(result) 14 | 15 | 16 | def parse_output_per_iter(output_file): 17 | """ Utility to parse the output of keras, neon and tensorflow""" 18 | with open("./results/%s" % output_file, "r") as fh: 19 | results = [] 20 | for line in fh: 21 | m = re.search(r'train on batch time:\s*(\d+\.\d+) ms', line) 22 | if m: 23 | r = m.group(1) 24 | results.append(float(r)) 25 | return np.mean(results[20:]) 26 | # return 0 27 | 28 | def parse_output_caffe(output_file): 29 | """ Utility to parse the output of caffe 30 | 31 | Count the time as the clock difference between two consecutive appearances of solver.cpp:228 32 | 33 | """ 34 | 35 | # string formatting 36 | FMT = '%H:%M:%S.%f' 37 | 38 | list_time = [] 39 | 40 | with open("./results/%s" % output_file, "r") as f: 41 | list_lines = f.readlines() 42 | for line in list_lines: 43 | if "solver.cpp:228" in line: 44 | time = line.split(" ")[1] 45 | time = datetime.strptime(time, FMT) 46 | list_time.append(time) 47 | 48 | arr_time = np.array(list_time) 49 | arr_delta = arr_time[1:] - arr_time[:-1] 50 | arr_delta = [d.microseconds for d in arr_delta] 51 | 52 | return 1E-3 * np.mean(arr_delta) 53 | 54 | 55 | def name_to_path(name): 56 | return name.lower().replace('(', '').replace(')', '').replace(' ', '_') 57 | 58 | 59 | if __name__ == '__main__': 60 | benchmark_names = ('Neon', 'Caffe', 'Keras (TensorFlow)', 'Keras (Theano)', 61 | 'TensorFlow', 'TensorFlow (slim)', 'MXNet') 62 | run_names = ('V100', 'GTX 1080', 'Maxwell Titan X', 'K80', 'K520') 63 | 64 | line = "| Framework | " 65 | for run_name in run_names: 66 | line += run_name + " | " 67 | print(line) 68 | 69 | for name in benchmark_names: 70 | line = "| " + name + " | " 71 | for run_name in run_names: 72 | result_path = name_to_path(run_name) + "/benchmark_" + name_to_path(name) + ".output" 73 | try: 74 | if name == 'Caffe': 75 | time_ = parse_output_caffe(result_path) 76 | elif name == 'TensorFlow' or name == 'Keras (TensorFlow)': 77 | time_ = parse_output_per_iter(result_path) 78 | else: 79 | time_ = parse_output(result_path) 80 | time_ = "%.2f" % (time_) 81 | except Exception as e: 82 | print(repr(e)) 83 | time_ = "N/A" 84 | line += time_ 85 | line += " | " 86 | print(line) 87 | -------------------------------------------------------------------------------- /results/gtx_1080/INFO.md: -------------------------------------------------------------------------------- 1 | ## Results GTX 1080 2 | 3 | The mini batch size is 16 4 | 5 | | Framework | Time (per minibatch) | 6 | |:---|:---| 7 | | Neon | 164.527 ms | 8 | | Torch (1) | 232.55 ms | 9 | | Caffe (2) | 244.445 ms | 10 | | Keras (TensorFlow) | 287.693 ms | 11 | | Keras (Theano) | 409.953 ms | 12 | 13 | The environment for the results listed above is as follows: 14 | 15 | - Hardware: GTX 1080 (EVGA GTX 1080 Founders Edition) 16 | - OS: Ubuntu 14.04.3 LTS 17 | - CUDA: 8.0 (cuda-repo-ubuntu1404-8-0-rc_8.0.27-1_amd64.deb) 18 | - CuDNN: 5.0 (cudnn-8.0-linux-x64-v5.0-ga.tgz) 19 | - Caffe: df412ac (from source) 20 | - Keras: 1.0.7 21 | - Theano: 0.8.2 22 | - TensorFlow: 85f76f5 (from source) 23 | - Neon: 1.5.4 (485033c) 24 | - Python: 2.7.6 25 | 26 | The TensorFlow version was *very* recent, it has to be in order for it to work with CUDA 8.0. 27 | 28 | (1) The Torch benchmark is from https://github.com/jcjohnson/cnn-benchmarks (it has an essentially identical setup, VGG-16, GTX 1080, CUDA 8, cuDNN 5, minibatch size 16). 29 | 30 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time. 31 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_caffe.dmon: -------------------------------------------------------------------------------- 1 | 0 38 46 2 0 0 0 5005 1592 2 | 0 141 50 99 52 0 0 5005 1592 3 | 0 139 51 99 48 0 0 5005 1820 4 | 0 158 51 99 47 0 0 5005 1835 5 | 0 166 52 100 54 0 0 5005 1837 6 | 0 165 53 100 62 0 0 5005 1837 7 | 0 123 54 100 52 0 0 5005 1837 8 | 0 153 54 100 55 0 0 5005 1836 9 | 0 135 54 100 61 0 0 5005 1836 10 | 0 157 55 99 58 0 0 5005 1836 11 | 0 169 56 99 60 0 0 5005 1836 12 | 0 161 56 99 58 0 0 5005 1836 13 | 0 137 57 99 60 0 0 5005 1824 14 | 0 132 57 99 53 0 0 5005 1821 15 | 0 152 57 99 48 0 0 5005 1821 16 | 0 140 58 99 45 0 0 5005 1821 17 | 0 160 58 100 53 0 0 5005 1820 18 | 0 167 59 100 57 0 0 5005 1820 19 | 0 153 59 99 64 0 0 5005 1820 20 | # gpu pwr temp sm mem enc dec mclk pclk 21 | # Idx W C % % % % MHz MHz 22 | 0 118 60 99 51 0 0 5005 1820 23 | 0 155 60 100 56 0 0 5005 1820 24 | 0 137 60 100 63 0 0 5005 1820 25 | 0 157 61 100 58 0 0 5005 1820 26 | 0 166 61 99 61 0 0 5005 1815 27 | 0 166 62 99 60 0 0 5005 1812 28 | 0 45 59 0 0 0 0 5005 1816 29 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_keras_tensorflow.dmon: -------------------------------------------------------------------------------- 1 | 0 44 46 6 3 0 0 5005 1590 2 | 0 161 48 28 19 0 0 5005 1754 3 | 0 145 50 100 59 0 0 5005 1827 4 | 0 181 50 98 63 0 0 5005 1832 5 | 0 166 51 99 85 0 0 5005 1830 6 | 0 191 52 98 61 0 0 5005 1827 7 | 0 159 53 97 84 0 0 5005 1826 8 | 0 133 53 98 61 0 0 5005 1829 9 | 0 146 54 96 82 0 0 5005 1829 10 | 0 119 54 98 60 0 0 5005 1828 11 | 0 154 54 97 85 0 0 5005 1825 12 | 0 155 55 98 61 0 0 5005 1829 13 | 0 102 55 97 83 0 0 5005 1821 14 | 0 161 56 98 61 0 0 5005 1810 15 | # gpu pwr temp sm mem enc dec mclk pclk 16 | # Idx W C % % % % MHz MHz 17 | 0 126 56 97 83 0 0 5005 1813 18 | 0 149 57 99 85 0 0 5005 1811 19 | 0 148 57 98 60 0 0 5005 1815 20 | 0 164 57 97 84 0 0 5005 1813 21 | 0 181 58 98 60 0 0 5005 1800 22 | 0 168 58 97 82 0 0 5005 1808 23 | 0 179 58 100 64 0 0 5005 1806 24 | 0 166 59 97 82 0 0 5005 1810 25 | 0 193 59 100 69 0 0 5005 1807 26 | 0 164 60 97 81 0 0 5005 1812 27 | 0 137 60 100 69 0 0 5005 1805 28 | 0 155 60 98 81 0 0 5005 1802 29 | 0 117 61 100 71 0 0 5005 1802 30 | 0 143 61 98 75 0 0 5005 1798 31 | 0 109 61 99 73 0 0 5005 1801 32 | 0 148 62 100 73 0 0 5005 1798 33 | 0 137 62 97 76 0 0 5005 1798 34 | 0 45 60 0 0 0 0 5005 1807 35 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_keras_tensorflow.output: -------------------------------------------------------------------------------- 1 | Using TensorFlow backend. 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 6 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so locally 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:118] Found device 0 with properties: 8 | name: Graphics Device 9 | major: 6 minor: 1 memoryClockRate (GHz) 1.7335 10 | pciBusID 0000:03:00.0 11 | Total memory: 7.92GiB 12 | Free memory: 7.81GiB 13 | W tensorflow/stream_executor/cuda/cuda_driver.cc:572] creating context when one is currently active; existing: 0x2a01cc0 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:118] Found device 1 with properties: 15 | name: Graphics Device 16 | major: 6 minor: 1 memoryClockRate (GHz) 1.7335 17 | pciBusID 0000:05:00.0 18 | Total memory: 7.92GiB 19 | Free memory: 7.81GiB 20 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:138] DMA: 0 1 21 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:148] 0: Y Y 22 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:148] 1: Y Y 23 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:868] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Graphics Device, pci bus id: 0000:03:00.0) 24 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:868] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Graphics Device, pci bus id: 0000:05:00.0) 25 | Iteration: 0 train on batch time: 287.307 ms. 26 | Iteration: 1 train on batch time: 285.672 ms. 27 | Iteration: 2 train on batch time: 286.286 ms. 28 | Iteration: 3 train on batch time: 287.512 ms. 29 | Iteration: 4 train on batch time: 288.259 ms. 30 | Iteration: 5 train on batch time: 286.027 ms. 31 | Iteration: 6 train on batch time: 286.402 ms. 32 | Iteration: 7 train on batch time: 286.703 ms. 33 | Iteration: 8 train on batch time: 287.772 ms. 34 | Iteration: 9 train on batch time: 285.709 ms. 35 | Iteration: 10 train on batch time: 286.838 ms. 36 | Iteration: 11 train on batch time: 288.765 ms. 37 | Iteration: 12 train on batch time: 286.001 ms. 38 | Iteration: 13 train on batch time: 286.320 ms. 39 | Iteration: 14 train on batch time: 286.807 ms. 40 | Iteration: 15 train on batch time: 285.734 ms. 41 | Iteration: 16 train on batch time: 287.607 ms. 42 | Iteration: 17 train on batch time: 287.108 ms. 43 | Iteration: 18 train on batch time: 286.929 ms. 44 | Iteration: 19 train on batch time: 286.319 ms. 45 | Iteration: 20 train on batch time: 286.867 ms. 46 | Iteration: 21 train on batch time: 286.533 ms. 47 | Iteration: 22 train on batch time: 286.395 ms. 48 | Iteration: 23 train on batch time: 286.727 ms. 49 | Iteration: 24 train on batch time: 286.351 ms. 50 | Iteration: 25 train on batch time: 284.861 ms. 51 | Iteration: 26 train on batch time: 284.247 ms. 52 | Iteration: 27 train on batch time: 287.205 ms. 53 | Iteration: 28 train on batch time: 286.115 ms. 54 | Iteration: 29 train on batch time: 288.505 ms. 55 | Iteration: 30 train on batch time: 285.953 ms. 56 | Iteration: 31 train on batch time: 286.647 ms. 57 | Iteration: 32 train on batch time: 287.382 ms. 58 | Iteration: 33 train on batch time: 285.452 ms. 59 | Iteration: 34 train on batch time: 287.961 ms. 60 | Iteration: 35 train on batch time: 291.466 ms. 61 | Iteration: 36 train on batch time: 288.147 ms. 62 | Iteration: 37 train on batch time: 286.794 ms. 63 | Iteration: 38 train on batch time: 287.992 ms. 64 | Iteration: 39 train on batch time: 288.307 ms. 65 | Iteration: 40 train on batch time: 286.961 ms. 66 | Iteration: 41 train on batch time: 287.075 ms. 67 | Iteration: 42 train on batch time: 288.158 ms. 68 | Iteration: 43 train on batch time: 290.944 ms. 69 | Iteration: 44 train on batch time: 286.356 ms. 70 | Iteration: 45 train on batch time: 286.673 ms. 71 | Iteration: 46 train on batch time: 289.705 ms. 72 | Iteration: 47 train on batch time: 286.751 ms. 73 | Iteration: 48 train on batch time: 291.429 ms. 74 | Iteration: 49 train on batch time: 287.108 ms. 75 | Iteration: 50 train on batch time: 288.143 ms. 76 | Iteration: 51 train on batch time: 286.341 ms. 77 | Iteration: 52 train on batch time: 287.558 ms. 78 | Iteration: 53 train on batch time: 289.172 ms. 79 | Iteration: 54 train on batch time: 289.523 ms. 80 | Iteration: 55 train on batch time: 287.128 ms. 81 | Iteration: 56 train on batch time: 288.374 ms. 82 | Iteration: 57 train on batch time: 288.952 ms. 83 | Iteration: 58 train on batch time: 291.691 ms. 84 | Iteration: 59 train on batch time: 287.035 ms. 85 | Iteration: 60 train on batch time: 288.489 ms. 86 | Iteration: 61 train on batch time: 289.944 ms. 87 | Iteration: 62 train on batch time: 286.632 ms. 88 | Iteration: 63 train on batch time: 288.199 ms. 89 | Iteration: 64 train on batch time: 288.198 ms. 90 | Iteration: 65 train on batch time: 287.945 ms. 91 | Iteration: 66 train on batch time: 287.715 ms. 92 | Iteration: 67 train on batch time: 285.903 ms. 93 | Iteration: 68 train on batch time: 289.123 ms. 94 | Iteration: 69 train on batch time: 286.140 ms. 95 | Iteration: 70 train on batch time: 289.240 ms. 96 | Iteration: 71 train on batch time: 289.794 ms. 97 | Iteration: 72 train on batch time: 286.609 ms. 98 | Iteration: 73 train on batch time: 287.146 ms. 99 | Iteration: 74 train on batch time: 288.039 ms. 100 | Iteration: 75 train on batch time: 286.723 ms. 101 | Iteration: 76 train on batch time: 286.833 ms. 102 | Iteration: 77 train on batch time: 287.447 ms. 103 | Iteration: 78 train on batch time: 286.315 ms. 104 | Iteration: 79 train on batch time: 287.332 ms. 105 | Iteration: 80 train on batch time: 287.199 ms. 106 | Iteration: 81 train on batch time: 287.768 ms. 107 | Iteration: 82 train on batch time: 290.930 ms. 108 | Iteration: 83 train on batch time: 288.858 ms. 109 | Iteration: 84 train on batch time: 291.169 ms. 110 | Iteration: 85 train on batch time: 287.972 ms. 111 | Iteration: 86 train on batch time: 288.800 ms. 112 | Iteration: 87 train on batch time: 286.622 ms. 113 | Iteration: 88 train on batch time: 286.225 ms. 114 | Iteration: 89 train on batch time: 287.903 ms. 115 | Iteration: 90 train on batch time: 290.274 ms. 116 | Iteration: 91 train on batch time: 286.583 ms. 117 | Iteration: 92 train on batch time: 287.495 ms. 118 | Iteration: 93 train on batch time: 287.426 ms. 119 | Iteration: 94 train on batch time: 287.889 ms. 120 | Iteration: 95 train on batch time: 288.170 ms. 121 | Iteration: 96 train on batch time: 288.284 ms. 122 | Iteration: 97 train on batch time: 290.761 ms. 123 | Iteration: 98 train on batch time: 287.909 ms. 124 | Iteration: 99 train on batch time: 291.303 ms. 125 | Batch size: 16 126 | Iterations: 100 127 | Time per iteration: 287.693 ms 128 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_keras_theano.dmon: -------------------------------------------------------------------------------- 1 | 0 36 31 0 0 0 0 5005 1603 2 | 0 104 33 17 6 0 0 5005 1603 3 | 0 165 36 95 31 0 0 5005 1748 4 | 0 153 37 97 28 0 0 5005 1863 5 | 0 150 38 96 31 0 0 5005 1854 6 | 0 152 38 97 24 0 0 5005 1847 7 | 0 153 39 95 34 0 0 5005 1847 8 | 0 153 40 95 37 0 0 5005 1847 9 | # gpu pwr temp sm mem enc dec mclk pclk 10 | # Idx W C % % % % MHz MHz 11 | 0 150 40 97 26 0 0 5005 1847 12 | 0 171 41 95 35 0 0 5005 1847 13 | 0 150 42 97 29 0 0 5005 1846 14 | 0 172 42 96 36 0 0 5005 1847 15 | 0 153 43 95 30 0 0 5005 1847 16 | 0 167 44 95 35 0 0 5005 1849 17 | 0 155 44 96 30 0 0 5005 1849 18 | 0 170 45 95 34 0 0 5005 1850 19 | 0 152 46 96 29 0 0 5005 1850 20 | 0 180 46 97 32 0 0 5005 1850 21 | 0 150 47 96 30 0 0 5005 1850 22 | 0 151 47 95 35 0 0 5005 1850 23 | 0 154 48 97 24 0 0 5005 1850 24 | 0 178 48 95 35 0 0 5005 1840 25 | 0 143 49 96 28 0 0 5005 1839 26 | 0 156 49 94 35 0 0 5005 1834 27 | 0 163 50 96 29 0 0 5005 1839 28 | 0 159 51 97 31 0 0 5005 1837 29 | 0 121 51 91 30 0 0 5005 1836 30 | 0 133 51 96 24 0 0 5005 1837 31 | 0 155 52 91 35 0 0 5005 1834 32 | 0 104 52 95 30 0 0 5005 1837 33 | # gpu pwr temp sm mem enc dec mclk pclk 34 | # Idx W C % % % % MHz MHz 35 | 0 175 54 95 34 0 0 5005 1837 36 | 0 142 54 96 23 0 0 5005 1837 37 | 0 184 54 91 35 0 0 5005 1834 38 | 0 154 55 97 26 0 0 5005 1832 39 | 0 161 55 95 36 0 0 5005 1834 40 | 0 166 55 94 31 0 0 5005 1831 41 | 0 148 56 93 32 0 0 5005 1833 42 | 0 140 56 94 32 0 0 5005 1831 43 | 0 123 56 95 23 0 0 5005 1831 44 | 0 145 58 94 35 0 0 5005 1834 45 | 0 116 58 96 31 0 0 5005 1830 46 | 0 164 58 93 34 0 0 5005 1835 47 | 0 44 56 0 0 0 0 5005 1826 48 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_keras_theano.output: -------------------------------------------------------------------------------- 1 | Using Theano backend. 2 | Using gpu device 0: Graphics Device (CNMeM is disabled, cuDNN 5005) 3 | Iteration: 0 train on batch time: 408.542 ms. 4 | Iteration: 1 train on batch time: 404.476 ms. 5 | Iteration: 2 train on batch time: 403.843 ms. 6 | Iteration: 3 train on batch time: 402.034 ms. 7 | Iteration: 4 train on batch time: 403.209 ms. 8 | Iteration: 5 train on batch time: 405.481 ms. 9 | Iteration: 6 train on batch time: 404.473 ms. 10 | Iteration: 7 train on batch time: 404.539 ms. 11 | Iteration: 8 train on batch time: 404.342 ms. 12 | Iteration: 9 train on batch time: 406.517 ms. 13 | Iteration: 10 train on batch time: 404.390 ms. 14 | Iteration: 11 train on batch time: 404.463 ms. 15 | Iteration: 12 train on batch time: 404.048 ms. 16 | Iteration: 13 train on batch time: 404.791 ms. 17 | Iteration: 14 train on batch time: 404.642 ms. 18 | Iteration: 15 train on batch time: 404.891 ms. 19 | Iteration: 16 train on batch time: 405.597 ms. 20 | Iteration: 17 train on batch time: 404.474 ms. 21 | Iteration: 18 train on batch time: 403.967 ms. 22 | Iteration: 19 train on batch time: 405.766 ms. 23 | Iteration: 20 train on batch time: 404.242 ms. 24 | Iteration: 21 train on batch time: 405.184 ms. 25 | Iteration: 22 train on batch time: 404.359 ms. 26 | Iteration: 23 train on batch time: 404.569 ms. 27 | Iteration: 24 train on batch time: 404.223 ms. 28 | Iteration: 25 train on batch time: 412.660 ms. 29 | Iteration: 26 train on batch time: 413.368 ms. 30 | Iteration: 27 train on batch time: 407.599 ms. 31 | Iteration: 28 train on batch time: 406.539 ms. 32 | Iteration: 29 train on batch time: 404.855 ms. 33 | Iteration: 30 train on batch time: 403.773 ms. 34 | Iteration: 31 train on batch time: 406.304 ms. 35 | Iteration: 32 train on batch time: 403.726 ms. 36 | Iteration: 33 train on batch time: 404.343 ms. 37 | Iteration: 34 train on batch time: 403.955 ms. 38 | Iteration: 35 train on batch time: 404.055 ms. 39 | Iteration: 36 train on batch time: 403.440 ms. 40 | Iteration: 37 train on batch time: 404.931 ms. 41 | Iteration: 38 train on batch time: 403.599 ms. 42 | Iteration: 39 train on batch time: 405.201 ms. 43 | Iteration: 40 train on batch time: 403.820 ms. 44 | Iteration: 41 train on batch time: 404.627 ms. 45 | Iteration: 42 train on batch time: 403.844 ms. 46 | Iteration: 43 train on batch time: 404.620 ms. 47 | Iteration: 44 train on batch time: 408.143 ms. 48 | Iteration: 45 train on batch time: 412.360 ms. 49 | Iteration: 46 train on batch time: 414.074 ms. 50 | Iteration: 47 train on batch time: 409.032 ms. 51 | Iteration: 48 train on batch time: 413.834 ms. 52 | Iteration: 49 train on batch time: 407.383 ms. 53 | Iteration: 50 train on batch time: 407.769 ms. 54 | Iteration: 51 train on batch time: 419.355 ms. 55 | Iteration: 52 train on batch time: 416.029 ms. 56 | Iteration: 53 train on batch time: 412.512 ms. 57 | Iteration: 54 train on batch time: 408.329 ms. 58 | Iteration: 55 train on batch time: 409.426 ms. 59 | Iteration: 56 train on batch time: 410.921 ms. 60 | Iteration: 57 train on batch time: 412.376 ms. 61 | Iteration: 58 train on batch time: 405.777 ms. 62 | Iteration: 59 train on batch time: 412.392 ms. 63 | Iteration: 60 train on batch time: 413.598 ms. 64 | Iteration: 61 train on batch time: 422.148 ms. 65 | Iteration: 62 train on batch time: 415.627 ms. 66 | Iteration: 63 train on batch time: 417.485 ms. 67 | Iteration: 64 train on batch time: 418.663 ms. 68 | Iteration: 65 train on batch time: 418.118 ms. 69 | Iteration: 66 train on batch time: 419.845 ms. 70 | Iteration: 67 train on batch time: 408.545 ms. 71 | Iteration: 68 train on batch time: 415.054 ms. 72 | Iteration: 69 train on batch time: 417.838 ms. 73 | Iteration: 70 train on batch time: 408.967 ms. 74 | Iteration: 71 train on batch time: 409.098 ms. 75 | Iteration: 72 train on batch time: 412.858 ms. 76 | Iteration: 73 train on batch time: 413.543 ms. 77 | Iteration: 74 train on batch time: 416.946 ms. 78 | Iteration: 75 train on batch time: 413.417 ms. 79 | Iteration: 76 train on batch time: 415.105 ms. 80 | Iteration: 77 train on batch time: 409.024 ms. 81 | Iteration: 78 train on batch time: 409.921 ms. 82 | Iteration: 79 train on batch time: 411.784 ms. 83 | Iteration: 80 train on batch time: 412.612 ms. 84 | Iteration: 81 train on batch time: 416.431 ms. 85 | Iteration: 82 train on batch time: 413.436 ms. 86 | Iteration: 83 train on batch time: 414.391 ms. 87 | Iteration: 84 train on batch time: 413.834 ms. 88 | Iteration: 85 train on batch time: 419.902 ms. 89 | Iteration: 86 train on batch time: 426.593 ms. 90 | Iteration: 87 train on batch time: 415.107 ms. 91 | Iteration: 88 train on batch time: 412.426 ms. 92 | Iteration: 89 train on batch time: 411.061 ms. 93 | Iteration: 90 train on batch time: 419.293 ms. 94 | Iteration: 91 train on batch time: 416.285 ms. 95 | Iteration: 92 train on batch time: 416.264 ms. 96 | Iteration: 93 train on batch time: 416.440 ms. 97 | Iteration: 94 train on batch time: 414.160 ms. 98 | Iteration: 95 train on batch time: 411.973 ms. 99 | Iteration: 96 train on batch time: 408.613 ms. 100 | Iteration: 97 train on batch time: 413.174 ms. 101 | Iteration: 98 train on batch time: 417.960 ms. 102 | Iteration: 99 train on batch time: 413.599 ms. 103 | Batch size: 16 104 | Iterations: 100 105 | Time per iteration: 409.953 ms 106 | -------------------------------------------------------------------------------- /results/gtx_1080/benchmark_neon.output: -------------------------------------------------------------------------------- 1 | Epoch 0 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.16s] 2 | Epoch 1 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.06s] 3 | Epoch 2 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.15s] 4 | Epoch 3 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 5 | Epoch 4 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 6 | Epoch 5 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 7 | Epoch 6 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 8 | Epoch 7 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 9 | Epoch 8 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 10 | Epoch 9 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.14s] 11 | Epoch 10 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 12 | Epoch 11 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 13 | Epoch 12 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 14 | Epoch 13 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 15 | Epoch 14 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.15s] 16 | Epoch 15 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 17 | Epoch 16 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 18 | Epoch 17 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 19 | Epoch 18 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 20 | Epoch 19 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 21 | Epoch 20 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 22 | Epoch 21 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.15s] 23 | Epoch 22 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 24 | Epoch 23 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 25 | Epoch 24 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 26 | Epoch 25 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 27 | Epoch 26 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 28 | Epoch 27 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 29 | Epoch 28 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 30 | Epoch 29 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 31 | Epoch 30 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 32 | Epoch 31 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 33 | Epoch 32 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 34 | Epoch 33 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 35 | Epoch 34 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 36 | Epoch 35 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 37 | Epoch 36 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 38 | Epoch 37 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 39 | Epoch 38 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.15s] 40 | Epoch 39 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 41 | Epoch 40 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 42 | Epoch 41 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 43 | Epoch 42 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 44 | Epoch 43 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 45 | Epoch 44 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 46 | Epoch 45 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 47 | Epoch 46 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 48 | Epoch 47 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 49 | Epoch 48 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 50 | Epoch 49 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 51 | Epoch 50 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 52 | Epoch 51 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 53 | Epoch 52 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 54 | Epoch 53 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 55 | Epoch 54 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 56 | Epoch 55 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 57 | Epoch 56 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 58 | Epoch 57 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 59 | Epoch 58 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 60 | Epoch 59 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 61 | Epoch 60 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 62 | Epoch 61 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 63 | Epoch 62 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 64 | Epoch 63 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 65 | Epoch 64 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 66 | Epoch 65 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 67 | Epoch 66 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 68 | Epoch 67 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 69 | Epoch 68 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 70 | Epoch 69 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 71 | Epoch 70 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 72 | Epoch 71 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 73 | Epoch 72 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 74 | Epoch 73 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 75 | Epoch 74 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 76 | Epoch 75 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 77 | Epoch 76 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 78 | Epoch 77 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 79 | Epoch 78 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 80 | Epoch 79 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 81 | Epoch 80 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 82 | Epoch 81 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 83 | Epoch 82 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 84 | Epoch 83 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 85 | Epoch 84 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 86 | Epoch 85 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 87 | Epoch 86 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 88 | Epoch 87 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 89 | Epoch 88 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 90 | Epoch 89 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 91 | Epoch 90 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 92 | Epoch 91 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 93 | Epoch 92 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 94 | Epoch 93 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 95 | Epoch 94 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 96 | Epoch 95 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 97 | Epoch 96 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 98 | Epoch 97 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 99 | Epoch 98 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 100 | Epoch 99 [Train |████████████████████| 1/1 batches, 6.87 cost, 0.15s] 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 164.527 ms 104 | 105 | real 0m18.269s 106 | user 0m15.268s 107 | sys 0m3.460s 108 | -------------------------------------------------------------------------------- /results/k520/INFO.md: -------------------------------------------------------------------------------- 1 | - Hardware: AWS EC2 g2.2xlarge (half of NVIDIA K520) 2 | - OS: Deep Learning AMI for Amazon Linux, v1.5 3 | - CUDA: 7.5 4 | - CuDNN: 5.1 5 | - NVIDIA drivers: 352.99 6 | - Caffe: 1.0.0-rc3 7 | - Keras: 1.0.8 8 | - Theano: 0.8.2.dev-127aa70c6d34421ba7808b23d8bea506dc878068 9 | - TensorFlow: 0.10.0 10 | - Neon: N/A 11 | - MXNet: 0.7.0 12 | - Python: 2.7.12 13 | -------------------------------------------------------------------------------- /results/k520/benchmark_caffe.dmon: -------------------------------------------------------------------------------- 1 | # gpu pwr temp sm mem enc dec mclk pclk 2 | # Idx W C % % % % MHz MHz 3 | 0 17 30 0 0 0 0 324 324 4 | 0 17 30 0 0 0 0 324 324 5 | 0 17 30 0 0 0 0 324 324 6 | 0 17 30 0 0 0 0 324 324 7 | 0 17 30 0 0 0 0 324 324 8 | 0 17 30 0 0 0 0 324 324 9 | 0 17 30 0 0 0 0 324 324 10 | 0 17 30 0 0 0 0 324 324 11 | 0 17 29 0 0 0 0 324 324 12 | 0 45 30 0 0 0 0 2500 797 13 | 0 45 31 0 0 0 0 2500 797 14 | 0 44 31 0 0 0 0 2500 797 15 | 0 45 31 0 0 0 0 2500 797 16 | 0 44 31 0 0 0 0 2500 797 17 | 0 44 31 0 0 0 0 2500 797 18 | 0 45 31 0 0 0 0 2500 797 19 | 0 45 31 0 0 0 0 2500 797 20 | 0 44 32 0 0 0 0 2500 797 21 | 0 44 32 0 0 0 0 2500 797 22 | 0 44 32 0 0 0 0 2500 797 23 | 0 44 32 0 0 0 0 2500 797 24 | 0 44 32 0 0 0 0 2500 797 25 | 0 44 32 0 0 0 0 2500 797 26 | 0 44 32 0 0 0 0 2500 797 27 | 0 44 32 0 0 0 0 2500 797 28 | 0 44 32 0 0 0 0 2500 797 29 | 0 45 32 0 0 0 0 2500 797 30 | 0 45 32 0 0 0 0 2500 797 31 | 0 44 32 0 0 0 0 2500 797 32 | 0 44 32 0 0 0 0 2500 797 33 | 0 44 32 0 0 0 0 2500 797 34 | 0 44 32 0 0 0 0 2500 797 35 | 0 44 32 0 0 0 0 2500 797 36 | 0 45 32 0 0 0 0 2500 797 37 | 0 90 35 98 42 0 0 2500 797 38 | 0 82 35 99 32 0 0 2500 797 39 | 0 47 35 70 6 0 0 2500 797 40 | -------------------------------------------------------------------------------- /results/k520/benchmark_keras_tensorflow.dmon: -------------------------------------------------------------------------------- 1 | # gpu pwr temp sm mem enc dec mclk pclk 2 | # Idx W C % % % % MHz MHz 3 | 0 45 55 0 0 0 0 2500 797 4 | 0 45 54 0 0 0 0 2500 797 5 | 0 46 53 6 0 0 0 2500 797 6 | 0 45 53 0 0 0 0 2500 797 7 | 0 46 53 0 0 0 0 2500 797 8 | 0 89 54 99 47 0 0 2500 797 9 | 0 96 55 99 34 0 0 2500 797 10 | 0 74 54 99 34 0 0 2500 797 11 | 0 75 54 99 36 0 0 2500 797 12 | 0 75 54 99 36 0 0 2500 797 13 | 0 95 55 99 22 0 0 2500 797 14 | 0 74 54 99 31 0 0 2500 797 15 | 0 74 55 99 27 0 0 2500 797 16 | 0 45 53 0 0 0 0 2500 797 17 | 0 45 52 0 0 0 0 2500 797 18 | 0 45 52 0 0 0 0 2500 797 19 | 0 45 52 0 0 0 0 2500 797 20 | 0 45 52 0 0 0 0 2500 797 21 | 0 45 51 0 0 0 0 2500 797 22 | 0 45 51 0 0 0 0 2500 797 23 | 0 45 51 0 0 0 0 2500 797 24 | 0 45 50 0 0 0 0 2500 797 25 | 0 45 50 0 0 0 0 2500 797 26 | 0 88 52 99 35 0 0 2500 797 27 | 0 74 52 99 35 0 0 2500 797 28 | 0 74 52 100 35 0 0 2500 797 29 | 0 74 52 99 32 0 0 2500 797 30 | 0 74 52 99 36 0 0 2500 797 31 | 0 74 52 99 36 0 0 2500 797 32 | 0 74 52 99 34 0 0 2500 797 33 | 0 74 52 100 34 0 0 2500 797 34 | 0 79 52 99 34 0 0 2500 797 35 | 0 74 52 99 34 0 0 2500 797 36 | 0 86 52 99 8 0 0 2500 797 37 | -------------------------------------------------------------------------------- /results/k520/benchmark_keras_theano.output: -------------------------------------------------------------------------------- 1 | Using Theano backend. 2 | Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5103) 3 | /usr/local/lib/python2.7/site-packages/Theano-0.8.2-py2.7.egg/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5. 4 | warnings.warn(warn) 5 | Iteration: 0 train on batch time: 2444.628 ms. 6 | Iteration: 1 train on batch time: 2446.240 ms. 7 | Iteration: 2 train on batch time: 2443.903 ms. 8 | Iteration: 3 train on batch time: 2443.137 ms. 9 | Iteration: 4 train on batch time: 2442.147 ms. 10 | Iteration: 5 train on batch time: 2444.695 ms. 11 | Iteration: 6 train on batch time: 2444.683 ms. 12 | Iteration: 7 train on batch time: 2442.783 ms. 13 | Iteration: 8 train on batch time: 2444.892 ms. 14 | Iteration: 9 train on batch time: 2442.176 ms. 15 | Iteration: 10 train on batch time: 2441.531 ms. 16 | Iteration: 11 train on batch time: 2442.843 ms. 17 | Iteration: 12 train on batch time: 2443.993 ms. 18 | Iteration: 13 train on batch time: 2442.021 ms. 19 | Iteration: 14 train on batch time: 2442.236 ms. 20 | Iteration: 15 train on batch time: 2445.117 ms. 21 | Iteration: 16 train on batch time: 2443.993 ms. 22 | Iteration: 17 train on batch time: 2443.142 ms. 23 | Iteration: 18 train on batch time: 2442.801 ms. 24 | Iteration: 19 train on batch time: 2445.849 ms. 25 | Iteration: 20 train on batch time: 2446.833 ms. 26 | Iteration: 21 train on batch time: 2444.769 ms. 27 | Iteration: 22 train on batch time: 2446.223 ms. 28 | Iteration: 23 train on batch time: 2442.227 ms. 29 | Iteration: 24 train on batch time: 2444.088 ms. 30 | Iteration: 25 train on batch time: 2444.733 ms. 31 | Iteration: 26 train on batch time: 2443.682 ms. 32 | Iteration: 27 train on batch time: 2442.925 ms. 33 | Iteration: 28 train on batch time: 2444.028 ms. 34 | Iteration: 29 train on batch time: 2444.721 ms. 35 | Iteration: 30 train on batch time: 2446.014 ms. 36 | Iteration: 31 train on batch time: 2440.911 ms. 37 | Iteration: 32 train on batch time: 2443.289 ms. 38 | Iteration: 33 train on batch time: 2443.552 ms. 39 | Iteration: 34 train on batch time: 2445.217 ms. 40 | Iteration: 35 train on batch time: 2442.353 ms. 41 | Iteration: 36 train on batch time: 2442.496 ms. 42 | Iteration: 37 train on batch time: 2444.923 ms. 43 | Iteration: 38 train on batch time: 2448.871 ms. 44 | Iteration: 39 train on batch time: 2449.978 ms. 45 | Iteration: 40 train on batch time: 2446.498 ms. 46 | Iteration: 41 train on batch time: 2447.461 ms. 47 | Iteration: 42 train on batch time: 2445.459 ms. 48 | Iteration: 43 train on batch time: 2444.839 ms. 49 | Iteration: 44 train on batch time: 2445.726 ms. 50 | Iteration: 45 train on batch time: 2450.397 ms. 51 | Iteration: 46 train on batch time: 2447.203 ms. 52 | Iteration: 47 train on batch time: 2444.968 ms. 53 | Iteration: 48 train on batch time: 2445.119 ms. 54 | Iteration: 49 train on batch time: 2446.987 ms. 55 | Iteration: 50 train on batch time: 2446.264 ms. 56 | Iteration: 51 train on batch time: 2443.741 ms. 57 | Iteration: 52 train on batch time: 2445.557 ms. 58 | Iteration: 53 train on batch time: 2448.646 ms. 59 | Iteration: 54 train on batch time: 2447.709 ms. 60 | Iteration: 55 train on batch time: 2447.929 ms. 61 | Iteration: 56 train on batch time: 2441.227 ms. 62 | Iteration: 57 train on batch time: 2447.513 ms. 63 | Iteration: 58 train on batch time: 2445.976 ms. 64 | Iteration: 59 train on batch time: 2448.064 ms. 65 | Iteration: 60 train on batch time: 2445.141 ms. 66 | Iteration: 61 train on batch time: 2447.166 ms. 67 | Iteration: 62 train on batch time: 2447.258 ms. 68 | Iteration: 63 train on batch time: 2445.542 ms. 69 | Iteration: 64 train on batch time: 2447.106 ms. 70 | Iteration: 65 train on batch time: 2446.711 ms. 71 | Iteration: 66 train on batch time: 2445.068 ms. 72 | Iteration: 67 train on batch time: 2444.910 ms. 73 | Iteration: 68 train on batch time: 2444.447 ms. 74 | Iteration: 69 train on batch time: 2445.627 ms. 75 | Iteration: 70 train on batch time: 2445.507 ms. 76 | Iteration: 71 train on batch time: 2448.621 ms. 77 | Iteration: 72 train on batch time: 2443.731 ms. 78 | Iteration: 73 train on batch time: 2447.027 ms. 79 | Iteration: 74 train on batch time: 2446.535 ms. 80 | Iteration: 75 train on batch time: 2444.343 ms. 81 | Iteration: 76 train on batch time: 2446.748 ms. 82 | Iteration: 77 train on batch time: 2446.792 ms. 83 | Iteration: 78 train on batch time: 2447.206 ms. 84 | Iteration: 79 train on batch time: 2444.551 ms. 85 | Iteration: 80 train on batch time: 2442.678 ms. 86 | Iteration: 81 train on batch time: 2442.499 ms. 87 | Iteration: 82 train on batch time: 2442.150 ms. 88 | Iteration: 83 train on batch time: 2450.410 ms. 89 | Iteration: 84 train on batch time: 2444.117 ms. 90 | Iteration: 85 train on batch time: 2444.983 ms. 91 | Iteration: 86 train on batch time: 2447.025 ms. 92 | Iteration: 87 train on batch time: 2445.948 ms. 93 | Iteration: 88 train on batch time: 2444.379 ms. 94 | Iteration: 89 train on batch time: 2444.040 ms. 95 | Iteration: 90 train on batch time: 2445.782 ms. 96 | Iteration: 91 train on batch time: 2445.015 ms. 97 | Iteration: 92 train on batch time: 2444.981 ms. 98 | Iteration: 93 train on batch time: 2445.117 ms. 99 | Iteration: 94 train on batch time: 2444.239 ms. 100 | Iteration: 95 train on batch time: 2445.872 ms. 101 | Iteration: 96 train on batch time: 2445.494 ms. 102 | Iteration: 97 train on batch time: 2446.789 ms. 103 | Iteration: 98 train on batch time: 2446.074 ms. 104 | Iteration: 99 train on batch time: 2450.346 ms. 105 | Batch size: 16 106 | Iterations: 100 107 | Time per iteration: 2445.223 ms 108 | -------------------------------------------------------------------------------- /results/k520/benchmark_mxnet.dmon: -------------------------------------------------------------------------------- 1 | # gpu pwr temp sm mem enc dec mclk pclk 2 | # Idx W C % % % % MHz MHz 3 | 0 17 22 0 0 0 0 324 324 4 | 0 17 22 0 0 0 0 324 324 5 | 0 17 22 0 0 0 0 324 324 6 | 0 17 22 0 0 0 0 324 324 7 | 0 17 22 0 0 0 0 324 324 8 | 0 17 22 0 0 0 0 324 324 9 | 0 17 22 0 0 0 0 324 324 10 | 0 17 22 0 0 0 0 324 324 11 | 0 17 22 0 0 0 0 324 324 12 | 0 17 22 0 0 0 0 324 324 13 | 0 17 22 0 0 0 0 324 324 14 | 0 17 22 0 0 0 0 324 324 15 | 0 17 22 0 0 0 0 324 324 16 | 0 17 22 0 0 0 0 324 324 17 | 0 17 22 0 0 0 0 324 324 18 | 0 17 22 0 0 0 0 324 324 19 | 0 17 22 0 0 0 0 324 324 20 | 0 17 22 0 0 0 0 324 324 21 | 0 17 22 0 0 0 0 324 324 22 | 0 17 22 0 0 0 0 324 324 23 | 0 17 22 0 0 0 0 324 324 24 | 0 17 22 0 0 0 0 324 324 25 | 0 17 22 0 0 0 0 324 324 26 | 0 17 22 0 0 0 0 324 324 27 | 0 17 22 0 0 0 0 324 324 28 | 0 17 22 0 0 0 0 324 324 29 | 0 17 22 0 0 0 0 324 324 30 | 0 17 22 0 0 0 0 324 324 31 | 0 17 22 0 0 0 0 324 324 32 | 0 17 22 0 0 0 0 324 324 33 | 0 17 22 0 0 0 0 324 324 34 | 0 17 22 0 0 0 0 324 324 35 | 0 17 22 0 0 0 0 324 324 36 | 0 17 22 0 0 0 0 324 324 37 | 0 17 22 0 0 0 0 324 324 38 | 0 17 22 0 0 0 0 324 324 39 | 0 17 22 0 0 0 0 324 324 40 | 0 44 23 0 0 0 0 2500 797 41 | 0 45 23 0 0 0 0 2500 797 42 | 0 45 24 0 0 0 0 2500 797 43 | 0 44 24 0 0 0 0 2500 797 44 | 0 44 24 0 0 0 0 2500 797 45 | 0 44 24 0 0 0 0 2500 797 46 | 0 44 24 0 0 0 0 2500 797 47 | # gpu pwr temp sm mem enc dec mclk pclk 48 | # Idx W C % % % % MHz MHz 49 | 0 45 25 0 0 0 0 2500 797 50 | 0 45 25 0 0 0 0 2500 797 51 | 0 44 25 0 0 0 0 2500 797 52 | 0 45 25 0 0 0 0 2500 797 53 | 0 44 25 0 0 0 0 2500 797 54 | 0 45 25 0 0 0 0 2500 797 55 | 0 45 26 0 0 0 0 2500 797 56 | 0 44 25 0 0 0 0 2500 797 57 | 0 44 26 0 0 0 0 2500 797 58 | 0 45 26 2 0 0 0 2500 797 59 | 0 45 26 0 0 0 0 2500 797 60 | 0 45 26 0 0 0 0 2500 797 61 | 0 45 26 0 0 0 0 2500 797 62 | 0 44 26 0 0 0 0 2500 797 63 | 0 44 26 0 0 0 0 2500 797 64 | 0 44 26 0 0 0 0 2500 797 65 | 0 45 27 0 0 0 0 2500 797 66 | 0 44 27 0 0 0 0 2500 797 67 | 0 44 27 0 0 0 0 2500 797 68 | 0 44 27 0 0 0 0 2500 797 69 | 0 44 27 0 0 0 0 2500 797 70 | 0 44 27 0 0 0 0 2500 797 71 | 0 44 27 0 0 0 0 2500 797 72 | 0 44 28 0 0 0 0 2500 797 73 | 0 44 27 0 0 0 0 2500 797 74 | 0 44 28 0 0 0 0 2500 797 75 | 0 44 28 0 0 0 0 2500 797 76 | 0 44 28 0 0 0 0 2500 797 77 | 0 44 28 0 0 0 0 2500 797 78 | 0 45 28 0 0 0 0 2500 797 79 | 0 44 28 0 0 0 0 2500 797 80 | 0 44 28 0 0 0 0 2500 797 81 | 0 45 28 0 0 0 0 2500 797 82 | -------------------------------------------------------------------------------- /results/k520/benchmark_mxnet.output: -------------------------------------------------------------------------------- 1 | [09:36:46] /home/ec2-user/src/mxnet/dmlc-core/include/dmlc/./logging.h:235: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory 2 | [09:36:46] /home/ec2-user/src/mxnet/dmlc-core/include/dmlc/logging.h:235: [09:36:46] src/engine/./threaded_engine.h:306: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory 3 | An fatal error occurred in asynchronous engine operation. If you do not know what caused this error, you can try set environment variable MXNET_ENGINE_TYPE to NaiveEngine and run with debugger (i.e. gdb). This will force all operations to be synchronous and backtrace will give you the series of calls that lead to this error. Remember to set MXNET_ENGINE_TYPE back to empty after debugging. 4 | terminate called after throwing an instance of 'dmlc::Error' 5 | what(): [09:36:46] src/engine/./threaded_engine.h:306: [09:36:46] src/storage/./pooled_storage_manager.h:79: cudaMalloc failed: out of memory 6 | An fatal error occurred in asynchronous engine operation. If you do not know what caused this error, you can try set environment variable MXNET_ENGINE_TYPE to NaiveEngine and run with debugger (i.e. gdb). This will force all operations to be synchronous and backtrace will give you the series of calls that lead to this error. Remember to set MXNET_ENGINE_TYPE back to empty after debugging. 7 | -------------------------------------------------------------------------------- /results/k520/benchmark_tensorflow.dmon: -------------------------------------------------------------------------------- 1 | # gpu pwr temp sm mem enc dec mclk pclk 2 | # Idx W C % % % % MHz MHz 3 | 0 47 35 70 6 0 0 2500 797 4 | 0 44 34 0 0 0 0 2500 797 5 | 0 45 34 0 0 0 0 2500 797 6 | 0 54 35 62 45 0 0 2500 797 7 | 0 78 36 99 34 0 0 2500 797 8 | 0 108 37 99 23 0 0 2500 797 9 | 0 90 37 99 50 0 0 2500 797 10 | 0 78 37 99 36 0 0 2500 797 11 | 0 77 37 99 36 0 0 2500 797 12 | 0 78 38 99 35 0 0 2500 797 13 | 0 78 38 99 35 0 0 2500 797 14 | 0 78 38 99 36 0 0 2500 797 15 | 0 100 39 99 30 0 0 2500 797 16 | 0 86 39 99 35 0 0 2500 797 17 | 0 102 40 99 21 0 0 2500 797 18 | 0 100 40 99 26 0 0 2500 797 19 | 0 105 41 99 34 0 0 2500 797 20 | 0 99 40 99 26 0 0 2500 797 21 | 0 87 40 99 36 0 0 2500 797 22 | 0 101 40 99 32 0 0 2500 797 23 | 0 94 41 99 35 0 0 2500 797 24 | 0 97 41 99 35 0 0 2500 797 25 | 0 78 40 99 34 0 0 2500 797 26 | 0 91 42 99 8 0 0 2500 797 27 | 0 100 42 99 21 0 0 2500 797 28 | 0 96 42 99 36 0 0 2500 797 29 | 0 95 42 99 38 0 0 2500 797 30 | 0 91 42 99 34 0 0 2500 797 31 | 0 91 43 99 17 0 0 2500 797 32 | 0 89 43 99 34 0 0 2500 797 33 | 0 94 43 99 39 0 0 2500 797 34 | 0 92 44 99 12 0 0 2500 797 35 | 0 95 44 99 36 0 0 2500 797 36 | 0 98 44 99 39 0 0 2500 797 37 | 0 99 44 99 37 0 0 2500 797 38 | 0 93 44 99 29 0 0 2500 797 39 | 0 87 44 99 27 0 0 2500 797 40 | 0 87 45 99 43 0 0 2500 797 41 | 0 88 45 99 31 0 0 2500 797 42 | 0 97 45 99 40 0 0 2500 797 43 | 0 97 46 99 21 0 0 2500 797 44 | 0 100 45 99 36 0 0 2500 797 45 | 0 98 45 99 38 0 0 2500 797 46 | 0 89 46 99 33 0 0 2500 797 47 | # gpu pwr temp sm mem enc dec mclk pclk 48 | # Idx W C % % % % MHz MHz 49 | 0 91 46 99 17 0 0 2500 797 50 | 0 89 46 99 35 0 0 2500 797 51 | 0 88 46 99 39 0 0 2500 797 52 | 0 85 46 99 12 0 0 2500 797 53 | 0 95 46 99 36 0 0 2500 797 54 | 0 95 47 99 39 0 0 2500 797 55 | 0 95 47 99 37 0 0 2500 797 56 | 0 94 47 99 30 0 0 2500 797 57 | 0 94 47 99 25 0 0 2500 797 58 | 0 89 47 99 43 0 0 2500 797 59 | 0 87 48 99 32 0 0 2500 797 60 | 0 96 48 99 40 0 0 2500 797 61 | 0 87 48 99 20 0 0 2500 797 62 | 0 98 48 99 36 0 0 2500 797 63 | 0 93 48 99 38 0 0 2500 797 64 | 0 91 48 99 38 0 0 2500 797 65 | 0 92 48 99 35 0 0 2500 797 66 | 0 93 48 99 26 0 0 2500 797 67 | 0 87 48 99 41 0 0 2500 797 68 | 0 93 48 99 32 0 0 2500 797 69 | 0 96 49 99 41 0 0 2500 797 70 | 0 100 50 99 15 0 0 2500 797 71 | 0 95 49 99 36 0 0 2500 797 72 | 0 93 49 99 35 0 0 2500 797 73 | 0 89 49 99 34 0 0 2500 797 74 | 0 90 49 99 10 0 0 2500 797 75 | 0 89 49 99 30 0 0 2500 797 76 | 0 96 49 99 42 0 0 2500 797 77 | 0 90 50 99 15 0 0 2500 797 78 | 0 97 50 99 37 0 0 2500 797 79 | 0 96 50 99 38 0 0 2500 797 80 | 0 97 50 99 38 0 0 2500 797 81 | 0 92 50 99 35 0 0 2500 797 82 | 0 89 50 99 26 0 0 2500 797 83 | 0 87 50 99 40 0 0 2500 797 84 | 0 92 50 99 32 0 0 2500 797 85 | 0 96 50 99 41 0 0 2500 797 86 | 0 95 51 99 14 0 0 2500 797 87 | 0 99 51 99 35 0 0 2500 797 88 | 0 91 51 99 36 0 0 2500 797 89 | 0 90 51 99 35 0 0 2500 797 90 | 0 90 51 99 9 0 0 2500 797 91 | 0 91 51 99 29 0 0 2500 797 92 | 0 86 51 99 43 0 0 2500 797 93 | # gpu pwr temp sm mem enc dec mclk pclk 94 | # Idx W C % % % % MHz MHz 95 | 0 86 51 99 16 0 0 2500 797 96 | 0 94 51 99 38 0 0 2500 797 97 | 0 97 52 99 39 0 0 2500 797 98 | 0 97 52 99 38 0 0 2500 797 99 | 0 94 52 99 36 0 0 2500 797 100 | 0 94 51 99 27 0 0 2500 797 101 | 0 90 52 99 39 0 0 2500 797 102 | 0 87 52 99 33 0 0 2500 797 103 | 0 96 52 99 41 0 0 2500 797 104 | 0 97 52 99 13 0 0 2500 797 105 | 0 97 52 99 36 0 0 2500 797 106 | 0 94 52 99 36 0 0 2500 797 107 | 0 91 52 99 35 0 0 2500 797 108 | 0 92 52 100 8 0 0 2500 797 109 | 0 91 52 99 28 0 0 2500 797 110 | 0 85 52 99 44 0 0 2500 797 111 | 0 94 52 99 16 0 0 2500 797 112 | 0 97 53 99 38 0 0 2500 797 113 | 0 98 53 99 39 0 0 2500 797 114 | 0 97 53 99 37 0 0 2500 797 115 | 0 94 53 99 37 0 0 2500 797 116 | 0 95 53 99 29 0 0 2500 797 117 | 0 91 53 99 11 0 0 2500 797 118 | 0 94 53 99 27 0 0 2500 797 119 | 0 96 53 99 44 0 0 2500 797 120 | 0 86 53 99 20 0 0 2500 797 121 | 0 97 53 99 39 0 0 2500 797 122 | 0 95 53 99 32 0 0 2500 797 123 | 0 95 53 99 36 0 0 2500 797 124 | 0 90 53 99 43 0 0 2500 797 125 | 0 93 54 99 32 0 0 2500 797 126 | 0 89 54 99 28 0 0 2500 797 127 | 0 93 53 99 36 0 0 2500 797 128 | 0 97 54 99 39 0 0 2500 797 129 | 0 96 54 99 9 0 0 2500 797 130 | 0 98 54 99 36 0 0 2500 797 131 | 0 94 53 99 37 0 0 2500 797 132 | 0 89 54 99 36 0 0 2500 797 133 | 0 90 54 99 13 0 0 2500 797 134 | 0 95 54 99 25 0 0 2500 797 135 | 0 87 54 99 44 0 0 2500 797 136 | 0 90 54 99 22 0 0 2500 797 137 | 0 93 54 99 39 0 0 2500 797 138 | 0 98 55 99 33 0 0 2500 797 139 | # gpu pwr temp sm mem enc dec mclk pclk 140 | # Idx W C % % % % MHz MHz 141 | 0 97 54 99 37 0 0 2500 797 142 | 0 91 54 99 44 0 0 2500 797 143 | 0 93 54 99 32 0 0 2500 797 144 | 0 91 54 99 28 0 0 2500 797 145 | 0 87 54 99 36 0 0 2500 797 146 | 0 95 55 99 39 0 0 2500 797 147 | 0 97 55 99 9 0 0 2500 797 148 | 0 98 55 99 35 0 0 2500 797 149 | 0 91 55 99 37 0 0 2500 797 150 | 0 93 55 99 36 0 0 2500 797 151 | 0 91 55 99 13 0 0 2500 797 152 | 0 89 55 99 24 0 0 2500 797 153 | 0 86 54 99 44 0 0 2500 797 154 | 0 91 55 99 22 0 0 2500 797 155 | 0 98 55 99 39 0 0 2500 797 156 | 0 98 56 99 33 0 0 2500 797 157 | 0 94 55 99 36 0 0 2500 797 158 | 0 96 55 99 44 0 0 2500 797 159 | 0 95 55 99 30 0 0 2500 797 160 | 0 91 55 99 28 0 0 2500 797 161 | 0 95 55 99 36 0 0 2500 797 162 | 0 94 55 99 39 0 0 2500 797 163 | 0 87 55 99 9 0 0 2500 797 164 | 0 96 55 99 36 0 0 2500 797 165 | 0 95 55 99 37 0 0 2500 797 166 | 0 90 55 99 36 0 0 2500 797 167 | 0 91 55 99 13 0 0 2500 797 168 | 0 93 55 99 28 0 0 2500 797 169 | 0 89 55 99 44 0 0 2500 797 170 | 0 94 55 99 21 0 0 2500 797 171 | 0 96 55 99 39 0 0 2500 797 172 | 0 98 56 99 8 0 0 2500 797 173 | 0 95 55 99 35 0 0 2500 797 174 | 0 93 56 99 35 0 0 2500 797 175 | 0 88 55 99 36 0 0 2500 797 176 | 0 91 55 99 19 0 0 2500 797 177 | 0 94 55 99 25 0 0 2500 797 178 | 0 96 55 99 43 0 0 2500 797 179 | 0 92 55 99 27 0 0 2500 797 180 | 0 91 55 99 40 0 0 2500 797 181 | 0 99 56 99 27 0 0 2500 797 182 | 0 93 56 99 36 0 0 2500 797 183 | 0 90 56 99 41 0 0 2500 797 184 | 0 93 55 99 34 0 0 2500 797 185 | # gpu pwr temp sm mem enc dec mclk pclk 186 | # Idx W C % % % % MHz MHz 187 | 0 91 56 99 23 0 0 2500 797 188 | 0 89 56 99 34 0 0 2500 797 189 | 0 94 56 99 39 0 0 2500 797 190 | 0 96 56 99 9 0 0 2500 797 191 | 0 96 56 99 35 0 0 2500 797 192 | 0 94 56 99 34 0 0 2500 797 193 | 0 94 56 99 38 0 0 2500 797 194 | 0 92 56 99 20 0 0 2500 797 195 | 0 87 56 99 24 0 0 2500 797 196 | 0 87 55 99 43 0 0 2500 797 197 | 0 92 56 99 27 0 0 2500 797 198 | 0 90 56 99 40 0 0 2500 797 199 | 0 99 57 99 27 0 0 2500 797 200 | 0 95 56 99 36 0 0 2500 797 201 | 0 94 56 99 41 0 0 2500 797 202 | 0 91 56 99 34 0 0 2500 797 203 | 0 91 56 99 23 0 0 2500 797 204 | 0 85 56 99 34 0 0 2500 797 205 | 0 93 56 99 39 0 0 2500 797 206 | 0 88 56 99 8 0 0 2500 797 207 | 0 95 56 99 36 0 0 2500 797 208 | 0 94 56 99 35 0 0 2500 797 209 | 0 92 56 99 36 0 0 2500 797 210 | 0 92 56 99 20 0 0 2500 797 211 | 0 95 56 99 25 0 0 2500 797 212 | 0 89 56 99 43 0 0 2500 797 213 | 0 94 56 99 27 0 0 2500 797 214 | 0 98 56 99 40 0 0 2500 797 215 | 0 98 57 99 27 0 0 2500 797 216 | 0 95 56 99 36 0 0 2500 797 217 | 0 92 56 99 41 0 0 2500 797 218 | 0 91 56 99 34 0 0 2500 797 219 | 0 89 56 99 23 0 0 2500 797 220 | 0 97 56 99 34 0 0 2500 797 221 | 0 96 56 99 39 0 0 2500 797 222 | 0 87 56 99 9 0 0 2500 797 223 | 0 92 56 99 35 0 0 2500 797 224 | 0 99 57 99 35 0 0 2500 797 225 | 0 92 57 99 36 0 0 2500 797 226 | 0 91 57 99 39 0 0 2500 797 227 | 0 95 57 99 34 0 0 2500 797 228 | 0 87 57 99 18 0 0 2500 797 229 | 0 88 57 99 34 0 0 2500 797 230 | 0 94 57 99 39 0 0 2500 797 231 | # gpu pwr temp sm mem enc dec mclk pclk 232 | # Idx W C % % % % MHz MHz 233 | 0 95 57 99 10 0 0 2500 797 234 | 0 96 57 99 36 0 0 2500 797 235 | 0 93 57 99 38 0 0 2500 797 236 | 0 88 57 99 37 0 0 2500 797 237 | 0 92 57 99 28 0 0 2500 797 238 | 0 85 57 99 21 0 0 2500 797 239 | 0 88 56 99 42 0 0 2500 797 240 | 0 92 57 99 30 0 0 2500 797 241 | 0 93 57 99 40 0 0 2500 797 242 | 0 96 58 99 22 0 0 2500 797 243 | 0 95 57 99 36 0 0 2500 797 244 | 0 95 57 99 38 0 0 2500 797 245 | 0 92 57 99 34 0 0 2500 797 246 | 0 91 57 99 18 0 0 2500 797 247 | 0 89 57 99 34 0 0 2500 797 248 | 0 95 56 99 39 0 0 2500 797 249 | 0 88 57 99 11 0 0 2500 797 250 | 0 95 57 99 36 0 0 2500 797 251 | 0 95 57 99 39 0 0 2500 797 252 | 0 93 57 99 37 0 0 2500 797 253 | 0 92 57 99 28 0 0 2500 797 254 | 0 88 57 99 25 0 0 2500 797 255 | 0 87 57 99 43 0 0 2500 797 256 | 0 94 57 99 30 0 0 2500 797 257 | 0 100 57 99 40 0 0 2500 797 258 | 0 100 58 99 22 0 0 2500 797 259 | 0 96 57 99 36 0 0 2500 797 260 | 0 95 57 99 38 0 0 2500 797 261 | 0 89 57 99 34 0 0 2500 797 262 | 0 90 57 99 18 0 0 2500 797 263 | 0 93 57 99 34 0 0 2500 797 264 | 0 95 57 99 39 0 0 2500 797 265 | -------------------------------------------------------------------------------- /results/k520/benchmark_tensorflow.output: -------------------------------------------------------------------------------- 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 8 | name: GRID K520 9 | major: 3 minor: 0 memoryClockRate (GHz) 0.797 10 | pciBusID 0000:00:03.0 11 | Total memory: 4.00GiB 12 | Free memory: 3.95GiB 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0) 16 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.52GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 17 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.29GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 18 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 19 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 20 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 21 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 22 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 23 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 24 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 25 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 26 | Iteration: 0 train on batch time: 2288.014 ms. 27 | Iteration: 1 train on batch time: 2290.354 ms. 28 | Iteration: 2 train on batch time: 2289.724 ms. 29 | Iteration: 3 train on batch time: 2290.773 ms. 30 | Iteration: 4 train on batch time: 2290.362 ms. 31 | Iteration: 5 train on batch time: 2291.452 ms. 32 | Iteration: 6 train on batch time: 2288.561 ms. 33 | Iteration: 7 train on batch time: 2292.294 ms. 34 | Iteration: 8 train on batch time: 2290.141 ms. 35 | Iteration: 9 train on batch time: 2289.341 ms. 36 | Iteration: 10 train on batch time: 2289.933 ms. 37 | Iteration: 11 train on batch time: 2290.004 ms. 38 | Iteration: 12 train on batch time: 2288.960 ms. 39 | Iteration: 13 train on batch time: 2291.385 ms. 40 | Iteration: 14 train on batch time: 2289.223 ms. 41 | Iteration: 15 train on batch time: 2291.922 ms. 42 | Iteration: 16 train on batch time: 2290.300 ms. 43 | Iteration: 17 train on batch time: 2289.473 ms. 44 | Iteration: 18 train on batch time: 2290.086 ms. 45 | Iteration: 19 train on batch time: 2290.494 ms. 46 | Iteration: 20 train on batch time: 2291.832 ms. 47 | Iteration: 21 train on batch time: 2290.415 ms. 48 | Iteration: 22 train on batch time: 2289.739 ms. 49 | Iteration: 23 train on batch time: 2289.833 ms. 50 | Iteration: 24 train on batch time: 2289.924 ms. 51 | Iteration: 25 train on batch time: 2290.042 ms. 52 | Iteration: 26 train on batch time: 2290.865 ms. 53 | Iteration: 27 train on batch time: 2289.665 ms. 54 | Iteration: 28 train on batch time: 2291.388 ms. 55 | Iteration: 29 train on batch time: 2289.806 ms. 56 | Iteration: 30 train on batch time: 2290.459 ms. 57 | Iteration: 31 train on batch time: 2290.359 ms. 58 | Iteration: 32 train on batch time: 2290.220 ms. 59 | Iteration: 33 train on batch time: 2289.432 ms. 60 | Iteration: 34 train on batch time: 2289.253 ms. 61 | Iteration: 35 train on batch time: 2289.966 ms. 62 | Iteration: 36 train on batch time: 2290.815 ms. 63 | Iteration: 37 train on batch time: 2290.038 ms. 64 | Iteration: 38 train on batch time: 2289.975 ms. 65 | Iteration: 39 train on batch time: 2289.872 ms. 66 | Iteration: 40 train on batch time: 2290.819 ms. 67 | Iteration: 41 train on batch time: 2290.014 ms. 68 | Iteration: 42 train on batch time: 2290.018 ms. 69 | Iteration: 43 train on batch time: 2290.331 ms. 70 | Iteration: 44 train on batch time: 2291.564 ms. 71 | Iteration: 45 train on batch time: 2289.280 ms. 72 | Iteration: 46 train on batch time: 2290.379 ms. 73 | Iteration: 47 train on batch time: 2290.502 ms. 74 | Iteration: 48 train on batch time: 2291.871 ms. 75 | Iteration: 49 train on batch time: 2291.065 ms. 76 | Iteration: 50 train on batch time: 2289.859 ms. 77 | Iteration: 51 train on batch time: 2292.050 ms. 78 | Iteration: 52 train on batch time: 2290.174 ms. 79 | Iteration: 53 train on batch time: 2290.171 ms. 80 | Iteration: 54 train on batch time: 2291.706 ms. 81 | Iteration: 55 train on batch time: 2290.779 ms. 82 | Iteration: 56 train on batch time: 2290.103 ms. 83 | Iteration: 57 train on batch time: 2290.064 ms. 84 | Iteration: 58 train on batch time: 2291.862 ms. 85 | Iteration: 59 train on batch time: 2291.895 ms. 86 | Iteration: 60 train on batch time: 2291.257 ms. 87 | Iteration: 61 train on batch time: 2289.168 ms. 88 | Iteration: 62 train on batch time: 2288.342 ms. 89 | Iteration: 63 train on batch time: 2289.242 ms. 90 | Iteration: 64 train on batch time: 2289.082 ms. 91 | Iteration: 65 train on batch time: 2292.338 ms. 92 | Iteration: 66 train on batch time: 2292.939 ms. 93 | Iteration: 67 train on batch time: 2289.538 ms. 94 | Iteration: 68 train on batch time: 2291.592 ms. 95 | Iteration: 69 train on batch time: 2289.194 ms. 96 | Iteration: 70 train on batch time: 2291.248 ms. 97 | Iteration: 71 train on batch time: 2291.441 ms. 98 | Iteration: 72 train on batch time: 2290.788 ms. 99 | Iteration: 73 train on batch time: 2291.898 ms. 100 | Iteration: 74 train on batch time: 2290.556 ms. 101 | Iteration: 75 train on batch time: 2291.116 ms. 102 | Iteration: 76 train on batch time: 2291.694 ms. 103 | Iteration: 77 train on batch time: 2291.067 ms. 104 | Iteration: 78 train on batch time: 2291.529 ms. 105 | Iteration: 79 train on batch time: 2289.126 ms. 106 | Iteration: 80 train on batch time: 2291.065 ms. 107 | Iteration: 81 train on batch time: 2290.981 ms. 108 | Iteration: 82 train on batch time: 2291.545 ms. 109 | Iteration: 83 train on batch time: 2289.720 ms. 110 | Iteration: 84 train on batch time: 2291.157 ms. 111 | Iteration: 85 train on batch time: 2291.588 ms. 112 | Iteration: 86 train on batch time: 2292.701 ms. 113 | Iteration: 87 train on batch time: 2288.769 ms. 114 | Iteration: 88 train on batch time: 2289.381 ms. 115 | Iteration: 89 train on batch time: 2292.850 ms. 116 | Iteration: 90 train on batch time: 2289.262 ms. 117 | Iteration: 91 train on batch time: 2291.040 ms. 118 | Iteration: 92 train on batch time: 2291.041 ms. 119 | Iteration: 93 train on batch time: 2291.432 ms. 120 | Iteration: 94 train on batch time: 2290.171 ms. 121 | Iteration: 95 train on batch time: 2291.687 ms. 122 | Iteration: 96 train on batch time: 2289.717 ms. 123 | Iteration: 97 train on batch time: 2291.111 ms. 124 | Iteration: 98 train on batch time: 2290.549 ms. 125 | Iteration: 99 train on batch time: 2289.340 ms. 126 | Batch size: 16 127 | Iterations: 100 128 | Time per iteration: 2290.514 ms 129 | -------------------------------------------------------------------------------- /results/k520/benchmark_tensorflow_slim.dmon: -------------------------------------------------------------------------------- 1 | # gpu pwr temp sm mem enc dec mclk pclk 2 | # Idx W C % % % % MHz MHz 3 | 0 45 55 35 12 0 0 2500 797 4 | 0 45 54 0 0 0 0 2500 797 5 | 0 45 54 0 0 0 0 2500 797 6 | 0 90 55 14 6 0 0 2500 797 7 | 0 78 55 99 35 0 0 2500 797 8 | 0 78 55 100 34 0 0 2500 797 9 | 0 87 56 99 34 0 0 2500 797 10 | 0 79 55 99 36 0 0 2500 797 11 | 0 79 55 99 35 0 0 2500 797 12 | 0 107 57 99 20 0 0 2500 797 13 | 0 95 56 99 25 0 0 2500 797 14 | 0 81 56 99 35 0 0 2500 797 15 | 0 86 55 99 35 0 0 2500 797 16 | 0 100 57 99 33 0 0 2500 797 17 | 0 97 56 99 34 0 0 2500 797 18 | 0 78 55 99 35 0 0 2500 797 19 | 0 100 56 99 35 0 0 2500 797 20 | 0 85 56 99 35 0 0 2500 797 21 | 0 79 55 100 35 0 0 2500 797 22 | 0 78 55 99 42 0 0 2500 797 23 | 0 78 55 99 34 0 0 2500 797 24 | 0 78 55 99 69 0 0 2500 797 25 | 0 78 56 99 28 0 0 2500 797 26 | 0 91 56 99 16 0 0 2500 797 27 | 0 96 56 95 45 0 0 2500 797 28 | 0 93 56 99 34 0 0 2500 797 29 | 0 91 56 99 28 0 0 2500 797 30 | 0 95 56 99 34 0 0 2500 797 31 | 0 92 56 99 43 0 0 2500 797 32 | 0 85 56 99 26 0 0 2500 797 33 | 0 97 56 99 40 0 0 2500 797 34 | 0 96 57 99 29 0 0 2500 797 35 | 0 95 56 99 36 0 0 2500 797 36 | 0 90 57 98 42 0 0 2500 797 37 | 0 96 57 99 33 0 0 2500 797 38 | 0 90 57 99 24 0 0 2500 797 39 | 0 90 56 99 36 0 0 2500 797 40 | 0 99 57 99 39 0 0 2500 797 41 | 0 93 57 99 8 0 0 2500 797 42 | 0 97 57 99 36 0 0 2500 797 43 | 0 93 57 99 35 0 0 2500 797 44 | 0 91 57 99 38 0 0 2500 797 45 | 0 90 56 99 17 0 0 2500 797 46 | 0 95 56 99 34 0 0 2500 797 47 | # gpu pwr temp sm mem enc dec mclk pclk 48 | # Idx W C % % % % MHz MHz 49 | 0 89 57 99 43 0 0 2500 797 50 | 0 90 57 99 26 0 0 2500 797 51 | 0 92 57 99 40 0 0 2500 797 52 | 0 99 57 99 29 0 0 2500 797 53 | 0 95 57 99 35 0 0 2500 797 54 | 0 90 57 98 42 0 0 2500 797 55 | 0 90 57 99 33 0 0 2500 797 56 | 0 91 57 99 24 0 0 2500 797 57 | 0 87 57 99 35 0 0 2500 797 58 | 0 95 57 99 39 0 0 2500 797 59 | 0 95 57 99 8 0 0 2500 797 60 | 0 98 57 99 36 0 0 2500 797 61 | 0 96 57 99 35 0 0 2500 797 62 | 0 98 57 99 36 0 0 2500 797 63 | 0 93 57 99 17 0 0 2500 797 64 | 0 85 57 99 34 0 0 2500 797 65 | 0 89 57 99 43 0 0 2500 797 66 | 0 91 57 99 25 0 0 2500 797 67 | 0 100 57 99 40 0 0 2500 797 68 | 0 100 58 99 30 0 0 2500 797 69 | 0 99 58 99 36 0 0 2500 797 70 | 0 95 57 98 43 0 0 2500 797 71 | 0 90 57 99 34 0 0 2500 797 72 | 0 91 57 99 25 0 0 2500 797 73 | 0 95 57 99 36 0 0 2500 797 74 | 0 92 57 99 39 0 0 2500 797 75 | 0 85 58 99 9 0 0 2500 797 76 | 0 96 58 99 35 0 0 2500 797 77 | 0 95 58 99 36 0 0 2500 797 78 | 0 94 57 99 36 0 0 2500 797 79 | 0 91 57 99 16 0 0 2500 797 80 | 0 94 57 99 35 0 0 2500 797 81 | 0 89 57 99 43 0 0 2500 797 82 | 0 94 57 99 24 0 0 2500 797 83 | 0 98 58 99 40 0 0 2500 797 84 | 0 99 58 99 31 0 0 2500 797 85 | 0 98 58 99 36 0 0 2500 797 86 | 0 94 58 99 35 0 0 2500 797 87 | 0 89 57 99 36 0 0 2500 797 88 | 0 90 58 99 21 0 0 2500 797 89 | 0 94 58 99 33 0 0 2500 797 90 | 0 95 57 99 43 0 0 2500 797 91 | 0 92 57 99 27 0 0 2500 797 92 | 0 90 58 99 40 0 0 2500 797 93 | # gpu pwr temp sm mem enc dec mclk pclk 94 | # Idx W C % % % % MHz MHz 95 | 0 98 58 99 26 0 0 2500 797 96 | 0 96 58 99 36 0 0 2500 797 97 | 0 91 58 98 41 0 0 2500 797 98 | 0 92 58 99 34 0 0 2500 797 99 | 0 91 58 99 22 0 0 2500 797 100 | 0 86 58 99 34 0 0 2500 797 101 | 0 93 58 99 39 0 0 2500 797 102 | 0 95 58 99 8 0 0 2500 797 103 | 0 100 58 99 36 0 0 2500 797 104 | 0 95 58 99 34 0 0 2500 797 105 | 0 95 58 99 38 0 0 2500 797 106 | 0 92 58 99 19 0 0 2500 797 107 | 0 87 58 99 34 0 0 2500 797 108 | 0 87 58 99 43 0 0 2500 797 109 | 0 89 58 99 26 0 0 2500 797 110 | 0 93 58 99 40 0 0 2500 797 111 | 0 100 59 99 28 0 0 2500 797 112 | 0 97 58 99 36 0 0 2500 797 113 | 0 96 58 98 42 0 0 2500 797 114 | 0 93 58 99 34 0 0 2500 797 115 | 0 91 58 99 24 0 0 2500 797 116 | 0 86 58 99 36 0 0 2500 797 117 | 0 94 57 99 39 0 0 2500 797 118 | 0 86 58 99 8 0 0 2500 797 119 | 0 97 58 99 36 0 0 2500 797 120 | 0 94 58 99 36 0 0 2500 797 121 | 0 92 58 99 36 0 0 2500 797 122 | 0 92 58 99 17 0 0 2500 797 123 | 0 94 58 99 34 0 0 2500 797 124 | 0 89 58 99 43 0 0 2500 797 125 | 0 92 58 99 24 0 0 2500 797 126 | 0 97 58 99 39 0 0 2500 797 127 | 0 99 59 99 30 0 0 2500 797 128 | 0 96 58 99 37 0 0 2500 797 129 | 0 94 58 98 43 0 0 2500 797 130 | 0 91 58 99 34 0 0 2500 797 131 | 0 92 58 99 26 0 0 2500 797 132 | 0 91 58 99 37 0 0 2500 797 133 | 0 97 58 99 39 0 0 2500 797 134 | 0 90 58 99 9 0 0 2500 797 135 | 0 92 58 99 36 0 0 2500 797 136 | 0 97 59 99 38 0 0 2500 797 137 | 0 97 59 99 36 0 0 2500 797 138 | 0 92 58 99 13 0 0 2500 797 139 | # gpu pwr temp sm mem enc dec mclk pclk 140 | # Idx W C % % % % MHz MHz 141 | 0 87 59 99 34 0 0 2500 797 142 | 0 89 58 99 24 0 0 2500 797 143 | 0 90 58 99 36 0 0 2500 797 144 | 0 97 58 99 39 0 0 2500 797 145 | 0 98 59 99 9 0 0 2500 797 146 | 0 99 59 99 36 0 0 2500 797 147 | 0 97 58 99 36 0 0 2500 797 148 | 0 89 59 99 36 0 0 2500 797 149 | 0 92 58 99 17 0 0 2500 797 150 | 0 96 58 99 35 0 0 2500 797 151 | 0 89 58 99 43 0 0 2500 797 152 | 0 87 58 99 24 0 0 2500 797 153 | 0 92 59 99 39 0 0 2500 797 154 | 0 97 59 99 31 0 0 2500 797 155 | 0 95 59 99 37 0 0 2500 797 156 | 0 95 59 98 43 0 0 2500 797 157 | 0 98 58 99 33 0 0 2500 797 158 | 0 90 58 99 26 0 0 2500 797 159 | 0 95 58 99 37 0 0 2500 797 160 | 0 100 59 99 39 0 0 2500 797 161 | 0 91 59 99 9 0 0 2500 797 162 | 0 97 59 99 36 0 0 2500 797 163 | 0 94 59 99 37 0 0 2500 797 164 | 0 90 58 99 36 0 0 2500 797 165 | 0 90 58 99 14 0 0 2500 797 166 | 0 97 58 99 34 0 0 2500 797 167 | 0 89 58 99 44 0 0 2500 797 168 | 0 92 58 99 22 0 0 2500 797 169 | 0 94 59 99 39 0 0 2500 797 170 | 0 99 59 99 33 0 0 2500 797 171 | 0 93 59 99 36 0 0 2500 797 172 | 0 91 59 98 43 0 0 2500 797 173 | 0 91 58 99 34 0 0 2500 797 174 | 0 92 59 99 28 0 0 2500 797 175 | 0 89 59 99 34 0 0 2500 797 176 | 0 96 59 99 39 0 0 2500 797 177 | 0 96 59 99 9 0 0 2500 797 178 | 0 96 59 99 36 0 0 2500 797 179 | 0 99 59 99 36 0 0 2500 797 180 | 0 93 59 99 35 0 0 2500 797 181 | 0 91 59 99 11 0 0 2500 797 182 | 0 88 59 99 35 0 0 2500 797 183 | 0 87 59 99 44 0 0 2500 797 184 | 0 92 59 99 17 0 0 2500 797 185 | # gpu pwr temp sm mem enc dec mclk pclk 186 | # Idx W C % % % % MHz MHz 187 | 0 100 59 99 39 0 0 2500 797 188 | 0 100 59 99 36 0 0 2500 797 189 | 0 95 59 99 37 0 0 2500 797 190 | 0 95 59 98 39 0 0 2500 797 191 | 0 92 59 99 34 0 0 2500 797 192 | 0 92 59 99 33 0 0 2500 797 193 | 0 97 59 99 33 0 0 2500 797 194 | 0 95 58 99 40 0 0 2500 797 195 | 0 84 59 99 21 0 0 2500 797 196 | 0 96 59 99 39 0 0 2500 797 197 | 0 94 59 99 33 0 0 2500 797 198 | 0 91 59 99 37 0 0 2500 797 199 | 0 92 59 98 43 0 0 2500 797 200 | 0 96 59 99 34 0 0 2500 797 201 | 0 89 59 99 28 0 0 2500 797 202 | 0 95 59 99 35 0 0 2500 797 203 | 0 97 59 99 39 0 0 2500 797 204 | 0 98 59 99 9 0 0 2500 797 205 | 0 97 59 99 36 0 0 2500 797 206 | 0 95 59 99 36 0 0 2500 797 207 | 0 88 58 99 35 0 0 2500 797 208 | 0 92 59 99 11 0 0 2500 797 209 | 0 95 59 99 34 0 0 2500 797 210 | 0 97 59 99 44 0 0 2500 797 211 | 0 88 58 99 18 0 0 2500 797 212 | 0 93 59 99 39 0 0 2500 797 213 | 0 96 59 99 35 0 0 2500 797 214 | 0 94 59 99 37 0 0 2500 797 215 | 0 92 59 98 40 0 0 2500 797 216 | 0 89 59 99 34 0 0 2500 797 217 | 0 91 59 99 32 0 0 2500 797 218 | 0 87 59 99 33 0 0 2500 797 219 | 0 95 59 99 39 0 0 2500 797 220 | 0 97 59 99 10 0 0 2500 797 221 | 0 98 59 99 35 0 0 2500 797 222 | 0 94 59 99 36 0 0 2500 797 223 | 0 89 59 99 35 0 0 2500 797 224 | 0 93 59 99 8 0 0 2500 797 225 | 0 93 59 99 34 0 0 2500 797 226 | 0 87 59 99 44 0 0 2500 797 227 | 0 88 59 99 17 0 0 2500 797 228 | 0 92 59 99 39 0 0 2500 797 229 | 0 98 59 99 38 0 0 2500 797 230 | 0 95 59 99 38 0 0 2500 797 231 | # gpu pwr temp sm mem enc dec mclk pclk 232 | # Idx W C % % % % MHz MHz 233 | 0 94 59 98 37 0 0 2500 797 234 | 0 96 59 99 34 0 0 2500 797 235 | 0 90 59 99 37 0 0 2500 797 236 | 0 91 59 99 34 0 0 2500 797 237 | 0 96 59 99 41 0 0 2500 797 238 | 0 88 59 99 12 0 0 2500 797 239 | 0 97 59 99 35 0 0 2500 797 240 | 0 94 59 99 36 0 0 2500 797 241 | 0 91 59 99 35 0 0 2500 797 242 | 0 92 59 99 8 0 0 2500 797 243 | 0 89 59 99 34 0 0 2500 797 244 | 0 89 59 99 44 0 0 2500 797 245 | 0 93 59 99 16 0 0 2500 797 246 | 0 96 59 99 38 0 0 2500 797 247 | 0 99 60 99 39 0 0 2500 797 248 | 0 94 59 99 38 0 0 2500 797 249 | 0 97 59 99 36 0 0 2500 797 250 | 0 90 59 99 35 0 0 2500 797 251 | 0 91 59 99 9 0 0 2500 797 252 | 0 89 59 99 35 0 0 2500 797 253 | 0 96 58 99 44 0 0 2500 797 254 | 0 93 59 99 17 0 0 2500 797 255 | 0 94 59 99 39 0 0 2500 797 256 | 0 97 59 99 38 0 0 2500 797 257 | 0 95 59 99 38 0 0 2500 797 258 | 0 93 59 98 37 0 0 2500 797 259 | 0 89 59 99 34 0 0 2500 797 260 | 0 89 59 99 37 0 0 2500 797 261 | 0 94 59 99 34 0 0 2500 797 262 | 0 99 59 99 41 0 0 2500 797 263 | -------------------------------------------------------------------------------- /results/k520/benchmark_tensorflow_slim.output: -------------------------------------------------------------------------------- 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 8 | name: GRID K520 9 | major: 3 minor: 0 memoryClockRate (GHz) 0.797 10 | pciBusID 0000:00:03.0 11 | Total memory: 4.00GiB 12 | Free memory: 3.95GiB 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0) 16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GRID K520, pci bus id: 0000:00:03.0) 17 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.52GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 18 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.29GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 19 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 20 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.19GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 21 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 22 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 23 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 24 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 25 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.20GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 26 | W tensorflow/core/common_runtime/bfc_allocator.cc:213] Ran out of memory trying to allocate 2.15GiB. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory is available. 27 | Batch size: 16 28 | Iterations: 100 29 | Time per iteration: 2488.508 ms 30 | -------------------------------------------------------------------------------- /results/k80/INFO.md: -------------------------------------------------------------------------------- 1 | - Hardware: AWS EC2 p2.xlarge (half of NVIDIA K80) 2 | - OS: Deep Learning AMI for Amazon Linux, v1.5 3 | - CUDA: 7.5 4 | - CuDNN: 5.1 5 | - Caffe: 1.0.0-rc3 6 | - Keras: 1.0.8 7 | - Theano: 0.8.2.dev-127aa70c6d34421ba7808b23d8bea506dc878068 8 | - TensorFlow: 0.10.0 9 | - Neon: N/A 10 | - MXNet: 0.7.0 11 | - Python: 2.7.12 12 | -------------------------------------------------------------------------------- /results/k80/benchmark_keras_tensorflow.output: -------------------------------------------------------------------------------- 1 | Using TensorFlow backend. 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 6 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally 7 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 8 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 9 | name: Tesla K80 10 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235 11 | pciBusID 0000:00:1e.0 12 | Total memory: 11.25GiB 13 | Free memory: 11.13GiB 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 15 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y 16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0) 17 | Iteration: 0 train on batch time: 1017.236 ms. 18 | Iteration: 1 train on batch time: 1020.970 ms. 19 | Iteration: 2 train on batch time: 1018.325 ms. 20 | Iteration: 3 train on batch time: 1012.985 ms. 21 | Iteration: 4 train on batch time: 1020.162 ms. 22 | Iteration: 5 train on batch time: 1032.775 ms. 23 | Iteration: 6 train on batch time: 1014.535 ms. 24 | Iteration: 7 train on batch time: 1011.061 ms. 25 | Iteration: 8 train on batch time: 1105.583 ms. 26 | Iteration: 9 train on batch time: 1017.483 ms. 27 | Iteration: 10 train on batch time: 1018.831 ms. 28 | Iteration: 11 train on batch time: 1021.091 ms. 29 | Iteration: 12 train on batch time: 1026.713 ms. 30 | Iteration: 13 train on batch time: 1021.117 ms. 31 | Iteration: 14 train on batch time: 1013.455 ms. 32 | Iteration: 15 train on batch time: 1036.014 ms. 33 | Iteration: 16 train on batch time: 1023.874 ms. 34 | Iteration: 17 train on batch time: 1016.743 ms. 35 | Iteration: 18 train on batch time: 1019.208 ms. 36 | Iteration: 19 train on batch time: 1032.383 ms. 37 | Iteration: 20 train on batch time: 1016.591 ms. 38 | Iteration: 21 train on batch time: 1026.585 ms. 39 | Iteration: 22 train on batch time: 1039.200 ms. 40 | Iteration: 23 train on batch time: 1032.526 ms. 41 | Iteration: 24 train on batch time: 1026.185 ms. 42 | Iteration: 25 train on batch time: 1027.078 ms. 43 | Iteration: 26 train on batch time: 1026.375 ms. 44 | Iteration: 27 train on batch time: 1030.565 ms. 45 | Iteration: 28 train on batch time: 1023.217 ms. 46 | Iteration: 29 train on batch time: 1038.791 ms. 47 | Iteration: 30 train on batch time: 1051.703 ms. 48 | Iteration: 31 train on batch time: 1028.012 ms. 49 | Iteration: 32 train on batch time: 1026.479 ms. 50 | Iteration: 33 train on batch time: 1025.240 ms. 51 | Iteration: 34 train on batch time: 1028.033 ms. 52 | Iteration: 35 train on batch time: 1024.030 ms. 53 | Iteration: 36 train on batch time: 1031.428 ms. 54 | Iteration: 37 train on batch time: 1047.575 ms. 55 | Iteration: 38 train on batch time: 1025.456 ms. 56 | Iteration: 39 train on batch time: 1024.458 ms. 57 | Iteration: 40 train on batch time: 1036.066 ms. 58 | Iteration: 41 train on batch time: 1030.012 ms. 59 | Iteration: 42 train on batch time: 1023.693 ms. 60 | Iteration: 43 train on batch time: 1034.144 ms. 61 | Iteration: 44 train on batch time: 1037.713 ms. 62 | Iteration: 45 train on batch time: 1023.031 ms. 63 | Iteration: 46 train on batch time: 1013.374 ms. 64 | Iteration: 47 train on batch time: 1040.580 ms. 65 | Iteration: 48 train on batch time: 1022.442 ms. 66 | Iteration: 49 train on batch time: 1017.749 ms. 67 | Iteration: 50 train on batch time: 1014.432 ms. 68 | Iteration: 51 train on batch time: 1027.870 ms. 69 | Iteration: 52 train on batch time: 1013.653 ms. 70 | Iteration: 53 train on batch time: 1013.838 ms. 71 | Iteration: 54 train on batch time: 1029.386 ms. 72 | Iteration: 55 train on batch time: 1013.163 ms. 73 | Iteration: 56 train on batch time: 1007.933 ms. 74 | Iteration: 57 train on batch time: 1009.042 ms. 75 | Iteration: 58 train on batch time: 1030.207 ms. 76 | Iteration: 59 train on batch time: 1013.684 ms. 77 | Iteration: 60 train on batch time: 1012.273 ms. 78 | Iteration: 61 train on batch time: 1124.486 ms. 79 | Iteration: 62 train on batch time: 1010.607 ms. 80 | Iteration: 63 train on batch time: 1007.886 ms. 81 | Iteration: 64 train on batch time: 1008.700 ms. 82 | Iteration: 65 train on batch time: 1022.307 ms. 83 | Iteration: 66 train on batch time: 1010.578 ms. 84 | Iteration: 67 train on batch time: 1006.925 ms. 85 | Iteration: 68 train on batch time: 1011.284 ms. 86 | Iteration: 69 train on batch time: 1021.331 ms. 87 | Iteration: 70 train on batch time: 1006.844 ms. 88 | Iteration: 71 train on batch time: 1008.393 ms. 89 | Iteration: 72 train on batch time: 1009.937 ms. 90 | Iteration: 73 train on batch time: 1008.786 ms. 91 | Iteration: 74 train on batch time: 1005.366 ms. 92 | Iteration: 75 train on batch time: 1011.294 ms. 93 | Iteration: 76 train on batch time: 1016.031 ms. 94 | Iteration: 77 train on batch time: 1009.339 ms. 95 | Iteration: 78 train on batch time: 1010.431 ms. 96 | Iteration: 79 train on batch time: 1095.084 ms. 97 | Iteration: 80 train on batch time: 1014.021 ms. 98 | Iteration: 81 train on batch time: 1007.499 ms. 99 | Iteration: 82 train on batch time: 1014.144 ms. 100 | Iteration: 83 train on batch time: 1010.009 ms. 101 | Iteration: 84 train on batch time: 1006.829 ms. 102 | Iteration: 85 train on batch time: 1004.976 ms. 103 | Iteration: 86 train on batch time: 1031.637 ms. 104 | Iteration: 87 train on batch time: 1008.206 ms. 105 | Iteration: 88 train on batch time: 1002.442 ms. 106 | Iteration: 89 train on batch time: 1005.437 ms. 107 | Iteration: 90 train on batch time: 1016.133 ms. 108 | Iteration: 91 train on batch time: 1007.179 ms. 109 | Iteration: 92 train on batch time: 1004.678 ms. 110 | Iteration: 93 train on batch time: 1019.152 ms. 111 | Iteration: 94 train on batch time: 1010.556 ms. 112 | Iteration: 95 train on batch time: 1005.696 ms. 113 | Iteration: 96 train on batch time: 1006.815 ms. 114 | Iteration: 97 train on batch time: 1013.493 ms. 115 | Iteration: 98 train on batch time: 1011.643 ms. 116 | Iteration: 99 train on batch time: 1007.593 ms. 117 | Batch size: 16 118 | Iterations: 100 119 | Time per iteration: 1021.813 ms 120 | -------------------------------------------------------------------------------- /results/k80/benchmark_keras_theano.output: -------------------------------------------------------------------------------- 1 | Iteration: 0 train on batch time: 1156.279 ms. 2 | Iteration: 1 train on batch time: 1152.773 ms. 3 | Iteration: 2 train on batch time: 1143.487 ms. 4 | Iteration: 3 train on batch time: 1143.189 ms. 5 | Iteration: 4 train on batch time: 1156.002 ms. 6 | Iteration: 5 train on batch time: 1152.428 ms. 7 | Iteration: 6 train on batch time: 1142.399 ms. 8 | Iteration: 7 train on batch time: 1145.704 ms. 9 | Iteration: 8 train on batch time: 1138.460 ms. 10 | Iteration: 9 train on batch time: 1189.352 ms. 11 | Iteration: 10 train on batch time: 1143.049 ms. 12 | Iteration: 11 train on batch time: 1146.637 ms. 13 | Iteration: 12 train on batch time: 1167.725 ms. 14 | Iteration: 13 train on batch time: 1173.385 ms. 15 | Iteration: 14 train on batch time: 1145.851 ms. 16 | Iteration: 15 train on batch time: 1156.808 ms. 17 | Iteration: 16 train on batch time: 1160.264 ms. 18 | Iteration: 17 train on batch time: 1155.437 ms. 19 | Iteration: 18 train on batch time: 1136.610 ms. 20 | Iteration: 19 train on batch time: 1151.689 ms. 21 | Iteration: 20 train on batch time: 1148.285 ms. 22 | Iteration: 21 train on batch time: 1150.323 ms. 23 | Iteration: 22 train on batch time: 1136.111 ms. 24 | Iteration: 23 train on batch time: 1151.901 ms. 25 | Iteration: 24 train on batch time: 1138.016 ms. 26 | Iteration: 25 train on batch time: 1153.965 ms. 27 | Iteration: 26 train on batch time: 1127.675 ms. 28 | Iteration: 27 train on batch time: 1142.117 ms. 29 | Iteration: 28 train on batch time: 1151.639 ms. 30 | Iteration: 29 train on batch time: 1146.955 ms. 31 | Iteration: 30 train on batch time: 1130.169 ms. 32 | Iteration: 31 train on batch time: 1129.715 ms. 33 | Iteration: 32 train on batch time: 1142.414 ms. 34 | Iteration: 33 train on batch time: 1136.229 ms. 35 | Iteration: 34 train on batch time: 1134.486 ms. 36 | Iteration: 35 train on batch time: 1141.239 ms. 37 | Iteration: 36 train on batch time: 1132.407 ms. 38 | Iteration: 37 train on batch time: 1133.712 ms. 39 | Iteration: 38 train on batch time: 1207.620 ms. 40 | Iteration: 39 train on batch time: 1139.921 ms. 41 | Iteration: 40 train on batch time: 1127.063 ms. 42 | Iteration: 41 train on batch time: 1138.914 ms. 43 | Iteration: 42 train on batch time: 1121.130 ms. 44 | Iteration: 43 train on batch time: 1132.552 ms. 45 | Iteration: 44 train on batch time: 1140.805 ms. 46 | Iteration: 45 train on batch time: 1134.416 ms. 47 | Iteration: 46 train on batch time: 1119.847 ms. 48 | Iteration: 47 train on batch time: 1137.358 ms. 49 | Iteration: 48 train on batch time: 1143.902 ms. 50 | Iteration: 49 train on batch time: 1134.764 ms. 51 | Iteration: 50 train on batch time: 1119.605 ms. 52 | Iteration: 51 train on batch time: 1137.893 ms. 53 | Iteration: 52 train on batch time: 1124.636 ms. 54 | Iteration: 53 train on batch time: 1134.856 ms. 55 | Iteration: 54 train on batch time: 1115.432 ms. 56 | Iteration: 55 train on batch time: 1129.295 ms. 57 | Iteration: 56 train on batch time: 1115.512 ms. 58 | Iteration: 57 train on batch time: 1149.882 ms. 59 | Iteration: 58 train on batch time: 1121.193 ms. 60 | Iteration: 59 train on batch time: 1124.862 ms. 61 | Iteration: 60 train on batch time: 1150.790 ms. 62 | Iteration: 61 train on batch time: 1128.271 ms. 63 | Iteration: 62 train on batch time: 1120.504 ms. 64 | Iteration: 63 train on batch time: 1135.152 ms. 65 | Iteration: 64 train on batch time: 1144.666 ms. 66 | Iteration: 65 train on batch time: 1132.896 ms. 67 | Iteration: 66 train on batch time: 1125.409 ms. 68 | Iteration: 67 train on batch time: 1138.033 ms. 69 | Iteration: 68 train on batch time: 1137.423 ms. 70 | Iteration: 69 train on batch time: 1141.092 ms. 71 | Iteration: 70 train on batch time: 1123.861 ms. 72 | Iteration: 71 train on batch time: 1132.350 ms. 73 | Iteration: 72 train on batch time: 1127.056 ms. 74 | Iteration: 73 train on batch time: 1141.376 ms. 75 | Iteration: 74 train on batch time: 1125.757 ms. 76 | Iteration: 75 train on batch time: 1131.033 ms. 77 | Iteration: 76 train on batch time: 1153.230 ms. 78 | Iteration: 77 train on batch time: 1135.386 ms. 79 | Iteration: 78 train on batch time: 1128.103 ms. 80 | Iteration: 79 train on batch time: 1158.865 ms. 81 | Iteration: 80 train on batch time: 1145.302 ms. 82 | Iteration: 81 train on batch time: 1140.393 ms. 83 | Iteration: 82 train on batch time: 1135.405 ms. 84 | Iteration: 83 train on batch time: 1157.408 ms. 85 | Iteration: 84 train on batch time: 1134.044 ms. 86 | Iteration: 85 train on batch time: 1145.173 ms. 87 | Iteration: 86 train on batch time: 1138.993 ms. 88 | Iteration: 87 train on batch time: 1140.910 ms. 89 | Iteration: 88 train on batch time: 1147.842 ms. 90 | Iteration: 89 train on batch time: 1144.074 ms. 91 | Iteration: 90 train on batch time: 1140.689 ms. 92 | Iteration: 91 train on batch time: 1142.731 ms. 93 | Iteration: 92 train on batch time: 1156.817 ms. 94 | Iteration: 93 train on batch time: 1145.158 ms. 95 | Iteration: 94 train on batch time: 1137.080 ms. 96 | Iteration: 95 train on batch time: 1164.166 ms. 97 | Iteration: 96 train on batch time: 1142.763 ms. 98 | Iteration: 97 train on batch time: 1140.657 ms. 99 | Iteration: 98 train on batch time: 1168.779 ms. 100 | Iteration: 99 train on batch time: 1161.280 ms. 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 1141.791 ms 104 | -------------------------------------------------------------------------------- /results/k80/benchmark_mxnet.output: -------------------------------------------------------------------------------- 1 | Batch size: 16 2 | Iterations: 400 3 | Time per iteration: 1247.469 ms 4 | -------------------------------------------------------------------------------- /results/k80/benchmark_neon.output: -------------------------------------------------------------------------------- 1 | Epoch 0 [Train |████████████████████| 1/1 batches, 6.91 cost, 130.02s] 2 | Epoch 1 [Train |████████████████████| 1/1 batches, 6.91 cost, 130.25s] 3 | Epoch 2 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.64s] 4 | Epoch 3 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.35s] 5 | Epoch 4 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.55s] 6 | Epoch 5 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.53s] 7 | Epoch 6 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.70s] 8 | Epoch 7 [Train |████████████████████| 1/1 batches, 6.91 cost, 129.68s] 9 | Epoch 8 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.45s] 10 | Epoch 9 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.41s] 11 | Epoch 10 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.48s] 12 | Epoch 11 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.21s] 13 | Epoch 12 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.64s] 14 | Epoch 13 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.41s] 15 | Epoch 14 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.79s] 16 | Epoch 15 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.61s] 17 | Epoch 16 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.91s] 18 | Epoch 17 [Train |████████████████████| 1/1 batches, 6.90 cost, 129.72s] 19 | -------------------------------------------------------------------------------- /results/k80/benchmark_tensorflow.output: -------------------------------------------------------------------------------- 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 8 | name: Tesla K80 9 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235 10 | pciBusID 0000:00:1e.0 11 | Total memory: 11.25GiB 12 | Free memory: 11.13GiB 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0) 16 | Iteration: 0 train on batch time: 1085.995 ms. 17 | Iteration: 1 train on batch time: 1049.083 ms. 18 | Iteration: 2 train on batch time: 1039.483 ms. 19 | Iteration: 3 train on batch time: 1051.917 ms. 20 | Iteration: 4 train on batch time: 1053.509 ms. 21 | Iteration: 5 train on batch time: 1046.565 ms. 22 | Iteration: 6 train on batch time: 1040.916 ms. 23 | Iteration: 7 train on batch time: 1063.469 ms. 24 | Iteration: 8 train on batch time: 1054.336 ms. 25 | Iteration: 9 train on batch time: 1048.346 ms. 26 | Iteration: 10 train on batch time: 1050.751 ms. 27 | Iteration: 11 train on batch time: 1052.788 ms. 28 | Iteration: 12 train on batch time: 1057.438 ms. 29 | Iteration: 13 train on batch time: 1054.737 ms. 30 | Iteration: 14 train on batch time: 1071.120 ms. 31 | Iteration: 15 train on batch time: 1061.348 ms. 32 | Iteration: 16 train on batch time: 1055.097 ms. 33 | Iteration: 17 train on batch time: 1082.748 ms. 34 | Iteration: 18 train on batch time: 1057.704 ms. 35 | Iteration: 19 train on batch time: 1050.902 ms. 36 | Iteration: 20 train on batch time: 1069.026 ms. 37 | Iteration: 21 train on batch time: 1064.463 ms. 38 | Iteration: 22 train on batch time: 1059.791 ms. 39 | Iteration: 23 train on batch time: 1063.129 ms. 40 | Iteration: 24 train on batch time: 1081.003 ms. 41 | Iteration: 25 train on batch time: 1065.987 ms. 42 | Iteration: 26 train on batch time: 1061.391 ms. 43 | Iteration: 27 train on batch time: 1075.312 ms. 44 | Iteration: 28 train on batch time: 1082.355 ms. 45 | Iteration: 29 train on batch time: 1066.022 ms. 46 | Iteration: 30 train on batch time: 1068.718 ms. 47 | Iteration: 31 train on batch time: 1084.667 ms. 48 | Iteration: 32 train on batch time: 1068.265 ms. 49 | Iteration: 33 train on batch time: 1065.237 ms. 50 | Iteration: 34 train on batch time: 1085.944 ms. 51 | Iteration: 35 train on batch time: 1071.550 ms. 52 | Iteration: 36 train on batch time: 1061.112 ms. 53 | Iteration: 37 train on batch time: 1072.375 ms. 54 | Iteration: 38 train on batch time: 1092.962 ms. 55 | Iteration: 39 train on batch time: 1081.078 ms. 56 | Iteration: 40 train on batch time: 1064.120 ms. 57 | Iteration: 41 train on batch time: 1103.913 ms. 58 | Iteration: 42 train on batch time: 1078.496 ms. 59 | Iteration: 43 train on batch time: 1065.931 ms. 60 | Iteration: 44 train on batch time: 1067.544 ms. 61 | Iteration: 45 train on batch time: 1091.250 ms. 62 | Iteration: 46 train on batch time: 1071.544 ms. 63 | Iteration: 47 train on batch time: 1060.247 ms. 64 | Iteration: 48 train on batch time: 1183.992 ms. 65 | Iteration: 49 train on batch time: 1064.452 ms. 66 | Iteration: 50 train on batch time: 1059.961 ms. 67 | Iteration: 51 train on batch time: 1064.271 ms. 68 | Iteration: 52 train on batch time: 1081.608 ms. 69 | Iteration: 53 train on batch time: 1052.056 ms. 70 | Iteration: 54 train on batch time: 1043.359 ms. 71 | Iteration: 55 train on batch time: 1065.859 ms. 72 | Iteration: 56 train on batch time: 1055.350 ms. 73 | Iteration: 57 train on batch time: 1047.651 ms. 74 | Iteration: 58 train on batch time: 1075.331 ms. 75 | Iteration: 59 train on batch time: 1046.515 ms. 76 | Iteration: 60 train on batch time: 1043.636 ms. 77 | Iteration: 61 train on batch time: 1049.837 ms. 78 | Iteration: 62 train on batch time: 1053.220 ms. 79 | Iteration: 63 train on batch time: 1056.382 ms. 80 | Iteration: 64 train on batch time: 1042.745 ms. 81 | Iteration: 65 train on batch time: 1051.990 ms. 82 | Iteration: 66 train on batch time: 1046.271 ms. 83 | Iteration: 67 train on batch time: 1037.513 ms. 84 | Iteration: 68 train on batch time: 1031.894 ms. 85 | Iteration: 69 train on batch time: 1052.511 ms. 86 | Iteration: 70 train on batch time: 1050.446 ms. 87 | Iteration: 71 train on batch time: 1024.116 ms. 88 | Iteration: 72 train on batch time: 1147.200 ms. 89 | Iteration: 73 train on batch time: 1029.651 ms. 90 | Iteration: 74 train on batch time: 1029.550 ms. 91 | Iteration: 75 train on batch time: 1048.105 ms. 92 | Iteration: 76 train on batch time: 1050.782 ms. 93 | Iteration: 77 train on batch time: 1024.452 ms. 94 | Iteration: 78 train on batch time: 1027.308 ms. 95 | Iteration: 79 train on batch time: 1042.816 ms. 96 | Iteration: 80 train on batch time: 1043.986 ms. 97 | Iteration: 81 train on batch time: 1033.075 ms. 98 | Iteration: 82 train on batch time: 1048.731 ms. 99 | Iteration: 83 train on batch time: 1048.694 ms. 100 | Iteration: 84 train on batch time: 1021.870 ms. 101 | Iteration: 85 train on batch time: 1025.438 ms. 102 | Iteration: 86 train on batch time: 1051.195 ms. 103 | Iteration: 87 train on batch time: 1040.411 ms. 104 | Iteration: 88 train on batch time: 1023.682 ms. 105 | Iteration: 89 train on batch time: 1043.288 ms. 106 | Iteration: 90 train on batch time: 1042.701 ms. 107 | Iteration: 91 train on batch time: 1022.932 ms. 108 | Iteration: 92 train on batch time: 1045.106 ms. 109 | Iteration: 93 train on batch time: 1050.785 ms. 110 | Iteration: 94 train on batch time: 1037.357 ms. 111 | Iteration: 95 train on batch time: 1025.389 ms. 112 | Iteration: 96 train on batch time: 1038.959 ms. 113 | Iteration: 97 train on batch time: 1045.040 ms. 114 | Iteration: 98 train on batch time: 1033.745 ms. 115 | Iteration: 99 train on batch time: 1035.373 ms. 116 | Batch size: 16 117 | Iterations: 100 118 | Time per iteration: 1057.119 ms 119 | -------------------------------------------------------------------------------- /results/k80/benchmark_tensorflow_slim.output: -------------------------------------------------------------------------------- 1 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.so.7.5 locally 2 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.so.5 locally 3 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.so.7.5 locally 4 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcuda.so.1 locally 5 | I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcurand.so.7.5 locally 6 | I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 7 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 8 | name: Tesla K80 9 | major: 3 minor: 7 memoryClockRate (GHz) 0.8235 10 | pciBusID 0000:00:1e.0 11 | Total memory: 11.25GiB 12 | Free memory: 11.13GiB 13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 14 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y 15 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0) 16 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0) 17 | Batch size: 16 18 | Iterations: 100 19 | Time per iteration: 1126.704 ms 20 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/INFO.md: -------------------------------------------------------------------------------- 1 | ## Results Maxwell Titan X 2 | 3 | The mini batch size is 16. As the speed changed with GPU heat, the fan speed was set to 100 % and the tests were all started with GPU temperature = 45 degrees. 4 | 5 | | Framework | Time (per minibatch) | 6 | |:---|:---| 7 | | Neon | 207.406 ms | 8 | | Torch (1) | 273.542 ms | 9 | | Caffe (2) | 311.061 ms | 10 | | Keras (TensorFlow) | 360.753 ms | 11 | | Keras (Theano) | 317.298 ms | 12 | | TensorFlow | 332.27 ms | 13 | | TensorFlow (slim) (3) | 370.89 ms | 14 | | mxnet | 324.635 ms | 15 | 16 | (1) The code in https://github.com/jcjohnson/cnn-benchmarks was re-run with 100 iterations, 100 % GPU fan and starting temperature of 45. 17 | 18 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time. 19 | 20 | (3) Uses the built-in slim training function, which has possibly more overhead. 21 | 22 | - Hardware: Titan X Maxwell 23 | - OS: Ubuntu 14.04.3 LTS 24 | - CUDA: 8.0 (cuda-repo-ubuntu1404-8-0-rc_8.0.27-1_amd64.deb) 25 | - CuDNN: 5.0 (cudnn-8.0-linux-x64-v5.0-ga.tgz) 26 | - Caffe: b2982c7 (from source) 27 | - Keras: 1.0.8 28 | - Theano: 0.9.0dev2.dev-338384adeabd2a56ccae22a9f1105a9f82ce9b8f 29 | - TensorFlow: 2a6d751 (from source) 30 | - Neon: 1.5.4 (485033c) 31 | - mxnet: (066373a) from source 32 | - Python: 2.7.6 33 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_keras_tensorflow.output: -------------------------------------------------------------------------------- 1 | Iteration: 0 train on batch time: 361.193 ms. 2 | Iteration: 1 train on batch time: 362.416 ms. 3 | Iteration: 2 train on batch time: 358.185 ms. 4 | Iteration: 3 train on batch time: 376.177 ms. 5 | Iteration: 4 train on batch time: 382.268 ms. 6 | Iteration: 5 train on batch time: 356.856 ms. 7 | Iteration: 6 train on batch time: 374.421 ms. 8 | Iteration: 7 train on batch time: 362.809 ms. 9 | Iteration: 8 train on batch time: 355.884 ms. 10 | Iteration: 9 train on batch time: 361.131 ms. 11 | Iteration: 10 train on batch time: 387.957 ms. 12 | Iteration: 11 train on batch time: 379.649 ms. 13 | Iteration: 12 train on batch time: 357.392 ms. 14 | Iteration: 13 train on batch time: 357.279 ms. 15 | Iteration: 14 train on batch time: 358.012 ms. 16 | Iteration: 15 train on batch time: 356.508 ms. 17 | Iteration: 16 train on batch time: 357.140 ms. 18 | Iteration: 17 train on batch time: 358.781 ms. 19 | Iteration: 18 train on batch time: 358.696 ms. 20 | Iteration: 19 train on batch time: 358.779 ms. 21 | Iteration: 20 train on batch time: 357.467 ms. 22 | Iteration: 21 train on batch time: 357.330 ms. 23 | Iteration: 22 train on batch time: 356.411 ms. 24 | Iteration: 23 train on batch time: 358.103 ms. 25 | Iteration: 24 train on batch time: 355.624 ms. 26 | Iteration: 25 train on batch time: 354.981 ms. 27 | Iteration: 26 train on batch time: 357.243 ms. 28 | Iteration: 27 train on batch time: 357.966 ms. 29 | Iteration: 28 train on batch time: 358.989 ms. 30 | Iteration: 29 train on batch time: 356.666 ms. 31 | Iteration: 30 train on batch time: 356.416 ms. 32 | Iteration: 31 train on batch time: 356.045 ms. 33 | Iteration: 32 train on batch time: 358.270 ms. 34 | Iteration: 33 train on batch time: 357.858 ms. 35 | Iteration: 34 train on batch time: 358.622 ms. 36 | Iteration: 35 train on batch time: 356.075 ms. 37 | Iteration: 36 train on batch time: 356.267 ms. 38 | Iteration: 37 train on batch time: 357.824 ms. 39 | Iteration: 38 train on batch time: 357.307 ms. 40 | Iteration: 39 train on batch time: 357.824 ms. 41 | Iteration: 40 train on batch time: 355.557 ms. 42 | Iteration: 41 train on batch time: 357.795 ms. 43 | Iteration: 42 train on batch time: 355.751 ms. 44 | Iteration: 43 train on batch time: 356.666 ms. 45 | Iteration: 44 train on batch time: 356.345 ms. 46 | Iteration: 45 train on batch time: 356.759 ms. 47 | Iteration: 46 train on batch time: 357.225 ms. 48 | Iteration: 47 train on batch time: 358.491 ms. 49 | Iteration: 48 train on batch time: 356.588 ms. 50 | Iteration: 49 train on batch time: 356.175 ms. 51 | Iteration: 50 train on batch time: 358.338 ms. 52 | Iteration: 51 train on batch time: 357.325 ms. 53 | Iteration: 52 train on batch time: 357.810 ms. 54 | Iteration: 53 train on batch time: 357.431 ms. 55 | Iteration: 54 train on batch time: 357.332 ms. 56 | Iteration: 55 train on batch time: 358.000 ms. 57 | Iteration: 56 train on batch time: 357.014 ms. 58 | Iteration: 57 train on batch time: 359.220 ms. 59 | Iteration: 58 train on batch time: 355.329 ms. 60 | Iteration: 59 train on batch time: 356.514 ms. 61 | Iteration: 60 train on batch time: 356.627 ms. 62 | Iteration: 61 train on batch time: 358.517 ms. 63 | Iteration: 62 train on batch time: 356.074 ms. 64 | Iteration: 63 train on batch time: 356.139 ms. 65 | Iteration: 64 train on batch time: 356.988 ms. 66 | Iteration: 65 train on batch time: 355.899 ms. 67 | Iteration: 66 train on batch time: 357.607 ms. 68 | Iteration: 67 train on batch time: 358.560 ms. 69 | Iteration: 68 train on batch time: 357.264 ms. 70 | Iteration: 69 train on batch time: 358.806 ms. 71 | Iteration: 70 train on batch time: 357.964 ms. 72 | Iteration: 71 train on batch time: 357.494 ms. 73 | Iteration: 72 train on batch time: 356.260 ms. 74 | Iteration: 73 train on batch time: 357.511 ms. 75 | Iteration: 74 train on batch time: 357.707 ms. 76 | Iteration: 75 train on batch time: 358.153 ms. 77 | Iteration: 76 train on batch time: 357.256 ms. 78 | Iteration: 77 train on batch time: 357.494 ms. 79 | Iteration: 78 train on batch time: 358.091 ms. 80 | Iteration: 79 train on batch time: 356.644 ms. 81 | Iteration: 80 train on batch time: 357.613 ms. 82 | Iteration: 81 train on batch time: 357.086 ms. 83 | Iteration: 82 train on batch time: 358.372 ms. 84 | Iteration: 83 train on batch time: 361.066 ms. 85 | Iteration: 84 train on batch time: 367.126 ms. 86 | Iteration: 85 train on batch time: 388.391 ms. 87 | Iteration: 86 train on batch time: 366.225 ms. 88 | Iteration: 87 train on batch time: 364.328 ms. 89 | Iteration: 88 train on batch time: 357.749 ms. 90 | Iteration: 89 train on batch time: 361.414 ms. 91 | Iteration: 90 train on batch time: 381.775 ms. 92 | Iteration: 91 train on batch time: 378.080 ms. 93 | Iteration: 92 train on batch time: 382.169 ms. 94 | Iteration: 93 train on batch time: 367.850 ms. 95 | Iteration: 94 train on batch time: 381.222 ms. 96 | Iteration: 95 train on batch time: 358.042 ms. 97 | Iteration: 96 train on batch time: 361.979 ms. 98 | Iteration: 97 train on batch time: 358.980 ms. 99 | Iteration: 98 train on batch time: 376.588 ms. 100 | Iteration: 99 train on batch time: 373.197 ms. 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 360.753 ms 104 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_keras_theano.output: -------------------------------------------------------------------------------- 1 | Iteration: 0 train on batch time: 316.571 ms. 2 | Iteration: 1 train on batch time: 317.315 ms. 3 | Iteration: 2 train on batch time: 317.677 ms. 4 | Iteration: 3 train on batch time: 317.544 ms. 5 | Iteration: 4 train on batch time: 317.125 ms. 6 | Iteration: 5 train on batch time: 316.060 ms. 7 | Iteration: 6 train on batch time: 317.397 ms. 8 | Iteration: 7 train on batch time: 316.203 ms. 9 | Iteration: 8 train on batch time: 315.844 ms. 10 | Iteration: 9 train on batch time: 318.185 ms. 11 | Iteration: 10 train on batch time: 318.545 ms. 12 | Iteration: 11 train on batch time: 317.864 ms. 13 | Iteration: 12 train on batch time: 317.422 ms. 14 | Iteration: 13 train on batch time: 318.078 ms. 15 | Iteration: 14 train on batch time: 316.676 ms. 16 | Iteration: 15 train on batch time: 316.480 ms. 17 | Iteration: 16 train on batch time: 317.987 ms. 18 | Iteration: 17 train on batch time: 316.802 ms. 19 | Iteration: 18 train on batch time: 317.709 ms. 20 | Iteration: 19 train on batch time: 318.014 ms. 21 | Iteration: 20 train on batch time: 317.708 ms. 22 | Iteration: 21 train on batch time: 317.348 ms. 23 | Iteration: 22 train on batch time: 318.015 ms. 24 | Iteration: 23 train on batch time: 316.362 ms. 25 | Iteration: 24 train on batch time: 316.876 ms. 26 | Iteration: 25 train on batch time: 317.187 ms. 27 | Iteration: 26 train on batch time: 316.574 ms. 28 | Iteration: 27 train on batch time: 317.994 ms. 29 | Iteration: 28 train on batch time: 316.957 ms. 30 | Iteration: 29 train on batch time: 316.761 ms. 31 | Iteration: 30 train on batch time: 316.354 ms. 32 | Iteration: 31 train on batch time: 317.619 ms. 33 | Iteration: 32 train on batch time: 317.344 ms. 34 | Iteration: 33 train on batch time: 316.610 ms. 35 | Iteration: 34 train on batch time: 316.180 ms. 36 | Iteration: 35 train on batch time: 317.139 ms. 37 | Iteration: 36 train on batch time: 316.756 ms. 38 | Iteration: 37 train on batch time: 316.698 ms. 39 | Iteration: 38 train on batch time: 316.859 ms. 40 | Iteration: 39 train on batch time: 316.749 ms. 41 | Iteration: 40 train on batch time: 316.636 ms. 42 | Iteration: 41 train on batch time: 318.698 ms. 43 | Iteration: 42 train on batch time: 317.036 ms. 44 | Iteration: 43 train on batch time: 316.692 ms. 45 | Iteration: 44 train on batch time: 317.117 ms. 46 | Iteration: 45 train on batch time: 317.050 ms. 47 | Iteration: 46 train on batch time: 316.263 ms. 48 | Iteration: 47 train on batch time: 319.322 ms. 49 | Iteration: 48 train on batch time: 318.018 ms. 50 | Iteration: 49 train on batch time: 316.385 ms. 51 | Iteration: 50 train on batch time: 318.187 ms. 52 | Iteration: 51 train on batch time: 317.213 ms. 53 | Iteration: 52 train on batch time: 317.435 ms. 54 | Iteration: 53 train on batch time: 317.183 ms. 55 | Iteration: 54 train on batch time: 317.591 ms. 56 | Iteration: 55 train on batch time: 317.128 ms. 57 | Iteration: 56 train on batch time: 317.527 ms. 58 | Iteration: 57 train on batch time: 317.905 ms. 59 | Iteration: 58 train on batch time: 316.414 ms. 60 | Iteration: 59 train on batch time: 317.886 ms. 61 | Iteration: 60 train on batch time: 316.230 ms. 62 | Iteration: 61 train on batch time: 317.368 ms. 63 | Iteration: 62 train on batch time: 316.102 ms. 64 | Iteration: 63 train on batch time: 317.860 ms. 65 | Iteration: 64 train on batch time: 316.927 ms. 66 | Iteration: 65 train on batch time: 317.274 ms. 67 | Iteration: 66 train on batch time: 316.880 ms. 68 | Iteration: 67 train on batch time: 317.347 ms. 69 | Iteration: 68 train on batch time: 317.733 ms. 70 | Iteration: 69 train on batch time: 318.467 ms. 71 | Iteration: 70 train on batch time: 318.170 ms. 72 | Iteration: 71 train on batch time: 316.283 ms. 73 | Iteration: 72 train on batch time: 319.012 ms. 74 | Iteration: 73 train on batch time: 317.762 ms. 75 | Iteration: 74 train on batch time: 316.997 ms. 76 | Iteration: 75 train on batch time: 316.797 ms. 77 | Iteration: 76 train on batch time: 317.759 ms. 78 | Iteration: 77 train on batch time: 316.534 ms. 79 | Iteration: 78 train on batch time: 316.722 ms. 80 | Iteration: 79 train on batch time: 316.897 ms. 81 | Iteration: 80 train on batch time: 316.676 ms. 82 | Iteration: 81 train on batch time: 317.529 ms. 83 | Iteration: 82 train on batch time: 318.468 ms. 84 | Iteration: 83 train on batch time: 317.519 ms. 85 | Iteration: 84 train on batch time: 317.008 ms. 86 | Iteration: 85 train on batch time: 318.626 ms. 87 | Iteration: 86 train on batch time: 316.383 ms. 88 | Iteration: 87 train on batch time: 317.032 ms. 89 | Iteration: 88 train on batch time: 317.510 ms. 90 | Iteration: 89 train on batch time: 317.324 ms. 91 | Iteration: 90 train on batch time: 317.124 ms. 92 | Iteration: 91 train on batch time: 318.425 ms. 93 | Iteration: 92 train on batch time: 316.791 ms. 94 | Iteration: 93 train on batch time: 317.963 ms. 95 | Iteration: 94 train on batch time: 317.323 ms. 96 | Iteration: 95 train on batch time: 318.413 ms. 97 | Iteration: 96 train on batch time: 317.258 ms. 98 | Iteration: 97 train on batch time: 316.941 ms. 99 | Iteration: 98 train on batch time: 317.517 ms. 100 | Iteration: 99 train on batch time: 318.043 ms. 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 317.298 ms 104 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_mxnet.output: -------------------------------------------------------------------------------- 1 | Batch size: 16 2 | Iterations: 400 3 | Time per iteration: 324.635 ms 4 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_neon.output: -------------------------------------------------------------------------------- 1 | Epoch 0 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.22s] 2 | Epoch 1 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.08s] 3 | Epoch 2 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.19s] 4 | Epoch 3 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.19s] 5 | Epoch 4 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.19s] 6 | Epoch 5 [Train |████████████████████| 1/1 batches, 6.91 cost, 0.19s] 7 | Epoch 6 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 8 | Epoch 7 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 9 | Epoch 8 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 10 | Epoch 9 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 11 | Epoch 10 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 12 | Epoch 11 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 13 | Epoch 12 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 14 | Epoch 13 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 15 | Epoch 14 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 16 | Epoch 15 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 17 | Epoch 16 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 18 | Epoch 17 [Train |████████████████████| 1/1 batches, 6.90 cost, 0.19s] 19 | Epoch 18 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 20 | Epoch 19 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 21 | Epoch 20 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 22 | Epoch 21 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 23 | Epoch 22 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 24 | Epoch 23 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 25 | Epoch 24 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 26 | Epoch 25 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 27 | Epoch 26 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 28 | Epoch 27 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 29 | Epoch 28 [Train |████████████████████| 1/1 batches, 6.89 cost, 0.19s] 30 | Epoch 29 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 31 | Epoch 30 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 32 | Epoch 31 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 33 | Epoch 32 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 34 | Epoch 33 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 35 | Epoch 34 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 36 | Epoch 35 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 37 | Epoch 36 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 38 | Epoch 37 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 39 | Epoch 38 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 40 | Epoch 39 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 41 | Epoch 40 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 42 | Epoch 41 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 43 | Epoch 42 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 44 | Epoch 43 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 45 | Epoch 44 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 46 | Epoch 45 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 47 | Epoch 46 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 48 | Epoch 47 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 49 | Epoch 48 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 50 | Epoch 49 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 51 | Epoch 50 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 52 | Epoch 51 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 53 | Epoch 52 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 54 | Epoch 53 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 55 | Epoch 54 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 56 | Epoch 55 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 57 | Epoch 56 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 58 | Epoch 57 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 59 | Epoch 58 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 60 | Epoch 59 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 61 | Epoch 60 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 62 | Epoch 61 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 63 | Epoch 62 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 64 | Epoch 63 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 65 | Epoch 64 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 66 | Epoch 65 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 67 | Epoch 66 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 68 | Epoch 67 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 69 | Epoch 68 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 70 | Epoch 69 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 71 | Epoch 70 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 72 | Epoch 71 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 73 | Epoch 72 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 74 | Epoch 73 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 75 | Epoch 74 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 76 | Epoch 75 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 77 | Epoch 76 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 78 | Epoch 77 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 79 | Epoch 78 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 80 | Epoch 79 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 81 | Epoch 80 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 82 | Epoch 81 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 83 | Epoch 82 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 84 | Epoch 83 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 85 | Epoch 84 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 86 | Epoch 85 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 87 | Epoch 86 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 88 | Epoch 87 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 89 | Epoch 88 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.20s] 90 | Epoch 89 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 91 | Epoch 90 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 92 | Epoch 91 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 93 | Epoch 92 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 94 | Epoch 93 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 95 | Epoch 94 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.20s] 96 | Epoch 95 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 97 | Epoch 96 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 98 | Epoch 97 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 99 | Epoch 98 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 100 | Epoch 99 [Train |████████████████████| 1/1 batches, 6.88 cost, 0.19s] 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 207.406 ms 104 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_tensorflow.output: -------------------------------------------------------------------------------- 1 | Iteration: 0 train on batch time: 328.818 ms. 2 | Iteration: 1 train on batch time: 333.407 ms. 3 | Iteration: 2 train on batch time: 333.047 ms. 4 | Iteration: 3 train on batch time: 332.079 ms. 5 | Iteration: 4 train on batch time: 333.472 ms. 6 | Iteration: 5 train on batch time: 334.534 ms. 7 | Iteration: 6 train on batch time: 332.017 ms. 8 | Iteration: 7 train on batch time: 333.512 ms. 9 | Iteration: 8 train on batch time: 334.184 ms. 10 | Iteration: 9 train on batch time: 330.417 ms. 11 | Iteration: 10 train on batch time: 333.059 ms. 12 | Iteration: 11 train on batch time: 333.217 ms. 13 | Iteration: 12 train on batch time: 330.912 ms. 14 | Iteration: 13 train on batch time: 333.879 ms. 15 | Iteration: 14 train on batch time: 333.876 ms. 16 | Iteration: 15 train on batch time: 331.909 ms. 17 | Iteration: 16 train on batch time: 332.823 ms. 18 | Iteration: 17 train on batch time: 333.824 ms. 19 | Iteration: 18 train on batch time: 332.093 ms. 20 | Iteration: 19 train on batch time: 332.637 ms. 21 | Iteration: 20 train on batch time: 333.547 ms. 22 | Iteration: 21 train on batch time: 330.950 ms. 23 | Iteration: 22 train on batch time: 332.907 ms. 24 | Iteration: 23 train on batch time: 333.391 ms. 25 | Iteration: 24 train on batch time: 332.391 ms. 26 | Iteration: 25 train on batch time: 334.819 ms. 27 | Iteration: 26 train on batch time: 333.753 ms. 28 | Iteration: 27 train on batch time: 332.266 ms. 29 | Iteration: 28 train on batch time: 334.893 ms. 30 | Iteration: 29 train on batch time: 334.298 ms. 31 | Iteration: 30 train on batch time: 333.276 ms. 32 | Iteration: 31 train on batch time: 335.117 ms. 33 | Iteration: 32 train on batch time: 333.548 ms. 34 | Iteration: 33 train on batch time: 331.597 ms. 35 | Iteration: 34 train on batch time: 333.420 ms. 36 | Iteration: 35 train on batch time: 333.415 ms. 37 | Iteration: 36 train on batch time: 332.857 ms. 38 | Iteration: 37 train on batch time: 334.195 ms. 39 | Iteration: 38 train on batch time: 334.632 ms. 40 | Iteration: 39 train on batch time: 333.794 ms. 41 | Iteration: 40 train on batch time: 334.871 ms. 42 | Iteration: 41 train on batch time: 334.118 ms. 43 | Iteration: 42 train on batch time: 333.610 ms. 44 | Iteration: 43 train on batch time: 335.168 ms. 45 | Iteration: 44 train on batch time: 336.150 ms. 46 | Iteration: 45 train on batch time: 329.581 ms. 47 | Iteration: 46 train on batch time: 335.177 ms. 48 | Iteration: 47 train on batch time: 332.751 ms. 49 | Iteration: 48 train on batch time: 331.409 ms. 50 | Iteration: 49 train on batch time: 333.099 ms. 51 | Iteration: 50 train on batch time: 333.680 ms. 52 | Iteration: 51 train on batch time: 329.999 ms. 53 | Iteration: 52 train on batch time: 331.872 ms. 54 | Iteration: 53 train on batch time: 333.053 ms. 55 | Iteration: 54 train on batch time: 331.581 ms. 56 | Iteration: 55 train on batch time: 330.702 ms. 57 | Iteration: 56 train on batch time: 333.057 ms. 58 | Iteration: 57 train on batch time: 330.838 ms. 59 | Iteration: 58 train on batch time: 331.483 ms. 60 | Iteration: 59 train on batch time: 330.193 ms. 61 | Iteration: 60 train on batch time: 331.512 ms. 62 | Iteration: 61 train on batch time: 330.924 ms. 63 | Iteration: 62 train on batch time: 330.957 ms. 64 | Iteration: 63 train on batch time: 333.135 ms. 65 | Iteration: 64 train on batch time: 330.593 ms. 66 | Iteration: 65 train on batch time: 331.780 ms. 67 | Iteration: 66 train on batch time: 331.242 ms. 68 | Iteration: 67 train on batch time: 330.998 ms. 69 | Iteration: 68 train on batch time: 329.692 ms. 70 | Iteration: 69 train on batch time: 332.326 ms. 71 | Iteration: 70 train on batch time: 330.668 ms. 72 | Iteration: 71 train on batch time: 331.139 ms. 73 | Iteration: 72 train on batch time: 330.898 ms. 74 | Iteration: 73 train on batch time: 331.845 ms. 75 | Iteration: 74 train on batch time: 331.395 ms. 76 | Iteration: 75 train on batch time: 330.457 ms. 77 | Iteration: 76 train on batch time: 332.719 ms. 78 | Iteration: 77 train on batch time: 330.476 ms. 79 | Iteration: 78 train on batch time: 332.566 ms. 80 | Iteration: 79 train on batch time: 333.037 ms. 81 | Iteration: 80 train on batch time: 329.615 ms. 82 | Iteration: 81 train on batch time: 331.192 ms. 83 | Iteration: 82 train on batch time: 333.480 ms. 84 | Iteration: 83 train on batch time: 330.991 ms. 85 | Iteration: 84 train on batch time: 330.611 ms. 86 | Iteration: 85 train on batch time: 332.442 ms. 87 | Iteration: 86 train on batch time: 330.716 ms. 88 | Iteration: 87 train on batch time: 330.958 ms. 89 | Iteration: 88 train on batch time: 331.185 ms. 90 | Iteration: 89 train on batch time: 331.274 ms. 91 | Iteration: 90 train on batch time: 327.980 ms. 92 | Iteration: 91 train on batch time: 330.826 ms. 93 | Iteration: 92 train on batch time: 332.512 ms. 94 | Iteration: 93 train on batch time: 330.908 ms. 95 | Iteration: 94 train on batch time: 331.749 ms. 96 | Iteration: 95 train on batch time: 333.341 ms. 97 | Iteration: 96 train on batch time: 330.234 ms. 98 | Iteration: 97 train on batch time: 329.032 ms. 99 | Iteration: 98 train on batch time: 333.054 ms. 100 | Iteration: 99 train on batch time: 330.318 ms. 101 | Batch size: 16 102 | Iterations: 100 103 | Time per iteration: 332.274 ms 104 | -------------------------------------------------------------------------------- /results/maxwell_titan_x/benchmark_tensorflow_slim.output: -------------------------------------------------------------------------------- 1 | Batch size: 16 2 | Iterations: 100 3 | Time per iteration: 370.890 ms 4 | -------------------------------------------------------------------------------- /results/v100/INFO.md: -------------------------------------------------------------------------------- 1 | ## Results V100 2 | 3 | The mini batch size is 16 4 | 5 | | Framework | Time (per minibatch) | 6 | |:---|:---| 7 | | TensorFlow | 117.11 | 8 | | TensorFlow (slim) | 176.14 | 9 | | Keras (TensorFlow) | 149.85 | 10 | | MXNet | 81.22 | 11 | 12 | - Hardware: Tesla V100-SXM2-16GB (Amazon AWS EC2 p3.2xlarge) 13 | - OS: Ubuntu 16.04.3 LTS (NVIDIA Volta Deep Learning AMI ami-b933cac4) 14 | - Python: 2.7.6 15 | - CUDA: 9.0.333 16 | - CuDNN: ? 17 | - TensorFlow: 1.4.0 18 | - Keras: 2.0.8-tf 19 | - MXNet: 1.1.0 20 | - Caffe: n/a 21 | - Theano: n/a 22 | - Neon: n/a 23 | 24 | (1) Tensorflow and Keras were run in NGC docker container nvcr.io/nvidia/tensorflow:18.03-py3, and MXNet in nvcr.io/nvidia/mxnet:18.03-py3 25 | (2) The time is for a complete SGD step including parameter updates, not just the forward+backward time. 26 | -------------------------------------------------------------------------------- /results/v100/benchmark_keras_tensorflow.output: -------------------------------------------------------------------------------- 1 | 2018-04-17 01:32:20.656147: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2 | 2018-04-17 01:32:20.656557: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 3 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53 4 | pciBusID: 0000:00:1e.0 5 | totalMemory: 15.77GiB freeMemory: 15.36GiB 6 | 2018-04-17 01:32:20.656581: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0) 7 | Iteration: 0 train on batch time: 1405.454 ms. 8 | Iteration: 1 train on batch time: 915.364 ms. 9 | Iteration: 2 train on batch time: 914.906 ms. 10 | Iteration: 3 train on batch time: 476.989 ms. 11 | Iteration: 4 train on batch time: 474.223 ms. 12 | Iteration: 5 train on batch time: 419.132 ms. 13 | Iteration: 6 train on batch time: 303.623 ms. 14 | Iteration: 7 train on batch time: 196.285 ms. 15 | Iteration: 8 train on batch time: 195.734 ms. 16 | Iteration: 9 train on batch time: 196.247 ms. 17 | Iteration: 10 train on batch time: 196.194 ms. 18 | Iteration: 11 train on batch time: 126.764 ms. 19 | Iteration: 12 train on batch time: 126.625 ms. 20 | Iteration: 13 train on batch time: 127.042 ms. 21 | Iteration: 14 train on batch time: 127.317 ms. 22 | Iteration: 15 train on batch time: 127.124 ms. 23 | Iteration: 16 train on batch time: 127.184 ms. 24 | Iteration: 17 train on batch time: 126.979 ms. 25 | Iteration: 18 train on batch time: 127.601 ms. 26 | Iteration: 19 train on batch time: 126.789 ms. 27 | Iteration: 20 train on batch time: 127.373 ms. 28 | Iteration: 21 train on batch time: 126.867 ms. 29 | Iteration: 22 train on batch time: 126.948 ms. 30 | Iteration: 23 train on batch time: 126.527 ms. 31 | Iteration: 24 train on batch time: 126.791 ms. 32 | Iteration: 25 train on batch time: 99.899 ms. 33 | Iteration: 26 train on batch time: 100.132 ms. 34 | Iteration: 27 train on batch time: 100.119 ms. 35 | Iteration: 28 train on batch time: 100.211 ms. 36 | Iteration: 29 train on batch time: 100.334 ms. 37 | Iteration: 30 train on batch time: 100.300 ms. 38 | Iteration: 31 train on batch time: 100.119 ms. 39 | Iteration: 32 train on batch time: 100.117 ms. 40 | Iteration: 33 train on batch time: 100.148 ms. 41 | Iteration: 34 train on batch time: 100.061 ms. 42 | Iteration: 35 train on batch time: 100.078 ms. 43 | Iteration: 36 train on batch time: 100.421 ms. 44 | Iteration: 37 train on batch time: 100.193 ms. 45 | Iteration: 38 train on batch time: 99.708 ms. 46 | Iteration: 39 train on batch time: 100.033 ms. 47 | Iteration: 40 train on batch time: 100.316 ms. 48 | Iteration: 41 train on batch time: 100.313 ms. 49 | Iteration: 42 train on batch time: 100.267 ms. 50 | Iteration: 43 train on batch time: 100.075 ms. 51 | Iteration: 44 train on batch time: 99.874 ms. 52 | Iteration: 45 train on batch time: 100.243 ms. 53 | Iteration: 46 train on batch time: 100.165 ms. 54 | Iteration: 47 train on batch time: 100.256 ms. 55 | Iteration: 48 train on batch time: 100.179 ms. 56 | Iteration: 49 train on batch time: 99.802 ms. 57 | Iteration: 50 train on batch time: 100.371 ms. 58 | Iteration: 51 train on batch time: 100.073 ms. 59 | Iteration: 52 train on batch time: 100.186 ms. 60 | Iteration: 53 train on batch time: 100.362 ms. 61 | Iteration: 54 train on batch time: 100.454 ms. 62 | Iteration: 55 train on batch time: 100.064 ms. 63 | Iteration: 56 train on batch time: 100.130 ms. 64 | Iteration: 57 train on batch time: 100.151 ms. 65 | Iteration: 58 train on batch time: 100.061 ms. 66 | Iteration: 59 train on batch time: 100.596 ms. 67 | Iteration: 60 train on batch time: 100.262 ms. 68 | Iteration: 61 train on batch time: 100.859 ms. 69 | Iteration: 62 train on batch time: 99.812 ms. 70 | Iteration: 63 train on batch time: 100.214 ms. 71 | Iteration: 64 train on batch time: 100.019 ms. 72 | Iteration: 65 train on batch time: 99.767 ms. 73 | Iteration: 66 train on batch time: 100.262 ms. 74 | Iteration: 67 train on batch time: 100.151 ms. 75 | Iteration: 68 train on batch time: 100.474 ms. 76 | Iteration: 69 train on batch time: 99.996 ms. 77 | Iteration: 70 train on batch time: 100.012 ms. 78 | Iteration: 71 train on batch time: 100.011 ms. 79 | Iteration: 72 train on batch time: 100.111 ms. 80 | Iteration: 73 train on batch time: 100.047 ms. 81 | Iteration: 74 train on batch time: 100.210 ms. 82 | Iteration: 75 train on batch time: 100.140 ms. 83 | Iteration: 76 train on batch time: 100.271 ms. 84 | Iteration: 77 train on batch time: 100.267 ms. 85 | Iteration: 78 train on batch time: 100.216 ms. 86 | Iteration: 79 train on batch time: 100.512 ms. 87 | Iteration: 80 train on batch time: 100.171 ms. 88 | Iteration: 81 train on batch time: 100.132 ms. 89 | Iteration: 82 train on batch time: 100.168 ms. 90 | Iteration: 83 train on batch time: 100.218 ms. 91 | Iteration: 84 train on batch time: 100.365 ms. 92 | Iteration: 85 train on batch time: 99.833 ms. 93 | Iteration: 86 train on batch time: 100.144 ms. 94 | Iteration: 87 train on batch time: 100.059 ms. 95 | Iteration: 88 train on batch time: 100.269 ms. 96 | Iteration: 89 train on batch time: 100.168 ms. 97 | Iteration: 90 train on batch time: 100.107 ms. 98 | Iteration: 91 train on batch time: 100.015 ms. 99 | Iteration: 92 train on batch time: 100.058 ms. 100 | Iteration: 93 train on batch time: 99.955 ms. 101 | Iteration: 94 train on batch time: 99.883 ms. 102 | Iteration: 95 train on batch time: 100.128 ms. 103 | Iteration: 96 train on batch time: 99.910 ms. 104 | Iteration: 97 train on batch time: 100.212 ms. 105 | Iteration: 98 train on batch time: 99.941 ms. 106 | Iteration: 99 train on batch time: 100.041 ms. 107 | Batch size: 16 108 | Iterations: 100 109 | Time per iteration: 149.847 ms 110 | -------------------------------------------------------------------------------- /results/v100/benchmark_mxnet.output: -------------------------------------------------------------------------------- 1 | [04:39:44] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:130: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable) 2 | /opt/mxnet/python/mxnet/optimizer.py:136: UserWarning: WARNING: New optimizer mxnet.optimizer.NAG is overriding existing optimizer mxnet.optimizer.NAG 3 | Optimizer.opt_registry[name].__name__)) 4 | benchmark_mxnet.py:88: DeprecationWarning: mxnet.model.FeedForward has been deprecated. Please use mxnet.mod.Module instead. 5 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), 6 | /opt/mxnet/python/mxnet/model.py:573: DeprecationWarning: Calling initializer with init(str, NDArray) has been deprecated.please use init(mx.init.InitDesc(...), NDArray) instead. 7 | self.initializer(k, v) 8 | Start 9 | Batch size: 16 10 | Iterations: 400 11 | Time per iteration: 81.217 ms 12 | -------------------------------------------------------------------------------- /results/v100/benchmark_tensorflow.output: -------------------------------------------------------------------------------- 1 | WARNING:tensorflow:From benchmark_tensorflow.py:78: softmax_cross_entropy (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 2 | Instructions for updating: 3 | Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed. 4 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:398: compute_weighted_loss (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 5 | Instructions for updating: 6 | Use tf.losses.compute_weighted_loss instead. 7 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:151: add_arg_scope..func_with_args (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 8 | Instructions for updating: 9 | Use tf.losses.add_loss instead. 10 | 2018-04-17 01:31:44.259351: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 11 | 2018-04-17 01:31:44.259751: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 12 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53 13 | pciBusID: 0000:00:1e.0 14 | totalMemory: 15.77GiB freeMemory: 15.36GiB 15 | 2018-04-17 01:31:44.259774: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0) 16 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02. 17 | Instructions for updating: 18 | Use `tf.global_variables_initializer` instead. 19 | Iteration: 0 train on batch time: 1095.193 ms. 20 | Iteration: 1 train on batch time: 650.711 ms. 21 | Iteration: 2 train on batch time: 652.697 ms. 22 | Iteration: 3 train on batch time: 344.923 ms. 23 | Iteration: 4 train on batch time: 345.038 ms. 24 | Iteration: 5 train on batch time: 179.180 ms. 25 | Iteration: 6 train on batch time: 179.378 ms. 26 | Iteration: 7 train on batch time: 125.554 ms. 27 | Iteration: 8 train on batch time: 125.383 ms. 28 | Iteration: 9 train on batch time: 88.094 ms. 29 | Iteration: 10 train on batch time: 88.195 ms. 30 | Iteration: 11 train on batch time: 87.938 ms. 31 | Iteration: 12 train on batch time: 88.148 ms. 32 | Iteration: 13 train on batch time: 88.087 ms. 33 | Iteration: 14 train on batch time: 88.032 ms. 34 | Iteration: 15 train on batch time: 87.999 ms. 35 | Iteration: 16 train on batch time: 88.042 ms. 36 | Iteration: 17 train on batch time: 88.144 ms. 37 | Iteration: 18 train on batch time: 88.071 ms. 38 | Iteration: 19 train on batch time: 88.006 ms. 39 | Iteration: 20 train on batch time: 88.110 ms. 40 | Iteration: 21 train on batch time: 88.095 ms. 41 | Iteration: 22 train on batch time: 88.169 ms. 42 | Iteration: 23 train on batch time: 87.847 ms. 43 | Iteration: 24 train on batch time: 87.962 ms. 44 | Iteration: 25 train on batch time: 88.184 ms. 45 | Iteration: 26 train on batch time: 88.016 ms. 46 | Iteration: 27 train on batch time: 88.027 ms. 47 | Iteration: 28 train on batch time: 88.076 ms. 48 | Iteration: 29 train on batch time: 87.893 ms. 49 | Iteration: 30 train on batch time: 88.014 ms. 50 | Iteration: 31 train on batch time: 87.990 ms. 51 | Iteration: 32 train on batch time: 87.952 ms. 52 | Iteration: 33 train on batch time: 88.172 ms. 53 | Iteration: 34 train on batch time: 87.979 ms. 54 | Iteration: 35 train on batch time: 88.007 ms. 55 | Iteration: 36 train on batch time: 87.991 ms. 56 | Iteration: 37 train on batch time: 87.919 ms. 57 | Iteration: 38 train on batch time: 88.131 ms. 58 | Iteration: 39 train on batch time: 87.980 ms. 59 | Iteration: 40 train on batch time: 88.053 ms. 60 | Iteration: 41 train on batch time: 88.027 ms. 61 | Iteration: 42 train on batch time: 88.001 ms. 62 | Iteration: 43 train on batch time: 88.121 ms. 63 | Iteration: 44 train on batch time: 88.205 ms. 64 | Iteration: 45 train on batch time: 88.191 ms. 65 | Iteration: 46 train on batch time: 88.133 ms. 66 | Iteration: 47 train on batch time: 88.099 ms. 67 | Iteration: 48 train on batch time: 88.160 ms. 68 | Iteration: 49 train on batch time: 88.120 ms. 69 | Iteration: 50 train on batch time: 88.138 ms. 70 | Iteration: 51 train on batch time: 88.142 ms. 71 | Iteration: 52 train on batch time: 88.001 ms. 72 | Iteration: 53 train on batch time: 87.923 ms. 73 | Iteration: 54 train on batch time: 88.105 ms. 74 | Iteration: 55 train on batch time: 88.137 ms. 75 | Iteration: 56 train on batch time: 88.028 ms. 76 | Iteration: 57 train on batch time: 87.955 ms. 77 | Iteration: 58 train on batch time: 88.229 ms. 78 | Iteration: 59 train on batch time: 88.140 ms. 79 | Iteration: 60 train on batch time: 88.050 ms. 80 | Iteration: 61 train on batch time: 88.076 ms. 81 | Iteration: 62 train on batch time: 87.965 ms. 82 | Iteration: 63 train on batch time: 88.112 ms. 83 | Iteration: 64 train on batch time: 87.935 ms. 84 | Iteration: 65 train on batch time: 88.025 ms. 85 | Iteration: 66 train on batch time: 88.126 ms. 86 | Iteration: 67 train on batch time: 88.147 ms. 87 | Iteration: 68 train on batch time: 88.022 ms. 88 | Iteration: 69 train on batch time: 88.084 ms. 89 | Iteration: 70 train on batch time: 88.057 ms. 90 | Iteration: 71 train on batch time: 88.007 ms. 91 | Iteration: 72 train on batch time: 88.055 ms. 92 | Iteration: 73 train on batch time: 87.965 ms. 93 | Iteration: 74 train on batch time: 88.014 ms. 94 | Iteration: 75 train on batch time: 88.025 ms. 95 | Iteration: 76 train on batch time: 87.869 ms. 96 | Iteration: 77 train on batch time: 88.068 ms. 97 | Iteration: 78 train on batch time: 87.859 ms. 98 | Iteration: 79 train on batch time: 88.062 ms. 99 | Iteration: 80 train on batch time: 87.958 ms. 100 | Iteration: 81 train on batch time: 87.940 ms. 101 | Iteration: 82 train on batch time: 87.945 ms. 102 | Iteration: 83 train on batch time: 88.151 ms. 103 | Iteration: 84 train on batch time: 88.000 ms. 104 | Iteration: 85 train on batch time: 88.052 ms. 105 | Iteration: 86 train on batch time: 87.935 ms. 106 | Iteration: 87 train on batch time: 87.997 ms. 107 | Iteration: 88 train on batch time: 87.870 ms. 108 | Iteration: 89 train on batch time: 88.095 ms. 109 | Iteration: 90 train on batch time: 87.870 ms. 110 | Iteration: 91 train on batch time: 88.105 ms. 111 | Iteration: 92 train on batch time: 87.972 ms. 112 | Iteration: 93 train on batch time: 87.895 ms. 113 | Iteration: 94 train on batch time: 87.955 ms. 114 | Iteration: 95 train on batch time: 87.976 ms. 115 | Iteration: 96 train on batch time: 87.909 ms. 116 | Iteration: 97 train on batch time: 87.986 ms. 117 | Iteration: 98 train on batch time: 87.991 ms. 118 | Iteration: 99 train on batch time: 87.980 ms. 119 | Batch size: 16 120 | Iterations: 100 121 | Time per iteration: 117.105 ms 122 | -------------------------------------------------------------------------------- /results/v100/benchmark_tensorflow_slim.output: -------------------------------------------------------------------------------- 1 | 2018-04-17 01:32:00.668931: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2 | 2018-04-17 01:32:00.669350: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 3 | name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53 4 | pciBusID: 0000:00:1e.0 5 | totalMemory: 15.77GiB freeMemory: 15.36GiB 6 | 2018-04-17 01:32:00.669374: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0) 7 | WARNING:tensorflow:From benchmark_tensorflow.py:44: softmax_cross_entropy (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 8 | Instructions for updating: 9 | Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed. 10 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:398: compute_weighted_loss (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 11 | Instructions for updating: 12 | Use tf.losses.compute_weighted_loss instead. 13 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/losses/python/losses/loss_ops.py:151: add_arg_scope..func_with_args (from tensorflow.contrib.losses.python.losses.loss_ops) is deprecated and will be removed after 2016-12-30. 14 | Instructions for updating: 15 | Use tf.losses.add_loss instead. 16 | WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/training/python/training/training.py:412: get_or_create_global_step (from tensorflow.contrib.framework.python.ops.variables) is deprecated and will be removed in a future version. 17 | Instructions for updating: 18 | Please switch to tf.train.get_or_create_global_step 19 | 2018-04-17 01:32:01.530145: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0) 20 | Batch size: 16 21 | Iterations: 100 22 | Time per iteration: 176.140 ms 23 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | 3 | if [ -d deep-learning-models ]; 4 | then 5 | cd deep-learning-models ; git pull ; cd .. 6 | else 7 | git clone https://github.com/fchollet/deep-learning-models 8 | fi 9 | 10 | nvidia-smi dmon > results/benchmark_caffe.dmon 2>&1 & 11 | python benchmark_caffe.py > results/benchmark_caffe.output 2>&1 12 | killall -9 nvidia-smi 13 | 14 | nvidia-smi dmon > results/benchmark_mxnet.dmon 2>&1 & 15 | python benchmark_mxnet.py > results/benchmark_mxnet.output 2>&1 16 | killall -9 nvidia-smi 17 | 18 | # PATH=/usr/local/cuda/bin/:$PATH python benchmark_neon.py > results/benchmark_neon.output 19 | 20 | nvidia-smi dmon > results/benchmark_tensorflow.dmon 2>&1 & 21 | python benchmark_tensorflow.py --train_schedule pure_tf > results/benchmark_tensorflow.output 2>&1 22 | killall -9 nvidia-smi 23 | 24 | nvidia-smi dmon > results/benchmark_tensorflow_slim.dmon 2>&1 & 25 | python benchmark_tensorflow.py --train_schedule slim > results/benchmark_tensorflow_slim.output 2>&1 26 | killall -9 nvidia-smi 27 | 28 | nvidia-smi dmon > results/benchmark_keras_theano.dmon 2>&1 & 29 | python benchmark_keras.py --backend theano > results/benchmark_keras_theano.output 2>&1 30 | killall -9 nvidia-smi 31 | 32 | nvidia-smi dmon > results/benchmark_keras_tensorflow.dmon 2>&1 & 33 | python benchmark_keras.py --backend tensorflow > results/benchmark_keras_tensorflow.output 2>&1 34 | killall -9 nvidia-smi 35 | 36 | 37 | -------------------------------------------------------------------------------- /vgg16_deploy.prototxt: -------------------------------------------------------------------------------- 1 | # From https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/ded9363bd93ec0c770134f4e387d8aaaaa2407ce/VGG_ILSVRC_16_layers_deploy.prototxt 2 | 3 | name: "VGG_ILSVRC_16_layers" 4 | input: "data" 5 | input_dim: 16 6 | input_dim: 3 7 | input_dim: 224 8 | input_dim: 224 9 | layers { 10 | bottom: "data" 11 | top: "conv1_1" 12 | name: "conv1_1" 13 | type: CONVOLUTION 14 | convolution_param { 15 | num_output: 64 16 | pad: 1 17 | kernel_size: 3 18 | } 19 | } 20 | layers { 21 | bottom: "conv1_1" 22 | top: "conv1_1" 23 | name: "relu1_1" 24 | type: RELU 25 | } 26 | layers { 27 | bottom: "conv1_1" 28 | top: "conv1_2" 29 | name: "conv1_2" 30 | type: CONVOLUTION 31 | convolution_param { 32 | num_output: 64 33 | pad: 1 34 | kernel_size: 3 35 | } 36 | } 37 | layers { 38 | bottom: "conv1_2" 39 | top: "conv1_2" 40 | name: "relu1_2" 41 | type: RELU 42 | } 43 | layers { 44 | bottom: "conv1_2" 45 | top: "pool1" 46 | name: "pool1" 47 | type: POOLING 48 | pooling_param { 49 | pool: MAX 50 | kernel_size: 2 51 | stride: 2 52 | } 53 | } 54 | layers { 55 | bottom: "pool1" 56 | top: "conv2_1" 57 | name: "conv2_1" 58 | type: CONVOLUTION 59 | convolution_param { 60 | num_output: 128 61 | pad: 1 62 | kernel_size: 3 63 | } 64 | } 65 | layers { 66 | bottom: "conv2_1" 67 | top: "conv2_1" 68 | name: "relu2_1" 69 | type: RELU 70 | } 71 | layers { 72 | bottom: "conv2_1" 73 | top: "conv2_2" 74 | name: "conv2_2" 75 | type: CONVOLUTION 76 | convolution_param { 77 | num_output: 128 78 | pad: 1 79 | kernel_size: 3 80 | } 81 | } 82 | layers { 83 | bottom: "conv2_2" 84 | top: "conv2_2" 85 | name: "relu2_2" 86 | type: RELU 87 | } 88 | layers { 89 | bottom: "conv2_2" 90 | top: "pool2" 91 | name: "pool2" 92 | type: POOLING 93 | pooling_param { 94 | pool: MAX 95 | kernel_size: 2 96 | stride: 2 97 | } 98 | } 99 | layers { 100 | bottom: "pool2" 101 | top: "conv3_1" 102 | name: "conv3_1" 103 | type: CONVOLUTION 104 | convolution_param { 105 | num_output: 256 106 | pad: 1 107 | kernel_size: 3 108 | } 109 | } 110 | layers { 111 | bottom: "conv3_1" 112 | top: "conv3_1" 113 | name: "relu3_1" 114 | type: RELU 115 | } 116 | layers { 117 | bottom: "conv3_1" 118 | top: "conv3_2" 119 | name: "conv3_2" 120 | type: CONVOLUTION 121 | convolution_param { 122 | num_output: 256 123 | pad: 1 124 | kernel_size: 3 125 | } 126 | } 127 | layers { 128 | bottom: "conv3_2" 129 | top: "conv3_2" 130 | name: "relu3_2" 131 | type: RELU 132 | } 133 | layers { 134 | bottom: "conv3_2" 135 | top: "conv3_3" 136 | name: "conv3_3" 137 | type: CONVOLUTION 138 | convolution_param { 139 | num_output: 256 140 | pad: 1 141 | kernel_size: 3 142 | } 143 | } 144 | layers { 145 | bottom: "conv3_3" 146 | top: "conv3_3" 147 | name: "relu3_3" 148 | type: RELU 149 | } 150 | layers { 151 | bottom: "conv3_3" 152 | top: "pool3" 153 | name: "pool3" 154 | type: POOLING 155 | pooling_param { 156 | pool: MAX 157 | kernel_size: 2 158 | stride: 2 159 | } 160 | } 161 | layers { 162 | bottom: "pool3" 163 | top: "conv4_1" 164 | name: "conv4_1" 165 | type: CONVOLUTION 166 | convolution_param { 167 | num_output: 512 168 | pad: 1 169 | kernel_size: 3 170 | } 171 | } 172 | layers { 173 | bottom: "conv4_1" 174 | top: "conv4_1" 175 | name: "relu4_1" 176 | type: RELU 177 | } 178 | layers { 179 | bottom: "conv4_1" 180 | top: "conv4_2" 181 | name: "conv4_2" 182 | type: CONVOLUTION 183 | convolution_param { 184 | num_output: 512 185 | pad: 1 186 | kernel_size: 3 187 | } 188 | } 189 | layers { 190 | bottom: "conv4_2" 191 | top: "conv4_2" 192 | name: "relu4_2" 193 | type: RELU 194 | } 195 | layers { 196 | bottom: "conv4_2" 197 | top: "conv4_3" 198 | name: "conv4_3" 199 | type: CONVOLUTION 200 | convolution_param { 201 | num_output: 512 202 | pad: 1 203 | kernel_size: 3 204 | } 205 | } 206 | layers { 207 | bottom: "conv4_3" 208 | top: "conv4_3" 209 | name: "relu4_3" 210 | type: RELU 211 | } 212 | layers { 213 | bottom: "conv4_3" 214 | top: "pool4" 215 | name: "pool4" 216 | type: POOLING 217 | pooling_param { 218 | pool: MAX 219 | kernel_size: 2 220 | stride: 2 221 | } 222 | } 223 | layers { 224 | bottom: "pool4" 225 | top: "conv5_1" 226 | name: "conv5_1" 227 | type: CONVOLUTION 228 | convolution_param { 229 | num_output: 512 230 | pad: 1 231 | kernel_size: 3 232 | } 233 | } 234 | layers { 235 | bottom: "conv5_1" 236 | top: "conv5_1" 237 | name: "relu5_1" 238 | type: RELU 239 | } 240 | layers { 241 | bottom: "conv5_1" 242 | top: "conv5_2" 243 | name: "conv5_2" 244 | type: CONVOLUTION 245 | convolution_param { 246 | num_output: 512 247 | pad: 1 248 | kernel_size: 3 249 | } 250 | } 251 | layers { 252 | bottom: "conv5_2" 253 | top: "conv5_2" 254 | name: "relu5_2" 255 | type: RELU 256 | } 257 | layers { 258 | bottom: "conv5_2" 259 | top: "conv5_3" 260 | name: "conv5_3" 261 | type: CONVOLUTION 262 | convolution_param { 263 | num_output: 512 264 | pad: 1 265 | kernel_size: 3 266 | } 267 | } 268 | layers { 269 | bottom: "conv5_3" 270 | top: "conv5_3" 271 | name: "relu5_3" 272 | type: RELU 273 | } 274 | layers { 275 | bottom: "conv5_3" 276 | top: "pool5" 277 | name: "pool5" 278 | type: POOLING 279 | pooling_param { 280 | pool: MAX 281 | kernel_size: 2 282 | stride: 2 283 | } 284 | } 285 | layers { 286 | bottom: "pool5" 287 | top: "fc6" 288 | name: "fc6" 289 | type: INNER_PRODUCT 290 | inner_product_param { 291 | num_output: 4096 292 | } 293 | } 294 | layers { 295 | bottom: "fc6" 296 | top: "fc6" 297 | name: "relu6" 298 | type: RELU 299 | } 300 | layers { 301 | bottom: "fc6" 302 | top: "fc6" 303 | name: "drop6" 304 | type: DROPOUT 305 | dropout_param { 306 | dropout_ratio: 0.5 307 | } 308 | } 309 | layers { 310 | bottom: "fc6" 311 | top: "fc7" 312 | name: "fc7" 313 | type: INNER_PRODUCT 314 | inner_product_param { 315 | num_output: 4096 316 | } 317 | } 318 | layers { 319 | bottom: "fc7" 320 | top: "fc7" 321 | name: "relu7" 322 | type: RELU 323 | } 324 | layers { 325 | bottom: "fc7" 326 | top: "fc7" 327 | name: "drop7" 328 | type: DROPOUT 329 | dropout_param { 330 | dropout_ratio: 0.5 331 | } 332 | } 333 | layers { 334 | bottom: "fc7" 335 | top: "fc8" 336 | name: "fc8" 337 | type: INNER_PRODUCT 338 | inner_product_param { 339 | num_output: 1000 340 | } 341 | } 342 | layers { 343 | bottom: "fc8" 344 | top: "prob" 345 | name: "prob" 346 | type: SOFTMAX 347 | } 348 | -------------------------------------------------------------------------------- /vgg16_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "vgg16_train_val.prototxt" 2 | test_iter: 1 3 | test_interval: 1000 4 | base_lr: 0.002 5 | stepsize: 50000 6 | lr_policy: "step" 7 | gamma: 0.1 8 | display: 1 9 | max_iter: 100 10 | momentum: 0.9 11 | weight_decay: 0.0005 12 | snapshot: 1000 13 | snapshot_prefix: "./" 14 | solver_mode: GPU 15 | -------------------------------------------------------------------------------- /vgg16_train_val.prototxt: -------------------------------------------------------------------------------- 1 | # from http://cs.stanford.edu/people/karpathy/vgg_train_val.prototxt 2 | name: "VGG_ILSVRC_16_layers" 3 | layers { 4 | name: "data" 5 | type: IMAGE_DATA 6 | top: "data" 7 | top: "label" 8 | include { 9 | phase: TRAIN 10 | } 11 | transform_param { 12 | crop_size: 224 13 | } 14 | image_data_param { 15 | source: "images/list.txt" 16 | batch_size: 16 17 | } 18 | } 19 | layers { 20 | name: "data" 21 | type: IMAGE_DATA 22 | top: "data" 23 | top: "label" 24 | include { 25 | phase: TEST 26 | } 27 | transform_param { 28 | crop_size: 224 29 | } 30 | image_data_param { 31 | source: "images/list.txt" 32 | batch_size: 16 33 | } 34 | } 35 | layers { 36 | bottom: "data" 37 | top: "conv1_1" 38 | name: "conv1_1" 39 | type: CONVOLUTION 40 | convolution_param { 41 | num_output: 64 42 | pad: 1 43 | kernel_size: 3 44 | } 45 | blobs_lr: 0.001 46 | blobs_lr: 0.001 47 | } 48 | layers { 49 | bottom: "conv1_1" 50 | top: "conv1_1" 51 | name: "relu1_1" 52 | type: RELU 53 | } 54 | layers { 55 | bottom: "conv1_1" 56 | top: "conv1_2" 57 | name: "conv1_2" 58 | type: CONVOLUTION 59 | convolution_param { 60 | num_output: 64 61 | pad: 1 62 | kernel_size: 3 63 | } 64 | blobs_lr: 0.001 65 | blobs_lr: 0.001 66 | } 67 | layers { 68 | bottom: "conv1_2" 69 | top: "conv1_2" 70 | name: "relu1_2" 71 | type: RELU 72 | } 73 | layers { 74 | bottom: "conv1_2" 75 | top: "pool1" 76 | name: "pool1" 77 | type: POOLING 78 | pooling_param { 79 | pool: MAX 80 | kernel_size: 2 81 | stride: 2 82 | } 83 | } 84 | layers { 85 | bottom: "pool1" 86 | top: "conv2_1" 87 | name: "conv2_1" 88 | type: CONVOLUTION 89 | convolution_param { 90 | num_output: 128 91 | pad: 1 92 | kernel_size: 3 93 | } 94 | blobs_lr: 0.001 95 | blobs_lr: 0.001 96 | } 97 | layers { 98 | bottom: "conv2_1" 99 | top: "conv2_1" 100 | name: "relu2_1" 101 | type: RELU 102 | } 103 | layers { 104 | bottom: "conv2_1" 105 | top: "conv2_2" 106 | name: "conv2_2" 107 | type: CONVOLUTION 108 | convolution_param { 109 | num_output: 128 110 | pad: 1 111 | kernel_size: 3 112 | } 113 | blobs_lr: 0.001 114 | blobs_lr: 0.001 115 | } 116 | layers { 117 | bottom: "conv2_2" 118 | top: "conv2_2" 119 | name: "relu2_2" 120 | type: RELU 121 | } 122 | layers { 123 | bottom: "conv2_2" 124 | top: "pool2" 125 | name: "pool2" 126 | type: POOLING 127 | pooling_param { 128 | pool: MAX 129 | kernel_size: 2 130 | stride: 2 131 | } 132 | } 133 | layers { 134 | bottom: "pool2" 135 | top: "conv3_1" 136 | name: "conv3_1" 137 | type: CONVOLUTION 138 | convolution_param { 139 | num_output: 256 140 | pad: 1 141 | kernel_size: 3 142 | } 143 | blobs_lr: 0.001 144 | blobs_lr: 0.001 145 | } 146 | layers { 147 | bottom: "conv3_1" 148 | top: "conv3_1" 149 | name: "relu3_1" 150 | type: RELU 151 | } 152 | layers { 153 | bottom: "conv3_1" 154 | top: "conv3_2" 155 | name: "conv3_2" 156 | type: CONVOLUTION 157 | convolution_param { 158 | num_output: 256 159 | pad: 1 160 | kernel_size: 3 161 | } 162 | blobs_lr: 0.001 163 | blobs_lr: 0.001 164 | } 165 | layers { 166 | bottom: "conv3_2" 167 | top: "conv3_2" 168 | name: "relu3_2" 169 | type: RELU 170 | } 171 | layers { 172 | bottom: "conv3_2" 173 | top: "conv3_3" 174 | name: "conv3_3" 175 | type: CONVOLUTION 176 | convolution_param { 177 | num_output: 256 178 | pad: 1 179 | kernel_size: 3 180 | } 181 | blobs_lr: 0.001 182 | blobs_lr: 0.001 183 | } 184 | layers { 185 | bottom: "conv3_3" 186 | top: "conv3_3" 187 | name: "relu3_3" 188 | type: RELU 189 | } 190 | layers { 191 | bottom: "conv3_3" 192 | top: "pool3" 193 | name: "pool3" 194 | type: POOLING 195 | pooling_param { 196 | pool: MAX 197 | kernel_size: 2 198 | stride: 2 199 | } 200 | } 201 | layers { 202 | bottom: "pool3" 203 | top: "conv4_1" 204 | name: "conv4_1" 205 | type: CONVOLUTION 206 | convolution_param { 207 | num_output: 512 208 | pad: 1 209 | kernel_size: 3 210 | } 211 | blobs_lr: 0.001 212 | blobs_lr: 0.001 213 | } 214 | layers { 215 | bottom: "conv4_1" 216 | top: "conv4_1" 217 | name: "relu4_1" 218 | type: RELU 219 | } 220 | layers { 221 | bottom: "conv4_1" 222 | top: "conv4_2" 223 | name: "conv4_2" 224 | type: CONVOLUTION 225 | convolution_param { 226 | num_output: 512 227 | pad: 1 228 | kernel_size: 3 229 | } 230 | blobs_lr: 0.001 231 | blobs_lr: 0.001 232 | } 233 | layers { 234 | bottom: "conv4_2" 235 | top: "conv4_2" 236 | name: "relu4_2" 237 | type: RELU 238 | } 239 | layers { 240 | bottom: "conv4_2" 241 | top: "conv4_3" 242 | name: "conv4_3" 243 | type: CONVOLUTION 244 | convolution_param { 245 | num_output: 512 246 | pad: 1 247 | kernel_size: 3 248 | } 249 | blobs_lr: 0.001 250 | blobs_lr: 0.001 251 | } 252 | layers { 253 | bottom: "conv4_3" 254 | top: "conv4_3" 255 | name: "relu4_3" 256 | type: RELU 257 | } 258 | layers { 259 | bottom: "conv4_3" 260 | top: "pool4" 261 | name: "pool4" 262 | type: POOLING 263 | pooling_param { 264 | pool: MAX 265 | kernel_size: 2 266 | stride: 2 267 | } 268 | } 269 | layers { 270 | bottom: "pool4" 271 | top: "conv5_1" 272 | name: "conv5_1" 273 | type: CONVOLUTION 274 | convolution_param { 275 | num_output: 512 276 | pad: 1 277 | kernel_size: 3 278 | } 279 | blobs_lr: 0.001 280 | blobs_lr: 0.001 281 | } 282 | layers { 283 | bottom: "conv5_1" 284 | top: "conv5_1" 285 | name: "relu5_1" 286 | type: RELU 287 | } 288 | layers { 289 | bottom: "conv5_1" 290 | top: "conv5_2" 291 | name: "conv5_2" 292 | type: CONVOLUTION 293 | convolution_param { 294 | num_output: 512 295 | pad: 1 296 | kernel_size: 3 297 | } 298 | blobs_lr: 0.001 299 | blobs_lr: 0.001 300 | } 301 | layers { 302 | bottom: "conv5_2" 303 | top: "conv5_2" 304 | name: "relu5_2" 305 | type: RELU 306 | } 307 | layers { 308 | bottom: "conv5_2" 309 | top: "conv5_3" 310 | name: "conv5_3" 311 | type: CONVOLUTION 312 | convolution_param { 313 | num_output: 512 314 | pad: 1 315 | kernel_size: 3 316 | } 317 | blobs_lr: 0.001 318 | blobs_lr: 0.001 319 | } 320 | layers { 321 | bottom: "conv5_3" 322 | top: "conv5_3" 323 | name: "relu5_3" 324 | type: RELU 325 | } 326 | layers { 327 | bottom: "conv5_3" 328 | top: "pool5" 329 | name: "pool5" 330 | type: POOLING 331 | pooling_param { 332 | pool: MAX 333 | kernel_size: 2 334 | stride: 2 335 | } 336 | } 337 | layers { 338 | bottom: "pool5" 339 | top: "fc6" 340 | name: "fc6" 341 | type: INNER_PRODUCT 342 | inner_product_param { 343 | num_output: 4096 344 | } 345 | blobs_lr: 0.001 346 | blobs_lr: 0.001 347 | } 348 | layers { 349 | bottom: "fc6" 350 | top: "fc6" 351 | name: "relu6" 352 | type: RELU 353 | } 354 | layers { 355 | bottom: "fc6" 356 | top: "fc6" 357 | name: "drop6" 358 | type: DROPOUT 359 | dropout_param { 360 | dropout_ratio: 0.5 361 | } 362 | } 363 | layers { 364 | bottom: "fc6" 365 | top: "fc7" 366 | name: "fc7" 367 | type: INNER_PRODUCT 368 | inner_product_param { 369 | num_output: 4096 370 | } 371 | blobs_lr: 0.001 372 | blobs_lr: 0.001 373 | } 374 | layers { 375 | bottom: "fc7" 376 | top: "fc7" 377 | name: "relu7" 378 | type: RELU 379 | } 380 | layers { 381 | bottom: "fc7" 382 | top: "fc7" 383 | name: "drop7" 384 | type: DROPOUT 385 | dropout_param { 386 | dropout_ratio: 0.5 387 | } 388 | } 389 | layers { 390 | name: "fc8" 391 | bottom: "fc7" 392 | top: "fc8" 393 | type: INNER_PRODUCT 394 | inner_product_param { 395 | num_output: 1000 396 | } 397 | blobs_lr: 0.001 398 | blobs_lr: 0.001 399 | } 400 | layers { 401 | name: "loss" 402 | type: SOFTMAX_LOSS 403 | bottom: "fc8" 404 | bottom: "label" 405 | top: "loss/loss" 406 | } 407 | layers { 408 | name: "accuracy/top1" 409 | type: ACCURACY 410 | bottom: "fc8" 411 | bottom: "label" 412 | top: "accuracy@1" 413 | include: { phase: TEST } 414 | accuracy_param { 415 | top_k: 1 416 | } 417 | } 418 | layers { 419 | name: "accuracy/top5" 420 | type: ACCURACY 421 | bottom: "fc8" 422 | bottom: "label" 423 | top: "accuracy@5" 424 | include: { phase: TEST } 425 | accuracy_param { 426 | top_k: 5 427 | } 428 | } 429 | --------------------------------------------------------------------------------