├── .gitignore
├── CUV
    └── README.md
├── LICENSE.MD
├── README.md
├── TorontoDeepLearning-convnet
    └── README.md
├── caffe
    ├── README.md
    ├── imagenet_winners
    │   ├── alexnet.prototxt
    │   ├── googlenet.prototxt
    │   ├── overfeat.prototxt
    │   └── vgg_a.prototxt
    ├── install.sh
    ├── output_alexnet.log
    ├── output_forceGradInput.log
    ├── output_googlenet.log
    ├── output_noGradInput.log
    ├── output_overfeat.log
    ├── output_vgg_a.log
    ├── proto_forceGradInput
    │   ├── conv1.prototxt
    │   ├── conv2.prototxt
    │   ├── conv3.prototxt
    │   ├── conv4.prototxt
    │   └── conv5.prototxt
    ├── proto_noGradInput
    │   ├── conv1.prototxt
    │   ├── conv2.prototxt
    │   ├── conv3.prototxt
    │   ├── conv4.prototxt
    │   └── conv5.prototxt
    ├── run_forcegradinput.sh
    ├── run_imagenet.sh
    └── run_nogradinput.sh
├── ccv
    ├── README.md
    ├── cwc-bench-runtime.cu
    ├── cwc-bench.c
    ├── lena-128.jpg
    ├── list.txt
    ├── makefile
    └── output.log
├── chainer
    ├── README.md
    ├── alex.py
    ├── googlenet.py
    ├── overfeat.py
    ├── run.sh
    ├── train_imagenet.py
    └── vgga.py
├── cltorch
    ├── README.md
    ├── imagenet.out.txt
    ├── imagenet_winners
    │   ├── alexnet.lua
    │   ├── benchmark.lua
    │   ├── googlenet.lua
    │   ├── overfeat.lua
    │   └── vgg_a.lua
    └── layerwise.out.txt
├── convnet.js
    ├── README.md
    ├── benchmark.js
    └── output.log
├── cuda-convnet2
    ├── README.md
    └── benchmark.lua
├── cxxnet
    └── README.md
├── deepcl
    ├── README.md
    ├── deepcl_benchmark.py
    └── install.sh
├── eblearn
    └── README.md
├── glconv
    ├── README.md
    ├── output.log
    └── stest.lua
├── greentea
    ├── Makefile.config
    ├── README.md
    ├── imagenet_winners
    │   ├── alexnet.prototxt
    │   ├── googlenet.prototxt
    │   ├── overfeat.prototxt
    │   └── vgg_a.prototxt
    ├── install.sh
    ├── output_alexnet.log
    ├── output_forceGradInput.log
    ├── output_googlenet.log
    ├── output_overfeat.log
    ├── output_vgg_a.log
    ├── run_forcegradinput.sh
    ├── run_imagenet.sh
    └── run_nogradinput.sh
├── matlab-DeepLearnToolbox
    └── README.md
├── mxnet
    ├── README.md
    ├── alexnet.py
    └── gnetv1.py
├── nervana
    ├── README.md
    ├── convnet-benchmarks.py
    ├── output.log
    └── output_fp16.log
├── nnforge
    ├── INSTALL.md
    ├── README.md
    └── benchmark
    │   ├── Makefile
    │   └── benchmark.cpp
├── tensorflow
    ├── BUILD
    ├── README.md
    ├── benchmark_alexnet.py
    ├── benchmark_googlenet.py
    ├── benchmark_overfeat.py
    ├── benchmark_vgg.py
    ├── output_alexnet.log
    ├── output_googlenet.log
    ├── output_overfeat.log
    ├── output_vgga.log
    └── run.sh
├── theano
    ├── README.md
    ├── alexnet.py
    ├── benchmark_imagenet.py
    ├── googlenet.py
    ├── output.log
    ├── overfeat.py
    ├── pylearn2_benchmark.py
    └── vgg.py
└── torch7
    ├── README.md
    ├── imagenet_winners
        ├── alexnet.lua
        ├── benchmark.lua
        ├── googlenet.lua
        ├── output.log
        ├── output_cudnn_fp16.log
        ├── output_fbnn.log
        ├── output_raw.log
        ├── overfeat.lua
        └── vgg_a.lua
    └── layerwise_benchmarks
        ├── benchmark.lua
        ├── multigpu.lua
        └── output.log


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | 
 3 | deepcl/DeepCL/
 4 | caffe/caffe/
 5 | greentea/caffe/
 6 | ccv/ccv/
 7 | ccv/cwc-bench
 8 | ccv/cwc-bench-runtime.o
 9 | ccv/cwc-bench.o
10 | cxxnet/cxxnet/
11 | glconv/libglconv/
12 | nervana/maxas/
13 | nervana/nervanagpu/
14 | theano/Theano/
15 | theano/pylearn2/
16 | theano/scikits.cuda/
17 | 


--------------------------------------------------------------------------------
/CUV/README.md:
--------------------------------------------------------------------------------
1 | Benchmark https://github.com/deeplearningais/CUV
2 | 
3 | uses cuda-convnet, so no new convnet kernels. wont be benchmarking it.
4 | https://github.com/deeplearningais/CUV/tree/master/src/3rd_party
5 | 


--------------------------------------------------------------------------------
/LICENSE.MD:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Soumith Chintala
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/TorontoDeepLearning-convnet/README.md:
--------------------------------------------------------------------------------
1 | This library uses the cuda kernels from Alex Khrizevsky's cuda-convnet, so there's no point in benchmarking it.
2 | 
3 | Kernels located here: https://github.com/TorontoDeepLearning/convnet/blob/master/src/cudamat_conv_kernels.cuh
4 | 


--------------------------------------------------------------------------------
/caffe/README.md:
--------------------------------------------------------------------------------
 1 | Install Caffe using the script:
 2 | ```bash
 3 | bash install.sh #actually go read the comments there, slight tweaks might be needed
 4 | ```
 5 | 
 6 | Run the benchmark using:
 7 | ```bash
 8 | ./run_imagenet.sh
 9 | ./run_nogradinput.sh
10 | ./run_forcegradinput.sh
11 | ```
12 | 


--------------------------------------------------------------------------------
/caffe/imagenet_winners/alexnet.prototxt:
--------------------------------------------------------------------------------
  1 | name: "alexnet"
  2 | input: "data"
  3 | input_dim: 128
  4 | input_dim: 3
  5 | input_dim: 224
  6 | input_dim: 224
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/11x11_s4"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 64
 19 |     kernel_size: 11
 20 |     stride: 4
 21 |     pad: 2
 22 |     weight_filler {
 23 |       type: "xavier"
 24 |       std: 0.1
 25 |     }
 26 |     bias_filler {
 27 |       type: "constant"
 28 |       value: 0.2
 29 |     }
 30 |   }
 31 | }
 32 | layers {
 33 |   name: "conv1/relu"
 34 |   type: RELU
 35 |   bottom: "conv1/11x11_s4"
 36 |   top: "conv1/11x11_s4"
 37 | }
 38 | layers {
 39 |   name: "pool1/3x3_s2"
 40 |   type: POOLING
 41 |   bottom: "conv1/11x11_s4"
 42 |   top: "pool1/3x3_s2"
 43 |   pooling_param {
 44 |     pool: MAX
 45 |     kernel_size: 3
 46 |     stride: 2
 47 |   }
 48 | }
 49 | layers {
 50 |   name: "conv2/5x5_s1"
 51 |   type: CONVOLUTION
 52 |   bottom: "pool1/3x3_s2"
 53 |   top: "conv2/5x5_s1"
 54 |   blobs_lr: 1
 55 |   blobs_lr: 2 
 56 |   weight_decay: 1
 57 |   weight_decay: 0 
 58 |   convolution_param {
 59 |     num_output: 192
 60 |     kernel_size: 5
 61 |     stride: 1
 62 |     pad: 2
 63 |     weight_filler {
 64 |       type: "xavier"
 65 |       std: 0.1
 66 |     }
 67 |     bias_filler {
 68 |       type: "constant"
 69 |       value: 0.2
 70 |     }
 71 |   }
 72 | }
 73 | layers {
 74 |   name: "conv2/relu"
 75 |   type: RELU
 76 |   bottom: "conv2/5x5_s1"
 77 |   top: "conv2/5x5_s1"
 78 | }
 79 | layers {
 80 |   name: "pool2/3x3_s2"
 81 |   type: POOLING
 82 |   bottom: "conv2/5x5_s1"
 83 |   top: "pool2/3x3_s2"
 84 |   pooling_param {
 85 |     pool: MAX
 86 |     kernel_size: 3
 87 |     stride: 2
 88 |   }
 89 | }
 90 | layers {
 91 |   name: "conv3/3x3_s1"
 92 |   type: CONVOLUTION
 93 |   bottom: "pool2/3x3_s2"
 94 |   top: "conv3/3x3_s1"
 95 |   blobs_lr: 1
 96 |   blobs_lr: 2 
 97 |   weight_decay: 1
 98 |   weight_decay: 0 
 99 |   convolution_param {
100 |     num_output: 384
101 |     kernel_size: 3
102 |     stride: 1
103 |     pad: 1
104 |     weight_filler {
105 |       type: "xavier"
106 |       std: 0.1
107 |     }
108 |     bias_filler {
109 |       type: "constant"
110 |       value: 0.2
111 |     }
112 |   }
113 | }
114 | layers {
115 |   name: "conv3/relu"
116 |   type: RELU
117 |   bottom: "conv3/3x3_s1"
118 |   top: "conv3/3x3_s1"
119 | }
120 | layers {
121 |   name: "conv4/3x3_s1"
122 |   type: CONVOLUTION
123 |   bottom: "conv3/3x3_s1"
124 |   top: "conv4/3x3_s1"
125 |   blobs_lr: 1
126 |   blobs_lr: 2 
127 |   weight_decay: 1
128 |   weight_decay: 0 
129 |   convolution_param {
130 |     num_output: 256
131 |     kernel_size: 3
132 |     stride: 1
133 |     pad: 1
134 |     weight_filler {
135 |       type: "xavier"
136 |       std: 0.1
137 |     }
138 |     bias_filler {
139 |       type: "constant"
140 |       value: 0.2
141 |     }
142 |   }
143 | }
144 | layers {
145 |   name: "conv4/relu"
146 |   type: RELU
147 |   bottom: "conv4/3x3_s1"
148 |   top: "conv4/3x3_s1"
149 | }
150 | layers {
151 |   name: "conv5/3x3_s1"
152 |   type: CONVOLUTION
153 |   bottom: "conv4/3x3_s1"
154 |   top: "conv5/3x3_s1"
155 |   blobs_lr: 1
156 |   blobs_lr: 2 
157 |   weight_decay: 1
158 |   weight_decay: 0 
159 |   convolution_param {
160 |     num_output: 256
161 |     kernel_size: 3
162 |     stride: 1
163 |     pad: 1
164 |     weight_filler {
165 |       type: "xavier"
166 |       std: 0.1
167 |     }
168 |     bias_filler {
169 |       type: "constant"
170 |       value: 0.2
171 |     }
172 |   }
173 | }
174 | layers {
175 |   name: "conv5/relu"
176 |   type: RELU
177 |   bottom: "conv5/3x3_s1"
178 |   top: "conv5/3x3_s1"
179 | }
180 | layers {
181 |   name: "pool5/3x3_s2"
182 |   type: POOLING
183 |   bottom: "conv5/3x3_s1"
184 |   top: "pool5/3x3_s2"
185 |   pooling_param {
186 |     pool: MAX
187 |     kernel_size: 3
188 |     stride: 2
189 |   }
190 | }
191 | layers {
192 |   name: "fc6"
193 |   type: INNER_PRODUCT
194 |   bottom: "pool5/3x3_s2"
195 |   top: "fc6"
196 |   inner_product_param {
197 |     num_output: 4096
198 |   }
199 | }
200 | layers {
201 |   name: "conv1/relu"
202 |   type: RELU
203 |   bottom: "fc6"
204 |   top: "fc6"
205 | }
206 | layers {
207 |   name: "fc7"
208 |   type: INNER_PRODUCT
209 |   bottom: "fc6"
210 |   top: "fc7"
211 |   inner_product_param {
212 |     num_output: 4096
213 |   }
214 | }
215 | layers {
216 |   name: "conv1/relu"
217 |   type: RELU
218 |   bottom: "fc7"
219 |   top: "fc7"
220 | }
221 | layers {
222 |   name: "fc8"
223 |   type: INNER_PRODUCT
224 |   bottom: "fc7"
225 |   top: "fc8"
226 |   inner_product_param {
227 |     num_output: 1000
228 |   }
229 | }
230 | 


--------------------------------------------------------------------------------
/caffe/imagenet_winners/overfeat.prototxt:
--------------------------------------------------------------------------------
  1 | name: "overfeat"
  2 | input: "data"
  3 | input_dim: 128
  4 | input_dim: 3
  5 | input_dim: 231
  6 | input_dim: 231
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1/11x11_s4"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/11x11_s4"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 96
 19 |     kernel_size: 11
 20 |     stride: 4
 21 |     weight_filler {
 22 |       type: "xavier"
 23 |       std: 0.1
 24 |     }
 25 |     bias_filler {
 26 |       type: "constant"
 27 |       value: 0.2
 28 |     }
 29 |   }
 30 | }
 31 | layers {
 32 |   name: "conv1/relu"
 33 |   type: RELU
 34 |   bottom: "conv1/11x11_s4"
 35 |   top: "conv1/11x11_s4"
 36 | }
 37 | layers {
 38 |   name: "pool1/2x2_s2"
 39 |   type: POOLING
 40 |   bottom: "conv1/11x11_s4"
 41 |   top: "pool1/2x2_s2"
 42 |   pooling_param {
 43 |     pool: MAX
 44 |     kernel_size: 2
 45 |     stride: 2
 46 |   }
 47 | }
 48 | layers {
 49 |   name: "conv2/5x5_s1"
 50 |   type: CONVOLUTION
 51 |   bottom: "pool1/2x2_s2"
 52 |   top: "conv2/5x5_s1"
 53 |   blobs_lr: 1
 54 |   blobs_lr: 2 
 55 |   weight_decay: 1
 56 |   weight_decay: 0 
 57 |   convolution_param {
 58 |     num_output: 256
 59 |     kernel_size: 5
 60 |     stride: 1
 61 |     weight_filler {
 62 |       type: "xavier"
 63 |       std: 0.1
 64 |     }
 65 |     bias_filler {
 66 |       type: "constant"
 67 |       value: 0.2
 68 |     }
 69 |   }
 70 | }
 71 | layers {
 72 |   name: "conv2/relu"
 73 |   type: RELU
 74 |   bottom: "conv2/5x5_s1"
 75 |   top: "conv2/5x5_s1"
 76 | }
 77 | layers {
 78 |   name: "pool2/2x2_s2"
 79 |   type: POOLING
 80 |   bottom: "conv2/5x5_s1"
 81 |   top: "pool2/2x2_s2"
 82 |   pooling_param {
 83 |     pool: MAX
 84 |     kernel_size: 2
 85 |     stride: 2
 86 |   }
 87 | }
 88 | layers {
 89 |   name: "conv3/3x3_s1"
 90 |   type: CONVOLUTION
 91 |   bottom: "pool2/2x2_s2"
 92 |   top: "conv3/3x3_s1"
 93 |   blobs_lr: 1
 94 |   blobs_lr: 2 
 95 |   weight_decay: 1
 96 |   weight_decay: 0 
 97 |   convolution_param {
 98 |     num_output: 512
 99 |     kernel_size: 3
100 |     stride: 1
101 |     pad: 1
102 |     weight_filler {
103 |       type: "xavier"
104 |       std: 0.1
105 |     }
106 |     bias_filler {
107 |       type: "constant"
108 |       value: 0.2
109 |     }
110 |   }
111 | }
112 | layers {
113 |   name: "conv3/relu"
114 |   type: RELU
115 |   bottom: "conv3/3x3_s1"
116 |   top: "conv3/3x3_s1"
117 | }
118 | layers {
119 |   name: "conv4/3x3_s1"
120 |   type: CONVOLUTION
121 |   bottom: "conv3/3x3_s1"
122 |   top: "conv4/3x3_s1"
123 |   blobs_lr: 1
124 |   blobs_lr: 2 
125 |   weight_decay: 1
126 |   weight_decay: 0 
127 |   convolution_param {
128 |     num_output: 1024
129 |     kernel_size: 3
130 |     stride: 1
131 |     pad: 1
132 |     weight_filler {
133 |       type: "xavier"
134 |       std: 0.1
135 |     }
136 |     bias_filler {
137 |       type: "constant"
138 |       value: 0.2
139 |     }
140 |   }
141 | }
142 | layers {
143 |   name: "conv4/relu"
144 |   type: RELU
145 |   bottom: "conv4/3x3_s1"
146 |   top: "conv4/3x3_s1"
147 | }
148 | layers {
149 |   name: "conv5/3x3_s1"
150 |   type: CONVOLUTION
151 |   bottom: "conv4/3x3_s1"
152 |   top: "conv5/3x3_s1"
153 |   blobs_lr: 1
154 |   blobs_lr: 2 
155 |   weight_decay: 1
156 |   weight_decay: 0 
157 |   convolution_param {
158 |     num_output: 1024
159 |     kernel_size: 3
160 |     stride: 1
161 |     pad: 1
162 |     weight_filler {
163 |       type: "xavier"
164 |       std: 0.1
165 |     }
166 |     bias_filler {
167 |       type: "constant"
168 |       value: 0.2
169 |     }
170 |   }
171 | }
172 | layers {
173 |   name: "conv5/relu"
174 |   type: RELU
175 |   bottom: "conv5/3x3_s1"
176 |   top: "conv5/3x3_s1"
177 | }
178 | layers {
179 |   name: "pool5/2x2_s2"
180 |   type: POOLING
181 |   bottom: "conv5/3x3_s1"
182 |   top: "pool5/2x2_s2"
183 |   pooling_param {
184 |     pool: MAX
185 |     kernel_size: 2
186 |     stride: 2
187 |   }
188 | }
189 | layers {
190 |   name: "fc6"
191 |   type: INNER_PRODUCT
192 |   bottom: "pool5/2x2_s2"
193 |   top: "fc6"
194 |   inner_product_param {
195 |     num_output: 3072
196 |   }
197 | }
198 | layers {
199 |   name: "fc7"
200 |   type: INNER_PRODUCT
201 |   bottom: "fc6"
202 |   top: "fc7"
203 |   inner_product_param {
204 |     num_output: 4096
205 |   }
206 | }
207 | layers {
208 |   name: "fc8"
209 |   type: INNER_PRODUCT
210 |   bottom: "fc7"
211 |   top: "fc8"
212 |   inner_product_param {
213 |     num_output: 1000
214 |   }
215 | }
216 | 


--------------------------------------------------------------------------------
/caffe/imagenet_winners/vgg_a.prototxt:
--------------------------------------------------------------------------------
  1 | name: "vgg_a"
  2 | input: "data"
  3 | input_dim: 64
  4 | input_dim: 3
  5 | input_dim: 224
  6 | input_dim: 224
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1/3x3_s1"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/3x3_s1"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 64
 19 |     kernel_size: 3
 20 |     pad: 1
 21 |     stride: 1
 22 |     weight_filler {
 23 |       type: "xavier"
 24 |       std: 0.1
 25 |     }
 26 |     bias_filler {
 27 |       type: "constant"
 28 |       value: 0.2
 29 |     }
 30 |   }
 31 | }
 32 | layers {
 33 |   name: "conv1/relu"
 34 |   type: RELU
 35 |   bottom: "conv1/3x3_s1"
 36 |   top: "conv1/3x3_s1"
 37 | }
 38 | layers {
 39 |   name: "pool1/2x2_s2"
 40 |   type: POOLING
 41 |   bottom: "conv1/3x3_s1"
 42 |   top: "pool1/2x2_s2"
 43 |   pooling_param {
 44 |     pool: MAX
 45 |     kernel_size: 2
 46 |     stride: 2
 47 |   }
 48 | }
 49 | layers {
 50 |   name: "conv2/3x3_s1"
 51 |   type: CONVOLUTION
 52 |   bottom: "pool1/2x2_s2"
 53 |   top: "conv2/3x3_s1"
 54 |   blobs_lr: 1
 55 |   blobs_lr: 2 
 56 |   weight_decay: 1
 57 |   weight_decay: 0 
 58 |   convolution_param {
 59 |     num_output: 128
 60 |     pad: 1
 61 |     kernel_size: 3
 62 |     stride: 1
 63 |     weight_filler {
 64 |       type: "xavier"
 65 |       std: 0.1
 66 |     }
 67 |     bias_filler {
 68 |       type: "constant"
 69 |       value: 0.2
 70 |     }
 71 |   }
 72 | }
 73 | layers {
 74 |   name: "conv2/relu"
 75 |   type: RELU
 76 |   bottom: "conv2/3x3_s1"
 77 |   top: "conv2/3x3_s1"
 78 | }
 79 | layers {
 80 |   name: "pool2/2x2_s2"
 81 |   type: POOLING
 82 |   bottom: "conv2/3x3_s1"
 83 |   top: "pool2/2x2_s2"
 84 |   pooling_param {
 85 |     pool: MAX
 86 |     kernel_size: 2
 87 |     stride: 2
 88 |   }
 89 | }
 90 | layers {
 91 |   name: "conv3/3x3_s1"
 92 |   type: CONVOLUTION
 93 |   bottom: "pool2/2x2_s2"
 94 |   top: "conv3/3x3_s1"
 95 |   blobs_lr: 1
 96 |   blobs_lr: 2 
 97 |   weight_decay: 1
 98 |   weight_decay: 0 
 99 |   convolution_param {
100 |     num_output: 256
101 |     kernel_size: 3
102 |     stride: 1
103 |     pad: 1
104 |     weight_filler {
105 |       type: "xavier"
106 |       std: 0.1
107 |     }
108 |     bias_filler {
109 |       type: "constant"
110 |       value: 0.2
111 |     }
112 |   }
113 | }
114 | layers {
115 |   name: "conv3/relu"
116 |   type: RELU
117 |   bottom: "conv3/3x3_s1"
118 |   top: "conv3/3x3_s1"
119 | }
120 | layers {
121 |   name: "conv4/3x3_s1"
122 |   type: CONVOLUTION
123 |   bottom: "conv3/3x3_s1"
124 |   top: "conv4/3x3_s1"
125 |   blobs_lr: 1
126 |   blobs_lr: 2 
127 |   weight_decay: 1
128 |   weight_decay: 0 
129 |   convolution_param {
130 |     num_output: 256
131 |     kernel_size: 3
132 |     stride: 1
133 |     pad: 1
134 |     weight_filler {
135 |       type: "xavier"
136 |       std: 0.1
137 |     }
138 |     bias_filler {
139 |       type: "constant"
140 |       value: 0.2
141 |     }
142 |   }
143 | }
144 | layers {
145 |   name: "conv4/relu"
146 |   type: RELU
147 |   bottom: "conv4/3x3_s1"
148 |   top: "conv4/3x3_s1"
149 | }
150 | layers {
151 |   name: "pool3/2x2_s2"
152 |   type: POOLING
153 |   bottom: "conv4/3x3_s1"
154 |   top: "pool3/2x2_s2"
155 |   pooling_param {
156 |     pool: MAX
157 |     kernel_size: 2
158 |     stride: 2
159 |   }
160 | }
161 | layers {
162 |   name: "conv5/3x3_s1"
163 |   type: CONVOLUTION
164 |   bottom: "pool3/2x2_s2"
165 |   top: "conv5/3x3_s1"
166 |   blobs_lr: 1
167 |   blobs_lr: 2 
168 |   weight_decay: 1
169 |   weight_decay: 0 
170 |   convolution_param {
171 |     num_output: 512
172 |     kernel_size: 3
173 |     stride: 1
174 |     pad: 1
175 |     weight_filler {
176 |       type: "xavier"
177 |       std: 0.1
178 |     }
179 |     bias_filler {
180 |       type: "constant"
181 |       value: 0.2
182 |     }
183 |   }
184 | }
185 | layers {
186 |   name: "conv5/relu"
187 |   type: RELU
188 |   bottom: "conv5/3x3_s1"
189 |   top: "conv5/3x3_s1"
190 | }
191 | layers {
192 |   name: "conv6/3x3_s1"
193 |   type: CONVOLUTION
194 |   bottom: "conv5/3x3_s1"
195 |   top: "conv6/3x3_s1"
196 |   blobs_lr: 1
197 |   blobs_lr: 2 
198 |   weight_decay: 1
199 |   weight_decay: 0 
200 |   convolution_param {
201 |     num_output: 512
202 |     kernel_size: 3
203 |     stride: 1
204 |     pad: 1
205 |     weight_filler {
206 |       type: "xavier"
207 |       std: 0.1
208 |     }
209 |     bias_filler {
210 |       type: "constant"
211 |       value: 0.2
212 |     }
213 |   }
214 | }
215 | layers {
216 |   name: "conv6/relu"
217 |   type: RELU
218 |   bottom: "conv6/3x3_s1"
219 |   top: "conv6/3x3_s1"
220 | }
221 | layers {
222 |   name: "pool4/2x2_s2"
223 |   type: POOLING
224 |   bottom: "conv6/3x3_s1"
225 |   top: "pool4/2x2_s2"
226 |   pooling_param {
227 |     pool: MAX
228 |     kernel_size: 2
229 |     stride: 2
230 |   }
231 | }
232 | layers {
233 |   name: "conv7/3x3_s1"
234 |   type: CONVOLUTION
235 |   bottom: "pool4/2x2_s2"
236 |   top: "conv7/3x3_s1"
237 |   blobs_lr: 1
238 |   blobs_lr: 2 
239 |   weight_decay: 1
240 |   weight_decay: 0 
241 |   convolution_param {
242 |     num_output: 512
243 |     kernel_size: 3
244 |     stride: 1
245 |     pad: 1
246 |     weight_filler {
247 |       type: "xavier"
248 |       std: 0.1
249 |     }
250 |     bias_filler {
251 |       type: "constant"
252 |       value: 0.2
253 |     }
254 |   }
255 | }
256 | layers {
257 |   name: "conv7/relu"
258 |   type: RELU
259 |   bottom: "conv7/3x3_s1"
260 |   top: "conv7/3x3_s1"
261 | }
262 | layers {
263 |   name: "conv8/3x3_s1"
264 |   type: CONVOLUTION
265 |   bottom: "conv7/3x3_s1"
266 |   top: "conv8/3x3_s1"
267 |   blobs_lr: 1
268 |   blobs_lr: 2 
269 |   weight_decay: 1
270 |   weight_decay: 0 
271 |   convolution_param {
272 |     num_output: 512
273 |     kernel_size: 3
274 |     stride: 1
275 |     pad: 1
276 |     weight_filler {
277 |       type: "xavier"
278 |       std: 0.1
279 |     }
280 |     bias_filler {
281 |       type: "constant"
282 |       value: 0.2
283 |     }
284 |   }
285 | }
286 | layers {
287 |   name: "conv8/relu"
288 |   type: RELU
289 |   bottom: "conv8/3x3_s1"
290 |   top: "conv8/3x3_s1"
291 | }
292 | layers {
293 |   name: "pool5/2x2_s2"
294 |   type: POOLING
295 |   bottom: "conv8/3x3_s1"
296 |   top: "pool5/2x2_s2"
297 |   pooling_param {
298 |     pool: MAX
299 |     kernel_size: 2
300 |     stride: 2
301 |   }
302 | }
303 | layers {
304 |   name: "fc6"
305 |   type: INNER_PRODUCT
306 |   bottom: "pool5/2x2_s2"
307 |   top: "fc6"
308 |   inner_product_param {
309 |     num_output: 4096
310 |   }
311 | }
312 | layers {
313 |   name: "fc7"
314 |   type: INNER_PRODUCT
315 |   bottom: "fc6"
316 |   top: "fc7"
317 |   inner_product_param {
318 |     num_output: 4096
319 |   }
320 | }
321 | layers {
322 |   name: "fc8"
323 |   type: INNER_PRODUCT
324 |   bottom: "fc7"
325 |   top: "fc8"
326 |   inner_product_param {
327 |     num_output: 1000
328 |   }
329 | }
330 | 


--------------------------------------------------------------------------------
/caffe/install.sh:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/BVLC/caffe.git
 2 | cd caffe
 3 | 
 4 | # Dependencies
 5 | sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libboost-all-dev libhdf5-serial-dev 
 6 | sudo apt-get install -y protobuf-compiler gfortran libjpeg62 libfreeimage-dev libatlas-base-dev git python-dev python-pip 
 7 | sudo apt-get install -y libgoogle-glog-dev libbz2-dev libxml2-dev libxslt-dev libffi-dev libssl-dev libgflags-dev liblmdb-dev python-yaml
 8 | sudo easy_install pillow
 9 | 
10 | # Compile Caffe
11 | cp Makefile.config.example Makefile.config
12 | # For some reason, I was getting <mpi.h> not found. So I had to manually edit /usr/include/H5public.h and disable PARALLEL support. (by adding #undef H5_HAVE_PARALLEL )
13 | # Also, I disabled CUDNN (because these are pure-caffe benchmarks)
14 | make all
15 | make test
16 | make runtest
17 | 


--------------------------------------------------------------------------------
/caffe/proto_forceGradInput/conv1.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_3x96x11x11"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 3
 5 | input_dim: 128
 6 | input_dim: 128
 7 | force_backward: true
 8 | layers {
 9 |   name: "conv1"
10 |   type: CONVOLUTION
11 |   bottom: "data"
12 |   top: "conv1"
13 |   blobs_lr: 1
14 |   blobs_lr: 2
15 |   convolution_param {
16 |     num_output: 96
17 |     kernel_size: 11
18 |     stride: 1
19 |     weight_filler {
20 |       type: "xavier"
21 |     }
22 |     bias_filler {
23 |       type: "constant"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/caffe/proto_forceGradInput/conv2.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_64x128x9x9"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 64
 5 | input_dim: 64
 6 | input_dim: 64
 7 | force_backward: true
 8 | layers {
 9 |   name: "conv2"
10 |   type: CONVOLUTION
11 |   bottom: "data"
12 |   top: "conv2"
13 |   blobs_lr: 1
14 |   blobs_lr: 2
15 |   convolution_param {
16 |     num_output: 128
17 |     kernel_size: 9
18 |     stride: 1
19 |     weight_filler {
20 |       type: "xavier"
21 |     }
22 |     bias_filler {
23 |       type: "constant"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/caffe/proto_forceGradInput/conv3.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_128x128x9x9"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 128
 5 | input_dim: 32
 6 | input_dim: 32
 7 | force_backward: true
 8 | layers {
 9 |   name: "conv3"
10 |   type: CONVOLUTION
11 |   bottom: "data"
12 |   top: "conv3"
13 |   blobs_lr: 1
14 |   blobs_lr: 2
15 |   convolution_param {
16 |     num_output: 128
17 |     kernel_size: 9
18 |     stride: 1
19 |     weight_filler {
20 |       type: "xavier"
21 |     }
22 |     bias_filler {
23 |       type: "constant"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/caffe/proto_forceGradInput/conv4.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_128x128x7x7"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 128
 5 | input_dim: 16
 6 | input_dim: 16
 7 | force_backward: true
 8 | layers {
 9 |   name: "conv4"
10 |   type: CONVOLUTION
11 |   bottom: "data"
12 |   top: "conv4"
13 |   blobs_lr: 1
14 |   blobs_lr: 2
15 |   convolution_param {
16 |     num_output: 128
17 |     kernel_size: 7
18 |     stride: 1
19 |     weight_filler {
20 |       type: "xavier"
21 |     }
22 |     bias_filler {
23 |       type: "constant"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/caffe/proto_forceGradInput/conv5.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_384x384x3x3"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 384
 5 | input_dim: 13
 6 | input_dim: 13
 7 | force_backward: true
 8 | layers {
 9 |   name: "conv5"
10 |   type: CONVOLUTION
11 |   bottom: "data"
12 |   top: "conv5"
13 |   blobs_lr: 1
14 |   blobs_lr: 2
15 |   convolution_param {
16 |     num_output: 384
17 |     kernel_size: 3
18 |     stride: 1
19 |     weight_filler {
20 |       type: "xavier"
21 |     }
22 |     bias_filler {
23 |       type: "constant"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/caffe/proto_noGradInput/conv1.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_3x96x11x11"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 3
 5 | input_dim: 128
 6 | input_dim: 128
 7 | layers {
 8 |   name: "conv1"
 9 |   type: CONVOLUTION
10 |   bottom: "data"
11 |   top: "conv1"
12 |   blobs_lr: 1
13 |   blobs_lr: 2
14 |   convolution_param {
15 |     num_output: 96
16 |     kernel_size: 11
17 |     stride: 1
18 |     weight_filler {
19 |       type: "xavier"
20 |     }
21 |     bias_filler {
22 |       type: "constant"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/caffe/proto_noGradInput/conv2.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_64x128x9x9"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 64
 5 | input_dim: 64
 6 | input_dim: 64
 7 | layers {
 8 |   name: "conv2"
 9 |   type: CONVOLUTION
10 |   bottom: "data"
11 |   top: "conv2"
12 |   blobs_lr: 1
13 |   blobs_lr: 2
14 |   convolution_param {
15 |     num_output: 128
16 |     kernel_size: 9
17 |     stride: 1
18 |     weight_filler {
19 |       type: "xavier"
20 |     }
21 |     bias_filler {
22 |       type: "constant"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/caffe/proto_noGradInput/conv3.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_128x128x9x9"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 128
 5 | input_dim: 32
 6 | input_dim: 32
 7 | layers {
 8 |   name: "conv3"
 9 |   type: CONVOLUTION
10 |   bottom: "data"
11 |   top: "conv3"
12 |   blobs_lr: 1
13 |   blobs_lr: 2
14 |   convolution_param {
15 |     num_output: 128
16 |     kernel_size: 9
17 |     stride: 1
18 |     weight_filler {
19 |       type: "xavier"
20 |     }
21 |     bias_filler {
22 |       type: "constant"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/caffe/proto_noGradInput/conv4.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_128x128x7x7"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 128
 5 | input_dim: 16
 6 | input_dim: 16
 7 | layers {
 8 |   name: "conv4"
 9 |   type: CONVOLUTION
10 |   bottom: "data"
11 |   top: "conv4"
12 |   blobs_lr: 1
13 |   blobs_lr: 2
14 |   convolution_param {
15 |     num_output: 128
16 |     kernel_size: 7
17 |     stride: 1
18 |     weight_filler {
19 |       type: "xavier"
20 |     }
21 |     bias_filler {
22 |       type: "constant"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/caffe/proto_noGradInput/conv5.prototxt:
--------------------------------------------------------------------------------
 1 | name: "ConvLayer_384x384x3x3"
 2 | input: "data"
 3 | input_dim: 128
 4 | input_dim: 384
 5 | input_dim: 13
 6 | input_dim: 13
 7 | layers {
 8 |   name: "conv5"
 9 |   type: CONVOLUTION
10 |   bottom: "data"
11 |   top: "conv5"
12 |   blobs_lr: 1
13 |   blobs_lr: 2
14 |   convolution_param {
15 |     num_output: 384
16 |     kernel_size: 3
17 |     stride: 1
18 |     weight_filler {
19 |       type: "xavier"
20 |     }
21 |     bias_filler {
22 |       type: "constant"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/caffe/run_forcegradinput.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./caffe/build/tools/caffe time --model=proto_forceGradInput/conv1.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
4 | ./caffe/build/tools/caffe time --model=proto_forceGradInput/conv2.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
5 | ./caffe/build/tools/caffe time --model=proto_forceGradInput/conv3.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
6 | ./caffe/build/tools/caffe time --model=proto_forceGradInput/conv4.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
7 | ./caffe/build/tools/caffe time --model=proto_forceGradInput/conv5.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
8 | 


--------------------------------------------------------------------------------
/caffe/run_imagenet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./caffe/build/tools/caffe time --model=./imagenet_winners/alexnet.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_alexnet.log 2>&1
4 | ./caffe/build/tools/caffe time --model=./imagenet_winners/overfeat.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_overfeat.log 2>&1
5 | ./caffe/build/tools/caffe time --model=./imagenet_winners/vgg_a.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_vgg_a.log 2>&1
6 | ./caffe/build/tools/caffe time --model=./imagenet_winners/googlenet.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_googlenet.log 2>&1
7 | 
8 | 


--------------------------------------------------------------------------------
/caffe/run_nogradinput.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./caffe/build/tools/caffe time --model=proto_noGradInput/conv1.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
4 | ./caffe/build/tools/caffe time --model=proto_noGradInput/conv2.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
5 | ./caffe/build/tools/caffe time --model=proto_noGradInput/conv3.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
6 | ./caffe/build/tools/caffe time --model=proto_noGradInput/conv4.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
7 | ./caffe/build/tools/caffe time --model=proto_noGradInput/conv5.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
8 | 
9 | 


--------------------------------------------------------------------------------
/ccv/README.md:
--------------------------------------------------------------------------------
 1 | Build ccv
 2 | --------
 3 | ```
 4 | sudo apt-get install libgsl0-dev
 5 | 
 6 | git clone https://github.com/liuliu/ccv.git
 7 | cd ccv
 8 | cd lib
 9 | ./configure
10 | cd ../bin/cuda
11 | make
12 | cd ../../../
13 | ```
14 | 
15 | Run the modified cwc-bench
16 | -------------
17 | ```
18 | make
19 | ./cwc-bench list.txt
20 | ```
21 | 


--------------------------------------------------------------------------------
/ccv/cwc-bench.c:
--------------------------------------------------------------------------------
  1 | #include "ccv.h"
  2 | #include <ctype.h>
  3 | 
  4 | void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params);
  5 | 
  6 | int main(int argc, char** argv)
  7 | {
  8 | 	ccv_enable_default_cache();
  9 | 	assert(argc == 2);
 10 | 	FILE *r = fopen(argv[1], "r");
 11 | 	char* file = (char*)malloc(1024);
 12 | 	ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), 64, 0);
 13 | 	size_t len = 1024;
 14 | 	ssize_t read;
 15 | 	while ((read = getline(&file, &len, r)) != -1)
 16 | 	{
 17 | 		while(read > 1 && isspace(file[read - 1]))
 18 | 			read--;
 19 | 		file[read] = 0;
 20 | 		ccv_file_info_t input;
 21 | 		input.filename = (char*)ccmalloc(1024);
 22 | 		strncpy(input.filename, file, 1024);
 23 | 		ccv_categorized_t categorized = ccv_categorized(0, 0, &input);
 24 | 		ccv_array_push(categorizeds, &categorized);
 25 | 	}
 26 | 	fclose(r);
 27 | 	free(file);
 28 | 	/* convnet-benchmarks parameters */
 29 | 	ccv_convnet_layer_param_t params[] = {
 30 | 		// first layer (convolutional => max pool => rnorm)
 31 | 		{
 32 | 			.type = CCV_CONVNET_CONVOLUTIONAL,
 33 | 			.bias = 0,
 34 | 			.glorot = sqrtf(2),
 35 | 			.input = {
 36 | 				.matrix = {
 37 | 					.rows = 128,
 38 | 					.cols = 128,
 39 | 					.channels = 3,
 40 | 					.partition = 1,
 41 | 				},
 42 | 			},
 43 | 			.output = {
 44 | 				.convolutional = {
 45 | 					.count = 96,
 46 | 					.strides = 1,
 47 | 					.border = 0,
 48 | 					.rows = 11,
 49 | 					.cols = 11,
 50 | 					.channels = 3,
 51 | 					.partition = 1,
 52 | 				},
 53 | 			},
 54 | 		},
 55 | 		// second layer (convolutional => max pool => rnorm)
 56 | 		{
 57 | 			.type = CCV_CONVNET_CONVOLUTIONAL,
 58 | 			.bias = 1,
 59 | 			.glorot = sqrtf(2),
 60 | 			.input = {
 61 | 				.matrix = {
 62 | 					.rows = 64,
 63 | 					.cols = 64,
 64 | 					.channels = 64,
 65 | 					.partition = 1,
 66 | 				},
 67 | 			},
 68 | 			.output = {
 69 | 				.convolutional = {
 70 | 					.count = 128,
 71 | 					.strides = 1,
 72 | 					.border = 0,
 73 | 					.rows = 9,
 74 | 					.cols = 9,
 75 | 					.channels = 64,
 76 | 					.partition = 1,
 77 | 				},
 78 | 			},
 79 | 		},
 80 | 		// third layer (convolutional)
 81 | 		{
 82 | 			.type = CCV_CONVNET_CONVOLUTIONAL,
 83 | 			.bias = 0,
 84 | 			.glorot = sqrtf(2),
 85 | 			.input = {
 86 | 				.matrix = {
 87 | 					.rows = 13,
 88 | 					.cols = 13,
 89 | 					.channels = 256,
 90 | 					.partition = 1,
 91 | 				},
 92 | 			},
 93 | 			.output = {
 94 | 				.convolutional = {
 95 | 					.count = 128,
 96 | 					.strides = 1,
 97 | 					.border = 0,
 98 | 					.rows = 9,
 99 | 					.cols = 9,
100 | 					.channels = 128,
101 | 					.partition = 1,
102 | 				},
103 | 			},
104 | 		},
105 | 		// fourth layer (convolutional)
106 | 		{
107 | 			.type = CCV_CONVNET_CONVOLUTIONAL,
108 | 			.bias = 1,
109 | 			.glorot = sqrtf(2),
110 | 			.input = {
111 | 				.matrix = {
112 | 					.rows = 16,
113 | 					.cols = 16,
114 | 					.channels = 128,
115 | 					.partition = 1,
116 | 				},
117 | 			},
118 | 			.output = {
119 | 				.convolutional = {
120 | 					.count = 128,
121 | 					.strides = 1,
122 | 					.border = 0,
123 | 					.rows = 7,
124 | 					.cols = 7,
125 | 					.channels = 128,
126 | 					.partition = 1,
127 | 				},
128 | 			},
129 | 		},
130 | 		// fifth layer (convolutional => max pool)
131 | 		{
132 | 			.type = CCV_CONVNET_CONVOLUTIONAL,
133 | 			.bias = 1,
134 | 			.glorot = sqrtf(2),
135 | 			.input = {
136 | 				.matrix = {
137 | 					.rows = 13,
138 | 					.cols = 13,
139 | 					.channels = 384,
140 | 					.partition = 1,
141 | 				},
142 | 			},
143 | 			.output = {
144 | 				.convolutional = {
145 | 					.count = 384,
146 | 					.strides = 1,
147 | 					.border = 0,
148 | 					.rows = 3,
149 | 					.cols = 3,
150 | 					.channels = 384,
151 | 					.partition = 1,
152 | 				},
153 | 			},
154 | 		},
155 | 	};
156 | 	ccv_convnet_t* convnet = ccv_convnet_new(1, ccv_size(128, 128), params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
157 | 	ccv_convnet_layer_train_param_t layer_params[13];
158 | 	memset(layer_params, 0, sizeof(layer_params));
159 | 	int i;
160 | 	for (i = 0; i < 13; i++)
161 | 	{
162 | 		layer_params[i].w.decay = 0.005;
163 | 		layer_params[i].w.learn_rate = 0.0005;
164 | 		layer_params[i].w.momentum = 0.9;
165 | 		layer_params[i].bias.decay = 0;
166 | 		layer_params[i].bias.learn_rate = 0.001;
167 | 		layer_params[i].bias.momentum = 0.9;
168 | 	}
169 | 	ccv_convnet_train_param_t train_params = {
170 | 		.max_epoch = 100,
171 | 		.mini_batch = 128,
172 | 		.device_count = 1,
173 | 		.layer_params = layer_params,
174 | 	};
175 | 	for (i = 0; i < 128; i++)
176 | 	{
177 | 		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
178 | 		ccv_dense_matrix_t* image = 0;
179 | 		ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
180 | 		ccv_dense_matrix_t* b = 0;
181 | 		if (image->rows > 128 && image->cols > 128)
182 | 			ccv_resample(image, &b, 0, ccv_max(128, (int)(image->rows * 128.0 / image->cols + 0.5)), ccv_max(128, (int)(image->cols * 128.0 / image->rows + 0.5)), CCV_INTER_AREA);
183 | 		else if (image->rows < 128 || image->cols < 128)
184 | 			ccv_resample(image, &b, 0, ccv_max(128, (int)(image->rows * 128.0 / image->cols + 0.5)), ccv_max(128, (int)(image->cols * 128.0 / image->rows + 0.5)), CCV_INTER_CUBIC);
185 | 		else
186 | 			b = image;
187 | 		if (b != image)
188 | 			ccv_matrix_free(image);
189 | 		ccv_dense_matrix_t* c = 0;
190 | 		ccv_slice(b, (ccv_matrix_t**)&c, CCV_32F, 0, 0, 128, 128);
191 | 		ccv_matrix_free(b);
192 | 		categorized->type = CCV_CATEGORIZED_DENSE_MATRIX;
193 | 		categorized->matrix = c;
194 | 	}
195 | 	cwc_bench_runtime(convnet, categorizeds, train_params);
196 | 	ccv_disable_cache();
197 | 	return 0;
198 | }
199 | 


--------------------------------------------------------------------------------
/ccv/lena-128.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soumith/convnet-benchmarks/b458aab61c0ac2257c0990119b5de15c1e886f02/ccv/lena-128.jpg


--------------------------------------------------------------------------------
/ccv/list.txt:
--------------------------------------------------------------------------------
  1 | ./lena-128.jpg
  2 | ./lena-128.jpg
  3 | ./lena-128.jpg
  4 | ./lena-128.jpg
  5 | ./lena-128.jpg
  6 | ./lena-128.jpg
  7 | ./lena-128.jpg
  8 | ./lena-128.jpg
  9 | ./lena-128.jpg
 10 | ./lena-128.jpg
 11 | ./lena-128.jpg
 12 | ./lena-128.jpg
 13 | ./lena-128.jpg
 14 | ./lena-128.jpg
 15 | ./lena-128.jpg
 16 | ./lena-128.jpg
 17 | ./lena-128.jpg
 18 | ./lena-128.jpg
 19 | ./lena-128.jpg
 20 | ./lena-128.jpg
 21 | ./lena-128.jpg
 22 | ./lena-128.jpg
 23 | ./lena-128.jpg
 24 | ./lena-128.jpg
 25 | ./lena-128.jpg
 26 | ./lena-128.jpg
 27 | ./lena-128.jpg
 28 | ./lena-128.jpg
 29 | ./lena-128.jpg
 30 | ./lena-128.jpg
 31 | ./lena-128.jpg
 32 | ./lena-128.jpg
 33 | ./lena-128.jpg
 34 | ./lena-128.jpg
 35 | ./lena-128.jpg
 36 | ./lena-128.jpg
 37 | ./lena-128.jpg
 38 | ./lena-128.jpg
 39 | ./lena-128.jpg
 40 | ./lena-128.jpg
 41 | ./lena-128.jpg
 42 | ./lena-128.jpg
 43 | ./lena-128.jpg
 44 | ./lena-128.jpg
 45 | ./lena-128.jpg
 46 | ./lena-128.jpg
 47 | ./lena-128.jpg
 48 | ./lena-128.jpg
 49 | ./lena-128.jpg
 50 | ./lena-128.jpg
 51 | ./lena-128.jpg
 52 | ./lena-128.jpg
 53 | ./lena-128.jpg
 54 | ./lena-128.jpg
 55 | ./lena-128.jpg
 56 | ./lena-128.jpg
 57 | ./lena-128.jpg
 58 | ./lena-128.jpg
 59 | ./lena-128.jpg
 60 | ./lena-128.jpg
 61 | ./lena-128.jpg
 62 | ./lena-128.jpg
 63 | ./lena-128.jpg
 64 | ./lena-128.jpg
 65 | ./lena-128.jpg
 66 | ./lena-128.jpg
 67 | ./lena-128.jpg
 68 | ./lena-128.jpg
 69 | ./lena-128.jpg
 70 | ./lena-128.jpg
 71 | ./lena-128.jpg
 72 | ./lena-128.jpg
 73 | ./lena-128.jpg
 74 | ./lena-128.jpg
 75 | ./lena-128.jpg
 76 | ./lena-128.jpg
 77 | ./lena-128.jpg
 78 | ./lena-128.jpg
 79 | ./lena-128.jpg
 80 | ./lena-128.jpg
 81 | ./lena-128.jpg
 82 | ./lena-128.jpg
 83 | ./lena-128.jpg
 84 | ./lena-128.jpg
 85 | ./lena-128.jpg
 86 | ./lena-128.jpg
 87 | ./lena-128.jpg
 88 | ./lena-128.jpg
 89 | ./lena-128.jpg
 90 | ./lena-128.jpg
 91 | ./lena-128.jpg
 92 | ./lena-128.jpg
 93 | ./lena-128.jpg
 94 | ./lena-128.jpg
 95 | ./lena-128.jpg
 96 | ./lena-128.jpg
 97 | ./lena-128.jpg
 98 | ./lena-128.jpg
 99 | ./lena-128.jpg
100 | ./lena-128.jpg
101 | ./lena-128.jpg
102 | ./lena-128.jpg
103 | ./lena-128.jpg
104 | ./lena-128.jpg
105 | ./lena-128.jpg
106 | ./lena-128.jpg
107 | ./lena-128.jpg
108 | ./lena-128.jpg
109 | ./lena-128.jpg
110 | ./lena-128.jpg
111 | ./lena-128.jpg
112 | ./lena-128.jpg
113 | ./lena-128.jpg
114 | ./lena-128.jpg
115 | ./lena-128.jpg
116 | ./lena-128.jpg
117 | ./lena-128.jpg
118 | ./lena-128.jpg
119 | ./lena-128.jpg
120 | ./lena-128.jpg
121 | ./lena-128.jpg
122 | ./lena-128.jpg
123 | ./lena-128.jpg
124 | ./lena-128.jpg
125 | ./lena-128.jpg
126 | ./lena-128.jpg
127 | ./lena-128.jpg
128 | ./lena-128.jpg
129 | 


--------------------------------------------------------------------------------
/ccv/makefile:
--------------------------------------------------------------------------------
 1 | include ccv/lib/config.mk
 2 | 
 3 | LDFLAGS := -L"ccv/lib" -lccv $(LDFLAGS)
 4 | CFLAGS := -O3 -Wall -I"ccv//lib" $(CFLAGS)
 5 | NVFLAGS := -O3 -I"ccv/lib" -lineinfo $(NVFLAGS)
 6 | 
 7 | all: libccv.a cwc-bench 
 8 | 
 9 | clean:
10 | 	${MAKE} clean -C ccv/lib ; rm -f *.o $(TARGETS)
11 | 
12 | cwc-bench: %: %.o cwc-bench-runtime.o libccv.a
13 | 	$(CC) -o $@ cwc-bench-runtime.o $< $(LDFLAGS)
14 | 
15 | libccv.a:
16 | 	${MAKE} -C ccv/lib
17 | 
18 | %.o: %.c ccv/lib/ccv.h
19 | 	$(CC) $< -o $@ -c $(CFLAGS)
20 | 
21 | %.o: %.cu ccv/lib/ccv.h ccv/lib/cuda/*.h ccv/lib/cuda/*.cu
22 | 	$(NVCC) $< -o $@ -c $(NVFLAGS)
23 | 


--------------------------------------------------------------------------------
/ccv/output.log:
--------------------------------------------------------------------------------
1 | -> ./cwc-bench list.txt
2 | 4 8 32, elapsed time for layer 1 fprop: 121.213440 milliseconds
3 | 4 8 32, elapsed time for layer 2 fprop: 437.119415 milliseconds
4 | 4 8 32, elapsed time for layer 3 fprop: 182.529343 milliseconds
5 | 4 8 32, elapsed time for layer 4 fprop: 23.569664 milliseconds
6 | 4 8 32, elapsed time for layer 5 fprop: 44.416992 milliseconds
7 | 


--------------------------------------------------------------------------------
/chainer/README.md:
--------------------------------------------------------------------------------
 1 | # Prepare
 2 | 
 3 | ```
 4 | pip install cupy
 5 | pip install chainer
 6 | ```
 7 | 
 8 | # Sources
 9 | 
10 | - [CuPy](https://github.com/cupy/cupy)
11 | - [Chainer](https://github.com/chainer/chainer)
12 | 
13 | 


--------------------------------------------------------------------------------
/chainer/alex.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | 
 5 | 
 6 | class Alex(chainer.Chain):
 7 |     insize = 224
 8 | 
 9 |     def __init__(self):
10 |         super(Alex, self).__init__()
11 |         with self.init_scope():
12 |             self.conv1 = L.Convolution2D(3,  64, 11, stride=4, pad=2)
13 |             self.conv2 = L.Convolution2D(64, 192,  5, pad=2)
14 |             self.conv3 = L.Convolution2D(192, 384,  3, pad=1)
15 |             self.conv4 = L.Convolution2D(384, 256,  3, pad=1)
16 |             self.conv5 = L.Convolution2D(256, 256,  3, pad=1)
17 |             self.fc6 = L.Linear(256 * 6 * 6, 4096)
18 |             self.fc7 = L.Linear(4096, 4096)
19 |             self.fc8 = L.Linear(4096, 1000)
20 | 
21 |     def forward(self, x):
22 |         h = F.max_pooling_2d(F.relu(self.conv1(x)), 3, stride=2)
23 |         h = F.max_pooling_2d(F.relu(self.conv2(h)), 3, stride=2)
24 |         h = F.relu(self.conv3(h))
25 |         h = F.relu(self.conv4(h))
26 |         h = F.max_pooling_2d(F.relu(self.conv5(h)), 3, stride=2)
27 |         h = F.relu(self.fc6(h))
28 |         h = F.relu(self.fc7(h))
29 |         return self.fc8(h)
30 | 


--------------------------------------------------------------------------------
/chainer/googlenet.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | 
 5 | 
 6 | class GoogLeNet(chainer.Chain):
 7 | 
 8 |     insize = 224
 9 | 
10 |     def __init__(self):
11 |         super(GoogLeNet, self).__init__()
12 |         with self.init_scope():
13 |             self.conv1 = L.Convolution2D(3,  64, 7, stride=2, pad=3)
14 |             self.conv2_reduce = L.Convolution2D(64,  64, 1)
15 |             self.conv2 = L.Convolution2D(64, 192, 3, stride=1, pad=1)
16 |             self.inc3a = L.Inception(192,  64,  96, 128, 16,  32,  32)
17 |             self.inc3b = L.Inception(256, 128, 128, 192, 32,  96,  64)
18 |             self.inc4a = L.Inception(480, 192,  96, 208, 16,  48,  64)
19 |             self.inc4b = L.Inception(512, 160, 112, 224, 24,  64,  64)
20 |             self.inc4c = L.Inception(512, 128, 128, 256, 24,  64,  64)
21 |             self.inc4d = L.Inception(512, 112, 144, 288, 32,  64,  64)
22 |             self.inc4e = L.Inception(528, 256, 160, 320, 32, 128, 128)
23 |             self.inc5a = L.Inception(832, 256, 160, 320, 32, 128, 128)
24 |             self.inc5b = L.Inception(832, 384, 192, 384, 48, 128, 128)
25 |             self.loss3_fc = L.Linear(1024, 1000)
26 | 
27 |             self.loss1_conv = L.Convolution2D(512, 128, 1)
28 |             self.loss1_fc1 = L.Linear(4 * 4 * 128, 1024)
29 |             self.loss1_fc2 = L.Linear(1024, 1000)
30 | 
31 |             self.loss2_conv = L.Convolution2D(528, 128, 1)
32 |             self.loss2_fc1 = L.Linear(4 * 4 * 128, 1024)
33 |             self.loss2_fc2 = L.Linear(1024, 1000)
34 | 
35 |     def forward(self, x):
36 |         h = F.relu(self.conv1(x))
37 |         h = F.local_response_normalization(
38 |             F.max_pooling_2d(h, 3, stride=2), n=5)
39 | 
40 |         h = F.relu(self.conv2_reduce(h))
41 |         h = F.relu(self.conv2(h))
42 |         h = F.max_pooling_2d(
43 |             F.local_response_normalization(h, n=5), 3, stride=2)
44 | 
45 |         h = self.inc3a(h)
46 |         h = self.inc3b(h)
47 |         h = F.max_pooling_2d(h, 3, stride=2)
48 |         h = self.inc4a(h)
49 | 
50 |         if chainer.config.train:
51 |             out1 = F.average_pooling_2d(h, 5, stride=3)
52 |             out1 = F.relu(self.loss1_conv(out1))
53 |             out1 = F.relu(self.loss1_fc1(out1))
54 |             out1 = self.loss1_fc2(out1)
55 | 
56 |         h = self.inc4b(h)
57 |         h = self.inc4c(h)
58 |         h = self.inc4d(h)
59 | 
60 |         if chainer.config.train:
61 |             out2 = F.average_pooling_2d(h, 5, stride=3)
62 |             out2 = F.relu(self.loss2_conv(out2))
63 |             out2 = F.relu(self.loss2_fc1(out2))
64 |             out2 = self.loss2_fc2(out2)
65 | 
66 |         h = self.inc4e(h)
67 |         h = F.max_pooling_2d(h, 3, stride=2)
68 |         h = self.inc5a(h)
69 |         h = self.inc5b(h)
70 | 
71 |         h = F.dropout(F.average_pooling_2d(h, 7, stride=1), 0.4)
72 |         out3 = self.loss3_fc(h)
73 |         return out1, out2, out3
74 | 


--------------------------------------------------------------------------------
/chainer/overfeat.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | 
 5 | 
 6 | class overfeat(chainer.Chain):
 7 |     insize = 231
 8 | 
 9 |     def __init__(self):
10 |         super(overfeat, self).__init__()
11 |         with self.init_scope():
12 |             self.conv1 = L.Convolution2D(   3,   96, 11, stride=4)
13 |             self.conv2 = L.Convolution2D(  96,  256,  5, pad=0)
14 |             self.conv3 = L.Convolution2D( 256,  512,  3, pad=1)
15 |             self.conv4 = L.Convolution2D( 512, 1024,  3, pad=1)
16 |             self.conv5 = L.Convolution2D(1024, 1024,  3, pad=1)
17 |             self.fc6 = L.Linear(1024 * 6 * 6, 3072)
18 |             self.fc7 = L.Linear(3072, 4096)
19 |             self.fc8 = L.Linear(4096, 1000)
20 | 
21 |     def forward(self, x):
22 |         h = F.max_pooling_2d(F.relu(self.conv1(x)), 2, stride=2)
23 |         h = F.max_pooling_2d(F.relu(self.conv2(h)), 2, stride=2)
24 |         h = F.relu(self.conv3(h))
25 |         h = F.relu(self.conv4(h))
26 |         h = F.max_pooling_2d(F.relu(self.conv5(h)), 3, stride=2)
27 |         h = F.relu(self.fc6(h))
28 |         h = F.relu(self.fc7(h))
29 |         return self.fc8(h)
30 | 


--------------------------------------------------------------------------------
/chainer/run.sh:
--------------------------------------------------------------------------------
1 | ./train_imagenet.py --arch alexnet   --batchsize 128     | tee out_alexnet.log
2 | ./train_imagenet.py --arch googlenet --batchsize 128     | tee out_googlenet.log
3 | ./train_imagenet.py --arch vgga      --batchsize 64      | tee out_vgga.log
4 | ./train_imagenet.py --arch overfeat  --batchsize 128     | tee out_overfeat.log
5 | 


--------------------------------------------------------------------------------
/chainer/train_imagenet.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import time
  4 | 
  5 | import numpy as np
  6 | 
  7 | from chainer import cuda
  8 | from chainer import optimizers
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser(
 12 |     description=' convnet benchmarks on imagenet')
 13 | parser.add_argument('--arch', '-a', default='alexnet',
 14 |                     help='Convnet architecture \
 15 |                     (alex, googlenet, vgga, overfeat)')
 16 | parser.add_argument('--batchsize', '-B', type=int, default=128,
 17 |                     help='minibatch size')
 18 | parser.add_argument('--gpu', '-g', default=0, type=int,
 19 |                     help='GPU ID (negative value indicates CPU)')
 20 | 
 21 | args = parser.parse_args()
 22 | xp = cuda.cupy if args.gpu >= 0 else np
 23 | 
 24 | # Prepare model
 25 | print(args.arch)
 26 | if args.arch == 'alexnet':
 27 |     import alex
 28 |     model = alex.Alex()
 29 | elif args.arch == 'googlenet':
 30 |     import googlenet
 31 |     model = googlenet.GoogLeNet()
 32 | elif args.arch == 'vgga':
 33 |     import vgga
 34 |     model = vgga.vgga()
 35 | elif args.arch == 'overfeat':
 36 |     import overfeat
 37 |     model = overfeat.overfeat()
 38 | else:
 39 |     raise ValueError('Invalid architecture name')
 40 | 
 41 | if args.gpu >= 0:
 42 |     cuda.get_device(args.gpu).use()
 43 |     model.to_gpu()
 44 | 
 45 | # Setup optimizer
 46 | optimizer = optimizers.SGD(lr=0.01)
 47 | optimizer.setup(model)
 48 | 
 49 | workspace_size = int(1 * 2**30)
 50 | import chainer
 51 | 
 52 | chainer.cuda.set_max_workspace_size(workspace_size)
 53 | 
 54 | chainer.config.train = True
 55 | chainer.config.use_cudnn = 'always'
 56 | 
 57 | 
 58 | class Timer():
 59 |     def preprocess(self):
 60 |         if xp == np:
 61 |             self.start = time.time()
 62 |         else:
 63 |             self.start = xp.cuda.Event()
 64 |             self.end = xp.cuda.Event()
 65 |             self.start.record()
 66 | 
 67 |     def postprocess(self):
 68 |         if xp == np:
 69 |             self.end = time.time()
 70 |         else:
 71 |             self.end.record()
 72 |             self.end.synchronize()
 73 | 
 74 |     def getElapseTime(self):
 75 |         if xp == np:
 76 |             return (self.end - self.start) * 1000
 77 |         else:
 78 |             return xp.cuda.get_elapsed_time(self.start, self.end)
 79 | 
 80 | 
 81 | def train_loop():
 82 |     # Trainer
 83 |     data = np.ndarray((args.batchsize, 3, model.insize,
 84 |                        model.insize), dtype=np.float32)
 85 |     data.fill(33333)
 86 |     total_forward = 0
 87 |     total_backward = 0
 88 |     niter = 13
 89 |     n_dry = 3
 90 | 
 91 |     label = np.ndarray((args.batchsize), dtype=np.int32)
 92 |     label.fill(1)
 93 |     count = 0
 94 |     timer = Timer()
 95 |     for i in range(niter):
 96 |         x = xp.asarray(data)
 97 |         y = xp.asarray(label)
 98 | 
 99 |         if args.arch == 'googlenet':
100 |             timer.preprocess()
101 |             out1, out2, out3 = model.forward(x)
102 |             timer.postprocess()
103 |             time_ = timer.getElapseTime()
104 |             if i > n_dry - 1:
105 |                 count += 1
106 |                 total_forward += time_
107 |             out = out1 + out2 + out3
108 |         else:
109 |             timer.preprocess()
110 |             out = model.forward(x)
111 |             timer.postprocess()
112 |             time_ = time_ = timer.getElapseTime()
113 |             if i > n_dry - 1:
114 |                 count += 1
115 |                 total_forward += time_
116 | 
117 |         out.zerograd()
118 |         out.grad.fill(3)
119 |         model.cleargrads()
120 |         if xp != np:
121 |             xp.cuda.Stream(null=True)
122 |         timer.preprocess()
123 |         out.backward()
124 |         timer.postprocess()
125 |         time_ = timer.getElapseTime()
126 |         if i > n_dry - 1:
127 |             total_backward += time_
128 |         model.cleargrads()
129 | 
130 |         del out, x, y
131 |         if args.arch == 'googlenet':
132 |             del out1, out2, out3
133 |     print("Average Forward:  ", total_forward / count, " ms")
134 |     print("Average Backward: ", total_backward / count, " ms")
135 |     print("Average Total:    ", (total_forward + total_backward) / count, " ms")
136 |     print("")
137 | 
138 | 
139 | train_loop()
140 | 


--------------------------------------------------------------------------------
/chainer/vgga.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | 
 5 | 
 6 | class vgga(chainer.Chain):
 7 |     insize = 224
 8 | 
 9 |     def __init__(self):
10 |         super(vgga, self).__init__()
11 |         with self.init_scope():
12 |             self.conv1 = L.Convolution2D(  3,  64, 3, stride=1, pad=1)
13 |             self.conv2 = L.Convolution2D( 64, 128, 3, stride=1, pad=1)
14 |             self.conv3 = L.Convolution2D(128, 256, 3, stride=1, pad=1)
15 |             self.conv4 = L.Convolution2D(256, 256, 3, stride=1, pad=1)
16 |             self.conv5 = L.Convolution2D(256, 512, 3, stride=1, pad=1)
17 |             self.conv6 = L.Convolution2D(512, 512, 3, stride=1, pad=1)
18 |             self.conv7 = L.Convolution2D(512, 512, 3, stride=1, pad=1)
19 |             self.conv8 = L.Convolution2D(512, 512, 3, stride=1, pad=1)
20 |             self.fc6 = L.Linear(512 * 7 * 7, 4096)
21 |             self.fc7 = L.Linear(4096, 4096)
22 |             self.fc8 = L.Linear(4096, 1000)
23 | 
24 |     def forward(self, x):
25 |         h = F.max_pooling_2d(F.relu(self.conv1(x)), 2, stride=2)
26 |         h = F.max_pooling_2d(F.relu(self.conv2(h)), 2, stride=2)
27 |         h = F.relu(self.conv3(h))
28 |         h = F.max_pooling_2d(F.relu(self.conv4(h)), 2, stride=2)
29 |         h = F.relu(self.conv5(h))
30 |         h = F.max_pooling_2d(F.relu(self.conv6(h)), 2, stride=2)
31 |         h = F.relu(self.conv7(h))
32 |         h = F.max_pooling_2d(F.relu(self.conv8(h)), 2, stride=2)
33 |         h = F.relu(self.fc6(h))
34 |         h = F.relu(self.fc7(h))
35 |         return self.fc8(h)
36 | 


--------------------------------------------------------------------------------
/cltorch/README.md:
--------------------------------------------------------------------------------
 1 | Install cltorch using the script:
 2 | ```bash
 3 | curl -s https://raw.githubusercontent.com/torch/ezinstall/master/install-all | bash
 4 | luarocks install cltorch # to install cltorch matrix layer
 5 | luarocks install clnn    # to install clnn network layer
 6 | ```
 7 | 
 8 | For imagenet_winners benchmarks (alexnet, overfeat, vgg and googlenet)
 9 | Run the benchmark using:
10 | ```bash
11 | th imagenet_winners/benchmark.lua
12 | ```
13 | 
14 | For layerwise benchmarks (table in frontpage with L1,L2,L3,L4,L5)
15 | Run the benchmark using:
16 | ```bash
17 | git clone --recursive https://github.com/hughperkins/clnn
18 | cd clnn
19 | th test/test-perf.lua
20 | ```
21 | 
22 | 


--------------------------------------------------------------------------------
/cltorch/imagenet.out.txt:
--------------------------------------------------------------------------------
 1 | Running on device: GeForce GTX TITAN X
 2 | Using NVIDIA Corporation , OpenCL platform: NVIDIA CUDA
 3 | Using OpenCL device: GeForce GTX TITAN X
 4 | ModelType: OverFeat[fast]	Kernels: clnn	Input shape: 128x3x231x231
 5 | clnn                                    :updateOutput():    1180.92
 6 | clnn                                 :updateGradInput():     354.65
 7 | clnn                               :accGradParameters():    1725.63
 8 | clnn                                           :Forward:    1180.92
 9 | clnn                                          :Backward:    2080.28
10 | clnn                                             :TOTAL:    3261.19
11 | 
12 | ModelType: AlexNet	Kernels: clnn	Input shape: 128x3x224x224
13 | clnn                                    :updateOutput():     388.69
14 | clnn                                 :updateGradInput():     204.12
15 | clnn                               :accGradParameters():     370.30
16 | clnn                                           :Forward:     388.69
17 | clnn                                          :Backward:     574.42
18 | clnn                                             :TOTAL:     963.11
19 | 
20 | ModelType: VGG Model-A	Kernels: clnn	Input shape: 64x3x224x224
21 | clnn                                    :updateOutput():     875.65
22 | clnn                                 :updateGradInput():     479.20
23 | clnn                               :accGradParameters():    2083.00
24 | clnn                                           :Forward:     875.65
25 | clnn                                          :Backward:    2562.20
26 | clnn                                             :TOTAL:    3437.85
27 | 
28 | ModelType: GoogleNet	Kernels: clnn	Input shape: 128x3x224x224
29 | clnn                                    :updateOutput():    3027.72
30 | clnn                                 :updateGradInput():    1554.68
31 | clnn                               :accGradParameters():    2433.74
32 | clnn                                           :Forward:    3027.72
33 | clnn                                          :Backward:    3988.43
34 | clnn                                             :TOTAL:    7016.15
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/cltorch/imagenet_winners/alexnet.lua:
--------------------------------------------------------------------------------
1 | ../../torch7/imagenet_winners/alexnet.lua


--------------------------------------------------------------------------------
/cltorch/imagenet_winners/benchmark.lua:
--------------------------------------------------------------------------------
 1 | require 'sys'
 2 | require 'cltorch'
 3 | require 'clnn'
 4 | 
 5 | local nets = {}
 6 | nets[#nets+1] = require 'imagenet_winners/overfeat'
 7 | nets[#nets+1] = require 'imagenet_winners/alexnet'
 8 | nets[#nets+1] = require 'imagenet_winners/vgg_a'
 9 | nets[#nets+1] = require 'imagenet_winners/googlenet'
10 | 
11 | local libs = {}
12 | libs[#libs+1] = {nn.SpatialConvolutionMM, nn.SpatialMaxPooling, nn.ReLU, 'BDHW', 'clnn'}
13 | 
14 | print('Running on device: ' .. cltorch.getDeviceProperties(cltorch.getDevice()).deviceName)
15 | 
16 | steps = 10 -- nb of steps in loop to average perf
17 | 
18 | function makeInput(config, size)
19 |    local layout = config[4]
20 |    local osize
21 |    if layout == 'BDHW' then
22 |       osize = size
23 |    elseif layout == 'DHWB' then
24 |       osize = {size[2],size[3],size[4],size[1]}
25 |    elseif layout == 'BHWD' then
26 |       osize = {size[1], size[3], size[4], size[2]}
27 |    end
28 |    return torch.randn(torch.LongStorage(osize))
29 | end
30 | 
31 | for i=1,#nets do
32 |    for j=1,#libs do
33 |       collectgarbage()
34 |       local model,model_name,size = nets[i](libs[j])
35 |       model=model:cl()
36 |       local input = makeInput(libs[j],size):cl()
37 |       local lib_name = libs[j][5]
38 |       print('ModelType: ' .. model_name, 'Kernels: ' .. lib_name,
39 |             'Input shape: ' .. input:size(1) .. 'x' .. input:size(2) ..
40 |                'x' .. input:size(3) .. 'x' .. input:size(4))
41 | 
42 |       -- dry-run
43 |       model:zeroGradParameters()
44 |       local output = model:updateOutput(input)
45 |       local gradInput = model:updateGradInput(input, output)
46 |       model:accGradParameters(input, output)
47 |       cltorch.synchronize()
48 |       collectgarbage()
49 | 
50 |       local tmf, tmbi, tmbg
51 |       sys.tic()
52 |       for t = 1,steps do
53 |          output = model:updateOutput(input)
54 |       end
55 |       cltorch.synchronize()
56 |       tmf = sys.toc()/steps
57 |       print(string.format("%-30s %25s %10.2f", lib_name, ':updateOutput():', tmf*1000))
58 | 
59 |       collectgarbage()
60 |       sys.tic()
61 |       for t = 1,steps do
62 |          model:updateGradInput(input, output)
63 |       end
64 |       cltorch.synchronize()
65 |       tmbi = sys.toc()/steps
66 |       print(string.format("%-30s %25s %10.2f", lib_name, ':updateGradInput():', tmbi*1000))
67 | 
68 |       collectgarbage()
69 |       sys.tic()
70 |       local ok = 1
71 |       for t = 1,steps do
72 |          ok = pcall(function() model:accGradParameters(input, output) end)
73 |       end
74 |       cltorch.synchronize()
75 |       tmbg = sys.toc()/steps
76 |       if not ok then
77 |          print(string.format("%-30s %25s %s", lib_name, ':accGradParameters():', 'FAILED!'))
78 |       else
79 |          print(string.format("%-30s %25s %10.2f", lib_name, ':accGradParameters():', tmbg*1000))
80 |       end
81 |       print(string.format("%-30s %25s %10.2f", lib_name, ':Forward:', (tmf)*1000))
82 |       print(string.format("%-30s %25s %10.2f", lib_name, ':Backward:', (tmbi+tmbg)*1000))
83 |       print(string.format("%-30s %25s %10.2f", lib_name, ':TOTAL:', (tmf+tmbi+tmbg)*1000))
84 |       print()
85 |    end
86 | end
87 | 
88 | print('')
89 | 


--------------------------------------------------------------------------------
/cltorch/imagenet_winners/googlenet.lua:
--------------------------------------------------------------------------------
1 | ../../torch7/imagenet_winners/googlenet.lua


--------------------------------------------------------------------------------
/cltorch/imagenet_winners/overfeat.lua:
--------------------------------------------------------------------------------
1 | ../../torch7/imagenet_winners/overfeat.lua


--------------------------------------------------------------------------------
/cltorch/imagenet_winners/vgg_a.lua:
--------------------------------------------------------------------------------
1 | ../../torch7/imagenet_winners/vgg_a.lua


--------------------------------------------------------------------------------
/cltorch/layerwise.out.txt:
--------------------------------------------------------------------------------
 1 | [0;36m1[0m	[0ml1[0m	
 2 | Using NVIDIA Corporation , OpenCL platform: NVIDIA CUDA
 3 | Using OpenCL device: GeForce GTX TITAN X
 4 | [0m   updateOutput sys.toc()[0m	[0;36m0.1102499961853[0m	
 5 | [0m   updateOutput sys.toc()[0m	[0;36m0.10596203804016[0m	
 6 | [0m   updateOutput sys.toc()[0m	[0;36m0.10403418540955[0m	
 7 | [0m   updateGradInput sys.toc()[0m	[0;36m0.21330809593201[0m	
 8 | [0m   updateGradInput sys.toc()[0m	[0;36m0.21355485916138[0m	
 9 | [0m   updateGradInput sys.toc()[0m	[0;36m0.21345400810242[0m	
10 | [0m   accGradParameters sys.toc()[0m	[0;36m0.21531891822815[0m	
11 | [0m   accGradParameters sys.toc()[0m	[0;36m0.21532011032104[0m	
12 | [0m   accGradParameters sys.toc()[0m	[0;36m0.21628308296204[0m	
13 | [0m   backwards[0m	[0;36m0.42862701416016[0m	
14 | [0m   backwards[0m	[0;36m0.42887496948242[0m	
15 | [0m   backwards[0m	[0;36m0.42973709106445[0m	
16 | [0;36m2[0m	[0ml3[0m	
17 | [0m   updateOutput sys.toc()[0m	[0;36m0.55879998207092[0m	
18 | [0m   updateOutput sys.toc()[0m	[0;36m0.55877995491028[0m	
19 | [0m   updateOutput sys.toc()[0m	[0;36m0.55866694450378[0m	
20 | [0m   updateGradInput sys.toc()[0m	[0;36m0.094888925552368[0m	
21 | [0m   updateGradInput sys.toc()[0m	[0;36m0.094869136810303[0m	
22 | [0m   updateGradInput sys.toc()[0m	[0;36m0.094879150390625[0m	
23 | [0m   accGradParameters sys.toc()[0m	[0;36m0.38145899772644[0m	
24 | [0m   accGradParameters sys.toc()[0m	[0;36m0.38126707077026[0m	
25 | [0m   accGradParameters sys.toc()[0m	[0;36m0.38134407997131[0m	
26 | [0m   backwards[0m	[0;36m0.47634792327881[0m	
27 | [0m   backwards[0m	[0;36m0.47613620758057[0m	
28 | [0m   backwards[0m	[0;36m0.47622323036194[0m	
29 | [0;36m3[0m	[0ml4[0m	
30 | [0m   updateOutput sys.toc()[0m	[0;36m0.17146182060242[0m	
31 | [0m   updateOutput sys.toc()[0m	[0;36m0.17142510414124[0m	
32 | [0m   updateOutput sys.toc()[0m	[0;36m0.17141604423523[0m	
33 | [0m   updateGradInput sys.toc()[0m	[0;36m0.027024984359741[0m	
34 | [0m   updateGradInput sys.toc()[0m	[0;36m0.027138948440552[0m	
35 | [0m   updateGradInput sys.toc()[0m	[0;36m0.02699613571167[0m	
36 | [0m   accGradParameters sys.toc()[0m	[0;36m0.040662050247192[0m	
37 | [0m   accGradParameters sys.toc()[0m	[0;36m0.040784120559692[0m	
38 | [0m   accGradParameters sys.toc()[0m	[0;36m0.040785074234009[0m	
39 | [0m   backwards[0m	[0;36m0.067687034606934[0m	
40 | [0m   backwards[0m	[0;36m0.067923069000244[0m	
41 | [0m   backwards[0m	[0;36m0.067781209945679[0m	
42 | [0;36m4[0m	[0ml5[0m	
43 | [0m   updateOutput sys.toc()[0m	[0;36m0.11678600311279[0m	
44 | [0m   updateOutput sys.toc()[0m	[0;36m0.11677193641663[0m	
45 | [0m   updateOutput sys.toc()[0m	[0;36m0.11677384376526[0m	
46 | [0m   updateGradInput sys.toc()[0m	[0;36m0.058609008789062[0m	
47 | [0m   updateGradInput sys.toc()[0m	[0;36m0.058629035949707[0m	
48 | [0m   updateGradInput sys.toc()[0m	[0;36m0.058593034744263[0m	
49 | [0m   accGradParameters sys.toc()[0m	[0;36m0.06651496887207[0m	
50 | [0m   accGradParameters sys.toc()[0m	[0;36m0.066493988037109[0m	
51 | [0m   accGradParameters sys.toc()[0m	[0;36m0.06651496887207[0m	
52 | [0m   backwards[0m	[0;36m0.12512397766113[0m	
53 | [0m   backwards[0m	[0;36m0.12512302398682[0m	
54 | [0m   backwards[0m	[0;36m0.12510800361633[0m	
55 | 


--------------------------------------------------------------------------------
/convnet.js/README.md:
--------------------------------------------------------------------------------
1 | sudo apt-get install nodejs
2 | 
3 | git clone https://github.com/karpathy/convnetjs.git
4 | 
5 | node benchmark.js
6 | 
7 | 


--------------------------------------------------------------------------------
/convnet.js/benchmark.js:
--------------------------------------------------------------------------------
 1 | var convnetjs = require("./convnetjs/build/convnet.js");
 2 | 
 3 | // L1 Conv Layer definition
 4 | var opt = { in_sx:128, in_sy:128, in_depth:3, sx:11, filters:96, stride: 1, pad: 0 };
 5 | var layer = new convnetjs.ConvLayer(opt);
 6 | 
 7 | // create a random input volume
 8 | var x = new convnetjs.Vol(128, 128, 3);
 9 | 
10 | // run it through batch_size number of times
11 | var batch_size = 128;
12 | var dtall = 0;
13 | for(var i=0;i<batch_size;i++) { // batch of 128
14 |   var t0 = +new Date();
15 |   layer.forward(x); // forward
16 |   var t1 = +new Date();
17 |   var dt = t1 - t0;
18 |   dtall += dt;
19 |   console.log(i + ' took ' + dt + 'ms. Estimating full batch to take ' + (dtall/(i+1))*batch_size + 'ms');
20 | }
21 | console.log('total: ' + dtall + 'ms');
22 | 


--------------------------------------------------------------------------------
/convnet.js/output.log:
--------------------------------------------------------------------------------
  1 | 0 took 3180ms. Estimating full batch to take 407040ms
  2 | 1 took 3273ms. Estimating full batch to take 412992ms
  3 | 2 took 2522ms. Estimating full batch to take 382933.3333333333ms
  4 | 3 took 2522ms. Estimating full batch to take 367904ms
  5 | 4 took 2521ms. Estimating full batch to take 358860.8ms
  6 | 5 took 2520ms. Estimating full batch to take 352810.6666666667ms
  7 | 6 took 2521ms. Estimating full batch to take 348507.4285714286ms
  8 | 7 took 2523ms. Estimating full batch to take 345312ms
  9 | 8 took 2522ms. Estimating full batch to take 342812.44444444444ms
 10 | 9 took 2520ms. Estimating full batch to take 340787.2ms
 11 | 10 took 2523ms. Estimating full batch to take 339165.0909090909ms
 12 | 11 took 2517ms. Estimating full batch to take 337749.3333333333ms
 13 | 12 took 2524ms. Estimating full batch to take 336620.3076923077ms
 14 | 13 took 2521ms. Estimating full batch to take 335625.14285714284ms
 15 | 14 took 2522ms. Estimating full batch to take 334771.2ms
 16 | 15 took 2522ms. Estimating full batch to take 334024ms
 17 | 16 took 2523ms. Estimating full batch to take 333372.23529411765ms
 18 | 17 took 2521ms. Estimating full batch to take 332778.6666666667ms
 19 | 18 took 2521ms. Estimating full batch to take 332247.5789473684ms
 20 | 19 took 2524ms. Estimating full batch to take 331788.8ms
 21 | 20 took 2521ms. Estimating full batch to take 331355.4285714286ms
 22 | 21 took 2522ms. Estimating full batch to take 330967.2727272727ms
 23 | 22 took 2523ms. Estimating full batch to take 330618.4347826087ms
 24 | 23 took 2520ms. Estimating full batch to take 330282.6666666667ms
 25 | 24 took 2521ms. Estimating full batch to take 329978.88ms
 26 | 25 took 2520ms. Estimating full batch to take 329693.53846153844ms
 27 | 26 took 2520ms. Estimating full batch to take 329429.3333333333ms
 28 | 27 took 2521ms. Estimating full batch to take 329188.5714285714ms
 29 | 28 took 2520ms. Estimating full batch to take 328960ms
 30 | 29 took 2522ms. Estimating full batch to take 328755.2ms
 31 | 30 took 2521ms. Estimating full batch to take 328559.48387096776ms
 32 | 31 took 2523ms. Estimating full batch to take 328384ms
 33 | 32 took 2522ms. Estimating full batch to take 328215.2727272727ms
 34 | 33 took 2521ms. Estimating full batch to take 328052.70588235295ms
 35 | 34 took 2523ms. Estimating full batch to take 327906.7428571429ms
 36 | 35 took 2519ms. Estimating full batch to take 327754.6666666667ms
 37 | 36 took 2523ms. Estimating full batch to take 327624.64864864864ms
 38 | 37 took 2520ms. Estimating full batch to take 327491.36842105264ms
 39 | 38 took 2520ms. Estimating full batch to take 327364.92307692306ms
 40 | 39 took 2523ms. Estimating full batch to take 327254.4ms
 41 | 40 took 2526ms. Estimating full batch to take 327158.6341463415ms
 42 | 41 took 2520ms. Estimating full batch to take 327049.14285714284ms
 43 | 42 took 2521ms. Estimating full batch to take 326947.72093023255ms
 44 | 43 took 2523ms. Estimating full batch to take 326856.7272727273ms
 45 | 44 took 2524ms. Estimating full batch to take 326772.6222222222ms
 46 | 45 took 2525ms. Estimating full batch to take 326694.95652173914ms
 47 | 46 took 2523ms. Estimating full batch to take 326615.1489361702ms
 48 | 47 took 2524ms. Estimating full batch to take 326541.3333333333ms
 49 | 48 took 2522ms. Estimating full batch to take 326465.306122449ms
 50 | 49 took 2523ms. Estimating full batch to take 326394.88ms
 51 | 50 took 2520ms. Estimating full batch to take 326319.6862745098ms
 52 | 51 took 2526ms. Estimating full batch to take 326262.1538461539ms
 53 | 52 took 2523ms. Estimating full batch to take 326199.5471698113ms
 54 | 53 took 2520ms. Estimating full batch to take 326132.14814814815ms
 55 | 54 took 2524ms. Estimating full batch to take 326076.5090909091ms
 56 | 55 took 2525ms. Estimating full batch to take 326025.14285714284ms
 57 | 56 took 2521ms. Estimating full batch to take 325966.5964912281ms
 58 | 57 took 2520ms. Estimating full batch to take 325907.8620689655ms
 59 | 58 took 2523ms. Estimating full batch to take 325857.6271186441ms
 60 | 59 took 2516ms. Estimating full batch to take 325794.13333333336ms
 61 | 60 took 2522ms. Estimating full batch to take 325745.31147540984ms
 62 | 61 took 2523ms. Estimating full batch to take 325700.12903225806ms
 63 | 62 took 2521ms. Estimating full batch to take 325652.31746031746ms
 64 | 63 took 2520ms. Estimating full batch to take 325604ms
 65 | 64 took 2522ms. Estimating full batch to take 325561.1076923077ms
 66 | 65 took 2521ms. Estimating full batch to take 325517.57575757575ms
 67 | 66 took 2523ms. Estimating full batch to take 325479.1641791045ms
 68 | 67 took 2524ms. Estimating full batch to take 325443.76470588235ms
 69 | 68 took 2522ms. Estimating full batch to take 325405.6811594203ms
 70 | 69 took 2520ms. Estimating full batch to take 325365.02857142856ms
 71 | 70 took 2524ms. Estimating full batch to take 325332.7323943662ms
 72 | 71 took 2521ms. Estimating full batch to take 325296ms
 73 | 72 took 2519ms. Estimating full batch to take 325256.76712328766ms
 74 | 73 took 2519ms. Estimating full batch to take 325218.5945945946ms
 75 | 74 took 2522ms. Estimating full batch to take 325186.56ms
 76 | 75 took 2523ms. Estimating full batch to take 325157.05263157893ms
 77 | 76 took 2523ms. Estimating full batch to take 325128.3116883117ms
 78 | 77 took 2520ms. Estimating full batch to take 325095.3846153846ms
 79 | 78 took 2522ms. Estimating full batch to take 325066.5316455696ms
 80 | 79 took 2518ms. Estimating full batch to take 325032ms
 81 | 80 took 2519ms. Estimating full batch to take 324999.9012345679ms
 82 | 81 took 2523ms. Estimating full batch to take 324974.8292682927ms
 83 | 82 took 2524ms. Estimating full batch to take 324951.9036144578ms
 84 | 83 took 2520ms. Estimating full batch to take 324923.4285714286ms
 85 | 84 took 2520ms. Estimating full batch to take 324895.62352941174ms
 86 | 85 took 2522ms. Estimating full batch to take 324871.4418604651ms
 87 | 86 took 2521ms. Estimating full batch to take 324846.3448275862ms
 88 | 87 took 2520ms. Estimating full batch to take 324820.36363636365ms
 89 | 88 took 2522ms. Estimating full batch to take 324797.84269662923ms
 90 | 89 took 2521ms. Estimating full batch to take 324774.4ms
 91 | 90 took 2521ms. Estimating full batch to take 324751.47252747254ms
 92 | 91 took 2523ms. Estimating full batch to take 324731.82608695654ms
 93 | 92 took 2522ms. Estimating full batch to take 324711.22580645164ms
 94 | 93 took 2520ms. Estimating full batch to take 324688.3404255319ms
 95 | 94 took 2522ms. Estimating full batch to take 324668.63157894736ms
 96 | 95 took 2520ms. Estimating full batch to take 324646.6666666667ms
 97 | 96 took 2520ms. Estimating full batch to take 324625.1546391753ms
 98 | 97 took 2519ms. Estimating full batch to take 324602.7755102041ms
 99 | 98 took 2522ms. Estimating full batch to take 324584.7272727273ms
100 | 99 took 2522ms. Estimating full batch to take 324567.04ms
101 | 100 took 2520ms. Estimating full batch to take 324547.1683168317ms
102 | 101 took 2521ms. Estimating full batch to take 324528.9411764706ms
103 | 102 took 2522ms. Estimating full batch to take 324512.31067961163ms
104 | 103 took 2520ms. Estimating full batch to take 324493.53846153844ms
105 | 104 took 2521ms. Estimating full batch to take 324476.34285714285ms
106 | 105 took 2524ms. Estimating full batch to take 324463.09433962265ms
107 | 106 took 2522ms. Estimating full batch to take 324447.7009345794ms
108 | 107 took 2521ms. Estimating full batch to take 324431.4074074074ms
109 | 108 took 2519ms. Estimating full batch to take 324413.06422018347ms
110 | 109 took 2522ms. Estimating full batch to take 324398.54545454547ms
111 | 110 took 2523ms. Estimating full batch to take 324385.4414414414ms
112 | 111 took 2522ms. Estimating full batch to take 324371.4285714286ms
113 | 112 took 2519ms. Estimating full batch to take 324354.26548672566ms
114 | 113 took 2521ms. Estimating full batch to take 324339.649122807ms
115 | 114 took 2523ms. Estimating full batch to take 324327.51304347825ms
116 | 115 took 2523ms. Estimating full batch to take 324315.5862068966ms
117 | 116 took 2523ms. Estimating full batch to take 324303.86324786325ms
118 | 117 took 2524ms. Estimating full batch to take 324293.42372881353ms
119 | 118 took 2522ms. Estimating full batch to take 324281.00840336137ms
120 | 119 took 2521ms. Estimating full batch to take 324267.73333333334ms
121 | 120 took 2523ms. Estimating full batch to take 324256.79338842974ms
122 | 121 took 2519ms. Estimating full batch to take 324241.8360655738ms
123 | 122 took 2524ms. Estimating full batch to take 324232.325203252ms
124 | 123 took 2523ms. Estimating full batch to take 324221.93548387097ms
125 | 124 took 2520ms. Estimating full batch to take 324208.64ms
126 | 125 took 2521ms. Estimating full batch to take 324196.5714285714ms
127 | 126 took 2523ms. Estimating full batch to take 324186.7086614173ms
128 | 127 took 2520ms. Estimating full batch to take 324174ms
129 | total: 324174ms
130 | 


--------------------------------------------------------------------------------
/cuda-convnet2/README.md:
--------------------------------------------------------------------------------
 1 | After a brief email exchange with Alex, he suggested that the easiest way to do benchmarking is to write a small C/C++ wrapper around cudaconv3 (where all the kernels are).
 2 | I took this route, except that I wrote a Torch/FFI wrapper around the kernels (instead of C/C++), the repository can be found at 
 3 | https://github.com/soumith/cuda-convnet2.torch
 4 | 
 5 | For details on installing torch, look at the README.md in the [torch7 folder](https://github.com/soumith/convnet-benchmarks/tree/master/torch7)
 6 | Assuming torch is already installed, it can be installed with
 7 | ```bash
 8 | luarocks install https://raw.githubusercontent.com/soumith/cuda-convnet2.torch/master/ccn2-scm-1.rockspec
 9 | ```
10 | 
11 | The benchmark is included with [benchmark.lua in the torch7 folder](https://github.com/soumith/convnet-benchmarks/tree/master/torch7)
12 | 
13 | The benchmark can be run with the command:
14 | ```bash
15 | th benchmark.lua
16 | ```
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/cuda-convnet2/benchmark.lua:
--------------------------------------------------------------------------------
1 | ../torch7/benchmark.lua


--------------------------------------------------------------------------------
/cxxnet/README.md:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/antinucleon/cxxnet.git
 2 | cd cxxnet
 3 | 
 4 | Apply the diff:
 5 | ```
 6 | diff --git a/Makefile b/Makefile
 7 | index 0eaf612..6dd13cd 100644
 8 | --- a/Makefile
 9 | +++ b/Makefile
10 | @@ -3,7 +3,7 @@ export CC  = gcc
11 |  export CXX = g++
12 |  export NVCC =nvcc
13 | 
14 | -export CFLAGS = -Wall -g -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I./mshadow/
15 | +export CFLAGS = -Wall -g -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I./mshadow/  -I /usr/local/cuda/include/ -L /usr/local/cuda/lib64/ -pthread
16 | 
17 |  ifeq ($(blas),1)
18 |   LDFLAGS= -lm -lcudart -lcublas -lcurand -lz `pkg-config --libs opencv` -lblas
19 | ```
20 | 
21 | bash build.sh blas=1
22 | 
23 | 


--------------------------------------------------------------------------------
/deepcl/README.md:
--------------------------------------------------------------------------------
 1 | Install DeepCL:
 2 | 
 3 | ```bash
 4 | bash install.sh
 5 | ```
 6 | 
 7 | Launch the script:
 8 | 
 9 | ```bash
10 | source DeepCL/env/bin/activate
11 | source DeepCL/dist/bin/activate.sh
12 | 
13 | python DeepCL/python/benchmarking/deepcl_benchmark2.py soumith1
14 | python DeepCL/python/benchmarking/deepcl_benchmark2.py soumith2
15 | python DeepCL/python/benchmarking/deepcl_benchmark2.py soumith3
16 | python DeepCL/python/benchmarking/deepcl_benchmark2.py soumith4
17 | python DeepCL/python/benchmarking/deepcl_benchmark2.py soumith5
18 | ```
19 | 
20 | Results should appear in the `results.txt` file
21 | 
22 | 


--------------------------------------------------------------------------------
/deepcl/deepcl_benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This is intended to target inclusion in soumith's benchmarks as
  4 | # https://github.com/soumith/convnet-benchmarks
  5 | 
  6 | from __future__ import print_function
  7 | 
  8 | import os
  9 | import sys
 10 | import time
 11 | import array
 12 | import random
 13 | import PyDeepCL
 14 | 
 15 | numEpochs = 10
 16 | 
 17 | runs = [
 18 |    {
 19 |       'inputPlanes': 3,
 20 |       'outputPlanes': 96,
 21 |       'filterSize': 11,
 22 |       'inputSize': 128,
 23 |       'batchSize': 128,
 24 |    },
 25 |    {
 26 |       'inputPlanes': 64,
 27 |       'outputPlanes': 128,
 28 |       'filterSize': 9,
 29 |       'inputSize': 64,
 30 |       'batchSize': 128,
 31 |    },
 32 |    {
 33 |       'inputPlanes': 128,
 34 |       'outputPlanes': 128,
 35 |       'filterSize': 9,
 36 |       'inputSize': 32,
 37 |       'batchSize': 128,
 38 |    },
 39 |    {
 40 |       'inputPlanes': 128,
 41 |       'outputPlanes': 128,
 42 |       'filterSize': 7,
 43 |       'inputSize': 16,
 44 |       'batchSize': 128,
 45 |    },
 46 |    {
 47 |       'inputPlanes': 384, # num input planes
 48 |       'outputPlanes': 384, # num output planes
 49 |       'filterSize': 3, # filter size
 50 |       'inputSize': 13, # input size
 51 |       'batchSize': 128, # batchsize
 52 |    }
 53 | ]
 54 | 
 55 | def writeResults( resultsLine ):
 56 |     f = open('results.txt', 'a')
 57 |     f.write( resultsLine + '\n' )
 58 |     f.close()
 59 | 
 60 | def time_layer( numEpochs, batchSize, inputPlanes, inputSize, outputPlanes, filterSize ):
 61 |     print('building network...')
 62 |     net = PyDeepCL.NeuralNet( inputPlanes, inputSize )
 63 | #    net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(inputPlanes)
 64 | #        .filterSize(1).padZeros().biased().linear() ) # this is just to make sure that gradient needs to be 
 65 | #                                                      # backproped through next layer
 66 |     net.addLayer( PyDeepCL.ForceBackpropMaker() ) # this forces the next layer to backprop gradients to
 67 |                           # this layer
 68 |     net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(outputPlanes)
 69 |         .filterSize(filterSize).biased().linear() )
 70 |     net.addLayer( PyDeepCL.FullyConnectedMaker().numPlanes(1).imageSize(1) )
 71 |     net.addLayer( PyDeepCL.SoftMaxMaker() )
 72 |     print( net.asString() )
 73 | 
 74 |     images = array.array( 'f', [0] * (batchSize*inputPlanes*inputSize*inputSize) )
 75 |     for i in range( batchSize*inputPlanes*inputSize*inputSize ):
 76 |         images[i] = random.random() - 0.5
 77 | #    grad = array.array('f',[0] * batchSize * outputPlanes * (inputSize - filterSize + 1) )
 78 | #    for i in range( batchSize * outputPlanes * (inputSize - filterSize + 1) ):
 79 | #        grad[i] = random.random() - 0.5
 80 |     labels = array.array('i',[0] * batchSize )
 81 |     
 82 |     print('warming up...')
 83 |     #try:
 84 |     net.setBatchSize(batchSize)
 85 | 
 86 |     # warm up forward
 87 |     for i in range(8):
 88 |         last = time.time()
 89 |         net.propagate( images )
 90 |         now = time.time()
 91 |         print('  warm up propagate all-layer time', now - last )
 92 |         last = now
 93 |     net.backPropFromLabels( 0.001, labels )
 94 |     now = time.time()
 95 |     print('   warm up backprop all-layer time', now - last )
 96 |     last = now
 97 | 
 98 |     layer = net.getLayer(2)
 99 |     print('running forward prop timings:')
100 |     for i in range(numEpochs):
101 |         layer.propagate()
102 |     now = time.time()
103 |     print('forward layer total time', now - last )
104 |     print('forward layer average time', ( now - last ) / float(numEpochs) )
105 |     writeResults( layer.asString() + ', forward: ' + str( ( now - last ) / float(numEpochs) * 1000 ) + 'ms' )
106 | 
107 |     print('warm up backwards again')
108 |     layer.backProp(0.001)
109 |     layer.backProp(0.001)
110 |     print('warm up backwards done. start timings:')
111 | 
112 |     now = time.time()
113 |     last = now
114 |     for i in range(numEpochs):
115 |         layer.backProp(0.001)
116 |     now = time.time()
117 |     print('backwar layer total time', now - last )
118 |     print('backwar layer average time', ( now - last ) / float(numEpochs) )
119 |     writeResults( layer.asString() + ', backward: ' + str( ( now - last ) / float(numEpochs) * 1000 ) + 'ms' )
120 |     last = now
121 | 
122 | def time_run(fn):
123 |     times = []
124 |     fn()  # warm-up call, outputPlanest timed
125 |     for _ in range(repeat):
126 |         start = time.time()
127 |         for _ in range(number):
128 |             fn()
129 |         times.append((time.time() - start) / number)
130 |     return min(times)
131 | 
132 | def parse_custom_config(s):
133 |     # parses a custom configuration string of the format:
134 |     # iAxB,kCxD,bE where A: input channels, B: input size,
135 |     # C: output channels, D: kernel size, E: batchsize,
136 |     # (with G, being optional)
137 |     run = {'batchSize': 128 }
138 |     defs = {'i': ['inputPlanes', 'inputSize'],
139 |             'k': ['outputPlanes', 'filterSize'],
140 |             'b': ['batchSize'] }
141 |     for part in s.split(','):
142 |         p, args = part[0], map(int, part[1:].split('x'))
143 |         run.update(zip(defs[p], args))
144 |     return run
145 | 
146 | def go(runs):
147 |     for run in runs:
148 |         for key in run.keys(): # copy key values into function scope
149 |             go.__globals__[key] = run[key]
150 |         print( '' )
151 |         print( 'CONFIG: ', run )
152 | 
153 |         time_layer(numEpochs, batchSize, inputPlanes, inputSize,
154 |             outputPlanes, filterSize )
155 | 
156 | if __name__ == '__main__':
157 |     if len(sys.argv) > 1:
158 |         # allow specifying the runs on command line, 1-indexed (i.e., 1 2 5)
159 |         runs = [runs[int(r) - 1] for r in sys.argv[1:] if r[0] != 'i']
160 |         # allow specifying custom configurations on command line (e.g., i3x80x15,k32x3x7,b256)
161 |         runs.extend([parse_custom_config(r) for r in sys.argv[1:] if r[0] == 'i'])
162 | 
163 |     go(runs)
164 | 
165 | 


--------------------------------------------------------------------------------
/deepcl/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get install -y python2.7 python2.7-dev python-virtualenv cmake cmake-curses-gui make g++ gcc git
 4 | git clone --recursive https://github.com/hughperkins/DeepCL.git -b soumith-benchmarks
 5 | cd DeepCL
 6 | mkdir build
 7 | cd build
 8 | cmake ..
 9 | make -j 4 install
10 | cd ../python
11 | virtualenv ../env
12 | source ../env/bin/activate
13 | source ../dist/bin/activate.sh
14 | python setup.py install
15 | cd ../..
16 | 
17 | 


--------------------------------------------------------------------------------
/eblearn/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soumith/convnet-benchmarks/b458aab61c0ac2257c0990119b5de15c1e886f02/eblearn/README.md


--------------------------------------------------------------------------------
/glconv/README.md:
--------------------------------------------------------------------------------
 1 | Lua libray for spatial convolutions with OpenGL
 2 | ===============================================
 3 | Install torch, cunn
 4 | 
 5 | git clone git@github.com:mvitez/libglconv.git
 6 | 
 7 | cd libglconv
 8 | 
 9 | make
10 | 
11 | th ../stest.lua |tee ../output.log
12 | 


--------------------------------------------------------------------------------
/glconv/stest.lua:
--------------------------------------------------------------------------------
 1 | require 'nn'
 2 | require 'sys'
 3 | require 'cutorch'
 4 | print('Running on device: ' .. cutorch.getDeviceProperties(cutorch.getDevice()).name)
 5 | 
 6 | gl = require 'libglconv'
 7 | gl.logging(1)
 8 | gl.useintp(1)
 9 | --gl.precision(1)	-- required on NVidia
10 | precision = 1
11 | torch.setdefaulttensortype('torch.FloatTensor')
12 | 
13 | steps = 10 -- nb of steps in loop to average perf
14 | 
15 | runs = {
16 |    
17 |    {
18 |       -- first layer
19 |       ni = 128,
20 |       no = 128,
21 |       kw = 3,
22 |       kh = 3,
23 |       iw = 128,
24 |       ih = 128,
25 |       bs = 128,
26 |       dh = 1,
27 |       dw = 1
28 |    },
29 | }
30 | 
31 | function nn.SpatialConvolutionMM:updateOutput(input)
32 | 	gl.precision(precision)
33 | 	if self.weight:dim() == 2 then
34 | 		self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
35 | 	end
36 | 	gl.conv(input, self.weight, self.output, self.bias)
37 | 	return self.output
38 | end
39 | 
40 | for i,run in ipairs(runs) do
41 |    -- params for run:
42 |    local ni,no,kw,kh,bs,iw,ih,dw,dh = run.ni,run.no,run.kw,run.kh,run.bs,run.iw,run.ih,run.dw,run.dh
43 |    print('')
44 |    print('CONFIG: input = ' .. ni..'x'..iw..'x'..ih..' * ker = ' .. ni..'x'..no..'x'..kw..'x'..kh 
45 | 	    .. ' (bs = '..bs..', stride = ' .. dw .. ')')
46 |    collectgarbage()
47 |    local input = torch.randn(bs,ni,ih,iw)
48 |    local network = nn.SpatialConvolutionMM(ni, no, kw, kh)
49 |    local output = network:forward(input)
50 | 
51 |    sys.tic()
52 |    for t = 1,steps do
53 |       output = network:forward(input)
54 |    end
55 |    local tmf = sys.toc()/steps
56 |    print(string.format("%-30s %25s %10.2f", 'glconv', ':updateOutput():', tmf*1000))
57 | end
58 | 


--------------------------------------------------------------------------------
/greentea/Makefile.config:
--------------------------------------------------------------------------------
 1 | ## Refer to http://caffe.berkeleyvision.org/installation.html
 2 | # Contributions simplifying and improving our build system are welcome!
 3 | 
 4 | # GreenTea (ViennaCL/OpenCL) backend switch
 5 | 
 6 | # Enable the CUDA backend
 7 | USE_CUDA := 0
 8 | 
 9 | # Enable the OpenCL/Greentea backend
10 | USE_GREENTEA := 1
11 | 
12 | # Folder of the ViennaCL header-only library
13 | VIENNACL_DIR = ../ViennaCL
14 | 
15 | # Either set clBLAS to 1 or it will use ViennaclBLAS.
16 | # CLBLAS should be faster, especially on AMD cards.
17 | USE_CLBLAS := 0
18 | 
19 | # cuDNN acceleration switch (uncomment to build with cuDNN).
20 | # USE_CUDNN := 1
21 | 
22 | # CPU-only switch (uncomment to build without GPU support).
23 | # CPU_ONLY := 1
24 | 
25 | # To customize your choice of compiler, uncomment and set the following.
26 | # N.B. the default for Linux is g++ and the default for OSX is clang++
27 | # CUSTOM_CXX := /usr/local/gcc/4.9.2/bin/g++
28 | 
29 | # CUDA directory contains bin/ and lib/ directories that we need.
30 | CUDA_DIR := /usr/local/cuda
31 | # On Ubuntu 14.04, if cuda tools are installed via
32 | # "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
33 | # CUDA_DIR := /usr
34 | 
35 | # CUDA architecture setting: going with all of them.
36 | # For CUDA < 6.0, comment the *_50 lines for compatibility.
37 | CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
38 | 		-gencode arch=compute_20,code=sm_21 \
39 | 		-gencode arch=compute_30,code=sm_30 \
40 | 		-gencode arch=compute_35,code=sm_35 \
41 | 		-gencode arch=compute_50,code=sm_50 \
42 | 		-gencode arch=compute_50,code=compute_50
43 | 
44 | # BLAS choice:
45 | # atlas for ATLAS (default)
46 | # mkl for MKL
47 | # open for OpenBlas
48 | BLAS := open
49 | # Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
50 | # Leave commented to accept the defaults for your choice of BLAS
51 | # (which should work)!
52 | # BLAS_INCLUDE := /path/to/your/blas
53 | # BLAS_LIB := /path/to/your/blas
54 | 
55 | # This is required only if you will compile the matlab interface.
56 | # MATLAB directory should contain the mex binary in /bin.
57 | # MATLAB_DIR := /usr/local
58 | # MATLAB_DIR := /Applications/MATLAB_R2012b.app
59 | 
60 | # NOTE: this is required only if you will compile the python interface.
61 | # We need to be able to find Python.h and numpy/arrayobject.h.
62 | PYTHON_INCLUDE := /usr/include/python2.7 \
63 | 		/usr/lib/python2.7/dist-packages/numpy/core/include
64 | # Anaconda Python distribution is quite popular. Include path:
65 | # Verify anaconda location, sometimes it's in root.
66 | # ANACONDA_HOME := $(HOME)/anaconda
67 | # PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
68 | 		# $(ANACONDA_HOME)/include/python2.7 \
69 | 		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
70 | 
71 | # We need to be able to find libpythonX.X.so or .dylib.
72 | PYTHON_LIB := /usr/lib
73 | # PYTHON_LIB := $(ANACONDA_HOME)/lib
74 | 
75 | # Uncomment to support layers written in Python (will link against Python libs)
76 | # WITH_PYTHON_LAYER := 1
77 | 
78 | # Whatever else you find you need goes here.
79 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
80 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
81 | 
82 | # Uncomment to use `pkg-config` to specify OpenCV library paths.
83 | # (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
84 | # USE_PKG_CONFIG := 1
85 | 
86 | BUILD_DIR := build
87 | DISTRIBUTE_DIR := distribute
88 | 
89 | # Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
90 | # DEBUG := 1
91 | # VIENNACL_DEBUG := 0
92 | 
93 | # The ID of the GPU that 'make runtest' will use to run unit tests.
94 | TEST_GPUID := 0
95 | 
96 | # enable pretty build (comment to see full commands)
97 | Q ?= @
98 | 


--------------------------------------------------------------------------------
/greentea/README.md:
--------------------------------------------------------------------------------
 1 | Download and extract viennacl:
 2 | wget -c http://downloads.sourceforge.net/project/viennacl/1.7.x/ViennaCL-1.7.0.tar.gz?r=http%3A%2F%2Fviennacl.sourceforge.net%2Fviennacl-download.html&ts=1440274908&use_mirror=superb-dca2
 3 | tar -xvf ViennaCL-1.7.0.tar.gz
 4 | 
 5 | 
 6 | Install Caffe using the script:
 7 | ```bash
 8 | bash install.sh
 9 | ```
10 | (If something fails, check your ViennaCL (>= 1.5) and OpenCL installation. Might have issues with libOpenCL.so provided by nVidia. If so, install a second OpenCL implementation from Intel or AMD.)
11 | 
12 | Run the benchmark using:
13 | ```bash
14 | ./run_imagenet.sh
15 | ./run_nogradinput.sh
16 | ./run_forcegradinput.sh
17 | ```
18 | 
19 | Requires at least one device with a valid OpenCL driver installed.
20 | The default compile settings (Makefile.config) do:
21 | - Use ViennaCL for BLAS calls.
22 | - Disable the CUDA backend.
23 | 
24 | 


--------------------------------------------------------------------------------
/greentea/imagenet_winners/alexnet.prototxt:
--------------------------------------------------------------------------------
  1 | name: "alexnet"
  2 | input: "data"
  3 | input_dim: 128
  4 | input_dim: 3
  5 | input_dim: 224
  6 | input_dim: 224
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/11x11_s4"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 64
 19 |     kernel_size: 11
 20 |     stride: 4
 21 |     pad: 2
 22 |     weight_filler {
 23 |       type: "xavier"
 24 |       std: 0.1
 25 |     }
 26 |     bias_filler {
 27 |       type: "constant"
 28 |       value: 0.2
 29 |     }
 30 |   }
 31 | }
 32 | layers {
 33 |   name: "conv1/relu"
 34 |   type: RELU
 35 |   bottom: "conv1/11x11_s4"
 36 |   top: "conv1/11x11_s4"
 37 | }
 38 | layers {
 39 |   name: "pool1/3x3_s2"
 40 |   type: POOLING
 41 |   bottom: "conv1/11x11_s4"
 42 |   top: "pool1/3x3_s2"
 43 |   pooling_param {
 44 |     pool: MAX
 45 |     kernel_size: 3
 46 |     stride: 2
 47 |   }
 48 | }
 49 | layers {
 50 |   name: "conv2/5x5_s1"
 51 |   type: CONVOLUTION
 52 |   bottom: "pool1/3x3_s2"
 53 |   top: "conv2/5x5_s1"
 54 |   blobs_lr: 1
 55 |   blobs_lr: 2 
 56 |   weight_decay: 1
 57 |   weight_decay: 0 
 58 |   convolution_param {
 59 |     num_output: 192
 60 |     kernel_size: 5
 61 |     stride: 1
 62 |     pad: 2
 63 |     weight_filler {
 64 |       type: "xavier"
 65 |       std: 0.1
 66 |     }
 67 |     bias_filler {
 68 |       type: "constant"
 69 |       value: 0.2
 70 |     }
 71 |   }
 72 | }
 73 | layers {
 74 |   name: "cpnv2/relu"
 75 |   type: RELU
 76 |   bottom: "conv2/5x5_s1"
 77 |   top: "conv2/5x5_s1"
 78 | }
 79 | layers {
 80 |   name: "pool2/3x3_s2"
 81 |   type: POOLING
 82 |   bottom: "conv2/5x5_s1"
 83 |   top: "pool2/3x3_s2"
 84 |   pooling_param {
 85 |     pool: MAX
 86 |     kernel_size: 3
 87 |     stride: 2
 88 |   }
 89 | }
 90 | layers {
 91 |   name: "conv3/3x3_s1"
 92 |   type: CONVOLUTION
 93 |   bottom: "pool2/3x3_s2"
 94 |   top: "conv3/3x3_s1"
 95 |   blobs_lr: 1
 96 |   blobs_lr: 2 
 97 |   weight_decay: 1
 98 |   weight_decay: 0 
 99 |   convolution_param {
100 |     num_output: 384
101 |     kernel_size: 3
102 |     stride: 1
103 |     pad: 1
104 |     weight_filler {
105 |       type: "xavier"
106 |       std: 0.1
107 |     }
108 |     bias_filler {
109 |       type: "constant"
110 |       value: 0.2
111 |     }
112 |   }
113 | }
114 | layers {
115 |   name: "conv3/relu"
116 |   type: RELU
117 |   bottom: "conv3/3x3_s1"
118 |   top: "conv3/3x3_s1"
119 | }
120 | layers {
121 |   name: "conv4/3x3_s1"
122 |   type: CONVOLUTION
123 |   bottom: "conv3/3x3_s1"
124 |   top: "conv4/3x3_s1"
125 |   blobs_lr: 1
126 |   blobs_lr: 2 
127 |   weight_decay: 1
128 |   weight_decay: 0 
129 |   convolution_param {
130 |     num_output: 256
131 |     kernel_size: 3
132 |     stride: 1
133 |     pad: 1
134 |     weight_filler {
135 |       type: "xavier"
136 |       std: 0.1
137 |     }
138 |     bias_filler {
139 |       type: "constant"
140 |       value: 0.2
141 |     }
142 |   }
143 | }
144 | layers {
145 |   name: "conv4/relu"
146 |   type: RELU
147 |   bottom: "conv4/3x3_s1"
148 |   top: "conv4/3x3_s1"
149 | }
150 | layers {
151 |   name: "conv5/3x3_s1"
152 |   type: CONVOLUTION
153 |   bottom: "conv4/3x3_s1"
154 |   top: "conv5/3x3_s1"
155 |   blobs_lr: 1
156 |   blobs_lr: 2 
157 |   weight_decay: 1
158 |   weight_decay: 0 
159 |   convolution_param {
160 |     num_output: 256
161 |     kernel_size: 3
162 |     stride: 1
163 |     pad: 1
164 |     weight_filler {
165 |       type: "xavier"
166 |       std: 0.1
167 |     }
168 |     bias_filler {
169 |       type: "constant"
170 |       value: 0.2
171 |     }
172 |   }
173 | }
174 | layers {
175 |   name: "conv5/relu"
176 |   type: RELU
177 |   bottom: "conv5/3x3_s1"
178 |   top: "conv5/3x3_s1"
179 | }
180 | layers {
181 |   name: "pool5/3x3_s2"
182 |   type: POOLING
183 |   bottom: "conv5/3x3_s1"
184 |   top: "pool5/3x3_s2"
185 |   pooling_param {
186 |     pool: MAX
187 |     kernel_size: 3
188 |     stride: 2
189 |   }
190 | }
191 | layers {
192 |   name: "fc6"
193 |   type: INNER_PRODUCT
194 |   bottom: "pool5/3x3_s2"
195 |   top: "fc6"
196 |   inner_product_param {
197 |     num_output: 4096
198 |   }
199 | }
200 | layers {
201 |   name: "fc7"
202 |   type: INNER_PRODUCT
203 |   bottom: "fc6"
204 |   top: "fc7"
205 |   inner_product_param {
206 |     num_output: 4096
207 |   }
208 | }
209 | layers {
210 |   name: "fc8"
211 |   type: INNER_PRODUCT
212 |   bottom: "fc7"
213 |   top: "fc8"
214 |   inner_product_param {
215 |     num_output: 1000
216 |   }
217 | }
218 | 


--------------------------------------------------------------------------------
/greentea/imagenet_winners/overfeat.prototxt:
--------------------------------------------------------------------------------
  1 | name: "overfeat"
  2 | input: "data"
  3 | input_dim: 128
  4 | input_dim: 3
  5 | input_dim: 231
  6 | input_dim: 231
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1/11x11_s4"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/11x11_s4"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 96
 19 |     kernel_size: 11
 20 |     stride: 4
 21 |     weight_filler {
 22 |       type: "xavier"
 23 |       std: 0.1
 24 |     }
 25 |     bias_filler {
 26 |       type: "constant"
 27 |       value: 0.2
 28 |     }
 29 |   }
 30 | }
 31 | layers {
 32 |   name: "conv1/relu"
 33 |   type: RELU
 34 |   bottom: "conv1/11x11_s4"
 35 |   top: "conv1/11x11_s4"
 36 | }
 37 | layers {
 38 |   name: "pool1/2x2_s2"
 39 |   type: POOLING
 40 |   bottom: "conv1/11x11_s4"
 41 |   top: "pool1/2x2_s2"
 42 |   pooling_param {
 43 |     pool: MAX
 44 |     kernel_size: 2
 45 |     stride: 2
 46 |   }
 47 | }
 48 | layers {
 49 |   name: "conv2/5x5_s1"
 50 |   type: CONVOLUTION
 51 |   bottom: "pool1/2x2_s2"
 52 |   top: "conv2/5x5_s1"
 53 |   blobs_lr: 1
 54 |   blobs_lr: 2 
 55 |   weight_decay: 1
 56 |   weight_decay: 0 
 57 |   convolution_param {
 58 |     num_output: 256
 59 |     kernel_size: 5
 60 |     stride: 1
 61 |     weight_filler {
 62 |       type: "xavier"
 63 |       std: 0.1
 64 |     }
 65 |     bias_filler {
 66 |       type: "constant"
 67 |       value: 0.2
 68 |     }
 69 |   }
 70 | }
 71 | layers {
 72 |   name: "conv2/relu"
 73 |   type: RELU
 74 |   bottom: "conv2/5x5_s1"
 75 |   top: "conv2/5x5_s1"
 76 | }
 77 | layers {
 78 |   name: "pool2/2x2_s2"
 79 |   type: POOLING
 80 |   bottom: "conv2/5x5_s1"
 81 |   top: "pool2/2x2_s2"
 82 |   pooling_param {
 83 |     pool: MAX
 84 |     kernel_size: 2
 85 |     stride: 2
 86 |   }
 87 | }
 88 | layers {
 89 |   name: "conv3/3x3_s1"
 90 |   type: CONVOLUTION
 91 |   bottom: "pool2/2x2_s2"
 92 |   top: "conv3/3x3_s1"
 93 |   blobs_lr: 1
 94 |   blobs_lr: 2 
 95 |   weight_decay: 1
 96 |   weight_decay: 0 
 97 |   convolution_param {
 98 |     num_output: 512
 99 |     kernel_size: 3
100 |     stride: 1
101 |     pad: 1
102 |     weight_filler {
103 |       type: "xavier"
104 |       std: 0.1
105 |     }
106 |     bias_filler {
107 |       type: "constant"
108 |       value: 0.2
109 |     }
110 |   }
111 | }
112 | layers {
113 |   name: "conv3/relu"
114 |   type: RELU
115 |   bottom: "conv3/3x3_s1"
116 |   top: "conv3/3x3_s1"
117 | }
118 | layers {
119 |   name: "conv4/3x3_s1"
120 |   type: CONVOLUTION
121 |   bottom: "conv3/3x3_s1"
122 |   top: "conv4/3x3_s1"
123 |   blobs_lr: 1
124 |   blobs_lr: 2 
125 |   weight_decay: 1
126 |   weight_decay: 0 
127 |   convolution_param {
128 |     num_output: 1024
129 |     kernel_size: 3
130 |     stride: 1
131 |     pad: 1
132 |     weight_filler {
133 |       type: "xavier"
134 |       std: 0.1
135 |     }
136 |     bias_filler {
137 |       type: "constant"
138 |       value: 0.2
139 |     }
140 |   }
141 | }
142 | layers {
143 |   name: "conv4/relu"
144 |   type: RELU
145 |   bottom: "conv4/3x3_s1"
146 |   top: "conv4/3x3_s1"
147 | }
148 | layers {
149 |   name: "conv5/3x3_s1"
150 |   type: CONVOLUTION
151 |   bottom: "conv4/3x3_s1"
152 |   top: "conv5/3x3_s1"
153 |   blobs_lr: 1
154 |   blobs_lr: 2 
155 |   weight_decay: 1
156 |   weight_decay: 0 
157 |   convolution_param {
158 |     num_output: 1024
159 |     kernel_size: 3
160 |     stride: 1
161 |     pad: 1
162 |     weight_filler {
163 |       type: "xavier"
164 |       std: 0.1
165 |     }
166 |     bias_filler {
167 |       type: "constant"
168 |       value: 0.2
169 |     }
170 |   }
171 | }
172 | layers {
173 |   name: "conv5/relu"
174 |   type: RELU
175 |   bottom: "conv5/3x3_s1"
176 |   top: "conv5/3x3_s1"
177 | }
178 | layers {
179 |   name: "pool5/2x2_s2"
180 |   type: POOLING
181 |   bottom: "conv5/3x3_s1"
182 |   top: "pool5/2x2_s2"
183 |   pooling_param {
184 |     pool: MAX
185 |     kernel_size: 2
186 |     stride: 2
187 |   }
188 | }
189 | layers {
190 |   name: "fc6"
191 |   type: INNER_PRODUCT
192 |   bottom: "pool5/2x2_s2"
193 |   top: "fc6"
194 |   inner_product_param {
195 |     num_output: 3072
196 |   }
197 | }
198 | layers {
199 |   name: "fc7"
200 |   type: INNER_PRODUCT
201 |   bottom: "fc6"
202 |   top: "fc7"
203 |   inner_product_param {
204 |     num_output: 4096
205 |   }
206 | }
207 | layers {
208 |   name: "fc8"
209 |   type: INNER_PRODUCT
210 |   bottom: "fc7"
211 |   top: "fc8"
212 |   inner_product_param {
213 |     num_output: 1000
214 |   }
215 | }
216 | 


--------------------------------------------------------------------------------
/greentea/imagenet_winners/vgg_a.prototxt:
--------------------------------------------------------------------------------
  1 | name: "vgg_a"
  2 | input: "data"
  3 | input_dim: 64
  4 | input_dim: 3
  5 | input_dim: 224
  6 | input_dim: 224
  7 | force_backward: true
  8 | layers {
  9 |   name: "conv1/3x3_s1"
 10 |   type: CONVOLUTION
 11 |   bottom: "data"
 12 |   top: "conv1/3x3_s1"
 13 |   blobs_lr: 1
 14 |   blobs_lr: 2 
 15 |   weight_decay: 1
 16 |   weight_decay: 0 
 17 |   convolution_param {
 18 |     num_output: 64
 19 |     kernel_size: 3
 20 |     stride: 1
 21 |     weight_filler {
 22 |       type: "xavier"
 23 |       std: 0.1
 24 |     }
 25 |     bias_filler {
 26 |       type: "constant"
 27 |       value: 0.2
 28 |     }
 29 |   }
 30 | }
 31 | layers {
 32 |   name: "conv1/relu"
 33 |   type: RELU
 34 |   bottom: "conv1/3x3_s1"
 35 |   top: "conv1/3x3_s1"
 36 | }
 37 | layers {
 38 |   name: "pool1/2x2_s2"
 39 |   type: POOLING
 40 |   bottom: "conv1/3x3_s1"
 41 |   top: "pool1/2x2_s2"
 42 |   pooling_param {
 43 |     pool: MAX
 44 |     kernel_size: 2
 45 |     stride: 2
 46 |   }
 47 | }
 48 | layers {
 49 |   name: "conv2/3x3_s1"
 50 |   type: CONVOLUTION
 51 |   bottom: "pool1/2x2_s2"
 52 |   top: "conv2/3x3_s1"
 53 |   blobs_lr: 1
 54 |   blobs_lr: 2 
 55 |   weight_decay: 1
 56 |   weight_decay: 0 
 57 |   convolution_param {
 58 |     num_output: 128
 59 |     pad: 1
 60 |     kernel_size: 3
 61 |     stride: 1
 62 |     weight_filler {
 63 |       type: "xavier"
 64 |       std: 0.1
 65 |     }
 66 |     bias_filler {
 67 |       type: "constant"
 68 |       value: 0.2
 69 |     }
 70 |   }
 71 | }
 72 | layers {
 73 |   name: "conv2/relu"
 74 |   type: RELU
 75 |   bottom: "conv2/3x3_s1"
 76 |   top: "conv2/3x3_s1"
 77 | }
 78 | layers {
 79 |   name: "pool2/2x2_s2"
 80 |   type: POOLING
 81 |   bottom: "conv2/3x3_s1"
 82 |   top: "pool2/2x2_s2"
 83 |   pooling_param {
 84 |     pool: MAX
 85 |     kernel_size: 2
 86 |     stride: 2
 87 |   }
 88 | }
 89 | layers {
 90 |   name: "conv3/3x3_s1"
 91 |   type: CONVOLUTION
 92 |   bottom: "pool2/2x2_s2"
 93 |   top: "conv3/3x3_s1"
 94 |   blobs_lr: 1
 95 |   blobs_lr: 2 
 96 |   weight_decay: 1
 97 |   weight_decay: 0 
 98 |   convolution_param {
 99 |     num_output: 256
100 |     kernel_size: 3
101 |     stride: 1
102 |     pad: 1
103 |     weight_filler {
104 |       type: "xavier"
105 |       std: 0.1
106 |     }
107 |     bias_filler {
108 |       type: "constant"
109 |       value: 0.2
110 |     }
111 |   }
112 | }
113 | layers {
114 |   name: "conv3/relu"
115 |   type: RELU
116 |   bottom: "conv3/3x3_s1"
117 |   top: "conv3/3x3_s1"
118 | }
119 | layers {
120 |   name: "conv4/3x3_s1"
121 |   type: CONVOLUTION
122 |   bottom: "conv3/3x3_s1"
123 |   top: "conv4/3x3_s1"
124 |   blobs_lr: 1
125 |   blobs_lr: 2 
126 |   weight_decay: 1
127 |   weight_decay: 0 
128 |   convolution_param {
129 |     num_output: 256
130 |     kernel_size: 3
131 |     stride: 1
132 |     pad: 1
133 |     weight_filler {
134 |       type: "xavier"
135 |       std: 0.1
136 |     }
137 |     bias_filler {
138 |       type: "constant"
139 |       value: 0.2
140 |     }
141 |   }
142 | }
143 | layers {
144 |   name: "conv4/relu"
145 |   type: RELU
146 |   bottom: "conv4/3x3_s1"
147 |   top: "conv4/3x3_s1"
148 | }
149 | layers {
150 |   name: "pool3/2x2_s2"
151 |   type: POOLING
152 |   bottom: "conv4/3x3_s1"
153 |   top: "pool3/2x2_s2"
154 |   pooling_param {
155 |     pool: MAX
156 |     kernel_size: 2
157 |     stride: 2
158 |   }
159 | }
160 | layers {
161 |   name: "conv5/3x3_s1"
162 |   type: CONVOLUTION
163 |   bottom: "pool3/2x2_s2"
164 |   top: "conv5/3x3_s1"
165 |   blobs_lr: 1
166 |   blobs_lr: 2 
167 |   weight_decay: 1
168 |   weight_decay: 0 
169 |   convolution_param {
170 |     num_output: 512
171 |     kernel_size: 3
172 |     stride: 1
173 |     pad: 1
174 |     weight_filler {
175 |       type: "xavier"
176 |       std: 0.1
177 |     }
178 |     bias_filler {
179 |       type: "constant"
180 |       value: 0.2
181 |     }
182 |   }
183 | }
184 | layers {
185 |   name: "conv5/relu"
186 |   type: RELU
187 |   bottom: "conv5/3x3_s1"
188 |   top: "conv5/3x3_s1"
189 | }
190 | layers {
191 |   name: "conv6/3x3_s1"
192 |   type: CONVOLUTION
193 |   bottom: "conv5/3x3_s1"
194 |   top: "conv6/3x3_s1"
195 |   blobs_lr: 1
196 |   blobs_lr: 2 
197 |   weight_decay: 1
198 |   weight_decay: 0 
199 |   convolution_param {
200 |     num_output: 512
201 |     kernel_size: 3
202 |     stride: 1
203 |     pad: 1
204 |     weight_filler {
205 |       type: "xavier"
206 |       std: 0.1
207 |     }
208 |     bias_filler {
209 |       type: "constant"
210 |       value: 0.2
211 |     }
212 |   }
213 | }
214 | layers {
215 |   name: "conv6/relu"
216 |   type: RELU
217 |   bottom: "conv6/3x3_s1"
218 |   top: "conv6/3x3_s1"
219 | }
220 | layers {
221 |   name: "pool4/2x2_s2"
222 |   type: POOLING
223 |   bottom: "conv6/3x3_s1"
224 |   top: "pool4/2x2_s2"
225 |   pooling_param {
226 |     pool: MAX
227 |     kernel_size: 2
228 |     stride: 2
229 |   }
230 | }
231 | layers {
232 |   name: "conv7/3x3_s1"
233 |   type: CONVOLUTION
234 |   bottom: "pool4/2x2_s2"
235 |   top: "conv7/3x3_s1"
236 |   blobs_lr: 1
237 |   blobs_lr: 2 
238 |   weight_decay: 1
239 |   weight_decay: 0 
240 |   convolution_param {
241 |     num_output: 512
242 |     kernel_size: 3
243 |     stride: 1
244 |     pad: 1
245 |     weight_filler {
246 |       type: "xavier"
247 |       std: 0.1
248 |     }
249 |     bias_filler {
250 |       type: "constant"
251 |       value: 0.2
252 |     }
253 |   }
254 | }
255 | layers {
256 |   name: "conv7/relu"
257 |   type: RELU
258 |   bottom: "conv7/3x3_s1"
259 |   top: "conv7/3x3_s1"
260 | }
261 | layers {
262 |   name: "conv8/3x3_s1"
263 |   type: CONVOLUTION
264 |   bottom: "conv7/3x3_s1"
265 |   top: "conv8/3x3_s1"
266 |   blobs_lr: 1
267 |   blobs_lr: 2 
268 |   weight_decay: 1
269 |   weight_decay: 0 
270 |   convolution_param {
271 |     num_output: 512
272 |     kernel_size: 3
273 |     stride: 1
274 |     pad: 1
275 |     weight_filler {
276 |       type: "xavier"
277 |       std: 0.1
278 |     }
279 |     bias_filler {
280 |       type: "constant"
281 |       value: 0.2
282 |     }
283 |   }
284 | }
285 | layers {
286 |   name: "conv8/relu"
287 |   type: RELU
288 |   bottom: "conv8/3x3_s1"
289 |   top: "conv8/3x3_s1"
290 | }
291 | layers {
292 |   name: "pool5/2x2_s2"
293 |   type: POOLING
294 |   bottom: "conv8/3x3_s1"
295 |   top: "pool5/2x2_s2"
296 |   pooling_param {
297 |     pool: MAX
298 |     kernel_size: 2
299 |     stride: 2
300 |   }
301 | }
302 | layers {
303 |   name: "fc6"
304 |   type: INNER_PRODUCT
305 |   bottom: "pool5/2x2_s2"
306 |   top: "fc6"
307 |   inner_product_param {
308 |     num_output: 4096
309 |   }
310 | }
311 | layers {
312 |   name: "fc7"
313 |   type: INNER_PRODUCT
314 |   bottom: "fc6"
315 |   top: "fc7"
316 |   inner_product_param {
317 |     num_output: 4096
318 |   }
319 | }
320 | layers {
321 |   name: "fc8"
322 |   type: INNER_PRODUCT
323 |   bottom: "fc7"
324 |   top: "fc8"
325 |   inner_product_param {
326 |     num_output: 1000
327 |   }
328 | }
329 | 


--------------------------------------------------------------------------------
/greentea/install.sh:
--------------------------------------------------------------------------------
 1 | git clone https://github.com/naibaf7/caffe.git
 2 | cd caffe
 3 | git checkout master
 4 | 
 5 | # Dependencies
 6 | sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libboost-all-dev libhdf5-serial-dev 
 7 | sudo apt-get install -y protobuf-compiler gfortran libjpeg62 libfreeimage-dev libatlas-base-dev git python-dev python-pip 
 8 | sudo apt-get install -y libgoogle-glog-dev libbz2-dev libxml2-dev libxslt-dev libffi-dev libssl-dev libgflags-dev liblmdb-dev python-yaml
 9 | sudo apt-get install -y libviennacl-dev opencl-headers libopenblas-base libopenblas-dev
10 | easy_install pillow #conda python
11 | 
12 | # Compile Caffe
13 | cp ../Makefile.config Makefile.config
14 | 
15 | cores=`grep -c ^processor /proc/cpuinfo`
16 | 
17 | make all -j$cores VIENNACL_DIR=../ViennaCL-1.7.0/
18 | make test -j$cores VIENNACL_DIR=../ViennaCL-1.7.0/
19 | make runtest -j$cores VIENNACL_DIR=../ViennaCL-1.7.0/
20 | 


--------------------------------------------------------------------------------
/greentea/run_forcegradinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm output_forceGradInput.log
 4 | 
 5 | ./caffe/build/tools/caffe time --model=../caffe/proto_forceGradInput/conv1.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
 6 | ./caffe/build/tools/caffe time --model=../caffe/proto_forceGradInput/conv2.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
 7 | ./caffe/build/tools/caffe time --model=../caffe/proto_forceGradInput/conv3.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
 8 | ./caffe/build/tools/caffe time --model=../caffe/proto_forceGradInput/conv4.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
 9 | ./caffe/build/tools/caffe time --model=../caffe/proto_forceGradInput/conv5.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_forceGradInput.log 2>&1
10 | 


--------------------------------------------------------------------------------
/greentea/run_imagenet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm output_*.log
3 | ./caffe/build/tools/caffe time --model=../caffe/imagenet_winners/alexnet.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_alexnet.log 2>&1
4 | ./caffe/build/tools/caffe time --model=../caffe/imagenet_winners/overfeat.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_overfeat.log 2>&1
5 | ./caffe/build/tools/caffe time --model=../caffe/imagenet_winners/vgg_a.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_vgg_a.log 2>&1
6 | ./caffe/build/tools/caffe time --model=./imagenet_winners/googlenet.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_googlenet.log 2>&1
7 | 
8 | 


--------------------------------------------------------------------------------
/greentea/run_nogradinput.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm output_noGradInput.log
3 | ./caffe/build/tools/caffe time --model=../caffe/proto_noGradInput/conv1.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
4 | ./caffe/build/tools/caffe time --model=../caffe/proto_noGradInput/conv2.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
5 | ./caffe/build/tools/caffe time --model=../caffe/proto_noGradInput/conv3.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
6 | ./caffe/build/tools/caffe time --model=../caffe/proto_noGradInput/conv4.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
7 | ./caffe/build/tools/caffe time --model=../caffe/proto_noGradInput/conv5.prototxt --iterations=10 --gpu 0 --logtostderr=1 >>output_noGradInput.log 2>&1
8 | 
9 | 


--------------------------------------------------------------------------------
/matlab-DeepLearnToolbox/README.md:
--------------------------------------------------------------------------------
1 | git clone https://github.com/rasmusbergpalm/DeepLearnToolbox.git
2 | 
3 | In octave
4 | 
5 | addpath(genpath('DeepLearnToolbox'));
6 | 
7 | 


--------------------------------------------------------------------------------
/mxnet/README.md:
--------------------------------------------------------------------------------
 1 | ### Install via:
 2 | 
 3 | 
 4 | ```
 5 | sudo apt-get update
 6 | sudo apt-get install -y build-essential git libatlas-base-dev libopencv-dev
 7 | 
 8 | git clone --recursive https://github.com/dmlc/mxnet
 9 | 
10 | make -j12 USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda
11 | cd python; python setup.py install
12 | cd ../../
13 | ```
14 | 
15 | ### Run benchmarks
16 | 
17 | ```
18 | CUDA_VISIBLE_DEVICES=2 MXNET_GPU_WORKER_NTHREADS=2 MXNET_EXEC_NUM_TEMP=1 python alexnet.py | tee out_alexnet.log
19 | 
20 | ```
21 | 
22 | ### Notes from antinucleon
23 | 
24 | We choose to block the dynamic thread engine to get fair result, definitely there will be some costs,
25 | and I do think it is not worth for these microseconds but the most important thing is learn spirit
26 | from each tools.
27 | The first epoch will make some delay for lazy allocation, but we think it is not a problem.
28 | Also there is a magic number of 4GB threshold for dynamic memory recycle,
29 | we didn't change it although dynamic memory recycle will hurt performance too much.
30 | 
31 | 
32 | One import thing about MXNet is chose parallelism level.
33 | Basically, less parallelism, fewer memory cost.
34 | For example, on Titan X with 12 GB memory, train on GoogLeNet v1,
35 | 
36 | ```MXNET_GPU_WORKER_NTHREADS=2 MXNET_EXEC_NUM_TEMP=1 python3 gnet.py``` allows training in batch of 256,
37 | 
38 | but
39 | 
40 | ```MXNET_GPU_WORKER_NTHREADS=4 MXNET_EXEC_NUM_TEMP=4 python3 gnet.py```
41 | 
42 | will be oom for batch of 256 (guess still saving a little more than other library but not tested)
43 | 
44 | 
45 | Various of setting can be found at: https://mxnet.readthedocs.org/en/latest/env_var.html
46 | 
47 | In my feeling because currently hardware is bottleneck, dynamic data flow engine and
48 | multi-execution doesn't show its advantage on single card, but in multi-gpu
49 | and distributed case, it makes problem much easier.
50 | 
51 | 
52 | BTW. Do you have plan to benchmark multi-gpu or distributed convolution net?
53 | We have collected some result already.
54 | 
55 | https://github.com/dmlc/mxnet/tree/master/example/distributed-training
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/mxnet/alexnet.py:
--------------------------------------------------------------------------------
  1 | # In[1]:
  2 | 
  3 | import mxnet as mx
  4 | import numpy as np
  5 | import time
  6 | 
  7 | 
  8 | # In[2]:
  9 | 
 10 | # Basic Info
 11 | dev = mx.gpu()
 12 | batch_size = 128
 13 | dshape = (batch_size, 3, 224, 224)
 14 | lshape = (batch_size)
 15 | num_epoch = 100
 16 | 
 17 | # Mock data iterator
 18 | tmp_data = np.random.uniform(-1, 1, dshape).astype("float32")
 19 | 
 20 | train_iter = mx.io.NDArrayIter(data=tmp_data,  batch_size=batch_size, shuffle=False, last_batch_handle='pad')
 21 | 
 22 | 
 23 | 
 24 | # In[5]:
 25 | 
 26 | def get_alexnet_symbol():
 27 |     ## define alexnet
 28 |     input_data = mx.symbol.Variable(name="data")
 29 |     # stage 1
 30 |     conv1 = mx.symbol.Convolution(
 31 |         data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=64)
 32 |     relu1 = mx.symbol.Activation(data=conv1, act_type="relu")
 33 |     pool1 = mx.symbol.Pooling(
 34 |         data=relu1, pool_type="max", kernel=(3, 3), stride=(2,2))
 35 | #    lrn1 = mx.symbol.LRN(data=pool1, alpha=0.0001, beta=0.75, knorm=1, nsize=5)
 36 |     # stage 2
 37 |     conv2 = mx.symbol.Convolution(
 38 |         data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=192)
 39 |     relu2 = mx.symbol.Activation(data=conv2, act_type="relu")
 40 |     pool2 = mx.symbol.Pooling(data=relu2, kernel=(3, 3), stride=(2, 2), pool_type="max")
 41 | #    lrn2 = mx.symbol.LRN(data=pool2, alpha=0.0001, beta=0.75, knorm=1, nsize=5)
 42 |     # stage 3
 43 |     conv3 = mx.symbol.Convolution(
 44 |         data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=384)
 45 |     relu3 = mx.symbol.Activation(data=conv3, act_type="relu")
 46 |     conv4 = mx.symbol.Convolution(
 47 |         data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=256)
 48 |     relu4 = mx.symbol.Activation(data=conv4, act_type="relu")
 49 |     conv5 = mx.symbol.Convolution(
 50 |         data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256)
 51 |     relu5 = mx.symbol.Activation(data=conv5, act_type="relu")
 52 |     pool3 = mx.symbol.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max")
 53 |     # stage 4
 54 |     flatten = mx.symbol.Flatten(data=pool3)
 55 |     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096)
 56 |     relu6 = mx.symbol.Activation(data=fc1, act_type="relu")
 57 |     # stage 5
 58 |     fc2 = mx.symbol.FullyConnected(data=relu6, num_hidden=4096)
 59 |     relu7 = mx.symbol.Activation(data=fc2, act_type="relu")
 60 |     # stage 6
 61 |     fc3 = mx.symbol.FullyConnected(data=relu7, num_hidden=1000)
 62 |     return fc3
 63 | 
 64 | # In[6]:
 65 | 
 66 | # bind to get executor
 67 | # This is what happened behind mx.model.Feedforward
 68 | fc3 = get_alexnet_symbol()
 69 | alex_exec = fc3.simple_bind(ctx=dev, grad_req="write", data=dshape)
 70 | print("Temp space: ", alex_exec.debug_str().split('\n')[-3])
 71 | # Find where to set data
 72 | 
 73 | 
 74 | # In[7]:
 75 | 
 76 | # some useful structure
 77 | # data structues 
 78 | arg_names = fc3.list_arguments()
 79 | arg_map = dict(zip(arg_names, alex_exec.arg_arrays))
 80 | grad_map = dict(zip(arg_names, alex_exec.grad_arrays))
 81 | 
 82 | 
 83 | param_blocks = [(i, arg_map[arg_names[i]], grad_map[arg_names[i]]) for i in range(len(arg_names)) if grad_map[arg_names[i]] != None]
 84 | input_ndarray = arg_map["data"]
 85 | grad = mx.nd.zeros((batch_size, 1000), ctx=mx.gpu())
 86 | param_len = len(param_blocks)
 87 | 
 88 | 
 89 | # In[8]:
 90 | 
 91 | #init
 92 | for i in range(param_len):
 93 |     param_blocks[i][1][:] = mx.rnd.uniform(-0.01, 0.01, param_blocks[i][1].shape)
 94 |     param_blocks[i][2][:] = 0.
 95 | # Set data
 96 | train_iter.reset()
 97 | dbatch = train_iter.next()
 98 | dbatch.data[0].copyto(input_ndarray)
 99 | # block all async all
100 | mx.nd.waitall()
101 | 
102 | 
103 | # In[12]:
104 | 
105 | # Test forward
106 | def test_forward(model, epoch):
107 |     tic = time.time()
108 |     for i in range(epoch):
109 |         model.forward(is_train=True)
110 |         # Note: This command will force thread engine block, which hurts performance a lot
111 |         # Remove it will bring parallelism bias
112 |         # model.outputs[0].wait_to_read()
113 |     model.outputs[0].wait_to_read()
114 |     toc = time.time()
115 |     return (toc - tic) / epoch
116 | 
117 | print("Avg forward per batch: ", test_forward(alex_exec, num_epoch))
118 | 
119 | 
120 | # In[13]:
121 | 
122 | # Test full path
123 | def test_full(model, epoch):
124 |     tic = time.time()
125 |     for i in range(epoch):
126 |         model.forward(is_train=True)
127 |         model.backward([grad])
128 |         #model.outputs[0].wait_to_read()
129 |         # mx.nd.waitall()
130 |         # mock update
131 |         for i in range(param_len):
132 |             param_blocks[i][1][:] -= 0.0 * param_blocks[i][2][:]
133 |     # Note: This command will force thread engine block, which hurts performance a lot
134 |     mx.nd.waitall()
135 |     toc = time.time()
136 |     return (toc - tic) / epoch
137 | 
138 | print("Avg fullpath per batch: ", test_full(alex_exec, num_epoch))
139 | 
140 | 
141 | # In[ ]:
142 | 
143 | 


--------------------------------------------------------------------------------
/mxnet/gnetv1.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # # Before start
  4 | # 
  5 | # There is many important [environment variables](https://mxnet.readthedocs.org/en/latest/env_var.html) which will influence the performance. Change these variable will change the parallelism, memory cost.
  6 | # 
  7 | # sample command: 
  8 | # ```
  9 | # MXNET_GPU_WORKER_NTHREADS=4 MXNET_EXEC_NUM_TEMP=4 python3 googlenet.py
 10 | # ```
 11 | # 
 12 | # Speed and memory cost may change due to different level of parallelism
 13 | 
 14 | # In[1]:
 15 | 
 16 | import mxnet as mx
 17 | import numpy as np
 18 | import time
 19 | 
 20 | 
 21 | # In[2]:
 22 | 
 23 | # Basic Info
 24 | dev = mx.gpu()
 25 | batch_size = 128
 26 | dshape = (batch_size, 3, 224, 224)
 27 | lshape = (batch_size)
 28 | num_epoch = 100
 29 | 
 30 | # Mock data iterator
 31 | tmp_data = np.random.uniform(-128, 128, dshape).astype("float32")
 32 | tmp_label = np.random.uniform(0, 1000, lshape).astype("int").astype("float32")
 33 | 
 34 | train_iter = mx.io.NDArrayIter(data=tmp_data, label=tmp_label, batch_size=batch_size, shuffle=False, last_batch_handle='pad')
 35 | 
 36 | 
 37 | 
 38 | # GoogLeNet V1: Converted from [Caffe](https://github.com/BVLC/caffe/blob/master/models/bvlc_googlenet/deploy.prototxt) directly
 39 | 
 40 | def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
 41 |     conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
 42 |     act = mx.symbol.Activation(data=conv, act_type='relu', name='relu_%s%s' %(name, suffix))
 43 |     return act
 44 | 
 45 | def InceptionFactory(data, num_1x1, num_3x3red, num_3x3, num_d5x5red, num_d5x5, pool, proj, name):
 46 |     # 1x1
 47 |     c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
 48 |     # 3x3 reduce + 3x3
 49 |     c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
 50 |     c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
 51 |     # double 3x3 reduce + double 3x3
 52 |     cd5x5r = ConvFactory(data=data, num_filter=num_d5x5red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
 53 |     cd5x5 = ConvFactory(data=cd5x5r, num_filter=num_d5x5, kernel=(5, 5), pad=(2, 2), name=('%s_double_3x3_1' % name))
 54 |     # pool + proj
 55 |     pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
 56 |     cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
 57 |     # concat
 58 |     concat = mx.symbol.Concat(*[c1x1, c3x3, cd5x5, cproj], name='ch_concat_%s_chconcat' % name)
 59 |     return concat
 60 | 
 61 | data = mx.sym.Variable("data")
 62 | conv1 = ConvFactory(data, 64, kernel=(7, 7), stride=(2,2), pad=(3, 3))
 63 | pool1 = mx.sym.Pooling(conv1, kernel=(3, 3), stride=(2, 2), pool_type="max")
 64 | conv2 = ConvFactory(pool1, 64, kernel=(1, 1), stride=(1,1))
 65 | conv3 = ConvFactory(conv2, 192, kernel=(3, 3), stride=(1, 1), pad=(1,1))
 66 | pool3 = mx.sym.Pooling(conv3, kernel=(3, 3), stride=(2, 2), pool_type="max")
 67 | 
 68 | in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name="in3a")
 69 | in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name="in3b")
 70 | pool4 = mx.sym.Pooling(in3b, kernel=(3, 3), stride=(2, 2), pool_type="max")
 71 | in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name="in4a")
 72 | in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name="in4b")
 73 | in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name="in4c")
 74 | in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name="in4d")
 75 | in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name="in4e")
 76 | pool5 = mx.sym.Pooling(in4e, kernel=(3, 3), stride=(2, 2), pool_type="max")
 77 | in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name="in5a")
 78 | in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name="in5b")
 79 | pool6 = mx.sym.Pooling(in5b, kernel=(7, 7), stride=(1,1), pool_type="avg")
 80 | flatten = mx.sym.Flatten(data=pool6)
 81 | loss3_classifier = mx.sym.FullyConnected(data=flatten, num_hidden=1000)
 82 | 
 83 | 
 84 | # In[4]:
 85 | 
 86 | # bind to get executor
 87 | # This is what happened behind mx.model.Feedforward
 88 | g_exec = loss3_classifier.simple_bind(ctx=dev, grad_req="write", data=dshape)
 89 | print("Temp Space: ", g_exec.debug_str().split('\n')[-3])
 90 | # Find where to set data
 91 | 
 92 | 
 93 | # In[5]:
 94 | 
 95 | # data structues 
 96 | arg_names = loss3_classifier.list_arguments()
 97 | arg_map = dict(zip(arg_names, g_exec.arg_arrays))
 98 | grad_map = dict(zip(arg_names, g_exec.grad_arrays))
 99 | 
100 | 
101 | param_blocks = [(i, arg_map[arg_names[i]], grad_map[arg_names[i]]) for i in range(len(arg_names)) if grad_map[arg_names[i]] != None]
102 | input_ndarray = arg_map["data"]
103 | #label_ndarray = arg_map["prob_label"]
104 | grad = mx.nd.zeros((batch_size, 1000), ctx=mx.gpu())
105 | param_len = len(param_blocks)
106 | 
107 | 
108 | # In[6]:
109 | 
110 | #init
111 | for i in range(param_len):
112 |     param_blocks[i][1][:] = mx.rnd.uniform(-0.01, 0.01, param_blocks[i][1].shape)
113 |     param_blocks[i][2][:] = 0.
114 | # Set data
115 | train_iter.reset()
116 | dbatch = train_iter.next()
117 | dbatch.data[0].copyto(input_ndarray)
118 | #dbatch.label[0].copyto(label_ndarray)
119 | # block all async all
120 | mx.nd.waitall()
121 | 
122 | 
123 | # In[ ]:
124 | 
125 | # Test forward
126 | def test_forward(model, epoch):
127 |     tic = time.time()
128 |     for i in range(epoch):
129 |         model.forward(is_train=True)
130 |         # Note: This command will force thread engine block, which hurts performance a lot
131 |         # Remove it will bring parallelism bias
132 |         model.outputs[0].wait_to_read()
133 |     toc = time.time()
134 |     return (toc - tic) / epoch
135 | 
136 | print("Avg forward per batch: ", test_forward(g_exec, num_epoch))
137 | 
138 | 
139 | # In[ ]:
140 | 
141 | # Test full path
142 | def test_full(model, epoch):
143 |     tic = time.time()
144 |     for i in range(epoch):
145 |         model.forward(is_train=True)
146 |         model.backward([grad])
147 |         # mock update, prevent NaN
148 |         for i in range(param_len):
149 |             param_blocks[i][1][:] -= 0.0 * param_blocks[i][2]
150 |     # Note: This command will force thread engine block, which hurts performance a lot
151 |     mx.nd.waitall()
152 |     toc = time.time()
153 |     return (toc - tic) / epoch
154 | 
155 | print("Avg fullpath per batch: ", test_full(g_exec, num_epoch))
156 | 
157 | 


--------------------------------------------------------------------------------
/nervana/README.md:
--------------------------------------------------------------------------------
 1 | Nervana Systems provided me with limited-release beta kernels for benchmarking (and correctness-checking).
 2 | 
 3 | They are open to releasing them publicly, but just dont have the bandwidth to support the open-release.
 4 | So, if you would like to have the kernels, email them: http://www.nervanasys.com/
 5 | 
 6 | The kernels come similarly packaged to their https://github.com/NervanaSystems/nervana-lib-gpu-performance-preview
 7 | 
 8 | 
 9 | SETUP
10 | =====
11 | ```
12 | git clone git@github.com:NervanaSystems/maxas.git
13 | cd maxas
14 | perl Makefile.PL
15 | make
16 | sudo make install
17 | 
18 | cd ..
19 | git clone -b convnew git@github.com:NervanaSystems/nervanagpu.git
20 | cd nervanagpu
21 | make kernels
22 | make python
23 | 
24 | cd ..
25 | python nervanagpu/benchmarks/convnet-benchmarks.py |tee output.log
26 | ```
27 | 
28 | I am copying over the convnet-benchmarks.py to this folder for public viewing, just to make sure that there's nothing funny in the benchmarking itself and that it's clear that the benchmarking is done properly.
29 | 
30 | 


--------------------------------------------------------------------------------
/nervana/output.log:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------------------------
 2 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
 3 | -------------------------------------------------------------------------------------
 4 | | fprop       |  28.419     |  28.351     |  27.964     |  28.898     |    msec     |
 5 | | bprop       |  58.95      |  58.862     |  58.014     |  59.568     |    msec     |
 6 | | iteration   |  87.369     |  87.244     |  86.211     |  88.159     |    msec     |
 7 | -------------------------------------------------------------------------------------
 8 | -------------------------------------------------------------------------------------
 9 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
10 | -------------------------------------------------------------------------------------
11 | | fprop       |  69.369     |  69.53      |  67.769     |  70.495     |    msec     |
12 | | bprop       |  141.77     |  141.8      |  139.1      |  144.12     |    msec     |
13 | | iteration   |  211.14     |  211.35     |  207.34     |  213.94     |    msec     |
14 | -------------------------------------------------------------------------------------
15 | -------------------------------------------------------------------------------------
16 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
17 | -------------------------------------------------------------------------------------
18 | | fprop       |  103.35     |  103.37     |  101.71     |  106.44     |    msec     |
19 | | bprop       |  217.43     |  217.35     |  214.4      |  220.68     |    msec     |
20 | | iteration   |  320.78     |  320.9      |  317.45     |  325.21     |    msec     |
21 | -------------------------------------------------------------------------------------
22 | -------------------------------------------------------------------------------------
23 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
24 | -------------------------------------------------------------------------------------
25 | | fprop       |  84.649     |  84.714     |  84.049     |  85.111     |    msec     |
26 | | bprop       |  186.01     |  185.99     |  185.5      |  186.74     |    msec     |
27 | | iteration   |  270.66     |  270.67     |  269.88     |  271.64     |    msec     |
28 | -------------------------------------------------------------------------------------
29 | 


--------------------------------------------------------------------------------
/nervana/output_fp16.log:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------------------------
 2 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
 3 | -------------------------------------------------------------------------------------
 4 | | fprop       |  25.633     |  25.634     |  25.565     |  25.695     |    msec     |
 5 | | bprop       |  52.985     |  52.967     |  52.84      |  53.2       |    msec     |
 6 | | iteration   |  78.618     |  78.608     |  78.423     |  78.839     |    msec     |
 7 | -------------------------------------------------------------------------------------
 8 | -------------------------------------------------------------------------------------
 9 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
10 | -------------------------------------------------------------------------------------
11 | | fprop       |  58.682     |  58.747     |  57.911     |  59.675     |    msec     |
12 | | bprop       |  118.17     |  118.25     |  116.78     |  120.05     |    msec     |
13 | | iteration   |  176.85     |  176.98     |  174.75     |  179.37     |    msec     |
14 | -------------------------------------------------------------------------------------
15 | -------------------------------------------------------------------------------------
16 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
17 | -------------------------------------------------------------------------------------
18 | | fprop       |  82.345     |  82.291     |  81.031     |  83.502     |    msec     |
19 | | bprop       |  171.82     |  171.82     |  169.8      |  173.32     |    msec     |
20 | | iteration   |  254.16     |  254.22     |  251.99     |  256.33     |    msec     |
21 | -------------------------------------------------------------------------------------
22 | -------------------------------------------------------------------------------------
23 | |    Func     |    Mean     |   Median    |     Min     |     Max     |    Units    |
24 | -------------------------------------------------------------------------------------
25 | | fprop       |  72.969     |  72.972     |  72.858     |  73.074     |    msec     |
26 | | bprop       |  157.08     |  157.07     |  156.89     |  157.37     |    msec     |
27 | | iteration   |  230.05     |  230.04     |  229.88     |  230.39     |    msec     |
28 | -------------------------------------------------------------------------------------
29 | 


--------------------------------------------------------------------------------
/nnforge/INSTALL.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Clone nnForge
 3 | -------------
 4 | ```
 5 | git clone https://github.com/milakov/nnForge.git
 6 | cd nnForge
 7 | ```
 8 | 
 9 | Modify Settings.mk
10 | ------------------
11 | ```
12 | diff --git a/Settings.mk b/Settings.mk
13 | index 3b8b945..96f030f 100644
14 | --- a/Settings.mk
15 | +++ b/Settings.mk
16 | @@ -1,20 +1,20 @@
17 |  BUILD_MODE=release
18 |  ENABLE_CUDA_BACKEND=yes
19 | -ENABLE_CUDA_PROFILING=no
20 | +ENABLE_CUDA_PROFILING=yes
21 |  CPP11COMPILER=no
22 | -BOOST_PATH=/usr/local
23 | +BOOST_PATH=/usr/
24 |  OPENCV_PATH=/usr/local
25 | -NETCDF_INSTALLED=yes
26 | +NETCDF_INSTALLED=no
27 |  NETCDF_PATH=
28 | -MATIO_INSTALLED=yes
29 | +MATIO_INSTALLED=no
30 |  MATIO_PATH=
31 |  CUDA_PATH=/usr/local/cuda
32 |  NVCC=nvcc
33 |  NNFORGE_PATH=../..
34 | -NNFORGE_INPUT_DATA_PATH=/home/max/nnforge/input_data
35 | -NNFORGE_WORKING_DATA_PATH=/home/max/nnforge/working_data
36 | +NNFORGE_INPUT_DATA_PATH=./nnforge/input_data
37 | +NNFORGE_WORKING_DATA_PATH=./nnforge/working_data
38 | 
39 | -BOOST_LIBS=-lboost_thread-mt -lboost_regex-mt -lboost_chrono-mt -lboost_filesystem-mt -lboost_program_options-mt -lboost_random-mt -lboost_system-mt -lboost_date_time-mt
40 | +BOOST_LIBS=-lboost_thread -lboost_regex -lboost_chrono -lboost_filesystem -lboost_program_options -lboost_random -lboost_system -lboost_date_time
41 |  OPENCV_LIBS=-lopencv_highgui -lopencv_imgproc -lopencv_core
42 |  NETCDF_LIBS=-lnetcdf
43 |  MATIO_LIBS=-lmatio
44 | ```
45 | 
46 | Compile nnForge
47 | ---------------
48 | ```
49 | ./make_all.sh
50 | ```
51 | 


--------------------------------------------------------------------------------
/nnforge/README.md:
--------------------------------------------------------------------------------
1 | For compiling nnForge, look at INSTALL.md
2 | 
3 | nnForge Convolution kernel (kepler) is at: https://github.com/milakov/nnForge/blob/master/nnforge/cuda/convolution_layer_updater_cuda_kepler.cuh
4 | 
5 | Quote from Maxim:
6 | > I am sorry to say that it is not an easy task at all. The actual convolution is done in https://github.com/milakov/nnForge/blob/master/nnforge/cuda/convolution_layer_tester_cuda_kepler.cuh but you cannot just call enqueue_test and sync on the stream, you would need to do a lot of preparation calls.
7 | 
8 | So, I guess I'll tackle this last, seems like a complicated task.
9 | 


--------------------------------------------------------------------------------
/nnforge/benchmark/Makefile:
--------------------------------------------------------------------------------
 1 | USE_BOOST=yes
 2 | USE_OPENCV=yes
 3 | USE_OPENMP=yes
 4 | USE_CUDA=yes
 5 | USE_NNFORGE=yes
 6 | 
 7 | include ../../Settings.mk
 8 | include ../../Main.mk
 9 | 
10 | include ../Example.mk
11 | 
12 | 


--------------------------------------------------------------------------------
/nnforge/benchmark/benchmark.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <stdio.h>
 3 | 
 4 | #include <nnforge/cuda/cuda.h>
 5 | 
 6 | #include <nnforge/nnforge.h>
 7 | 
 8 | int main(int argc, char* argv[])
 9 | {
10 | 	try
11 | 	{
12 | 		nnforge::cuda::cuda::init();		
13 | 		nnforge::convolution_layer layer(std::vector<unsigned int>(2, 11), 3, 96);
14 | 
15 | 		nnforge::layer_configuration_specific input_configuration;
16 | 		input_configuration.feature_map_count = 3;
17 | 		input_configuration.dimension_sizes.push_back(128);
18 | 		input_configuration.dimension_sizes.push_back(128);
19 | 
20 | 		float fflops = layer.get_forward_flops(input_configuration);
21 | 		float bflops = layer.get_backward_flops(input_configuration);
22 | 		float bbflops = layer.get_backward_flops_2nd(input_configuration);
23 | 		std::cout << "convolution_layer 3->96 11x11"<< std::endl;
24 | 		std::cout << ":forward gflop/s: " << fflops/1000000000 << std::endl;
25 | 		std::cout << ":backward gflop/s: " << bflops/1000000000 << std::endl;
26 | 		std::cout << ":hessian  gflop/s: " << bbflops/1000000000 << std::endl;
27 | 
28 | 	}
29 | 	catch (const std::exception& e)
30 | 	{
31 | 		std::cout << "Exception caught: " << e.what() << std::endl;
32 | 		return 1;
33 | 	}
34 | 
35 | 	return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/tensorflow/BUILD:
--------------------------------------------------------------------------------
 1 | py_binary(
 2 |     name = "benchmark_alexnet",
 3 |     srcs = [
 4 |         "benchmark_alexnet.py",
 5 |     ],
 6 |     srcs_version = "PY2AND3",
 7 |     deps = [
 8 |         "//tensorflow:tensorflow_py",
 9 |     ],
10 | )
11 | 
12 | py_binary(
13 |     name = "benchmark_overfeat",
14 |     srcs = [
15 |         "benchmark_overfeat.py",
16 |     ],
17 |     srcs_version = "PY2AND3",
18 |     deps = [
19 |         "//tensorflow:tensorflow_py",
20 |     ],
21 | )
22 | 
23 | py_binary(
24 |     name = "benchmark_vgg",
25 |     srcs = [
26 |         "benchmark_vgg.py",
27 |     ],
28 |     srcs_version = "PY2AND3",
29 |     deps = [
30 |         "//tensorflow:tensorflow_py",
31 |     ],
32 | )
33 | 
34 | py_binary(
35 |     name = "benchmark_googlenet",
36 |     srcs = [
37 |         "benchmark_googlenet.py",
38 |     ],
39 |     srcs_version = "PY2AND3",
40 |     deps = [
41 |         "//tensorflow:tensorflow_py",
42 |     ],
43 | )
44 | 
45 | filegroup(
46 |     name = "all_files",
47 |     srcs = glob(
48 |         ["**/*"],
49 |         exclude = [
50 |             "**/METADATA",
51 |             "**/OWNERS",
52 |         ],
53 |     ),
54 |     visibility = ["//tensorflow:__subpackages__"],
55 | )
56 | 


--------------------------------------------------------------------------------
/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | - Downgrade CUDA to 7.0. It installs to: /usr/local/cuda-7.0
2 | - Download CuDNN R2 and extract it to: $HOME/Downloads/cudnn-6.5-linux-x64-v2
3 | - Install TensorFlow:
4 | 
5 | pip install https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.5.0-cp27-none-linux_x86_64.whl
6 | 
7 | 


--------------------------------------------------------------------------------
/tensorflow/benchmark_alexnet.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | from collections import namedtuple
  3 | from datetime import datetime
  4 | import csv
  5 | import math
  6 | import time
  7 | 
  8 | import tensorflow.python.platform
  9 | import tensorflow as tf
 10 | 
 11 | FLAGS = tf.app.flags.FLAGS
 12 | 
 13 | tf.app.flags.DEFINE_integer('batch_size', 128,
 14 |                             """Batch size.""")
 15 | tf.app.flags.DEFINE_integer('num_batches', 100,
 16 |                             """Number of batches to run.""")
 17 | tf.app.flags.DEFINE_boolean('forward_only', False,
 18 |                             """Only run the forward pass.""")
 19 | tf.app.flags.DEFINE_boolean('forward_backward_only', False,
 20 |                             """Only run the forward-forward pass.""")
 21 | tf.app.flags.DEFINE_string('data_format', 'NCHW',
 22 |                            """The data format for Convnet operations.
 23 |                            Can be either NHWC or NCHW.
 24 |                            """)
 25 | tf.app.flags.DEFINE_string('csv_file', '',
 26 |                            """File to output timing information to in csv
 27 |                            format. If not file is passed in, csv file will
 28 |                            not be cteated.
 29 |                            """)
 30 | 
 31 | 
 32 | parameters = []
 33 | 
 34 | conv_counter = 1
 35 | pool_counter = 1
 36 | affine_counter = 1
 37 | 
 38 | TimingEntry = namedtuple(
 39 |     'TimingEntry', ['info_string', 'timestamp', 'num_batches', 'mean', 'sd'])
 40 | 
 41 | def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType):
 42 |     global conv_counter
 43 |     global parameters
 44 |     name = 'conv' + str(conv_counter)
 45 |     conv_counter += 1
 46 |     with tf.name_scope(name) as scope:
 47 |         kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut],
 48 |                                                  dtype=tf.float32,
 49 |                                                  stddev=1e-1), name='weights')
 50 |         if FLAGS.data_format == 'NCHW':
 51 |           strides = [1, 1, dH, dW]
 52 |         else:
 53 |           strides = [1, dH, dW, 1]
 54 |         conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
 55 |                             data_format=FLAGS.data_format)
 56 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 57 |                              trainable=True, name='biases')
 58 |         bias = tf.reshape(tf.nn.bias_add(conv, biases,
 59 |                                          data_format=FLAGS.data_format),
 60 |                           conv.get_shape())
 61 |         conv1 = tf.nn.relu(bias, name=scope)
 62 |         parameters += [kernel, biases]
 63 |         return conv1
 64 | 
 65 | def _affine(inpOp, nIn, nOut):
 66 |     global affine_counter
 67 |     global parameters
 68 |     name = 'affine' + str(affine_counter)
 69 |     affine_counter += 1
 70 |     with tf.name_scope(name) as scope:
 71 |         kernel = tf.Variable(tf.truncated_normal([nIn, nOut],
 72 |                                                  dtype=tf.float32,
 73 |                                                  stddev=1e-1), name='weights')
 74 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 75 |                              trainable=True, name='biases')
 76 |         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name)
 77 |         parameters += [kernel, biases]
 78 |         return affine1
 79 | 
 80 | def _mpool(inpOp, kH, kW, dH, dW):
 81 |     global pool_counter
 82 |     global parameters
 83 |     name = 'pool' + str(pool_counter)
 84 |     pool_counter += 1
 85 |     if FLAGS.data_format == 'NCHW':
 86 |       ksize = [1, 1, kH, kW]
 87 |       strides = [1, 1, dH, dW]
 88 |     else:
 89 |       ksize = [1, kH, kW, 1]
 90 |       strides = [1, dH, dW, 1]
 91 |     return tf.nn.max_pool(inpOp,
 92 |                           ksize=ksize,
 93 |                           strides=strides,
 94 |                           padding='VALID',
 95 |                           data_format=FLAGS.data_format,
 96 |                           name=name)
 97 | 
 98 | def loss(logits, labels):
 99 |     batch_size = tf.size(labels)
100 |     labels = tf.expand_dims(labels, 1)
101 |     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
102 |     concated = tf.concat([indices, labels], 1)
103 |     onehot_labels = tf.sparse_to_dense(
104 |         concated, tf.stack([batch_size, 1000]), 1.0, 0.0)
105 |     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
106 |         logits=logits, labels=onehot_labels, name='xentropy')
107 |     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
108 |     return loss
109 | 
110 | def inference(images):
111 |     conv1 = _conv (images, 3, 64, 11, 11, 4, 4, 'VALID')
112 |     pool1 = _mpool(conv1,  3, 3, 2, 2)
113 |     conv2 = _conv (pool1,  64, 192, 5, 5, 1, 1, 'SAME')
114 |     pool2 = _mpool(conv2,  3, 3, 2, 2)
115 |     conv3 = _conv (pool2,  192, 384, 3, 3, 1, 1, 'SAME')
116 |     conv4 = _conv (conv3,  384, 256, 3, 3, 1, 1, 'SAME')
117 |     conv5 = _conv (conv4,  256, 256, 3, 3, 1, 1, 'SAME')
118 |     pool5 = _mpool(conv5,  3, 3, 2, 2)
119 |     resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
120 |     affn1 = _affine(resh1, 256 * 6 * 6, 4096)
121 |     affn2 = _affine(affn1, 4096, 4096)
122 |     affn3 = _affine(affn2, 4096, 1000)
123 | 
124 |     return affn3
125 | 
126 | 
127 | def time_tensorflow_run(session, target, info_string):
128 |   num_steps_burn_in = 10
129 |   total_duration = 0.0
130 |   total_duration_squared = 0.0
131 |   if not isinstance(target, list):
132 |     target = [target]
133 |   target_op = tf.group(*target)
134 |   for i in range(FLAGS.num_batches + num_steps_burn_in):
135 |     start_time = time.time()
136 |     _ = session.run(target_op)
137 |     duration = time.time() - start_time
138 |     if i > num_steps_burn_in:
139 |       if not i % 10:
140 |         print ('%s: step %d, duration = %.3f' %
141 |                (datetime.now(), i - num_steps_burn_in, duration))
142 |       total_duration += duration
143 |       total_duration_squared += duration * duration
144 |   mn = total_duration / FLAGS.num_batches
145 |   vr = total_duration_squared / FLAGS.num_batches - mn * mn
146 |   sd = math.sqrt(vr)
147 |   print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
148 |          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
149 |   return TimingEntry(info_string, datetime.now(), FLAGS.num_batches, mn, sd)
150 | 
151 | def store_data_in_csv(timing_entries):
152 |   with open(FLAGS.csv_file, 'wb') as csvfile:
153 |     writer = csv.writer(csvfile)
154 |     for timing_entry in timing_entries:
155 |       writer.writerow(
156 |           [timing_entry.info_string, timing_entry.timestamp,
157 |            timing_entry.num_batches, timing_entry.mean, timing_entry.sd])
158 | 
159 | def run_benchmark():
160 |   global parameters
161 |   timing_entries = []
162 |   with tf.Graph().as_default():
163 |     # Generate some dummy images.
164 |     image_size = 224
165 |     # Note that our padding definition is slightly different the cuda-convnet.
166 |     # In order to force the model to start with the same activations sizes,
167 |     # we add 3 to the image_size and employ VALID padding above.
168 |     if FLAGS.data_format == 'NCHW':
169 |       image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
170 |     else:
171 |       image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
172 |     images = tf.Variable(tf.random_normal(image_shape,
173 |                                           dtype=tf.float32,
174 |                                           stddev=1e-1))
175 | 
176 |     labels = tf.Variable(tf.ones([FLAGS.batch_size],
177 |                                  dtype=tf.int32))
178 | 
179 |     # Build a Graph that computes the logits predictions from the
180 |     # inference model.
181 |     last_layer = inference(images)
182 | 
183 |     # Build an initialization operation.
184 |     init = tf.global_variables_initializer()
185 | 
186 |     # Start running operations on the Graph.
187 |     sess = tf.Session('')
188 |     sess.run(init)
189 | 
190 |     run_forward = True
191 |     run_forward_backward = True
192 |     if FLAGS.forward_only and FLAGS.forward_backward_only:
193 |       raise ValueError("Cannot specify --forward_only and "
194 |                        "--forward_backward_only at the same time.")
195 |     if FLAGS.forward_only:
196 |       run_forward_backward = False
197 |     elif FLAGS.forward_backward_only:
198 |       run_forward = False
199 | 
200 |     if run_forward:
201 |       # Run the forward benchmark.
202 |       timing_entries.append(time_tensorflow_run(sess, last_layer, "Forward"))
203 | 
204 |     if run_forward_backward:
205 |       # Add a simple objective so we can calculate the backward pass.
206 |       objective = loss(last_layer, labels)
207 |       # Compute the gradient with respect to all the parameters.
208 |       grad = tf.gradients(objective, parameters)
209 |       # Run the backward benchmark.
210 |       timing_entries.append(time_tensorflow_run(sess, grad, "Forward-backward"))
211 | 
212 |   if FLAGS.csv_file:
213 |     store_data_in_csv(timing_entries)
214 | 
215 | 
216 | def main(_):
217 |   run_benchmark()
218 | 
219 | 
220 | if __name__ == '__main__':
221 |   tf.app.run()
222 | 


--------------------------------------------------------------------------------
/tensorflow/benchmark_overfeat.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | from collections import namedtuple
  3 | from datetime import datetime
  4 | import csv
  5 | import math
  6 | import time
  7 | 
  8 | import tensorflow.python.platform
  9 | import tensorflow as tf
 10 | 
 11 | FLAGS = tf.app.flags.FLAGS
 12 | 
 13 | tf.app.flags.DEFINE_integer('batch_size', 128,
 14 |                             """Batch size.""")
 15 | tf.app.flags.DEFINE_integer('num_batches', 100,
 16 |                             """Number of batches to run.""")
 17 | tf.app.flags.DEFINE_boolean('forward_only', False,
 18 |                             """Only run the forward pass.""")
 19 | tf.app.flags.DEFINE_boolean('forward_backward_only', False,
 20 |                             """Only run the forward-forward pass.""")
 21 | tf.app.flags.DEFINE_string('data_format', 'NCHW',
 22 |                            """The data format for Convnet operations.
 23 |                            Can be either NHWC or NCHW.
 24 |                            """)
 25 | tf.app.flags.DEFINE_string('csv_file', '',
 26 |                            """File to output timing information to in csv
 27 |                            format. If not file is passed in, csv file will
 28 |                            not be cteated.
 29 |                            """)
 30 | 
 31 | parameters = []
 32 | 
 33 | conv_counter = 1
 34 | pool_counter = 1
 35 | affine_counter = 1
 36 | 
 37 | TimingEntry = namedtuple(
 38 |     'TimingEntry', ['info_string', 'timestamp', 'num_batches', 'mean', 'sd'])
 39 | 
 40 | def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType):
 41 |     global conv_counter
 42 |     global parameters
 43 |     name = 'conv' + str(conv_counter)
 44 |     conv_counter += 1
 45 |     with tf.name_scope(name) as scope:
 46 |         kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut],
 47 |                                                  dtype=tf.float32,
 48 |                                                  stddev=1e-1), name='weights')
 49 |         if FLAGS.data_format == 'NCHW':
 50 |           strides = [1, 1, dH, dW]
 51 |         else:
 52 |           strides = [1, dH, dW, 1]
 53 |         conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
 54 |                             data_format=FLAGS.data_format)
 55 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 56 |                              trainable=True, name='biases')
 57 |         bias = tf.reshape(tf.nn.bias_add(conv, biases,
 58 |                                          data_format=FLAGS.data_format),
 59 |                           conv.get_shape())
 60 |         conv1 = tf.nn.relu(bias, name=scope)
 61 |         parameters += [kernel, biases]
 62 |         return conv1
 63 | 
 64 | def _affine(inpOp, nIn, nOut):
 65 |     global affine_counter
 66 |     global parameters
 67 |     name = 'affine' + str(affine_counter)
 68 |     affine_counter += 1
 69 |     with tf.name_scope(name) as scope:
 70 |         kernel = tf.Variable(tf.truncated_normal([nIn, nOut],
 71 |                                                  dtype=tf.float32,
 72 |                                                  stddev=1e-1), name='weights')
 73 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 74 |                              trainable=True, name='biases')
 75 |         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name)
 76 |         parameters += [kernel, biases]
 77 |         return affine1
 78 | 
 79 | def _mpool(inpOp, kH, kW, dH, dW):
 80 |     global pool_counter
 81 |     global parameters
 82 |     name = 'pool' + str(pool_counter)
 83 |     pool_counter += 1
 84 |     if FLAGS.data_format == 'NCHW':
 85 |       ksize = [1, 1, kH, kW]
 86 |       strides = [1, 1, dH, dW]
 87 |     else:
 88 |       ksize = [1, kH, kW, 1]
 89 |       strides = [1, dH, dW, 1]
 90 |     return tf.nn.max_pool(inpOp,
 91 |                           ksize=ksize,
 92 |                           strides=strides,
 93 |                           padding='VALID',
 94 |                           data_format=FLAGS.data_format,
 95 |                           name=name)
 96 | 
 97 | def loss(logits, labels):
 98 |     batch_size = tf.size(labels)
 99 |     labels = tf.expand_dims(labels, 1)
100 |     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
101 |     concated = tf.concat([indices, labels], 1)
102 |     onehot_labels = tf.sparse_to_dense(
103 |         concated, tf.stack([batch_size, 1000]), 1.0, 0.0)
104 |     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
105 |         logits=logits, labels=onehot_labels, name='xentropy')
106 |     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
107 |     return loss
108 | 
109 | def inference(images):
110 |     conv1 = _conv (images, 3, 96, 11, 11, 4, 4, 'VALID')
111 |     pool1 = _mpool(conv1,  2, 2, 2, 2)
112 |     conv2 = _conv(pool1, 96, 256, 5, 5, 1, 1, 'VALID')
113 |     pool2 = _mpool(conv2,  2, 2, 2, 2)
114 |     conv3 = _conv (pool2,  256, 512, 3, 3, 1, 1, 'SAME')
115 |     conv4 = _conv (conv3,  512, 1024, 3, 3, 1, 1, 'SAME')
116 |     conv5 = _conv (conv4,  1024, 1024, 3, 3, 1, 1, 'SAME')
117 |     pool5 = _mpool(conv5,  2, 2, 2, 2)
118 |     resh1 = tf.reshape(pool5, [-1, 1024 * 6 * 6])
119 |     affn1 = _affine(resh1, 1024 * 6 * 6, 3072)
120 |     affn2 = _affine(affn1, 3072, 4096)
121 |     affn3 = _affine(affn2, 4096, 1000)
122 | 
123 |     return affn3
124 | 
125 | 
126 | def time_tensorflow_run(session, target, info_string):
127 |   num_steps_burn_in = 10
128 |   total_duration = 0.0
129 |   total_duration_squared = 0.0
130 |   if not isinstance(target, list):
131 |     target = [target]
132 |   target_op = tf.group(*target)
133 |   for i in range(FLAGS.num_batches + num_steps_burn_in):
134 |     start_time = time.time()
135 |     _ = session.run(target_op)
136 |     duration = time.time() - start_time
137 |     if i > num_steps_burn_in:
138 |       if not i % 10:
139 |         print ('%s: step %d, duration = %.3f' %
140 |                (datetime.now(), i - num_steps_burn_in, duration))
141 |       total_duration += duration
142 |       total_duration_squared += duration * duration
143 |   mn = total_duration / FLAGS.num_batches
144 |   vr = total_duration_squared / FLAGS.num_batches - mn * mn
145 |   sd = math.sqrt(vr)
146 |   print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
147 |          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
148 |   return TimingEntry(info_string, datetime.now(), FLAGS.num_batches, mn, sd)
149 | 
150 | def store_data_in_csv(timing_entries):
151 |   with open(FLAGS.csv_file, 'wb') as csvfile:
152 |     writer = csv.writer(csvfile)
153 |     for timing_entry in timing_entries:
154 |       writer.writerow(
155 |           [timing_entry.info_string, timing_entry.timestamp,
156 |            timing_entry.num_batches, timing_entry.mean, timing_entry.sd])
157 | 
158 | def run_benchmark():
159 |   global parameters
160 |   timing_entries = []
161 |   with tf.Graph().as_default():
162 |     # Generate some dummy images.
163 |     image_size = 231
164 |     # Note that our padding definition is slightly different the cuda-convnet.
165 |     # In order to force the model to start with the same activations sizes,
166 |     # we add 3 to the image_size and employ VALID padding above.
167 |     if FLAGS.data_format == 'NCHW':
168 |       image_shape = [FLAGS.batch_size, 3, image_size, image_size]
169 |     else:
170 |       image_shape = [FLAGS.batch_size, image_size, image_size, 3]
171 |     images = tf.Variable(tf.random_normal(image_shape,
172 |                                           dtype=tf.float32,
173 |                                           stddev=1e-1))
174 | 
175 |     labels = tf.Variable(tf.ones([FLAGS.batch_size],
176 |                                  dtype=tf.int32))
177 | 
178 |     # Build a Graph that computes the logits predictions from the
179 |     # inference model.
180 |     last_layer = inference(images)
181 | 
182 |     # Build an initialization operation.
183 |     init = tf.global_variables_initializer()
184 | 
185 |     # Start running operations on the Graph.
186 |     sess = tf.Session('')
187 |     sess.run(init)
188 | 
189 |     run_forward = True
190 |     run_forward_backward = True
191 |     if FLAGS.forward_only and FLAGS.forward_backward_only:
192 |       raise ValueError("Cannot specify --forward_only and "
193 |                        "--forward_backward_only at the same time.")
194 |     if FLAGS.forward_only:
195 |       run_forward_backward = False
196 |     elif FLAGS.forward_backward_only:
197 |       run_forward = False
198 | 
199 |     if run_forward:
200 |       # Run the forward benchmark.
201 |       timing_entries.append(time_tensorflow_run(sess, last_layer, "Forward"))
202 | 
203 |     if run_forward_backward:
204 |       # Add a simple objective so we can calculate the backward pass.
205 |       objective = loss(last_layer, labels)
206 |       # Compute the gradient with respect to all the parameters.
207 |       grad = tf.gradients(objective, parameters)
208 |       # Run the backward benchmark.
209 |       timing_entries.append(time_tensorflow_run(sess, grad, "Forward-backward"))
210 | 
211 |   if FLAGS.csv_file:
212 |     store_data_in_csv(timing_entries)
213 | 
214 | 
215 | def main(_):
216 |   run_benchmark()
217 | 
218 | 
219 | if __name__ == '__main__':
220 |   tf.app.run()
221 | 


--------------------------------------------------------------------------------
/tensorflow/benchmark_vgg.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | from collections import namedtuple
  3 | from datetime import datetime
  4 | import csv
  5 | import math
  6 | import time
  7 | 
  8 | import tensorflow.python.platform
  9 | import tensorflow as tf
 10 | 
 11 | FLAGS = tf.app.flags.FLAGS
 12 | 
 13 | # TODO: why is batch size 64 going OOM?
 14 | tf.app.flags.DEFINE_integer('batch_size', 64,
 15 |                             """Batch size.""")
 16 | tf.app.flags.DEFINE_integer('num_batches', 100,
 17 |                             """Number of batches to run.""")
 18 | tf.app.flags.DEFINE_boolean('forward_only', False,
 19 |                             """Only run the forward pass.""")
 20 | tf.app.flags.DEFINE_boolean('forward_backward_only', False,
 21 |                             """Only run the forward-forward pass.""")
 22 | tf.app.flags.DEFINE_string('data_format', 'NCHW',
 23 |                            """The data format for Convnet operations.
 24 |                            Can be either NHWC or NCHW.
 25 |                            """)
 26 | tf.app.flags.DEFINE_string('csv_file', '',
 27 |                            """File to output timing information to in csv
 28 |                            format. If not file is passed in, csv file will
 29 |                            not be cteated.
 30 |                            """)
 31 | 
 32 | parameters = []
 33 | 
 34 | conv_counter = 1
 35 | pool_counter = 1
 36 | affine_counter = 1
 37 | 
 38 | TimingEntry = namedtuple(
 39 |     'TimingEntry', ['info_string', 'timestamp', 'num_batches', 'mean', 'sd'])
 40 | 
 41 | def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType):
 42 |     global conv_counter
 43 |     global parameters
 44 |     name = 'conv' + str(conv_counter)
 45 |     conv_counter += 1
 46 |     with tf.name_scope(name) as scope:
 47 |         kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut],
 48 |                                                  dtype=tf.float32,
 49 |                                                  stddev=1e-1), name='weights')
 50 |         if FLAGS.data_format == 'NCHW':
 51 |           strides = [1, 1, dH, dW]
 52 |         else:
 53 |           strides = [1, dH, dW, 1]
 54 |         conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
 55 |                             data_format=FLAGS.data_format)
 56 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 57 |                              trainable=True, name='biases')
 58 |         bias = tf.reshape(tf.nn.bias_add(conv, biases,
 59 |                                          data_format=FLAGS.data_format),
 60 |                           conv.get_shape())
 61 |         conv1 = tf.nn.relu(bias, name=scope)
 62 |         parameters += [kernel, biases]
 63 |         return conv1
 64 | 
 65 | def _affine(inpOp, nIn, nOut):
 66 |     global affine_counter
 67 |     global parameters
 68 |     name = 'affine' + str(affine_counter)
 69 |     affine_counter += 1
 70 |     with tf.name_scope(name) as scope:
 71 |         kernel = tf.Variable(tf.truncated_normal([nIn, nOut],
 72 |                                                  dtype=tf.float32,
 73 |                                                  stddev=1e-1), name='weights')
 74 |         biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
 75 |                              trainable=True, name='biases')
 76 |         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name)
 77 |         parameters += [kernel, biases]
 78 |         return affine1
 79 | 
 80 | def _mpool(inpOp, kH, kW, dH, dW):
 81 |     global pool_counter
 82 |     global parameters
 83 |     name = 'pool' + str(pool_counter)
 84 |     pool_counter += 1
 85 |     if FLAGS.data_format == 'NCHW':
 86 |       ksize = [1, 1, kH, kW]
 87 |       strides = [1, 1, dH, dW]
 88 |     else:
 89 |       ksize = [1, kH, kW, 1]
 90 |       strides = [1, dH, dW, 1]
 91 |     return tf.nn.max_pool(inpOp,
 92 |                           ksize=ksize,
 93 |                           strides=strides,
 94 |                           padding='VALID',
 95 |                           data_format=FLAGS.data_format,
 96 |                           name=name)
 97 | 
 98 | def loss(logits, labels):
 99 |     batch_size = tf.size(labels)
100 |     labels = tf.expand_dims(labels, 1)
101 |     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
102 |     concated = tf.concat([indices, labels], 1)
103 |     onehot_labels = tf.sparse_to_dense(
104 |         concated, tf.stack([batch_size, 1000]), 1.0, 0.0)
105 |     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
106 |         logits=logits, labels=onehot_labels, name='xentropy')
107 |     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
108 |     return loss
109 | 
110 | def inference(images):
111 |     conv1 = _conv (images, 3, 64, 3, 3, 1, 1, 'SAME')
112 |     pool1 = _mpool(conv1,  2, 2, 2, 2)
113 |     conv2 = _conv (pool1,  64, 128, 3, 3, 1, 1, 'SAME')
114 |     pool2 = _mpool(conv2,  2, 2, 2, 2)
115 |     conv3 = _conv (pool2,  128, 256, 3, 3, 1, 1, 'SAME')
116 |     conv4 = _conv (conv3,  256, 256, 3, 3, 1, 1, 'SAME')
117 |     pool4 = _mpool(conv4,  2, 2, 2, 2)
118 |     conv5 = _conv (pool4,  256, 512, 3, 3, 1, 1, 'SAME')
119 |     conv6 = _conv (conv5,  512, 512, 3, 3, 1, 1, 'SAME')
120 |     pool6 = _mpool(conv6,  2, 2, 2, 2)
121 |     conv7 = _conv (pool6,  512, 512, 3, 3, 1, 1, 'SAME')
122 |     conv8 = _conv (conv7,  512, 512, 3, 3, 1, 1, 'SAME')
123 |     pool8 = _mpool(conv8,  2, 2, 2, 2)
124 |     resh1 = tf.reshape(pool8, [-1, 512 * 7 * 7])
125 |     affn1 = _affine(resh1, 512 * 7 * 7, 4096)
126 |     affn2 = _affine(affn1, 4096, 4096)
127 |     affn3 = _affine(affn2, 4096, 1000)
128 | 
129 |     return affn3
130 | 
131 | 
132 | def time_tensorflow_run(session, target, info_string):
133 |   num_steps_burn_in = 10
134 |   total_duration = 0.0
135 |   total_duration_squared = 0.0
136 |   if not isinstance(target, list):
137 |     target = [target]
138 |   target_op = tf.group(*target)
139 |   for i in range(FLAGS.num_batches + num_steps_burn_in):
140 |     start_time = time.time()
141 |     _ = session.run(target_op)
142 |     duration = time.time() - start_time
143 |     if i >= num_steps_burn_in:
144 |       if not i % 10:
145 |         print ('%s: step %d, duration = %.3f' %
146 |                (datetime.now(), i - num_steps_burn_in, duration))
147 |       total_duration += duration
148 |       total_duration_squared += duration * duration
149 |   mn = total_duration / FLAGS.num_batches
150 |   vr = total_duration_squared / FLAGS.num_batches - mn * mn
151 |   sd = math.sqrt(vr)
152 |   print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
153 |          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
154 |   return TimingEntry(info_string, datetime.now(), FLAGS.num_batches, mn, sd)
155 | 
156 | def store_data_in_csv(timing_entries):
157 |   with open(FLAGS.csv_file, 'wb') as csvfile:
158 |     writer = csv.writer(csvfile)
159 |     for timing_entry in timing_entries:
160 |       writer.writerow(
161 |           [timing_entry.info_string, timing_entry.timestamp,
162 |            timing_entry.num_batches, timing_entry.mean, timing_entry.sd])
163 | 
164 | def run_benchmark():
165 |   global parameters
166 |   timing_entries = []
167 |   with tf.Graph().as_default():
168 |     # Generate some dummy images.
169 |     image_size = 224
170 |     if FLAGS.data_format == 'NCHW':
171 |       image_shape = [FLAGS.batch_size, 3, image_size, image_size]
172 |     else:
173 |       image_shape = [FLAGS.batch_size, image_size, image_size, 3]
174 |     images = tf.Variable(tf.ones(image_shape, dtype=tf.float32))
175 | 
176 |     labels = tf.Variable(tf.ones([FLAGS.batch_size],
177 |                                  dtype=tf.int32))
178 | 
179 |     # Build a Graph that computes the logits predictions from the
180 |     # inference model.
181 |     last_layer = inference(images)
182 | 
183 |     # Build an initialization operation.
184 |     init = tf.global_variables_initializer()
185 | 
186 |     # Start running operations on the Graph.
187 |     sess = tf.Session('')
188 |     sess.run(init)
189 | 
190 |     run_forward = True
191 |     run_forward_backward = True
192 |     if FLAGS.forward_only and FLAGS.forward_backward_only:
193 |       raise ValueError("Cannot specify --forward_only and "
194 |                        "--forward_backward_only at the same time.")
195 |     if FLAGS.forward_only:
196 |       run_forward_backward = False
197 |     elif FLAGS.forward_backward_only:
198 |       run_forward = False
199 | 
200 |     if run_forward:
201 |       # Run the forward benchmark.
202 |       timing_entries.append(time_tensorflow_run(sess, last_layer, "Forward"))
203 | 
204 |     if run_forward_backward:
205 |       # Add a simple objective so we can calculate the backward pass.
206 |       objective = loss(last_layer, labels)
207 |       # Compute the gradient with respect to all the parameters.
208 |       grad = tf.gradients(objective, parameters)
209 |       # Run the backward benchmark.
210 |       timing_entries.append(time_tensorflow_run(sess, grad, "Forward-backward"))
211 | 
212 |   if FLAGS.csv_file:
213 |     store_data_in_csv(timing_entries)
214 | 
215 | 
216 | def main(_):
217 |   run_benchmark()
218 | 
219 | 
220 | if __name__ == '__main__':
221 |   tf.app.run()
222 | 


--------------------------------------------------------------------------------
/tensorflow/output_alexnet.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
 2 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcudnn.so locally
 3 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
 4 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
 6 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 7 | name: GeForce GTX TITAN X
 8 | major: 5 minor: 2 memoryClockRate (GHz) 1.076
 9 | pciBusID 0000:06:00.0
10 | Total memory: 12.00GiB
11 | Free memory: 11.87GiB
12 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
14 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:06:00.0)
15 | 2016-04-25 13:46:58.813401: step 10, duration = 0.027
16 | 2016-04-25 13:46:59.079705: step 20, duration = 0.027
17 | 2016-04-25 13:46:59.345377: step 30, duration = 0.026
18 | 2016-04-25 13:46:59.612196: step 40, duration = 0.027
19 | 2016-04-25 13:46:59.878704: step 50, duration = 0.027
20 | 2016-04-25 13:47:00.145232: step 60, duration = 0.027
21 | 2016-04-25 13:47:00.411876: step 70, duration = 0.027
22 | 2016-04-25 13:47:00.678707: step 80, duration = 0.027
23 | 2016-04-25 13:47:00.945364: step 90, duration = 0.027
24 | 2016-04-25 13:47:01.185530: Forward across 100 steps, 0.026 +/- 0.003 sec / batch
25 | 2016-04-25 13:47:02.941281: step 10, duration = 0.082
26 | 2016-04-25 13:47:03.761663: step 20, duration = 0.082
27 | 2016-04-25 13:47:04.582690: step 30, duration = 0.082
28 | 2016-04-25 13:47:05.404722: step 40, duration = 0.082
29 | 2016-04-25 13:47:06.223997: step 50, duration = 0.082
30 | 2016-04-25 13:47:07.050684: step 60, duration = 0.082
31 | 2016-04-25 13:47:07.873504: step 70, duration = 0.082
32 | 2016-04-25 13:47:08.693525: step 80, duration = 0.082
33 | 2016-04-25 13:47:09.513074: step 90, duration = 0.082
34 | 2016-04-25 13:47:10.252648: Forward-backward across 100 steps, 0.081 +/- 0.008 sec / batch
35 | 


--------------------------------------------------------------------------------
/tensorflow/output_googlenet.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
 2 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcudnn.so locally
 3 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
 4 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
 6 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 7 | name: GeForce GTX TITAN X
 8 | major: 5 minor: 2 memoryClockRate (GHz) 1.076
 9 | pciBusID 0000:06:00.0
10 | Total memory: 12.00GiB
11 | Free memory: 11.87GiB
12 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
14 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:06:00.0)
15 | I tensorflow/core/common_runtime/gpu/pool_allocator.cc:244] PoolAllocator: After 1968 get requests, put_count=1888 evicted_count=1000 eviction_rate=0.529661 and unsatisfied allocation rate=0.599593
16 | I tensorflow/core/common_runtime/gpu/pool_allocator.cc:256] Raising pool_size_limit_ from 100 to 110
17 | 2016-04-25 13:49:16.617654: step 10, duration = 0.136
18 | 2016-04-25 13:49:17.979970: step 20, duration = 0.137
19 | 2016-04-25 13:49:19.343556: step 30, duration = 0.136
20 | 2016-04-25 13:49:20.707242: step 40, duration = 0.136
21 | 2016-04-25 13:49:22.072121: step 50, duration = 0.136
22 | 2016-04-25 13:49:23.435953: step 60, duration = 0.136
23 | 2016-04-25 13:49:24.798779: step 70, duration = 0.136
24 | 2016-04-25 13:49:26.162925: step 80, duration = 0.136
25 | 2016-04-25 13:49:27.527266: step 90, duration = 0.136
26 | 2016-04-25 13:49:28.755830: Forward across 100 steps, 0.135 +/- 0.014 sec / batch
27 | 2016-04-25 13:49:38.388811: step 10, duration = 0.450
28 | 2016-04-25 13:49:42.876702: step 20, duration = 0.450
29 | 2016-04-25 13:49:47.370660: step 30, duration = 0.451
30 | 2016-04-25 13:49:51.866057: step 40, duration = 0.448
31 | 2016-04-25 13:49:56.356868: step 50, duration = 0.447
32 | 2016-04-25 13:50:00.850748: step 60, duration = 0.447
33 | 2016-04-25 13:50:05.339067: step 70, duration = 0.450
34 | 2016-04-25 13:50:09.834336: step 80, duration = 0.448
35 | 2016-04-25 13:50:14.328572: step 90, duration = 0.447
36 | 2016-04-25 13:50:18.378832: Forward-backward across 100 steps, 0.445 +/- 0.045 sec / batch
37 | 


--------------------------------------------------------------------------------
/tensorflow/output_overfeat.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
 2 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcudnn.so locally
 3 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
 4 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
 6 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 7 | name: GeForce GTX TITAN X
 8 | major: 5 minor: 2 memoryClockRate (GHz) 1.076
 9 | pciBusID 0000:06:00.0
10 | Total memory: 12.00GiB
11 | Free memory: 11.87GiB
12 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
14 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:06:00.0)
15 | 2016-04-25 13:47:13.718646: step 10, duration = 0.090
16 | 2016-04-25 13:47:14.631825: step 20, duration = 0.091
17 | 2016-04-25 13:47:15.541794: step 30, duration = 0.090
18 | 2016-04-25 13:47:16.452997: step 40, duration = 0.091
19 | 2016-04-25 13:47:17.364615: step 50, duration = 0.091
20 | 2016-04-25 13:47:18.275613: step 60, duration = 0.092
21 | 2016-04-25 13:47:19.185519: step 70, duration = 0.092
22 | 2016-04-25 13:47:20.099216: step 80, duration = 0.092
23 | 2016-04-25 13:47:21.010124: step 90, duration = 0.091
24 | 2016-04-25 13:47:21.834772: Forward across 100 steps, 0.090 +/- 0.009 sec / batch
25 | 2016-04-25 13:47:27.718206: step 10, duration = 0.278
26 | 2016-04-25 13:47:30.516497: step 20, duration = 0.281
27 | 2016-04-25 13:47:33.318644: step 30, duration = 0.279
28 | 2016-04-25 13:47:36.132312: step 40, duration = 0.281
29 | 2016-04-25 13:47:38.941096: step 50, duration = 0.281
30 | 2016-04-25 13:47:41.762675: step 60, duration = 0.282
31 | 2016-04-25 13:47:44.588236: step 70, duration = 0.283
32 | 2016-04-25 13:47:47.419856: step 80, duration = 0.285
33 | 2016-04-25 13:47:50.257761: step 90, duration = 0.283
34 | 2016-04-25 13:47:52.818085: Forward-backward across 100 steps, 0.279 +/- 0.028 sec / batch
35 | 


--------------------------------------------------------------------------------
/tensorflow/output_vgga.log:
--------------------------------------------------------------------------------
 1 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
 2 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcudnn.so locally
 3 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
 4 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
 5 | I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
 6 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
 7 | name: GeForce GTX TITAN X
 8 | major: 5 minor: 2 memoryClockRate (GHz) 1.076
 9 | pciBusID 0000:06:00.0
10 | Total memory: 12.00GiB
11 | Free memory: 11.87GiB
12 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 
13 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y 
14 | I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:06:00.0)
15 | 2016-04-25 13:47:57.753471: step 10, duration = 0.159
16 | 2016-04-25 13:47:59.342040: step 20, duration = 0.160
17 | 2016-04-25 13:48:00.932890: step 30, duration = 0.159
18 | 2016-04-25 13:48:02.523590: step 40, duration = 0.159
19 | 2016-04-25 13:48:04.117070: step 50, duration = 0.160
20 | 2016-04-25 13:48:05.710548: step 60, duration = 0.160
21 | 2016-04-25 13:48:07.303214: step 70, duration = 0.159
22 | 2016-04-25 13:48:08.894066: step 80, duration = 0.159
23 | 2016-04-25 13:48:10.490060: step 90, duration = 0.159
24 | 2016-04-25 13:48:11.925424: Forward across 100 steps, 0.158 +/- 0.016 sec / batch
25 | 2016-04-25 13:48:23.395210: step 10, duration = 0.544
26 | 2016-04-25 13:48:28.839546: step 20, duration = 0.545
27 | 2016-04-25 13:48:34.287994: step 30, duration = 0.546
28 | 2016-04-25 13:48:39.734013: step 40, duration = 0.542
29 | 2016-04-25 13:48:45.185569: step 50, duration = 0.546
30 | 2016-04-25 13:48:50.636499: step 60, duration = 0.545
31 | 2016-04-25 13:48:56.088808: step 70, duration = 0.545
32 | 2016-04-25 13:49:01.539024: step 80, duration = 0.544
33 | 2016-04-25 13:49:06.995601: step 90, duration = 0.545
34 | 2016-04-25 13:49:11.909986: Forward-backward across 100 steps, 0.540 +/- 0.054 sec / batch
35 | 


--------------------------------------------------------------------------------
/tensorflow/run.sh:
--------------------------------------------------------------------------------
 1 | export LD_LIBRARY_PATH=/usr/local/cuda-7.0/lib:/usr/local/cuda-7.0/lib64:$LD_LIBRARY_PATH
 2 | export LD_LIBRARY_PATH=$HOME/Downloads/cudnn-6.5-linux-x64-v2:$LD_LIBRARY_PATH
 3 | export PATH="$HOME/code/bazel/bazel-bin/src":$PATH
 4 | export CUDA_VISIBLE_DEVICES=2
 5 | 
 6 | rm ~/code/tensorflow/tensorflow/models/convnetbenchmarks
 7 | ln -s ~/code/convnet-benchmarks/tensorflow ~/code/tensorflow/tensorflow/models/convnetbenchmarks
 8 | 
 9 | cd ~/code/tensorflow
10 | 
11 | bazel build -c opt --config=cuda //tensorflow/models/convnetbenchmarks:benchmark_alexnet
12 | bazel build -c opt --config=cuda //tensorflow/models/convnetbenchmarks:benchmark_overfeat
13 | bazel build -c opt --config=cuda //tensorflow/models/convnetbenchmarks:benchmark_vgg
14 | bazel build -c opt --config=cuda //tensorflow/models/convnetbenchmarks:benchmark_googlenet
15 | 
16 | bazel-bin/tensorflow/models/convnetbenchmarks/benchmark_alexnet   2>&1 | tee ~/code/convnet-benchmarks/tensorflow/output_alexnet.log
17 | bazel-bin/tensorflow/models/convnetbenchmarks/benchmark_overfeat  2>&1 | tee ~/code/convnet-benchmarks/tensorflow/output_overfeat.log
18 | bazel-bin/tensorflow/models/convnetbenchmarks/benchmark_vgg       2>&1 | tee ~/code/convnet-benchmarks/tensorflow/output_vgga.log
19 | bazel-bin/tensorflow/models/convnetbenchmarks/benchmark_googlenet 2>&1 | tee ~/code/convnet-benchmarks/tensorflow/output_googlenet.log
20 | 
21 | cd ~/code/convnet-benchmarks/tensorflow
22 | 


--------------------------------------------------------------------------------
/theano/README.md:
--------------------------------------------------------------------------------
 1 | Install Theano:
 2 | ```
 3 | git clone git://github.com/Theano/Theano.git
 4 | cd Theano
 5 | python setup.py develop
 6 | # To install into your home directory instead:
 7 | # python setup.py develop --prefix=~/.local
 8 | ```
 9 | 
10 | Install pylearn2:
11 | ```
12 | git clone git://github.com/lisa-lab/pylearn2.git
13 | cd pylearn2
14 | python setup.py develop
15 | # To install into your home directory instead:
16 | # python setup.py develop --prefix=~/.local
17 | ```
18 | 
19 | Install pycuda:
20 | ```
21 | wget -c https://pypi.python.org/packages/source/p/pycuda/pycuda-2013.1.1.tar.gz#md5=acf9319ab2970d9700ed6486aa87b708
22 | tar -xvf pycuda-2013.1.1.tar.gz
23 | cd pycuda-2013.1.1
24 | ./configure.py
25 | python setup.py install
26 | # To install into your home directory instead:
27 | # python setup.py install --user
28 | ```
29 | 
30 | Install scikits.cuda:
31 | ```
32 | git clone https://github.com/lebedov/scikits.cuda.git
33 | cd scikits.cuda
34 | python setup.py install
35 | # To install into your home directory instead:
36 | # python setup.py install --user
37 | ```
38 | 
39 | Launch the script:
40 | ```
41 | SKIP=legacy python pylearn2_benchmark.py
42 | ```
43 | 


--------------------------------------------------------------------------------
/theano/alexnet.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer,\
 3 |     MaxPool2DLayer
 4 | 
 5 | image_sz = 227
 6 | 
 7 | 
 8 | def build_model(batch_size=128):
 9 |     x = T.tensor4('input')
10 |     layer = InputLayer((batch_size, 3, image_sz, image_sz), input_var=x)
11 | 
12 |     layer = Conv2DLayer(layer, 64, 11, stride=4, pad='valid')
13 |     layer = MaxPool2DLayer(layer, 3, stride=2)
14 | 
15 |     layer = Conv2DLayer(layer, 192, 5, pad='same')
16 |     layer = MaxPool2DLayer(layer, 3, stride=2)
17 | 
18 |     layer = Conv2DLayer(layer, 384, 3, pad='same')
19 |     layer = Conv2DLayer(layer, 256, 3, pad='same')
20 |     layer = Conv2DLayer(layer, 256, 3, pad='same')
21 |     layer = MaxPool2DLayer(layer, 3, stride=2)
22 | 
23 |     layer = DenseLayer(layer, 4096)
24 |     layer = DenseLayer(layer, 4096)
25 |     layer = DenseLayer(layer, 1000, nonlinearity=None)
26 | 
27 |     return layer, x
28 | 


--------------------------------------------------------------------------------
/theano/benchmark_imagenet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | import theano.tensor as T
 4 | from lasagne.layers import get_output, get_all_params
 5 | import time
 6 | from datetime import datetime
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser(
10 |     description=' convnet benchmarks on imagenet')
11 | parser.add_argument('--arch', '-a', default='alexnet',
12 |                     help='Convnet architecture \
13 |                     (alexnet, googlenet, vgg, overfeat)')
14 | parser.add_argument('--batch_size', '-B', type=int, default=128,
15 |                     help='minibatch size')
16 | parser.add_argument('--num_batches', '-n', type=int, default=100,
17 |                     help='number of minibatches')
18 | 
19 | args = parser.parse_args()
20 | 
21 | if args.arch == 'alexnet':
22 |     from alexnet import build_model, image_sz
23 | elif args.arch == 'googlenet':
24 |     from googlenet import build_model, image_sz
25 | elif args.arch == 'vgg':
26 |     from vgg import build_model, image_sz
27 | elif args.arch == 'overfeat':
28 |     from overfeat import build_model, image_sz
29 | else:
30 |     raise ValueError('Invalid architecture name')
31 | 
32 | 
33 | def time_theano_run(func, fargs, info_string):
34 |     num_batches = args.num_batches
35 |     num_steps_burn_in = 10
36 |     durations = []
37 |     for i in range(num_batches + num_steps_burn_in):
38 |         start_time = time.time()
39 |         _ = func(*fargs)
40 |         duration = time.time() - start_time
41 |         if i > num_steps_burn_in:
42 |             if not i % 10:
43 |                 print('%s: step %d, duration = %.3f' %
44 |                       (datetime.now(), i - num_steps_burn_in, duration))
45 |             durations.append(duration)
46 |     durations = np.array(durations)
47 |     print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
48 |           (datetime.now(), info_string, num_batches,
49 |            durations.mean(), durations.std()))
50 | 
51 | 
52 | def main():
53 |     batch_size = args.batch_size
54 |     print('Building model...')
55 |     layer, input_var = build_model(batch_size=batch_size)
56 |     labels_var = T.ivector('labels')
57 |     output = get_output(layer)
58 |     loss = T.nnet.categorical_crossentropy(
59 |         T.nnet.softmax(output), labels_var).mean(
60 |         dtype=theano.config.floatX)
61 |     gradient = T.grad(loss, get_all_params(layer))
62 | 
63 |     print('Compiling theano functions...')
64 |     forward_func = theano.function([input_var], output)
65 |     full_func = theano.function([input_var, labels_var], gradient)
66 |     print('Functions are compiled')
67 | 
68 |     images = np.random.rand(batch_size, 3, image_sz, image_sz).astype(np.float32)
69 |     labels = np.random.randint(0, 1000, size=batch_size).astype(np.int32)
70 | 
71 |     time_theano_run(forward_func, [images], 'Forward')
72 |     time_theano_run(full_func, [images, labels], 'Forward-Backward')
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/theano/googlenet.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer,\
 3 |     MaxPool2DLayer, Pool2DLayer, ConcatLayer
 4 | 
 5 | 
 6 | image_sz = 224
 7 | 
 8 | 
 9 | def _inception(inp, o1s, o2s1, o2s2, o3s1, o3s2, o4s):
10 |     conv1 = Conv2DLayer(inp, o1s, 1)
11 | 
12 |     conv3_ = Conv2DLayer(inp, o2s1, 1)
13 |     conv3 = Conv2DLayer(conv3_, o2s2, 3, pad='same')
14 | 
15 |     conv5_ = Conv2DLayer(inp, o3s1, 1)
16 |     conv5 = Conv2DLayer(conv5_, o3s2, 5, pad='same')
17 | 
18 |     pool_ = MaxPool2DLayer(inp, 3, stride=1, pad=1)
19 |     pool = Conv2DLayer(pool_, o4s, 1)
20 | 
21 |     return ConcatLayer([conv1, conv3, conv5, pool])
22 | 
23 | 
24 | def build_model(batch_size=128):
25 |     x = T.tensor4('input')
26 |     layer = InputLayer((batch_size, 3, image_sz, image_sz), input_var=x)
27 | 
28 |     conv1 = Conv2DLayer(layer, 64, 7, stride=2, pad='same')
29 |     pool1 = MaxPool2DLayer(conv1, 3, stride=2, pad=1)
30 | 
31 |     conv2 = Conv2DLayer(pool1, 64, 1, pad='same')
32 |     conv3 = Conv2DLayer(conv2, 192, 3, pad='same')
33 |     pool3 = MaxPool2DLayer(conv3, 3, stride=2, pad=1)
34 | 
35 |     incept3a = _inception(pool3,    64, 96, 128, 16, 32, 32)
36 |     incept3b = _inception(incept3a, 128, 128, 192, 32, 96, 64)
37 |     pool4 = MaxPool2DLayer(incept3b, 3, stride=2, pad=1)
38 |     incept4a = _inception(pool4,    192,  96, 208, 16, 48, 64)
39 |     incept4b = _inception(incept4a, 160, 112, 224, 24, 64, 64)
40 |     incept4c = _inception(incept4b, 128, 128, 256, 24, 64, 64)
41 |     incept4d = _inception(incept4c, 112, 144, 288, 32, 64, 64)
42 |     incept4e = _inception(incept4d, 256, 160, 320, 32, 128, 128)
43 |     pool5 = MaxPool2DLayer(incept4e, 3, stride=2, pad=1)
44 | 
45 |     incept5a = _inception(pool5,    256, 160, 320, 32, 128, 128)
46 |     incept5b = _inception(incept5a, 384, 192, 384, 48, 128, 128)
47 |     pool6 = Pool2DLayer(incept5b, 7, stride=1, mode='average_exc_pad')
48 | 
49 |     layer = DenseLayer(pool6, 1024)
50 |     layer = DenseLayer(layer, 1000, nonlinearity=None)
51 | 
52 |     return layer, x
53 | 
54 | 


--------------------------------------------------------------------------------
/theano/output.log:
--------------------------------------------------------------------------------
  1 | Using gpu device 0: Graphics Device
  2 | Note: cuDNN not available
  3 | 
  4 | CONFIG: input = 3 x 128 x 128 * ker = 3 x 96 x 11 x 11 ( bs = 128 , stride = 1 )
  5 | (experimental) meta-optimizer                      ==> fprop         ==>      91
  6 | (experimental) meta-optimizer                      ==> bprop inputs  ==>      92
  7 | (experimental) meta-optimizer                      ==> bprop weights ==>      87
  8 | 
  9 | theano.sandbox.cuda.fftconv.conv2d_fft fprop: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 10 | theano.sandbox.cuda.fftconv.conv2d_fft bprop inputs: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 11 | theano.sandbox.cuda.fftconv.conv2d_fft bprop weights: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 12 | 
 13 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> fprop         ==>      91
 14 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop inputs  ==>      92
 15 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop weights ==>      87
 16 | 
 17 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> fprop         ==>      90
 18 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop inputs  ==>      89
 19 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop weights ==>      85
 20 | 
 21 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> fprop         ==>      38
 22 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop inputs  ==>      53
 23 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop weights ==>     339
 24 | 
 25 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> fprop         ==>      38
 26 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop inputs  ==>      53
 27 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop weights ==>      83
 28 | 
 29 | 
 30 | CONFIG: input = 64 x 64 x 64 * ker = 64 x 128 x 9 x 9 ( bs = 128 , stride = 1 )
 31 | (experimental) meta-optimizer                      ==> fprop         ==>     149
 32 | (experimental) meta-optimizer                      ==> bprop inputs  ==>     173
 33 | (experimental) meta-optimizer                      ==> bprop weights ==>     239
 34 | 
 35 | theano.sandbox.cuda.fftconv.conv2d_fft fprop: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 36 | theano.sandbox.cuda.fftconv.conv2d_fft bprop inputs: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 37 | theano.sandbox.cuda.fftconv.conv2d_fft bprop weights: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 38 | 
 39 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> fprop         ==>     149
 40 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop inputs  ==>     173
 41 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop weights ==>     244
 42 | 
 43 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> fprop         ==>     143
 44 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop inputs  ==>     163
 45 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop weights ==>     242
 46 | 
 47 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> fprop         ==>     184
 48 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop inputs  ==>     226
 49 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop weights ==>     207
 50 | 
 51 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> fprop         ==>     183
 52 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop inputs  ==>     227
 53 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop weights ==>     441
 54 | 
 55 | 
 56 | CONFIG: input = 128 x 32 x 32 * ker = 128 x 128 x 9 x 9 ( bs = 128 , stride = 1 )
 57 | (experimental) meta-optimizer                      ==> fprop         ==>     123
 58 | (experimental) meta-optimizer                      ==> bprop inputs  ==>      91
 59 | (experimental) meta-optimizer                      ==> bprop weights ==>      86
 60 | 
 61 | theano.sandbox.cuda.fftconv.conv2d_fft fprop: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 62 | theano.sandbox.cuda.fftconv.conv2d_fft bprop inputs: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 63 | theano.sandbox.cuda.fftconv.conv2d_fft bprop weights: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 64 | 
 65 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> fprop         ==>     123
 66 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop inputs  ==>      90
 67 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop weights ==>      86
 68 | 
 69 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> fprop         ==>     121
 70 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop inputs  ==>      88
 71 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop weights ==>      86
 72 | 
 73 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> fprop         ==>      68
 74 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop inputs  ==>      77
 75 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop weights ==>      70
 76 | 
 77 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> fprop         ==>      67
 78 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop inputs  ==>      78
 79 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop weights ==>      95
 80 | 
 81 | 
 82 | CONFIG: input = 128 x 16 x 16 * ker = 128 x 128 x 7 x 7 ( bs = 128 , stride = 1 )
 83 | (experimental) meta-optimizer                      ==> fprop         ==>      25
 84 | (experimental) meta-optimizer                      ==> bprop inputs  ==>      18
 85 | (experimental) meta-optimizer                      ==> bprop weights ==>      11
 86 | 
 87 | theano.sandbox.cuda.fftconv.conv2d_fft fprop: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 88 | theano.sandbox.cuda.fftconv.conv2d_fft bprop inputs: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 89 | theano.sandbox.cuda.fftconv.conv2d_fft bprop weights: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
 90 | 
 91 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> fprop         ==>      24
 92 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop inputs  ==>      18
 93 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop weights ==>      11
 94 | 
 95 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> fprop         ==>      24
 96 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop inputs  ==>      18
 97 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop weights ==>      11
 98 | 
 99 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> fprop         ==>       7
100 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop inputs  ==>       7
101 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop weights ==>       8
102 | 
103 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> fprop         ==>       7
104 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop inputs  ==>       7
105 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop weights ==>       9
106 | 
107 | 
108 | CONFIG: input = 384 x 13 x 13 * ker = 384 x 384 x 3 x 3 ( bs = 128 , stride = 1 )
109 | (experimental) meta-optimizer                      ==> fprop         ==>      28
110 | (experimental) meta-optimizer                      ==> bprop inputs  ==>      18
111 | (experimental) meta-optimizer                      ==> bprop weights ==>      14
112 | 
113 | theano.sandbox.cuda.fftconv.conv2d_fft fprop: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
114 | theano.sandbox.cuda.fftconv.conv2d_fft bprop inputs: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
115 | theano.sandbox.cuda.fftconv.conv2d_fft bprop weights: FAILED ('The following error happened while compiling the node', CuFFTOp(GpuContiguous.0), '\n', 'scikits.cuda is needed for all GPU fft implementation, including fftconv.')
116 | 
117 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> fprop         ==>      28
118 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop inputs  ==>      17
119 | (auto) theano.sandbox.cuda.blas.GpuCorrMM          ==> bprop weights ==>      14
120 | 
121 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> fprop         ==>      28
122 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop inputs  ==>      17
123 | (manual) theano.sandbox.cuda.blas.GpuCorrMM        ==> bprop weights ==>      14
124 | 
125 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> fprop         ==>      16
126 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop inputs  ==>      15
127 | pylearn2.sandbox.cuda_convnet(partial_sum=None)    ==> bprop weights ==>      19
128 | 
129 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> fprop         ==>      17
130 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop inputs  ==>      15
131 | pylearn2.sandbox.cuda_convnet(partial_sum=1)       ==> bprop weights ==>      23
132 | 
133 | 


--------------------------------------------------------------------------------
/theano/overfeat.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer,\
 3 |     MaxPool2DLayer
 4 | 
 5 | image_sz = 231
 6 | 
 7 | 
 8 | def build_model(batch_size=128):
 9 |     x = T.tensor4('input')
10 |     layer = InputLayer((batch_size, 3, image_sz, image_sz), input_var=x)
11 | 
12 |     layer = Conv2DLayer(layer, 96, 11, stride=4, pad='valid')
13 |     layer = MaxPool2DLayer(layer, 2)
14 | 
15 |     layer = Conv2DLayer(layer, 256, 5, pad='valid')
16 |     layer = MaxPool2DLayer(layer, 2)
17 | 
18 |     layer = Conv2DLayer(layer, 512, 3, pad='same')
19 |     layer = Conv2DLayer(layer, 1024, 3, pad='same')
20 |     layer = Conv2DLayer(layer, 1024, 3, pad='same')
21 |     layer = MaxPool2DLayer(layer, 2)
22 | 
23 |     layer = DenseLayer(layer, 3072)
24 |     layer = DenseLayer(layer, 4096)
25 |     layer = DenseLayer(layer, 1000, nonlinearity=None)
26 | 
27 |     return layer, x
28 | 


--------------------------------------------------------------------------------
/theano/vgg.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer,\
 3 |     MaxPool2DLayer
 4 | 
 5 | image_sz = 224
 6 | 
 7 | 
 8 | def build_model(batch_size=64):
 9 |     x = T.tensor4('input')
10 |     layer = InputLayer((batch_size, 3, image_sz, image_sz), input_var=x)
11 | 
12 |     layer = Conv2DLayer(layer, 64, 3, pad='same')
13 |     layer = MaxPool2DLayer(layer, 2)
14 | 
15 |     layer = Conv2DLayer(layer, 128, 3, pad='same')
16 |     layer = MaxPool2DLayer(layer, 2)
17 | 
18 |     layer = Conv2DLayer(layer, 256, 3, pad='same')
19 |     layer = Conv2DLayer(layer, 256, 3, pad='same')
20 |     layer = MaxPool2DLayer(layer, 2)
21 | 
22 |     layer = Conv2DLayer(layer, 512, 3, pad='same')
23 |     layer = Conv2DLayer(layer, 512, 3, pad='same')
24 |     layer = MaxPool2DLayer(layer, 2)
25 | 
26 |     layer = Conv2DLayer(layer, 512, 3, pad='same')
27 |     layer = Conv2DLayer(layer, 512, 3, pad='same')
28 |     layer = MaxPool2DLayer(layer, 2)
29 | 
30 |     layer = DenseLayer(layer, 4096)
31 |     layer = DenseLayer(layer, 4096)
32 |     layer = DenseLayer(layer, 1000, nonlinearity=None)
33 | 
34 |     return layer, x
35 | 


--------------------------------------------------------------------------------
/torch7/README.md:
--------------------------------------------------------------------------------
 1 | Install torch-7 using the instructions at: http://torch.ch/docs/getting-started.html#_
 2 | Then install cudaconv2, cudnn
 3 | ```bash
 4 | luarocks install ccn2 # to do cuda-convnet2 benchmarks using torch wrappers
 5 | luarocks install cudnn # to do NVIDIA CuDNN benchmarks, also have CuDNN installed on your machine
 6 | #luarocks install https://raw.githubusercontent.com/qassemoquab/nnbhwd/master/nnbhwd-scm-1.rockspec # to do nnBHWD benchmarks
 7 | ```
 8 | 
 9 | Also install fbcunn's sync branch (I just synced from internal, pending cleanup): https://github.com/facebook/fbcunn/blob/master/INSTALL.md
10 | 
11 | For layerwise benchmarks (table in frontpage with L1,L2,L3,L4,L5)
12 | Run the benchmark using:
13 | ```bash
14 | th layerwise_benchmarks/benchmark.lua
15 | ```
16 | 
17 | For imagenet-winners benchmarks
18 | Run the benchmark using:
19 | ```bash
20 | th imagenet_winners/benchmark.lua
21 | ```
22 | 
23 | For imagenet-winners benchmarks of cudaconv2, look at the cudaconv2 branch: https://github.com/soumith/convnet-benchmarks/tree/cudaconv2
24 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/alexnet.lua:
--------------------------------------------------------------------------------
 1 | function alexnet(lib)
 2 |    local SpatialConvolution = lib[1]
 3 |    local SpatialMaxPooling = lib[2]
 4 |    local ReLU = lib[3]
 5 |    local SpatialZeroPadding = nn.SpatialZeroPadding
 6 |    local padding = true
 7 |    local stride1only = false
 8 | 
 9 |    -- from https://code.google.com/p/cuda-convnet2/source/browse/layers/layers-imagenet-1gpu.cfg
10 |    -- this is AlexNet that was presented in the One Weird Trick paper. http://arxiv.org/abs/1404.5997
11 |    local features = nn.Sequential()
12 |    features:add(SpatialConvolution(3,64,11,11,4,4,2,2))       -- 224 -> 55
13 |    features:add(ReLU(true))
14 |    features:add(SpatialMaxPooling(3,3,2,2))                   -- 55 ->  27
15 |    features:add(SpatialConvolution(64,192,5,5,1,1,2,2))       --  27 -> 27
16 |    features:add(ReLU(true))
17 |    features:add(SpatialMaxPooling(3,3,2,2))                   --  27 ->  13
18 |    features:add(SpatialConvolution(192,384,3,3,1,1,1,1))      --  13 ->  13
19 |    features:add(ReLU(true))
20 |    features:add(SpatialConvolution(384,256,3,3,1,1,1,1))      --  13 ->  13
21 |    features:add(ReLU(true))
22 |    features:add(SpatialConvolution(256,256,3,3,1,1,1,1))      --  13 ->  13
23 |    features:add(ReLU(true))
24 |    features:add(SpatialMaxPooling(3,3,2,2))                   -- 13 -> 6
25 | 
26 |    local classifier = nn.Sequential()
27 |    classifier:add(nn.View(256*6*6))
28 |    -- classifier:add(nn.Dropout(0.5))
29 |    classifier:add(nn.Linear(256*6*6, 4096))
30 |    classifier:add(nn.Threshold(0, 1e-6))
31 |    -- classifier:add(nn.Dropout(0.5))
32 |    classifier:add(nn.Linear(4096, 4096))
33 |    classifier:add(nn.Threshold(0, 1e-6))
34 |    classifier:add(nn.Linear(4096, 1000))
35 |    -- classifier:add(nn.LogSoftMax())
36 | 
37 |    features:get(1).gradInput = nil
38 | 
39 |    local model = nn.Sequential()
40 |    model:add(features):add(classifier)
41 | 
42 |    return model,'AlexNet',{128,3,224,224}
43 | end
44 | 
45 | return alexnet
46 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/benchmark.lua:
--------------------------------------------------------------------------------
  1 | require 'sys'
  2 | 
  3 | require 'cunn'
  4 | 
  5 | require 'cudnn'
  6 | cudnn.benchmark = true -- run manual auto-tuner provided by cudnn
  7 | cudnn.verbose = false
  8 | 
  9 | -- require 'fbcunn'
 10 | -- require 'nnbhwd' -- not compiling anymore, file an issue
 11 | local nets = {}
 12 | nets[#nets+1] = require 'alexnet'
 13 | nets[#nets+1] = require 'overfeat'
 14 | nets[#nets+1] = require 'vgg_a'
 15 | nets[#nets+1] = require 'googlenet'
 16 | 
 17 | local libs = {}
 18 | libs[#libs+1] = {cudnn.SpatialConvolution, cudnn.SpatialMaxPooling, cudnn.ReLU, 'BDHW', 'cudnn'}
 19 | -- libs[#libs+1] = {fbnn.SpatialConvolution, cudnn.SpatialMaxPooling, cudnn.ReLU, 'BDHW', 'fbnn'}
 20 | -- libs[#libs+1] = {nn.SpatialConvolutionMM, nn.SpatialMaxPooling, nn.ReLU, 'BDHW', 'nn'}
 21 | -- libs[#libs+1] = {nn.SpatialConvolutionBHWD, nn.SpatialMaxPoolingBHWD, nn.ReLU, 'BHWD', 'nnBHWD'}
 22 | 
 23 | print('Running on device: ' .. cutorch.getDeviceProperties(cutorch.getDevice()).name)
 24 | 
 25 | steps = 10 -- nb of steps in loop to average perf
 26 | nDryRuns = 10
 27 | 
 28 | function makeInput(config, size)
 29 |    local layout = config[4]
 30 |    local osize
 31 |    if layout == 'BDHW' then
 32 |       osize = size
 33 |    elseif layout == 'DHWB' then
 34 |       osize = {size[2],size[3],size[4],size[1]}
 35 |    elseif layout == 'BHWD' then
 36 |       osize = {size[1], size[3], size[4], size[2]}
 37 |    end
 38 |    return torch.randn(torch.LongStorage(osize))
 39 | end
 40 | 
 41 | for i=1,#nets do
 42 |    for j=1,#libs do
 43 |       collectgarbage()
 44 |       local model,model_name,size = nets[i](libs[j])
 45 |       model=model:cuda()
 46 |       local input = makeInput(libs[j],size):cuda()
 47 |       local lib_name = libs[j][5]
 48 |       print('ModelType: ' .. model_name, 'Kernels: ' .. lib_name,
 49 |             'Input shape: ' .. input:size(1) .. 'x' .. input:size(2) ..
 50 |                'x' .. input:size(3) .. 'x' .. input:size(4))
 51 | 
 52 |       -- dry-run
 53 |       for i=1,nDryRuns do
 54 |          model:zeroGradParameters()
 55 |          local output = model:updateOutput(input)
 56 |          local gradInput = model:updateGradInput(input, output)
 57 |          model:accGradParameters(input, output)
 58 |          cutorch.synchronize()
 59 |          collectgarbage()
 60 |       end
 61 | 
 62 |       local tmf, tmbi, tmbg
 63 |       sys.tic()
 64 |       for t = 1,steps do
 65 |          output = model:updateOutput(input)
 66 |       end
 67 |       cutorch.synchronize()
 68 |       tmf = sys.toc()/steps
 69 |       print(string.format("%-30s %25s %10.2f", lib_name, ':updateOutput():', tmf*1000))
 70 | 
 71 |       collectgarbage()
 72 |       sys.tic()
 73 |       for t = 1,steps do
 74 |          model:updateGradInput(input, output)
 75 |       end
 76 |       cutorch.synchronize()
 77 |       tmbi = sys.toc()/steps
 78 |       print(string.format("%-30s %25s %10.2f", lib_name, ':updateGradInput():', tmbi*1000))
 79 | 
 80 |       collectgarbage()
 81 |       sys.tic()
 82 |       local ok = 1
 83 |       for t = 1,steps do
 84 |          ok = pcall(function() model:accGradParameters(input, output) end)
 85 |       end
 86 |       cutorch.synchronize()
 87 |       tmbg = sys.toc()/steps
 88 |       if not ok then
 89 |          print(string.format("%-30s %25s %s", lib_name, ':accGradParameters():', 'FAILED!'))
 90 |       else
 91 |          print(string.format("%-30s %25s %10.2f", lib_name, ':accGradParameters():', tmbg*1000))
 92 |       end
 93 |       print(string.format("%-30s %25s %10.2f", lib_name, ':Forward:', (tmf)*1000))
 94 |       print(string.format("%-30s %25s %10.2f", lib_name, ':Backward:', (tmbi+tmbg)*1000))
 95 |       print(string.format("%-30s %25s %10.2f", lib_name, ':TOTAL:', (tmf+tmbi+tmbg)*1000))
 96 |       print()
 97 |    end
 98 | end
 99 | 
100 | print('')
101 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/googlenet.lua:
--------------------------------------------------------------------------------
 1 | -- adapted from nagadomi's CIFAR attempt: https://github.com/nagadomi/kaggle-cifar10-torch7/blob/cuda-convnet2/inception_model.lua
 2 | local function inception(depth_dim, input_size, config, lib)
 3 |    local SpatialConvolution = lib[1]
 4 |    local SpatialMaxPooling = lib[2]
 5 |    local ReLU = lib[3]
 6 | 
 7 |    local depth_concat = nn.Concat(depth_dim)
 8 |    local conv1 = nn.Sequential()
 9 |    conv1:add(SpatialConvolution(input_size, config[1][1], 1, 1)):add(ReLU(true))
10 |    depth_concat:add(conv1)
11 | 
12 |    local conv3 = nn.Sequential()
13 |    conv3:add(SpatialConvolution(input_size, config[2][1], 1, 1)):add(ReLU(true))
14 |    conv3:add(SpatialConvolution(config[2][1], config[2][2], 3, 3, 1, 1, 1, 1)):add(ReLU(true))
15 |    depth_concat:add(conv3)
16 | 
17 |    local conv5 = nn.Sequential()
18 |    conv5:add(SpatialConvolution(input_size, config[3][1], 1, 1)):add(ReLU(true))
19 |    conv5:add(SpatialConvolution(config[3][1], config[3][2], 5, 5, 1, 1, 2, 2)):add(ReLU(true))
20 |    depth_concat:add(conv5)
21 | 
22 |    local pool = nn.Sequential()
23 |    pool:add(SpatialMaxPooling(config[4][1], config[4][1], 1, 1, 1, 1))
24 |    pool:add(SpatialConvolution(input_size, config[4][2], 1, 1)):add(ReLU(true))
25 |    depth_concat:add(pool)
26 | 
27 |    return depth_concat
28 | end
29 | 
30 | local function googlenet(lib)
31 |    local SpatialConvolution = lib[1]
32 |    local SpatialMaxPooling = lib[2]
33 |    local SpatialAveragePooling = torch.type(lib[2]) == 'nn.SpatialMaxPooling' and nn.SpatialAveragePooling or cudnn.SpatialAveragePooling
34 |    local ReLU = lib[3]
35 |    local model = nn.Sequential()
36 |    model:add(SpatialConvolution(3,64,7,7,2,2,3,3)):add(ReLU(true))
37 |    model:add(SpatialMaxPooling(3,3,2,2,1,1))
38 |    -- LRN (not added for now)
39 |    model:add(SpatialConvolution(64,64,1,1,1,1,0,0)):add(ReLU(true))
40 |    model:add(SpatialConvolution(64,192,3,3,1,1,1,1)):add(ReLU(true))
41 |    -- LRN (not added for now)
42 |    model:add(SpatialMaxPooling(3,3,2,2,1,1))
43 |    model:add(inception(2, 192, {{ 64}, { 96,128}, {16, 32}, {3, 32}},lib)) -- 256
44 |    model:add(inception(2, 256, {{128}, {128,192}, {32, 96}, {3, 64}},lib)) -- 480
45 |    model:add(SpatialMaxPooling(3,3,2,2,1,1))
46 |    model:add(inception(2, 480, {{192}, { 96,208}, {16, 48}, {3, 64}},lib)) -- 4(a)
47 |    model:add(inception(2, 512, {{160}, {112,224}, {24, 64}, {3, 64}},lib)) -- 4(b)
48 |    model:add(inception(2, 512, {{128}, {128,256}, {24, 64}, {3, 64}},lib)) -- 4(c)
49 |    model:add(inception(2, 512, {{112}, {144,288}, {32, 64}, {3, 64}},lib)) -- 4(d)
50 |    model:add(inception(2, 528, {{256}, {160,320}, {32,128}, {3,128}},lib)) -- 4(e) (14x14x832)
51 |    model:add(SpatialMaxPooling(3,3,2,2,1,1))
52 |    model:add(inception(2, 832, {{256}, {160,320}, {32,128}, {3,128}},lib)) -- 5(a)
53 |    model:add(inception(2, 832, {{384}, {192,384}, {48,128}, {3,128}},lib)) -- 5(b)
54 |    model:add(SpatialAveragePooling(7,7,1,1))
55 |    model:add(nn.View(1024):setNumInputDims(3))
56 |    -- model:add(nn.Dropout(0.4))
57 |    model:add(nn.Linear(1024,1000)):add(nn.ReLU(true))
58 |    -- model:add(nn.LogSoftMax())
59 |    model:get(1).gradInput = nil
60 |    return model,'GoogleNet', {128,3,224,224}
61 | end
62 | 
63 | return googlenet
64 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/output.log:
--------------------------------------------------------------------------------
 1 | Running on device: GeForce GTX TITAN X	
 2 | ModelType: AlexNet	Kernels: cudnn	Input shape: 128x3x224x224	
 3 | cudnn                                   :updateOutput():      27.65	
 4 | cudnn                                :updateGradInput():      24.32	
 5 | cudnn                              :accGradParameters():      28.99	
 6 | cudnn                                          :Forward:      27.65	
 7 | cudnn                                         :Backward:      53.31	
 8 | cudnn                                            :TOTAL:      80.96	
 9 | ModelType: OverFeat[fast]	Kernels: cudnn	Input shape: 128x3x231x231	
10 | cudnn                                   :updateOutput():      94.28	
11 | cudnn                                :updateGradInput():      81.17	
12 | cudnn                              :accGradParameters():      93.07	
13 | cudnn                                          :Forward:      94.28	
14 | cudnn                                         :Backward:     174.24	
15 | cudnn                                            :TOTAL:     268.52	
16 | ModelType: VGG Model-A	Kernels: cudnn	Input shape: 64x3x224x224	
17 | cudnn                                   :updateOutput():     162.74	
18 | cudnn                                :updateGradInput():     167.05	
19 | cudnn                              :accGradParameters():     199.49	
20 | cudnn                                          :Forward:     162.74	
21 | cudnn                                         :Backward:     366.54	
22 | cudnn                                            :TOTAL:     529.29	
23 | ModelType: GoogleNet	Kernels: cudnn	Input shape: 128x3x224x224	
24 | cudnn                                   :updateOutput():     130.76	
25 | cudnn                                :updateGradInput():     197.86	
26 | cudnn                              :accGradParameters():     142.15	
27 | cudnn                                          :Forward:     130.76	
28 | cudnn                                         :Backward:     340.01	
29 | cudnn                                            :TOTAL:     470.77	
30 | 	
31 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/output_cudnn_fp16.log:
--------------------------------------------------------------------------------
 1 | Running on device: GeForce GTX TITAN X	
 2 | ModelType: AlexNet	Kernels: cudnn	Input shape: 128x3x224x224	
 3 | cudnn                                   :updateOutput():      24.87	
 4 | cudnn                                :updateGradInput():      21.15	
 5 | cudnn                              :accGradParameters():      25.64	
 6 | cudnn                                          :Forward:      24.87	
 7 | cudnn                                         :Backward:      46.79	
 8 | cudnn                                            :TOTAL:      71.66	
 9 | ModelType: OverFeat[fast]	Kernels: cudnn	Input shape: 128x3x231x231	
10 | cudnn                                   :updateOutput():      86.15	
11 | cudnn                                :updateGradInput():      73.20	
12 | cudnn                              :accGradParameters():      83.29	
13 | cudnn                                          :Forward:      86.15	
14 | cudnn                                         :Backward:     156.50	
15 | cudnn                                            :TOTAL:     242.64	
16 | ModelType: VGG Model-A	Kernels: cudnn	Input shape: 64x3x224x224	
17 | cudnn                                   :updateOutput():     140.33	
18 | cudnn                                :updateGradInput():     144.58	
19 | cudnn                              :accGradParameters():     186.94	
20 | cudnn                                          :Forward:     140.33	
21 | cudnn                                         :Backward:     331.52	
22 | cudnn                                            :TOTAL:     471.85	
23 | ModelType: GoogleNet	Kernels: cudnn	Input shape: 128x3x224x224	
24 | cudnn                                   :updateOutput():     112.51	
25 | cudnn                                :updateGradInput():     223.20	
26 | cudnn                              :accGradParameters():     126.51	
27 | cudnn                                          :Forward:     112.51	
28 | cudnn                                         :Backward:     349.71	
29 | cudnn                                            :TOTAL:     462.22	
30 | 	
31 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/output_fbnn.log:
--------------------------------------------------------------------------------
 1 | Running on device: GeForce GTX TITAN X
 2 | ModelType: AlexNet      Kernels: fbnn   Input shape: 128x3x224x224
 3 | fbnn                                    :updateOutput():      31.93
 4 | fbnn                                 :updateGradInput():      41.59
 5 | fbnn                               :accGradParameters():      30.59
 6 | fbnn                                           :Forward:      31.93
 7 | fbnn                                          :Backward:      72.18
 8 | fbnn                                             :TOTAL:     104.11
 9 | 
10 | Running on device: GeForce GTX TITAN X
11 | ModelType: OverFeat[fast]       Kernels: fbnn   Input shape: 128x3x231x231
12 | fbnn                                    :updateOutput():     114.36
13 | fbnn                                 :updateGradInput():     116.35
14 | fbnn                               :accGradParameters():     111.56
15 | fbnn                                           :Forward:     114.36
16 | fbnn                                          :Backward:     227.91
17 | fbnn                                             :TOTAL:     342.27


--------------------------------------------------------------------------------
/torch7/imagenet_winners/overfeat.lua:
--------------------------------------------------------------------------------
 1 | function overfeat_fast(lib)
 2 |    local SpatialConvolution = lib[1]
 3 |    local SpatialMaxPooling = lib[2]
 4 |    local ReLU = lib[3]
 5 |    local SpatialZeroPadding = nn.SpatialZeroPadding
 6 | 
 7 |    local features = nn.Sequential()
 8 |    features:add(SpatialConvolution(3, 96, 11, 11, 4, 4))
 9 |    features:add(ReLU(true))
10 |    features:add(SpatialMaxPooling(2, 2, 2, 2))
11 |    features:add(SpatialConvolution(96, 256, 5, 5, 1, 1))
12 |    features:add(ReLU(true))
13 |    features:add(SpatialMaxPooling(2, 2, 2, 2))
14 |    features:add(SpatialConvolution(256, 512, 3, 3, 1, 1, 1, 1))
15 |    features:add(ReLU(true))
16 |    features:add(SpatialConvolution(512, 1024, 3, 3, 1, 1, 1, 1))
17 |    features:add(ReLU(true))
18 |    features:add(SpatialConvolution(1024, 1024, 3, 3, 1, 1, 1, 1))
19 |    features:add(ReLU(true))
20 |    features:add(nn.SpatialMaxPooling(2, 2, 2, 2))
21 | 
22 |    features:get(1).gradInput = nil
23 | 
24 |    local classifier = nn.Sequential()
25 |    classifier:add(nn.View(1024*6*6))
26 |    -- classifier:add(nn.Dropout(0.5))
27 |    classifier:add(nn.Linear(1024*6*6, 3072))
28 |    classifier:add(nn.Threshold(0, 1e-6))
29 |    -- classifier:add(nn.Dropout(0.5))
30 |    classifier:add(nn.Linear(3072, 4096))
31 |    classifier:add(nn.Threshold(0, 1e-6))
32 |    classifier:add(nn.Linear(4096, 1000))
33 |    -- classifier:add(nn.LogSoftMax())
34 | 
35 |    local model = nn.Sequential()
36 |    model:add(features):add(classifier)
37 | 
38 |    return model,'OverFeat[fast]',{128,3,231,231}
39 | end
40 | 
41 | return overfeat_fast
42 | 


--------------------------------------------------------------------------------
/torch7/imagenet_winners/vgg_a.lua:
--------------------------------------------------------------------------------
 1 | function vgg_a(lib)
 2 |    local SpatialConvolution = lib[1]
 3 |    local SpatialMaxPooling = lib[2]
 4 |    local ReLU = lib[3]
 5 |    local SpatialZeroPadding = nn.SpatialZeroPadding
 6 |    local padding = true
 7 |    local stride1only = false
 8 |    if lib[5] == 'fbfft' then
 9 |       padding = false -- fbfft does not support implicit zero padding
10 |       stride1only = true -- fbfft does not support convolutions that are not stride-1
11 |    end
12 | 
13 |    local modelType = 'A' -- on a titan black, B/D/E run out of memory even for batch-size 32
14 | 
15 |    -- Create tables describing VGG configurations A, B, D, E
16 |    local cfg = {}
17 |    if modelType == 'A' then
18 |       cfg = {64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'}
19 |    elseif modelType == 'B' then
20 |       cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'}
21 |    elseif modelType == 'D' then
22 |       cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'}
23 |    elseif modelType == 'E' then
24 |       cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'}
25 |    else
26 |       error('Unknown model type: ' .. modelType .. ' | Please specify a modelType A or B or D or E')
27 |    end
28 | 
29 |    local features = nn.Sequential()
30 |    do
31 |       local iChannels = 3;
32 |       for k,v in ipairs(cfg) do
33 |          if v == 'M' then
34 |             features:add(SpatialMaxPooling(2,2,2,2))
35 |          else
36 |             local oChannels = v;
37 |             features:add(SpatialConvolution(iChannels,oChannels,3,3,1,1,1,1))
38 |             features:add(ReLU(true))
39 |             iChannels = oChannels;
40 |          end
41 |       end
42 |    end
43 | 
44 |    features:get(1).gradInput = nil
45 | 
46 |    local classifier = nn.Sequential()
47 |    classifier:add(nn.View(512*7*7))
48 |    classifier:add(nn.Linear(512*7*7, 4096))
49 |    classifier:add(nn.Threshold(0, 1e-6))
50 |    -- classifier:add(nn.Dropout(0.5))
51 |    classifier:add(nn.Linear(4096, 4096))
52 |    classifier:add(nn.Threshold(0, 1e-6))
53 |    -- classifier:add(nn.Dropout(0.5))
54 |    classifier:add(nn.Linear(4096, 1000))
55 |    -- classifier:add(nn.LogSoftMax())
56 | 
57 |    local model = nn.Sequential()
58 |    model:add(features):add(classifier)
59 | 
60 |    return model,'VGG Model-' .. modelType, {64,3,224,224}
61 | end
62 | 
63 | return vgg_a
64 | 


--------------------------------------------------------------------------------
/torch7/layerwise_benchmarks/benchmark.lua:
--------------------------------------------------------------------------------
  1 | require 'sys'
  2 | require 'cunn'
  3 | require 'ccn2'
  4 | require 'cudnn'
  5 | cudnn.benchmark = true -- run manual auto-tuner provided by cudnn
  6 | cudnn.verbose = false
  7 | -- require 'fbcunn'
  8 | -- require 'nnbhwd'
  9 | 
 10 | print('Running on device: ' .. cutorch.getDeviceProperties(cutorch.getDevice()).name)
 11 | 
 12 | steps = 10 -- nb of steps in loop to average perf
 13 | 
 14 | runs = {
 15 | 
 16 |    {
 17 |       -- first layer
 18 |       ni = 3,
 19 |       no = 96,
 20 |       kw = 11,
 21 |       kh = 11,
 22 |       iw = 128,
 23 |       ih = 128,
 24 |       bs = 128,
 25 |       dw = 1,
 26 |       dh = 1,
 27 |    },
 28 |    {
 29 |       -- second layer
 30 |       ni = 64,
 31 |       no = 128,
 32 |       kw = 9,
 33 |       kh = 9,
 34 |       iw = 64,
 35 |       ih = 64,
 36 |       bs = 128,
 37 |       dw = 1,
 38 |       dh = 1,
 39 |    },
 40 |    {
 41 |       -- third layer
 42 |       ni = 128,
 43 |       no = 128,
 44 |       kw = 9,
 45 |       kh = 9,
 46 |       iw = 32,
 47 |       ih = 32,
 48 |       bs = 128,
 49 |       dw = 1,
 50 |       dh = 1,
 51 |    },
 52 |    {
 53 |       -- fourth layer
 54 |       ni = 128,
 55 |       no = 128,
 56 |       kw = 7,
 57 |       kh = 7,
 58 |       iw = 16,
 59 |       ih = 16,
 60 |       bs = 128,
 61 |       dw = 1,
 62 |       dh = 1,
 63 |    },
 64 |    {  -- layers with small inputs/kernels, seen at the lower ends of the network
 65 |       ni = 384,
 66 |       no = 384,
 67 |       kw = 3,
 68 |       kh = 3,
 69 |       iw = 13,
 70 |       ih = 13,
 71 |       bs = 128,
 72 |       dw = 1,
 73 |       dh = 1,
 74 |    },
 75 | }
 76 | 
 77 | 
 78 | for i,run in ipairs(runs) do
 79 |    -- params for run:
 80 |    local ni,no,kw,kh,bs,iw,ih,dw,dh = run.ni,run.no,run.kw,run.kh,run.bs,run.iw,run.ih,run.dw,run.dh
 81 |    print('')
 82 |    print('CONFIG: input = ' .. ni..'x'..iw..'x'..ih..' * ker = ' .. ni..'x'..no..'x'..kw..'x'..kh .. ' (bs = '..bs..', stride = ' .. dw .. ')')
 83 |    local mods = {}
 84 |    mods[1] = cudnn.SpatialConvolution(ni,no,kw,kh,dw,dh):cuda()
 85 |    mods[2] = nn.SpatialConvolutionMM(ni,no,kw,kh,dw,dh):cuda()
 86 |    -- mods[3] = ccn2.SpatialConvolution(ni,no,kw,dw,0,1,4):cuda()
 87 |    -- mods[4] = nn.SpatialConvolutionCuFFT(ni,no,kw,kh,dw,dh):cuda()
 88 |    -- mods[4] = nn.SpatialConvolutionBHWD(ni,no,kw,kh,dw,dh):cuda()
 89 |    for j=1,#mods do
 90 |       local tmf, tmbi, tmbg
 91 |       collectgarbage()
 92 |       if torch.typename(mods[j]) == 'ccn2.SpatialConvolution' then
 93 |          i1 = torch.randn(ni, ih, iw, bs):cuda();
 94 |       elseif torch.typename(mods[j]) == 'nn.SpatialConvolutionBHWD' then
 95 |          i1 = torch.randn(bs, ih, iw, ni):cuda();
 96 |       else
 97 |          i1 = torch.randn(bs, ni, ih, iw):cuda()
 98 |       end
 99 |       collectgarbage()
100 |       local o1 = mods[j]:forward(i1)
101 |       cutorch.synchronize()
102 |       collectgarbage()
103 |       sys.tic()
104 |       for t = 1,steps do
105 |          o1 = mods[j]:updateOutput(i1)
106 |       end
107 |       cutorch.synchronize()
108 |       tmf = sys.toc()/steps
109 |       print(string.format("%-30s %25s %10.2f", torch.typename(mods[j]), ':updateOutput():', tmf*1000))
110 | 
111 |       cutorch.synchronize()
112 |       collectgarbage()
113 |       sys.tic()
114 |       for t = 1,steps do
115 |          mods[j]:updateGradInput(i1, o1)
116 |       end
117 |       cutorch.synchronize()
118 |       tmbi = sys.toc()/steps
119 |       print(string.format("%-30s %25s %10.2f", torch.typename(mods[j]), ':updateGradInput():', tmbi*1000))
120 | 
121 |       cutorch.synchronize()
122 |       collectgarbage()
123 |       sys.tic()
124 |       local ok = 1
125 |       for t = 1,steps do
126 |          ok = pcall(function() mods[j]:accGradParameters(i1, o1) end)
127 |       end
128 |       cutorch.synchronize()
129 |       tmbg = sys.toc()/steps
130 |       if not ok then
131 |          print(string.format("%-30s %25s %s", torch.typename(mods[j]), ':accGradParameters():', 'FAILED!'))
132 |       else
133 |          print(string.format("%-30s %25s %10.2f", torch.typename(mods[j]), ':accGradParameters():', tmbg*1000))
134 |       end
135 |       print(string.format("%-30s %25s %10.2f", torch.typename(mods[j]), ':TOTAL:', (tmf+tmbi+tmbg)*1000))
136 |       print()
137 |    end
138 | end
139 | 
140 | print('')
141 | 


--------------------------------------------------------------------------------
/torch7/layerwise_benchmarks/multigpu.lua:
--------------------------------------------------------------------------------
 1 | require 'cutorch'
 2 | require 'nn'
 3 | require 'cunn'
 4 | require 'cudnn'
 5 | require 'ccn2'
 6 | 
 7 | --cutorch.setDevice(4)
 8 | 
 9 | fSize = {3, 96, 128, 128, 384}
10 | inputSize = {3, 64, 64}
11 | batchSize = 128
12 | 
13 | nettype = 'cudnn'
14 | 
15 | if nettype == 'cudnn' then
16 |   model = nn.Sequential()
17 |   model:add(cudnn.SpatialConvolution(fSize[1], fSize[2], 9, 9))
18 |   model:add(cudnn.ReLU())
19 |   model:add(cudnn.SpatialMaxPooling(2,2,2,2))
20 |   model:add(cudnn.SpatialConvolution(fSize[2], fSize[3], 5, 5))
21 |   model:add(cudnn.ReLU())
22 |   model:add(cudnn.SpatialMaxPooling(2,2,2,2))
23 |   model:add(cudnn.SpatialConvolution(fSize[3], fSize[4], 4, 4))
24 |   model:add(cudnn.ReLU())
25 |   model:add(cudnn.SpatialConvolution(fSize[4], fSize[5], 3, 3))
26 |   model:add(cudnn.ReLU())
27 |   model:add(cudnn.SpatialMaxPooling(2,2,2,2))
28 |   model:add(cudnn.SpatialConvolution(fSize[5], fSize[5], 3, 3))
29 |   --model:add(nn.Reshape(fSize[5]))
30 |   --model:add(nn.Linear(fSize[5],1))
31 | elseif nettype == 'ccn2' then
32 |   model = nn.Sequential()
33 |   model:add(nn.Transpose({1,4},{1,3},{1,2}))
34 |   model:add(ccn2.SpatialConvolution(fSize[1], fSize[2], 9))
35 |   model:add(nn.ReLU())
36 |   model:add(ccn2.SpatialMaxPooling(2,2))
37 |   model:add(ccn2.SpatialConvolution(fSize[2], fSize[3], 5))
38 |   model:add(nn.ReLU())
39 |   model:add(ccn2.SpatialMaxPooling(2,2))
40 |   model:add(ccn2.SpatialConvolution(fSize[3], fSize[4], 5))
41 |   model:add(nn.ReLU())
42 |   model:add(ccn2.SpatialConvolution(fSize[4], fSize[5], 3))
43 |   model:add(nn.ReLU())
44 |   model:add(ccn2.SpatialMaxPooling(2,2))
45 |   model:add(ccn2.SpatialConvolution(fSize[5], fSize[5], 3))
46 | elseif nettype == 'MM' then
47 |   model = nn.Sequential()
48 |   model:add(nn.SpatialConvolutionMM(fSize[1], fSize[2], 9, 9))
49 |   model:add(nn.ReLU())
50 |   model:add(nn.SpatialMaxPooling(2,2,2,2))
51 |   model:add(nn.SpatialConvolutionMM(fSize[2], fSize[3], 5, 5))
52 |   model:add(nn.ReLU())
53 |   model:add(nn.SpatialMaxPooling(2,2,2,2))
54 |   model:add(nn.SpatialConvolutionMM(fSize[3], fSize[4], 4, 4))
55 |   model:add(nn.ReLU())
56 |   model:add(nn.SpatialConvolutionMM(fSize[4], fSize[5], 3, 3))
57 |   model:add(nn.ReLU())
58 |   model:add(nn.SpatialMaxPooling(2,2,2,2))
59 |   model:add(nn.SpatialConvolutionMM(fSize[5], fSize[5], 3, 3))
60 |   model:add(nn.Reshape(fSize[5]))
61 | end
62 | 
63 | model = model:cuda()
64 | 
65 | input = torch.rand(batchSize, inputSize[1], inputSize[2], inputSize[3]):cuda()
66 | 
67 | -- first run
68 | --print(model:forward(input):size())
69 | output = model:forward(input)
70 | cutorch.synchronize()
71 | 
72 | a = torch.Timer()
73 | output = model:forward(input)
74 | print('FORWARD free run time:', a:time().real)
75 | 
76 | cutorch.synchronize()
77 | a:reset()
78 | output = model:forward(input)
79 | cutorch.synchronize()
80 | print('FORWARD sync time:', a:time().real)
81 | 
82 | cutorch.synchronize()
83 | a:reset()
84 | model:backward(input, output)
85 | print('BACKWARD free run time:', a:time().real)
86 | 
87 | cutorch.synchronize()
88 | a:reset()
89 | model:backward(input, output)
90 | cutorch.synchronize()
91 | print('BACKWARD sync time:', a:time().real)
92 | 


--------------------------------------------------------------------------------
/torch7/layerwise_benchmarks/output.log:
--------------------------------------------------------------------------------
 1 | Running on device: GeForce GTX TITAN X
 2 | 
 3 | CONFIG: input = 3x128x128 * ker = 3x96x11x11 (bs = 128, stride = 1)
 4 | cudnn.SpatialConvolution                :updateOutput():      32.65
 5 | cudnn.SpatialConvolution             :updateGradInput():      69.05
 6 | cudnn.SpatialConvolution           :accGradParameters():      35.09
 7 | cudnn.SpatialConvolution                         :TOTAL:     136.79
 8 | 
 9 | nn.SpatialConvolutionMM                 :updateOutput():      93.27
10 | nn.SpatialConvolutionMM              :updateGradInput():      89.15
11 | nn.SpatialConvolutionMM            :accGradParameters():     113.82
12 | nn.SpatialConvolutionMM                          :TOTAL:     296.25
13 | 
14 | 
15 | CONFIG: input = 64x64x64 * ker = 64x128x9x9 (bs = 128, stride = 1)
16 | cudnn.SpatialConvolution                :updateOutput():      13.75
17 | cudnn.SpatialConvolution             :updateGradInput():     100.08
18 | cudnn.SpatialConvolution           :accGradParameters():      19.74
19 | cudnn.SpatialConvolution                         :TOTAL:     133.57
20 | 
21 | nn.SpatialConvolutionMM                 :updateOutput():     146.45
22 | nn.SpatialConvolutionMM              :updateGradInput():     165.18
23 | nn.SpatialConvolutionMM            :accGradParameters():     278.77
24 | nn.SpatialConvolutionMM                          :TOTAL:     590.40
25 | 
26 | 
27 | CONFIG: input = 128x32x32 * ker = 128x128x9x9 (bs = 128, stride = 1)
28 | cudnn.SpatialConvolution                :updateOutput():       5.56
29 | cudnn.SpatialConvolution             :updateGradInput():      17.12
30 | cudnn.SpatialConvolution           :accGradParameters():       6.86
31 | cudnn.SpatialConvolution                         :TOTAL:      29.54
32 | 
33 | nn.SpatialConvolutionMM                 :updateOutput():     122.67
34 | nn.SpatialConvolutionMM              :updateGradInput():      85.94
35 | nn.SpatialConvolutionMM            :accGradParameters():      93.75
36 | nn.SpatialConvolutionMM                          :TOTAL:     302.35
37 | 
38 | 
39 | CONFIG: input = 128x16x16 * ker = 128x128x7x7 (bs = 128, stride = 1)
40 | cudnn.SpatialConvolution                :updateOutput():       1.42
41 | cudnn.SpatialConvolution             :updateGradInput():       4.57
42 | cudnn.SpatialConvolution           :accGradParameters():       1.52
43 | cudnn.SpatialConvolution                         :TOTAL:       7.51
44 | 
45 | nn.SpatialConvolutionMM                 :updateOutput():      25.41
46 | nn.SpatialConvolutionMM              :updateGradInput():      16.84
47 | nn.SpatialConvolutionMM            :accGradParameters():      12.51
48 | nn.SpatialConvolutionMM                          :TOTAL:      54.76
49 | 
50 | 
51 | CONFIG: input = 384x13x13 * ker = 384x384x3x3 (bs = 128, stride = 1)
52 | cudnn.SpatialConvolution                :updateOutput():       7.57
53 | cudnn.SpatialConvolution             :updateGradInput():       7.53
54 | cudnn.SpatialConvolution           :accGradParameters():       7.81
55 | cudnn.SpatialConvolution                         :TOTAL:      22.91
56 | 
57 | nn.SpatialConvolutionMM                 :updateOutput():      29.69
58 | nn.SpatialConvolutionMM              :updateGradInput():      17.36
59 | nn.SpatialConvolutionMM            :accGradParameters():      15.38
60 | nn.SpatialConvolutionMM                          :TOTAL:      62.43
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------