├── .gitignore ├── README.md ├── RESULTS.md ├── models ├── ENet-encoder.lua ├── ENet.lua ├── alexnetowt.lua ├── googlenet.lua ├── inception-v3.lua ├── resnet.lua └── vgg.lua ├── opts.lua ├── profile-model.lua └── src ├── modelTimer.lua └── profiler.lua /.gitignore: -------------------------------------------------------------------------------- 1 | # Files 2 | *.sw* 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Torch7-Network Profiler 2 | 3 | Repository to calculate feed-forward time and number of operations taken by a neural network. 4 | 5 | 6 | ## Running the application 7 | 8 | The application can profile both a written definition of a network or an already trained network (saved as ether `ascii` or `binary`). Pass in the location of the model using the `-m` or its long equivalent `--model`. 9 | 10 | To profile the written definition of the model it must be defined in a specially formatted table saved to a file with the `.lua` extension. Examples of which can be found in the [models](models) directory. 11 | 12 | ``` 13 | th profile-model.lua --model <'path/filename.lua'> --res 1x3x231x231 14 | ``` 15 | 16 | To profile the already trained network, pass in the path and file name by again using the `-m/model` flag. If the network file has standard extensions the application will auto detect if the network is saved as ether an `ascii` or `binary` network and load appropriately. 17 | 18 | Profiling the network speed on different platforms is also possible. Currently the default platform is `cpu` but if available the profiler can be targeted to run the networks using `cuda`. 19 | 20 | ``` 21 | th profile-model.lua --model <'path/filename.lua'> --platform <'cpu'|'cuda'> 22 | ``` 23 | 24 | ### License 25 | 26 | This software is released under a creative commons license which allows for personal and research use only. For a commercial license please contact the authors. You can view a license summary here: http://creativecommons.org/licenses/by-nc/4.0/ 27 | -------------------------------------------------------------------------------- /RESULTS.md: -------------------------------------------------------------------------------- 1 | #Torch7 Profiling 2 | # RESULTS TABLE 3 | 4 | # ResNet 18 5 | 6 | 720p (720x1280) 7 | 8 | 67.7 Gops/frame 9 | 10 | ###Titan X Pascal 11 | 12 | 20.51 ms 13 | 14 | ###TitanX 15 | 16 | 35.40 ms 17 | 18 | ###GTX 1080 19 | 20 | 25.48 ms 21 | 22 | # AlexNet OWT 23 | operations: 1.43 G 24 | 25 | image size: 224 x 224 26 | 27 | All results are averaged over 100 runs unless otherwise mentioned 28 | 29 | ### Macbook Pro 15in Late 2013 CPU intel i7 30 | 31.90 ms 31 | 32 | ### Intel Core i7 4710HQ (Gigabyte P35x V4) 33 | 25.30 ms (4C8T) 34 | 35 | ### Macbook Pro 15in Late 2013 GPU GT 750M 36 | 25.18 ms 37 | 38 | ### Intel(R) Xeon(R) CPU E5-1620 0 @ 3.60GHz (GPU2) 39 | 462.37 ms (1-core) 40 | 41 | ### nVidia GeForce GTX 980M (Gigabyte P35X v4) 42 | 3.74 ms 43 | 44 | ### nVidia GeForce GTX 980 (GPU2) 45 | 2.99 ms 46 | 47 | ### nVidia GeForce GTX Titan X (GPU3) 48 | 2.57 ms 49 | 50 | ### nVidia GeForce GTX 1080 (GPU1) 51 | 2.00 ms 52 | 53 | ### Titan X Pascal (GPU4) 54 | 1.96 ms 55 | 56 | ### nVidia TX1 CPU 57 | 114.66 ms 58 | 59 | ### nVidia TX1 GPU 32 bits 60 | 25.73 ms 61 | 62 | ### nVidia TX1 CUDNN 4, FP32 thnets: 63 | 64 | | Batch Size | 1 | 2 | 4 | 8 | 16 | 32* | 65 | |:-------------------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:| 66 | | Time (ms per batch) | 54 | 57 | 69 | 93 | 137 | 216 | 67 | | Time (ms per frame) | 54 | 28 | 17 | 12 | 8 | 7 | 68 | 69 | *batch > 32 gets worse 70 | 71 | ### nVidia TX1 CUDNN 4, FP16 thnets: 72 | 73 | 74 | | Batch Size | 1 | 2 | 4 | 8 | 16 | 32 | 75 | |:-------------------:|:-----:|:-----:|:----:|:-----:|:-----:|:-----:| 76 | | Time (ms per batch) | 28 | 33 | 40 | 70 | 135 | 593 | 77 | | Time (ms per frame) | 28 | 16 | 10 | 9 | 8 | 18 | 78 | 79 | 80 | ### nVidia TX1 CPU thnets: 81 | 82 | batch 1 31.6170 ms 83 | 84 | (batch > 1 is not better in performance) 85 | 86 | ### nVidia TX1 nVidia TX1 thnets cudnn 4 87 | 88 | | Input Resolution | Perf. CPU FP32* (ms) | Perf. GPU FP32 (ms) | Perf. GPU FP16 (ms) | 89 | |:----------------:|:--------------------:|:-------------------:|:-------------------:| 90 | | VGA (640x480) | 1272 | 95 | 58 | 91 | | WXGA (1280x720) | 4406 | 308 | 203 | 92 | | FHD (1920x1080) | 11237 | 673 | 434 | 93 | 94 | *CPU results averaged over 10 runs 95 | -------------------------------------------------------------------------------- /models/ENet-encoder.lua: -------------------------------------------------------------------------------- 1 | local classes = 30 2 | local model = nn.Sequential() 3 | 4 | local ct = 0 5 | function _bottleneck(internal_scale, use_relu, asymetric, dilated, input, output, downsample) 6 | local internal = output / internal_scale 7 | local input_stride = downsample and 2 or 1 8 | 9 | local sum = nn.ConcatTable() 10 | 11 | local main = nn.Sequential() 12 | local other = nn.Sequential() 13 | sum:add(main):add(other) 14 | 15 | main:add(nn.SpatialConvolution(input, internal, input_stride, input_stride, input_stride, input_stride, 0, 0):noBias()) 16 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 17 | if use_relu then main:add(nn.PReLU(internal)) end 18 | if not asymetric and not dilated then 19 | main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1)) 20 | elseif asymetric then 21 | local pad = (asymetric-1) / 2 22 | main:add(nn.SpatialConvolution(internal, internal, asymetric, 1, 1, 1, pad, 0):noBias()) 23 | main:add(nn.SpatialConvolution(internal, internal, 1, asymetric, 1, 1, 0, pad)) 24 | elseif dilated then 25 | main:add(nn.SpatialDilatedConvolution(internal, internal, 3, 3, 1, 1, dilated, dilated, dilated, dilated)) 26 | else 27 | assert(false, 'You shouldn\'t be here') 28 | end 29 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 30 | if use_relu then main:add(nn.PReLU(internal)) end 31 | main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias()) 32 | main:add(nn.SpatialBatchNormalization(output, 1e-3)) 33 | main:add(nn.SpatialDropout((ct < 5) and 0.01 or 0.1)) 34 | ct = ct + 1 35 | 36 | other:add(nn.Identity()) 37 | if downsample then 38 | other:add(nn.SpatialMaxPooling(2, 2, 2, 2)) 39 | end 40 | if input ~= output then 41 | other:add(nn.Padding(1, output-input, 3)) 42 | end 43 | 44 | return nn.Sequential():add(sum):add(nn.CAddTable()):add(nn.PReLU(output)) 45 | end 46 | 47 | local _ = require 'moses' 48 | local bottleneck = _.bindn(_bottleneck, 4, true, false, false) 49 | local cbottleneck = _.bindn(_bottleneck, 4, true, false, false) 50 | local xbottleneck = _.bindn(_bottleneck, 4, true, 7, false) 51 | local wbottleneck = _.bindn(_bottleneck, 4, true, 5, false) 52 | local dbottleneck = _.bindn(_bottleneck, 4, true, false, 2) 53 | local xdbottleneck = _.bindn(_bottleneck, 4, true, false, 4) 54 | local xxdbottleneck = _.bindn(_bottleneck, 4, true, false, 8) 55 | local xxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 16) 56 | local xxxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 32) 57 | 58 | local initial_block = nn.ConcatTable(2) 59 | initial_block:add(nn.SpatialConvolution(3, 13, 3, 3, 2, 2, 1, 1)) 60 | initial_block:add(nn.SpatialMaxPooling(2, 2, 2, 2)) 61 | 62 | model:add(initial_block) -- 128x256 63 | model:add(nn.JoinTable(2)) -- can't use Concat, because SpatialConvolution needs contiguous gradOutput 64 | model:add(nn.SpatialBatchNormalization(16, 1e-3)) 65 | model:add(nn.PReLU(16)) 66 | model:add(bottleneck(16, 64, true)) -- 64x128 67 | for i = 1,4 do 68 | model:add(bottleneck(64, 64)) 69 | end 70 | model:add(bottleneck(64, 128, true)) -- 32x64 71 | for i = 1,2 do 72 | model:add(cbottleneck(128, 128)) 73 | model:add(dbottleneck(128, 128)) 74 | model:add(wbottleneck(128, 128)) 75 | model:add(xdbottleneck(128, 128)) 76 | model:add(cbottleneck(128, 128)) 77 | model:add(xxdbottleneck(128, 128)) 78 | model:add(wbottleneck(128, 128)) 79 | model:add(xxxdbottleneck(128, 128)) 80 | end 81 | model:add(nn.SpatialConvolution(128, classes, 1, 1)) 82 | return model 83 | -------------------------------------------------------------------------------- /models/ENet.lua: -------------------------------------------------------------------------------- 1 | local function getEncoder() 2 | local model = nn.Sequential() 3 | 4 | local ct = 0 5 | function _bottleneck(internal_scale, use_relu, asymetric, dilated, input, output, downsample) 6 | local internal = output / internal_scale 7 | local input_stride = downsample and 2 or 1 8 | 9 | local sum = nn.ConcatTable() 10 | 11 | local main = nn.Sequential() 12 | local other = nn.Sequential() 13 | sum:add(main):add(other) 14 | 15 | main:add(nn.SpatialConvolution(input, internal, input_stride, input_stride, input_stride, input_stride, 0, 0):noBias()) 16 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 17 | if use_relu then main:add(nn.PReLU(internal)) end 18 | if not asymetric and not dilated then 19 | main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1)) 20 | elseif asymetric then 21 | local pad = (asymetric-1) / 2 22 | main:add(nn.SpatialConvolution(internal, internal, asymetric, 1, 1, 1, pad, 0):noBias()) 23 | main:add(nn.SpatialConvolution(internal, internal, 1, asymetric, 1, 1, 0, pad)) 24 | elseif dilated then 25 | main:add(nn.SpatialDilatedConvolution(internal, internal, 3, 3, 1, 1, dilated, dilated, dilated, dilated)) 26 | else 27 | assert(false, 'You shouldn\'t be here') 28 | end 29 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 30 | if use_relu then main:add(nn.PReLU(internal)) end 31 | main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias()) 32 | main:add(nn.SpatialBatchNormalization(output, 1e-3)) 33 | main:add(nn.SpatialDropout((ct < 5) and 0.01 or 0.1)) 34 | ct = ct + 1 35 | 36 | other:add(nn.Identity()) 37 | if downsample then 38 | other:add(nn.SpatialMaxPooling(2, 2, 2, 2)) 39 | end 40 | if input ~= output then 41 | other:add(nn.Padding(1, output-input, 3)) 42 | end 43 | 44 | return nn.Sequential():add(sum):add(nn.CAddTable()):add(nn.PReLU(output)) 45 | end 46 | 47 | local _ = require 'moses' 48 | local bottleneck = _.bindn(_bottleneck, 4, true, false, false) 49 | local cbottleneck = _.bindn(_bottleneck, 4, true, false, false) 50 | local xbottleneck = _.bindn(_bottleneck, 4, true, 7, false) 51 | local wbottleneck = _.bindn(_bottleneck, 4, true, 5, false) 52 | local dbottleneck = _.bindn(_bottleneck, 4, true, false, 2) 53 | local xdbottleneck = _.bindn(_bottleneck, 4, true, false, 4) 54 | local xxdbottleneck = _.bindn(_bottleneck, 4, true, false, 8) 55 | local xxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 16) 56 | local xxxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 32) 57 | 58 | local initial_block = nn.ConcatTable(2) 59 | initial_block:add(nn.SpatialConvolution(3, 13, 3, 3, 2, 2, 1, 1)) 60 | initial_block:add(nn.SpatialMaxPooling(2, 2, 2, 2)) 61 | 62 | model:add(initial_block) -- 128x256 63 | model:add(nn.JoinTable(2)) -- can't use Concat, because SpatialConvolution needs contiguous gradOutput 64 | model:add(nn.SpatialBatchNormalization(16, 1e-3)) 65 | model:add(nn.PReLU(16)) 66 | model:add(bottleneck(16, 64, true)) -- 64x128 67 | for i = 1,4 do 68 | model:add(bottleneck(64, 64)) 69 | end 70 | model:add(bottleneck(64, 128, true)) -- 32x64 71 | for i = 1,2 do 72 | model:add(cbottleneck(128, 128)) 73 | model:add(dbottleneck(128, 128)) 74 | model:add(wbottleneck(128, 128)) 75 | model:add(xdbottleneck(128, 128)) 76 | model:add(cbottleneck(128, 128)) 77 | model:add(xxdbottleneck(128, 128)) 78 | model:add(wbottleneck(128, 128)) 79 | model:add(xxxdbottleneck(128, 128)) 80 | end 81 | --model:add(nn.SpatialConvolution(128, classes, 1, 1)) 82 | return model 83 | end 84 | 85 | -------------------------------------------------------------------------------- 86 | -- Model definition starts here 87 | -------------------------------------------------------------------------------- 88 | 89 | local classes = 30 90 | local model = getEncoder() 91 | -- SpatialMaxUnpooling requires nn modules... 92 | model:apply(function(module) 93 | if module.modules then 94 | for i,submodule in ipairs(module.modules) do 95 | if torch.typename(submodule):match('nn.SpatialMaxPooling') then 96 | module.modules[i] = nn.SpatialMaxPooling(2, 2, 2, 2) -- TODO: make more flexible 97 | end 98 | end 99 | end 100 | end) 101 | 102 | -- find pooling modules 103 | local pooling_modules = {} 104 | model:apply(function(module) 105 | if torch.typename(module):match('nn.SpatialMaxPooling') then 106 | table.insert(pooling_modules, module) 107 | end 108 | end) 109 | assert(#pooling_modules == 3, 'There should be 3 pooling modules') 110 | 111 | function bottleneck(input, output, upsample, reverse_module) 112 | local internal = output / 4 113 | local input_stride = upsample and 2 or 1 114 | 115 | local module = nn.Sequential() 116 | local sum = nn.ConcatTable() 117 | local main = nn.Sequential() 118 | local other = nn.Sequential() 119 | sum:add(main):add(other) 120 | 121 | main:add(nn.SpatialConvolution(input, internal, 1, 1, 1, 1, 0, 0):noBias()) 122 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 123 | main:add(nn.ReLU(true)) 124 | if not upsample then 125 | main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1)) 126 | else 127 | main:add(nn.SpatialFullConvolution(internal, internal, 3, 3, 2, 2, 1, 1, 1, 1)) 128 | end 129 | main:add(nn.SpatialBatchNormalization(internal, 1e-3)) 130 | main:add(nn.ReLU(true)) 131 | main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias()) 132 | main:add(nn.SpatialBatchNormalization(output, 1e-3)) 133 | 134 | other:add(nn.Identity()) 135 | if input ~= output or upsample then 136 | other:add(nn.SpatialConvolution(input, output, 1, 1, 1, 1, 0, 0):noBias()) 137 | other:add(nn.SpatialBatchNormalization(output, 1e-3)) 138 | if upsample and reverse_module then 139 | other:add(nn.SpatialMaxUnpooling(reverse_module)) 140 | end 141 | end 142 | 143 | if upsample and not reverse_module then 144 | main:remove(#main.modules) -- remove BN 145 | return main 146 | end 147 | return module:add(sum):add(nn.CAddTable()):add(nn.ReLU(true)) 148 | end 149 | 150 | --model:add(bottleneck(128, 128)) 151 | model:add(bottleneck(128, 64, true, pooling_modules[3])) -- 32x64 152 | model:add(bottleneck(64, 64)) 153 | model:add(bottleneck(64, 64)) 154 | model:add(bottleneck(64, 16, true, pooling_modules[2])) -- 64x128 155 | model:add(bottleneck(16, 16)) 156 | model:add(nn.SpatialFullConvolution(16, classes, 2, 2, 2, 2)) 157 | return model 158 | -------------------------------------------------------------------------------- /models/alexnetowt.lua: -------------------------------------------------------------------------------- 1 | -- from https://code.google.com/p/cuda-convnet2/source/browse/layers/layers-imagenet-1gpu.cfg 2 | -- this is AlexNet that was presented in the One Weird Trick paper. http://arxiv.org/abs/1404.5997 3 | local features = nn.Sequential() 4 | features:add(nn.SpatialConvolution(3,64,11,11,4,4,2,2)) -- 224 -> 55 5 | features:add(nn.ReLU(true)) 6 | features:add(nn.SpatialMaxPooling(3,3,2,2)) -- 55 -> 27 7 | features:add(nn.SpatialConvolution(64,192,5,5,1,1,2,2)) -- 27 -> 27 8 | features:add(nn.ReLU(true)) 9 | features:add(nn.SpatialMaxPooling(3,3,2,2)) -- 27 -> 13 10 | features:add(nn.SpatialConvolution(192,384,3,3,1,1,1,1)) -- 13 -> 13 11 | features:add(nn.ReLU(true)) 12 | features:add(nn.SpatialConvolution(384,256,3,3,1,1,1,1)) -- 13 -> 13 13 | features:add(nn.ReLU(true)) 14 | features:add(nn.SpatialConvolution(256,256,3,3,1,1,1,1)) -- 13 -> 13 15 | features:add(nn.ReLU(true)) 16 | features:add(nn.SpatialMaxPooling(3,3,2,2)) -- 13 -> 6 17 | 18 | --features:cuda() 19 | --features = makeDataParallel(features, nGPU) -- defined in util.lua 20 | 21 | local classifier = nn.Sequential() 22 | classifier:add(nn.View(256*6*6)) 23 | 24 | classifier:add(nn.Dropout(0.5)) 25 | classifier:add(nn.Linear(256*6*6, 4096)) 26 | classifier:add(nn.ReLU()) 27 | 28 | classifier:add(nn.Dropout(0.5)) 29 | classifier:add(nn.Linear(4096, 4096)) 30 | classifier:add(nn.ReLU()) 31 | 32 | classifier:add(nn.Linear(4096, 1000)) 33 | classifier:add(nn.LogSoftMax()) 34 | 35 | --classifier:cuda() 36 | 37 | local model = nn.Sequential():add(features):add(classifier) 38 | model.imageSize = 256 39 | model.imageCrop = 224 40 | 41 | return model 42 | -------------------------------------------------------------------------------- /models/googlenet.lua: -------------------------------------------------------------------------------- 1 | -- adapted from nagadomi's CIFAR attempt: https://github.com/nagadomi/kaggle-cifar10-torch7/blob/cuda-convnet2/inception_model.lua 2 | 3 | -- Adapted and taken from Soumith's convnet-benchmarks repo: 4 | -- https://github.com/soumith/convnet-benchmarks 5 | 6 | --The MIT License (MIT) 7 | -- 8 | --Copyright (c) 2016 Soumith Chintala 9 | -- 10 | --Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 11 | -- 12 | --The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 13 | -- 14 | --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | local function inception(depth_dim, input_size, config) 17 | local SpatialConvolution = nn.SpatialConvolution 18 | local SpatialMaxPooling = nn.SpatialMaxPooling 19 | local ReLU = nn.ReLU 20 | 21 | local depth_concat = nn.Concat(depth_dim) 22 | local conv1 = nn.Sequential() 23 | conv1:add(SpatialConvolution(input_size, config[1][1], 1, 1)):add(ReLU(true)) 24 | depth_concat:add(conv1) 25 | 26 | local conv3 = nn.Sequential() 27 | conv3:add(SpatialConvolution(input_size, config[2][1], 1, 1)):add(ReLU(true)) 28 | conv3:add(SpatialConvolution(config[2][1], config[2][2], 3, 3, 1, 1, 1, 1)):add(ReLU(true)) 29 | depth_concat:add(conv3) 30 | 31 | local conv5 = nn.Sequential() 32 | conv5:add(SpatialConvolution(input_size, config[3][1], 1, 1)):add(ReLU(true)) 33 | conv5:add(SpatialConvolution(config[3][1], config[3][2], 5, 5, 1, 1, 2, 2)):add(ReLU(true)) 34 | depth_concat:add(conv5) 35 | 36 | local pool = nn.Sequential() 37 | pool:add(SpatialMaxPooling(config[4][1], config[4][1], 1, 1, 1, 1)) 38 | pool:add(SpatialConvolution(input_size, config[4][2], 1, 1)):add(ReLU(true)) 39 | depth_concat:add(pool) 40 | 41 | return depth_concat 42 | end 43 | 44 | local SpatialConvolution = nn.SpatialConvolution 45 | local SpatialMaxPooling = nn.SpatialMaxPooling 46 | local SpatialAveragePooling = nn.SpatialAveragePooling 47 | local ReLU = nn.ReLU 48 | local model = nn.Sequential() 49 | model:add(SpatialConvolution(3,64,7,7,2,2,3,3)):add(ReLU(true)) 50 | model:add(SpatialMaxPooling(3,3,2,2,1,1)) 51 | -- LRN (not added for now) 52 | model:add(SpatialConvolution(64,64,1,1,1,1,0,0)):add(ReLU(true)) 53 | model:add(SpatialConvolution(64,192,3,3,1,1,1,1)):add(ReLU(true)) 54 | -- LRN (not added for now) 55 | model:add(SpatialMaxPooling(3,3,2,2,1,1)) 56 | model:add(inception(2, 192, {{ 64}, { 96,128}, {16, 32}, {3, 32}})) -- 256 57 | model:add(inception(2, 256, {{128}, {128,192}, {32, 96}, {3, 64}})) -- 480 58 | model:add(SpatialMaxPooling(3,3,2,2,1,1)) 59 | model:add(inception(2, 480, {{192}, { 96,208}, {16, 48}, {3, 64}})) -- 4(a) 60 | model:add(inception(2, 512, {{160}, {112,224}, {24, 64}, {3, 64}})) -- 4(b) 61 | model:add(inception(2, 512, {{128}, {128,256}, {24, 64}, {3, 64}})) -- 4(c) 62 | model:add(inception(2, 512, {{112}, {144,288}, {32, 64}, {3, 64}})) -- 4(d) 63 | model:add(inception(2, 528, {{256}, {160,320}, {32,128}, {3,128}})) -- 4(e) (14x14x832) 64 | model:add(SpatialMaxPooling(3,3,2,2,1,1)) 65 | model:add(inception(2, 832, {{256}, {160,320}, {32,128}, {3,128}})) -- 5(a) 66 | model:add(inception(2, 832, {{384}, {192,384}, {48,128}, {3,128}})) -- 5(b) 67 | model:add(SpatialAveragePooling(7,7,1,1)) 68 | model:add(nn.View(1024):setNumInputDims(3)) 69 | ---- model:add(nn.Dropout(0.4)) 70 | model:add(nn.Linear(1024,1000)):add(nn.ReLU(true)) 71 | -- model:add(nn.LogSoftMax()) 72 | model:get(1).gradInput = nil 73 | return model 74 | -------------------------------------------------------------------------------- /models/inception-v3.lua: -------------------------------------------------------------------------------- 1 | 2 | -- Inception-V3 from this paper - 3 | -- https://arxiv.org/pdf/1512.00567v3.pdf 4 | -- and as visualized by http://dgschwend.github.io/netscope/#/preset/inceptionv3 5 | -- Inception uses a 3x299x299 input 6 | 7 | -- This module is from Figure 5 of Inception-V3 paper 8 | local function inception_duplicate() 9 | local SpatialConvolution = nn.SpatialConvolution 10 | local SpatialMaxPooling = nn.SpatialMaxPooling 11 | local SpatialAveragePooling = nn.SpatialAveragePooling 12 | local ReLU = nn.ReLU 13 | 14 | local depth_concat = nn.Concat(2) 15 | local path1 = nn.Sequential() 16 | path1:add(SpatialConvolution(288,64,1,1,1,1,0,0)):add(ReLU(true)) 17 | path1:add(SpatialConvolution(64,96,3,3,1,1,1,1)):add(ReLU(true)) 18 | path1:add(SpatialConvolution(96,96,3,3,1,1,1,1)):add(ReLU(true)) 19 | depth_concat:add(path1) 20 | 21 | local path2 = nn.Sequential() 22 | path2:add(SpatialConvolution(288,48,1,1,1,1,0,0)):add(ReLU(true)) 23 | path2:add(SpatialConvolution(48,64,3,3,1,1,1,1)):add(ReLU(true)) 24 | depth_concat:add(path2) 25 | 26 | local path3 = nn.Sequential() 27 | path3:add(SpatialAveragePooling(3,3,1,1,1,1)) 28 | path3:add(SpatialConvolution(288,64,1,1,1,1,0,0)) 29 | depth_concat:add(path3) 30 | 31 | local path4 = nn.Sequential() 32 | path4:add(SpatialConvolution(288,64,1,1,1,1,0,0)):add(ReLU(true)) 33 | depth_concat:add(path4) 34 | 35 | return depth_concat 36 | end 37 | 38 | -- This module is from Figure 6 of Inception-V3 paper 39 | local function inception_asymmetric() 40 | local SpatialConvolution = nn.SpatialConvolution 41 | local SpatialMaxPooling = nn.SpatialMaxPooling 42 | local SpatialAveragePooling = nn.SpatialAveragePooling 43 | local ReLU = nn.ReLU 44 | 45 | local depth_concat = nn.Concat(2) 46 | local path1 = nn.Sequential() 47 | path1:add(SpatialConvolution(768,128,1,1,1,1,0,0)):add(ReLU(true)) 48 | path1:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true)) 49 | path1:add(SpatialConvolution(128,128,7,1,1,1,3,0)):add(ReLU(true)) 50 | path1:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true)) 51 | path1:add(SpatialConvolution(128,192,7,1,1,1,3,0)):add(ReLU(true)) 52 | depth_concat:add(path1) 53 | 54 | local path2 = nn.Sequential() 55 | path2:add(SpatialConvolution(768,128,1,1,1,1,0,0)):add(ReLU(true)) 56 | path2:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true)) 57 | path2:add(SpatialConvolution(128,192,7,1,1,1,3,0)):add(ReLU(true)) 58 | depth_concat:add(path2) 59 | 60 | local path3 = nn.Sequential() 61 | path3:add(SpatialAveragePooling(3,3,1,1,1,1)) 62 | path3:add(SpatialConvolution(768,192,1,1,1,1,0,0)):add(ReLU(true)) 63 | depth_concat:add(path3) 64 | 65 | local path4 = nn.Sequential() 66 | path4:add(SpatialConvolution(768,192,1,1,1,1,0,0)):add(ReLU(true)) 67 | depth_concat:add(path4) 68 | 69 | return depth_concat 70 | 71 | end 72 | 73 | -- This and expanded2 are from Figure 7 of Inception-V3 paper 74 | local function inception_asymmetric_expanded1() 75 | local SpatialConvolution = nn.SpatialConvolution 76 | local SpatialMaxPooling = nn.SpatialMaxPooling 77 | local SpatialAveragePooling = nn.SpatialAveragePooling 78 | local ReLU = nn.ReLU 79 | 80 | local depth_concat = nn.Concat(2) 81 | local path1 = nn.Sequential() 82 | path1:add(SpatialConvolution(1280,448,1,1,1,1,0,0)):add(ReLU(true)) 83 | path1:add(SpatialConvolution(448,384,3,3,1,1,1,1)):add(ReLU(true)) 84 | local path1_depth_concat = nn.Concat(2) 85 | local path1_1 = nn.Sequential() 86 | path1_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true)) 87 | path1_depth_concat:add(path1_1) 88 | local path1_2 = nn.Sequential() 89 | path1_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true)) 90 | path1_depth_concat:add(path1_2) 91 | path1:add(path1_depth_concat) 92 | depth_concat:add(path1) 93 | 94 | local path2 = nn.Sequential() 95 | path2:add(SpatialConvolution(1280,384,1,1,1,1,0,0)):add(ReLU(true)) 96 | local path2_depth_concat = nn.Concat(2) 97 | local path2_1 = nn.Sequential() 98 | path2_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true)) 99 | path2_depth_concat:add(path2_1) 100 | local path2_2 = nn.Sequential() 101 | path2_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true)) 102 | path2_depth_concat:add(path2_2) 103 | path2:add(path2_depth_concat) 104 | depth_concat:add(path2) 105 | 106 | local path3 = nn.Sequential() 107 | path3:add(SpatialAveragePooling(3,3,1,1,1,1)) 108 | path3:add(SpatialConvolution(1280,192,1,1,1,1,0,0)):add(ReLU(true)) 109 | depth_concat:add(path3) 110 | 111 | local path4 = nn.Sequential() 112 | path4:add(SpatialConvolution(1280,320,1,1,1,1,0,0)):add(ReLU(true)) 113 | depth_concat:add(path4) 114 | 115 | return depth_concat 116 | end 117 | 118 | local function inception_asymmetric_expanded2() 119 | local SpatialConvolution = nn.SpatialConvolution 120 | local SpatialMaxPooling = nn.SpatialMaxPooling 121 | local SpatialAveragePooling = nn.SpatialAveragePooling 122 | local ReLU = nn.ReLU 123 | 124 | local depth_concat = nn.Concat(2) 125 | local path1 = nn.Sequential() 126 | path1:add(SpatialConvolution(2048,448,1,1,1,1,0,0)):add(ReLU(true)) 127 | path1:add(SpatialConvolution(448,384,3,3,1,1,1,1)):add(ReLU(true)) 128 | local path1_depth_concat = nn.Concat(2) 129 | local path1_1 = nn.Sequential() 130 | path1_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true)) 131 | path1_depth_concat:add(path1_1) 132 | local path1_2 = nn.Sequential() 133 | path1_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true)) 134 | path1_depth_concat:add(path1_2) 135 | path1:add(path1_depth_concat) 136 | depth_concat:add(path1) 137 | 138 | local path2 = nn.Sequential() 139 | path2:add(SpatialConvolution(2048,384,1,1,1,1,0,0)):add(ReLU(true)) 140 | local path2_depth_concat = nn.Concat(2) 141 | local path2_1 = nn.Sequential() 142 | path2_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true)) 143 | path2_depth_concat:add(path2_1) 144 | local path2_2 = nn.Sequential() 145 | path2_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true)) 146 | path2_depth_concat:add(path2_2) 147 | path2:add(path2_depth_concat) 148 | depth_concat:add(path2) 149 | 150 | local path3 = nn.Sequential() 151 | path3:add(SpatialAveragePooling(3,3,1,1,1,1)) 152 | path3:add(SpatialConvolution(2048,192,1,1,1,1,0,0)):add(ReLU(true)) 153 | depth_concat:add(path3) 154 | 155 | local path4 = nn.Sequential() 156 | path4:add(SpatialConvolution(2048,320,1,1,1,1,0,0)):add(ReLU(true)) 157 | depth_concat:add(path4) 158 | 159 | return depth_concat 160 | end 161 | 162 | 163 | local function inception_grid_reduce(n_input_maps, n_output_maps) 164 | local SpatialConvolution = nn.SpatialConvolution 165 | local SpatialMaxPooling = nn.SpatialMaxPooling 166 | local ReLU = nn.ReLU 167 | 168 | local conv_output_maps = (n_output_maps - n_input_maps)/2 169 | 170 | local depth_concat = nn.Concat(2) 171 | 172 | local path1 = nn.Sequential() 173 | path1:add(SpatialConvolution(n_input_maps,conv_output_maps,1,1,1,1,0,0)):add(ReLU(true)) 174 | path1:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,1,1,1,1)):add(ReLU(true)) 175 | path1:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,2,2,0,0)):add(ReLU(true)) 176 | depth_concat:add(path1) 177 | 178 | local path2 = nn.Sequential() 179 | path2:add(SpatialConvolution(n_input_maps,conv_output_maps,1,1,1,1,0,0)):add(ReLU(true)) 180 | path2:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,2,2,0,0)):add(ReLU(true)) 181 | depth_concat:add(path2) 182 | 183 | local path3 = nn.Sequential() 184 | path3:add(SpatialMaxPooling(3,3,2,2,0,0)) 185 | depth_concat:add(path3) 186 | 187 | return depth_concat 188 | end 189 | 190 | local SpatialConvolution = nn.SpatialConvolution 191 | local SpatialMaxPooling = nn.SpatialMaxPooling 192 | local SpatialAveragePooling = nn.SpatialAveragePooling 193 | local ReLU = nn.ReLU 194 | 195 | local model = nn.Sequential() 196 | -- Begin Inception "stem" 197 | model:add(SpatialConvolution(3,32,3,3,2,2,0,0)):add(ReLU(true)) 198 | model:add(SpatialConvolution(32,32,3,3,1,1,0,0)):add(ReLU(true)) 199 | model:add(SpatialConvolution(32,64,3,3,1,1,1,1)):add(ReLU(true)) 200 | model:add(SpatialMaxPooling(3,3,2,2,0,0)) 201 | model:add(SpatialConvolution(64,80,3,3,1,1,0,0)):add(ReLU(true)) 202 | model:add(SpatialConvolution(80,192,3,3,2,2,0,0)):add(ReLU(true)) 203 | model:add(SpatialConvolution(192,288,3,3,1,1,1,1)):add(ReLU(true)) 204 | 205 | model:add(inception_duplicate()) 206 | model:add(inception_duplicate()) 207 | model:add(inception_duplicate()) 208 | model:add(inception_grid_reduce(288,768)) 209 | 210 | model:add(inception_asymmetric()) 211 | model:add(inception_asymmetric()) 212 | model:add(inception_asymmetric()) 213 | model:add(inception_asymmetric()) 214 | model:add(inception_asymmetric()) 215 | model:add(inception_grid_reduce(768,1280)) 216 | 217 | model:add(inception_asymmetric_expanded1()) 218 | model:add(inception_asymmetric_expanded2()) 219 | model:add(SpatialAveragePooling(8,8,1,1,0,0)) 220 | 221 | model:add(nn.View(1024):setNumInputDims(3)) 222 | model:add(nn.Linear(1024,1000)):add(nn.ReLU(true)) 223 | 224 | model:get(1).gradInput = nil 225 | 226 | return model 227 | -------------------------------------------------------------------------------- /models/resnet.lua: -------------------------------------------------------------------------------- 1 | 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The ResNet model definition 10 | -- 11 | 12 | local depth = 18 -- Imagenet: 18, 34, 50, 101, 152 | Cifar: 20, 32, 44, 56, 110, 1202 13 | local shortcutType = 'B' -- B/C 14 | local dataset = 'imagenet' -- imagenet/cifar10 15 | local iChannels 16 | 17 | local nn = require 'nn' 18 | --require 'cunn' 19 | 20 | local Convolution = nn.SpatialConvolution 21 | local Avg = nn.SpatialAveragePooling 22 | local ReLU = nn.ReLU 23 | local Max = nn.SpatialMaxPooling 24 | local SBatchNorm = nn.SpatialBatchNormalization 25 | 26 | -- The shortcut layer is either identity or 1x1 convolution 27 | local function shortcut(nInputPlane, nOutputPlane, stride) 28 | local useConv = shortcutType == 'C' or 29 | (shortcutType == 'B' and nInputPlane ~= nOutputPlane) 30 | if useConv then 31 | -- 1x1 convolution 32 | return nn.Sequential() 33 | :add(Convolution(nInputPlane, nOutputPlane, 1, 1, stride, stride)) 34 | :add(SBatchNorm(nOutputPlane)) 35 | elseif nInputPlane ~= nOutputPlane then 36 | -- Strided, zero-padded identity shortcut 37 | return nn.Sequential() 38 | :add(nn.SpatialAveragePooling(1, 1, stride, stride)) 39 | :add(nn.Concat(2) 40 | :add(nn.Identity()) 41 | :add(nn.MulConstant(0))) 42 | else 43 | return nn.Identity() 44 | end 45 | end 46 | 47 | -- The basic residual layer block for 18 and 34 layer network, and the 48 | -- CIFAR networks 49 | local function basicblock(n, stride) 50 | local nInputPlane = iChannels 51 | iChannels = n 52 | 53 | local s = nn.Sequential() 54 | s:add(Convolution(nInputPlane,n,3,3,stride,stride,1,1)) 55 | s:add(SBatchNorm(n)) 56 | s:add(ReLU(true)) 57 | s:add(Convolution(n,n,3,3,1,1,1,1)) 58 | s:add(SBatchNorm(n)) 59 | 60 | return nn.Sequential() 61 | :add(nn.ConcatTable() 62 | :add(s) 63 | :add(shortcut(nInputPlane, n, stride))) 64 | :add(nn.CAddTable(true)) 65 | :add(ReLU(true)) 66 | end 67 | 68 | -- The bottleneck residual layer for 50, 101, and 152 layer networks 69 | local function bottleneck(n, stride) 70 | local nInputPlane = iChannels 71 | iChannels = n * 4 72 | 73 | local s = nn.Sequential() 74 | s:add(Convolution(nInputPlane,n,1,1,1,1,0,0)) 75 | s:add(SBatchNorm(n)) 76 | s:add(ReLU(true)) 77 | s:add(Convolution(n,n,3,3,stride,stride,1,1)) 78 | s:add(SBatchNorm(n)) 79 | s:add(ReLU(true)) 80 | s:add(Convolution(n,n*4,1,1,1,1,0,0)) 81 | s:add(SBatchNorm(n * 4)) 82 | 83 | return nn.Sequential() 84 | :add(nn.ConcatTable() 85 | :add(s) 86 | :add(shortcut(nInputPlane, n * 4, stride))) 87 | :add(nn.CAddTable(true)) 88 | :add(ReLU(true)) 89 | end 90 | 91 | -- Creates count residual blocks with specified number of features 92 | local function layer(block, features, count, stride) 93 | local s = nn.Sequential() 94 | for i=1,count do 95 | s:add(block(features, i == 1 and stride or 1)) 96 | end 97 | return s 98 | end 99 | 100 | local model = nn.Sequential() 101 | if dataset == 'imagenet' then 102 | -- Configurations for ResNet: 103 | -- num. residual blocks, num features, residual block function 104 | local cfg = { 105 | [18] = {{2, 2, 2, 2}, 512, basicblock}, 106 | [34] = {{3, 4, 6, 3}, 512, basicblock}, 107 | [50] = {{3, 4, 6, 3}, 2048, bottleneck}, 108 | [101] = {{3, 4, 23, 3}, 2048, bottleneck}, 109 | [152] = {{3, 8, 36, 3}, 2048, bottleneck}, 110 | } 111 | 112 | assert(cfg[depth], 'Invalid depth: ' .. tostring(depth)) 113 | local def, nFeatures, block = table.unpack(cfg[depth]) 114 | iChannels = 64 115 | print('ResNet-' .. depth .. ' ImageNet') 116 | 117 | -- The ResNet ImageNet model 118 | model:add(Convolution(3,64,7,7,2,2,3,3)) 119 | model:add(SBatchNorm(64)) 120 | model:add(ReLU(true)) 121 | model:add(Max(3,3,2,2,1,1)) 122 | model:add(layer(block, 64, def[1])) 123 | model:add(layer(block, 128, def[2], 2)) 124 | model:add(layer(block, 256, def[3], 2)) 125 | model:add(layer(block, 512, def[4], 2)) 126 | model:add(Avg(7, 7, 1, 1)) 127 | model:add(nn.View(nFeatures):setNumInputDims(3)) 128 | model:add(nn.Linear(nFeatures, 1000)) 129 | elseif dataset == 'cifar10' then 130 | -- Model type specifies number of layers for CIFAR-10 model 131 | assert((depth - 2) % 6 == 0, 'depth should be one of 20, 32, 44, 56, 110, 1202') 132 | local n = (depth - 2) / 6 133 | iChannels = 16 134 | print('ResNet-' .. depth .. ' CIFAR-10') 135 | 136 | -- The ResNet CIFAR-10 model 137 | model:add(Convolution(3,16,3,3,1,1,1,1)) 138 | model:add(SBatchNorm(16)) 139 | model:add(ReLU(true)) 140 | model:add(layer(basicblock, 16, n)) 141 | model:add(layer(basicblock, 32, n, 2)) 142 | model:add(layer(basicblock, 64, n, 2)) 143 | model:add(Avg(8, 8, 1, 1)) 144 | model:add(nn.View(64):setNumInputDims(3)) 145 | model:add(nn.Linear(64, 10)) 146 | else 147 | error('invalid dataset: ' .. dataset) 148 | end 149 | 150 | local function ConvInit(name) 151 | for k,v in pairs(model:findModules(name)) do 152 | local n = v.kW*v.kH*v.nOutputPlane 153 | v.weight:normal(0,math.sqrt(2/n)) 154 | --if cudnn.version >= 4000 then 155 | -- v.bias = nil 156 | -- v.gradBias = nil 157 | --else 158 | -- v.bias:zero() 159 | --end 160 | end 161 | end 162 | local function BNInit(name) 163 | for k,v in pairs(model:findModules(name)) do 164 | v.weight:fill(1) 165 | v.bias:zero() 166 | end 167 | end 168 | 169 | ConvInit('nn.SpatialConvolution') 170 | ConvInit('nn.SpatialConvolution') 171 | BNInit('fbnn.SpatialBatchNormalization') 172 | BNInit('nn.SpatialBatchNormalization') 173 | BNInit('nn.SpatialBatchNormalization') 174 | for k,v in pairs(model:findModules('nn.Linear')) do 175 | v.bias:zero() 176 | end 177 | --model:cuda() 178 | 179 | -- if opt.cudnn == 'deterministic' then 180 | -- model:apply(function(m) 181 | -- if m.setMode then m:setMode(1,1,1) end 182 | -- end) 183 | -- end 184 | 185 | model:get(1).gradInput = nil 186 | 187 | return model 188 | -------------------------------------------------------------------------------- /models/vgg.lua: -------------------------------------------------------------------------------- 1 | local classes = 1000 2 | local modelType = 'A' -- on a titan black, B/D/E run out of memory even for batch-size 32 3 | 4 | -- Create tables describing VGG configurations A, B, D, E 5 | local cfg = {} 6 | if modelType == 'A' then 7 | cfg = {64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'} 8 | elseif modelType == 'B' then 9 | cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'} 10 | elseif modelType == 'D' then 11 | cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'} 12 | elseif modelType == 'E' then 13 | cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'} 14 | else 15 | error('Unknown model type: ' .. modelType .. ' | Please specify a modelType A or B or D or E') 16 | end 17 | 18 | local features = nn.Sequential() 19 | do 20 | local iChannels = 3; 21 | for k,v in ipairs(cfg) do 22 | if v == 'M' then 23 | features:add(nn.SpatialMaxPooling(2,2,2,2)) 24 | else 25 | local oChannels = v; 26 | local conv3 = nn.SpatialConvolution(iChannels,oChannels,3,3,1,1,1,1); 27 | features:add(conv3) 28 | features:add(nn.ReLU(true)) 29 | iChannels = oChannels; 30 | end 31 | end 32 | end 33 | 34 | --features:cuda() 35 | --features = makeDataParallel(features, nGPU) -- defined in util.lua 36 | 37 | local classifier = nn.Sequential() 38 | classifier:add(nn.View(512*7*7)) 39 | classifier:add(nn.Linear(512*7*7, 4096)) 40 | classifier:add(nn.Threshold(0, 1e-6)) 41 | --classifier:add(nn.BatchNormalization(4096, 1e-3)) 42 | classifier:add(nn.Dropout(0.5)) 43 | classifier:add(nn.Linear(4096, 4096)) 44 | classifier:add(nn.Threshold(0, 1e-6)) 45 | --classifier:add(nn.BatchNormalization(4096, 1e-3)) 46 | classifier:add(nn.Dropout(0.5)) 47 | classifier:add(nn.Linear(4096, classes)) 48 | classifier:add(nn.LogSoftMax()) 49 | --classifier:cuda() 50 | 51 | local model = nn.Sequential() 52 | model:add(features):add(classifier) 53 | 54 | return model 55 | -------------------------------------------------------------------------------- /opts.lua: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- Contains options required by run.lua 3 | -- 4 | -- Written by: Abhishek Chaurasia 5 | -- Dated: 6th June, 2016 6 | -------------------------------------------------------------------------------- 7 | 8 | local opts = {} 9 | 10 | lapp = require 'pl.lapp' 11 | function opts.parse(arg) 12 | local opt = lapp [[ 13 | 14 | Command line options: 15 | -m, --model (default '') Path & filename of network model to profile 16 | -p, --platform (default cpu) Select profiling platform (cpu|cuda) 17 | -r, --res (default 1x3x231x231) Input image resolution Channel x Width x Height 18 | -e, --eye (default 0) Network eye 19 | -i, --iter (default 1000) Averaging iterations 20 | -s, --save (default -) Save the float model to file as in 21 | [a]scii or as in [b]inary format (a|b) 22 | --verbose (default detail) detail/medium/compact 23 | --MACs Use multiply-add when counting ops 24 | 25 | Example: 26 | th profile-model.lua --model <'path/filename.lua'> --res 1x3x231x231 27 | 28 | ]] 29 | 30 | return opt 31 | end 32 | 33 | return opts 34 | -------------------------------------------------------------------------------- /profile-model.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'xlua' 3 | require 'sys' 4 | 5 | local lapp = assert(require('pl.lapp')) 6 | local opts = assert(require('opts')) 7 | local profileTime = assert(require('src/modelTimer.lua')) 8 | 9 | -- to load a 64-bit model in binary, we override torch.DiskFile if 32bit machine (ARM): 10 | local systembit = tonumber(io.popen("getconf LONG_BIT"):read('*a')) 11 | if systembit == 32 then 12 | require('libbincompat') 13 | end 14 | 15 | local pf = function(...) print(string.format(...)) end 16 | local r = sys.COLORS.red 17 | local g = sys.COLORS.green 18 | local b = sys.COLORS.blue 19 | local n = sys.COLORS.none 20 | local THIS = sys.COLORS.blue .. 'THIS' .. n 21 | 22 | -- Parsing input arguemnets 23 | opt = opts.parse(arg) 24 | if opt.platform == 'cuda' then 25 | require 'cunn' 26 | require 'cudnn' 27 | end 28 | 29 | torch.setdefaulttensortype('torch.FloatTensor') 30 | 31 | paths.dofile('src/profiler.lua') 32 | -- Loading model 33 | if string.find(opt.model, '.lua', #opt.model-4) then 34 | model = { channel = 3, name = opt.model } 35 | pf('Building %s model \n', r..model.name..n) 36 | net = require (opt.model) 37 | elseif string.find(opt.model, '.net', #opt.model-4) then 38 | model = { channel = 3, name = 'Trained binary network' } 39 | pf('Loading %s model from binary file...\n', r..model.name..n) 40 | net = torch.load(opt.model) 41 | elseif string.find(opt.model, '.net.ascii', #opt.model-10) then 42 | model = { channel = 3, name = 'Trained ascii network' } 43 | pf('Loading %s model from ascii file...\n', r..model.name..n) 44 | net = torch.load(opt.model, 'ascii') 45 | else 46 | error('Network named not recognized') 47 | end 48 | 49 | if net:type() == 'torch.CudaTensor' then 50 | cudnn.convert(net, nn) 51 | net:float() 52 | end 53 | 54 | net:evaluate() 55 | net:clearState() 56 | 57 | local iBatch, iChannel, iWidth, iHeight = string.match(opt.res, '(%d+)x(%d+)x(%d+)x(%d+)') 58 | -- or string.match(opt.res, '(%d+)X(%d+)X(%d+)') 59 | 60 | iBatch = tonumber(iBatch) 61 | iChannel = tonumber(iChannel) 62 | iWidth = tonumber(iWidth) 63 | iHeight = tonumber(iHeight) 64 | 65 | if iChannel ~= 0 then 66 | model.channel = iChannel 67 | end 68 | 69 | local batch = (iBatch ~= 0) and iBatch 70 | local width = (iWidth ~= 0) and iWidth 71 | local height = (iHeight ~= 0) and iHeight or width 72 | 73 | imgBatch = torch.FloatTensor(batch, model.channel, height, width) 74 | 75 | if opt.save == 'a' then 76 | pf('Saving model as model.net.ascii... ') 77 | torch.save('model.net.ascii', net, 'ascii') 78 | pf('Done.\n') 79 | elseif opt.save == 'b' then 80 | pf('Saving model as model.net... ') 81 | torch.save('model.net', net) 82 | pf('Done.\n') 83 | end 84 | 85 | -- calculate the number of operations performed by the network 86 | if not model.def then 87 | totalOps, layerOps = count_ops(net:clone(), imgBatch) 88 | else 89 | totalOps, layerOps = count_ops(model.def, imgBatch) 90 | end 91 | 92 | pf('Operations estimation for image size: %d x %d', width, height) 93 | 94 | local function detailedPrint(...) 95 | if opt.verbose == 'detail' or opt.verbose == 'medium' then 96 | pf(...) 97 | end 98 | end 99 | 100 | -- Compute per layer opt counts 101 | detailedPrint('\n-----------------------------------------------------------------------------------------------') 102 | detailedPrint('%5s %-29s %20s %11s %15s %9s', 'S.No.', 'Module Name', 'Input Resolution', 'Neurons', 'Ops', '% Ops') 103 | detailedPrint('===============================================================================================') 104 | local opsPerCommonModule = {} 105 | local totalNeurons = 0 106 | for i, info in pairs(layerOps) do 107 | local name = info['name'] 108 | local ops = info['ops'] 109 | local maps = info['maps'] 110 | local neurons = info['neurons'] 111 | if not opsPerCommonModule[name] then 112 | opsPerCommonModule[name] = 0 113 | end 114 | local percOps = (ops/totalOps)*100 115 | if percOps > 1 then 116 | percOps = string.format('%s%9.4f%s', b, percOps, n) 117 | else 118 | percOps = string.format('%9.4f', percOps) 119 | end 120 | if opt.verbose == 'medium' and (ops/totalOps)*100 > 0 then 121 | pf('%5d %s%-29s%s %s%20s%s %11s %s%15s%s %9s', i, g, name, n, r, maps, n, neurons, r, ops, n, percOps) 122 | elseif opt.verbose == 'detail' then 123 | pf('%5d %s%-29s%s %s%20s%s %11s %s%15s%s %9s', i, g, name, n, r, maps, n, neurons, r, ops, n, percOps) 124 | end 125 | totalNeurons = totalNeurons + neurons 126 | opsPerCommonModule[name] = opsPerCommonModule[name] + ops 127 | end 128 | 129 | print('-----------------------------------------------------------------------------------------------') 130 | pf(' %s%s%s : %d ', r, 'Total number of trainable parameters', n, net:getParameters():size(1)) 131 | pf(' %s%-36s%s : %d', r, 'Total number of neurons', n, totalNeurons) 132 | print('-----------------------------------------------------------------------------------------------') 133 | print('* Operations per common module *') 134 | -- Print total 135 | local ops = opt.MACs and 'MACs' or 'Ops' 136 | for name, count in pairs(opsPerCommonModule) do 137 | if count > 0 then 138 | print(string.format(' + %-35s: %.4e %s', name, count, ops)) 139 | end 140 | end 141 | pf(' %s%-35s: %.4e %s', b, 'Total', totalOps, ops) 142 | print('===============================================================================================') 143 | 144 | -- time and average over a number of iterations 145 | pf('Profiling %s, %d iterations', r..model.name..n, opt.iter) 146 | net:evaluate() 147 | net:clearState() 148 | time = profileTime:time(net, imgBatch, opt.iter, opt.platform) 149 | 150 | local d = g..'CPU'..n 151 | if 'cuda' == opt.platform then 152 | d = g..'GPU'..n 153 | end 154 | 155 | pf(' Forward average time on %s %s : %.2f ms', THIS, d, time.total * 1e3) 156 | pf(' Performance for %s %s : %.2f G-Ops/s\n', THIS, d, totalOps * 1e-9 / time.total) 157 | -------------------------------------------------------------------------------- /src/modelTimer.lua: -------------------------------------------------------------------------------- 1 | local profileTime = {} 2 | local xlua = assert(require('xlua')) 3 | 4 | local function calc_time_cuda(net, img, iterations) 5 | collectgarbage() 6 | 7 | cutorch.setDevice(1) 8 | cudnn.convert(net, cudnn, function(m) return torch.type(m):find('MaxPooling') end) 9 | net:cuda() 10 | 11 | print('==> using GPU #' .. cutorch.getDevice()) 12 | cutorch.synchronize() 13 | 14 | local tmp = false 15 | local timer = torch.Timer() 16 | local timing = torch.FloatTensor(iterations) 17 | local t = 0 18 | 19 | -- iterations plus one to prime the jit 20 | for i=1, (iterations+1) do 21 | xlua.progress(i, iterations) 22 | 23 | timer:reset() 24 | 25 | tmp = net:forward(img:cuda()) 26 | cutorch.synchronize() 27 | tmp:float() 28 | 29 | t = timer:time().real 30 | timing[(i%iterations)+1] = t 31 | end 32 | 33 | return timing:mean(), tmp 34 | end 35 | 36 | local function calc_time_cpu(net, img, iterations) 37 | local tmp = false 38 | local timer = torch.Timer() 39 | local timing = torch.FloatTensor(iterations) 40 | local t = 0 41 | 42 | -- iterations plus one to prime the jit 43 | for i=1, (iterations+1) do 44 | xlua.progress(i, iterations) 45 | 46 | timer:reset() 47 | 48 | tmp = net:forward(img) 49 | 50 | t = timer:time().real 51 | timing[(i%iterations)+1] = t 52 | end 53 | 54 | return timing:mean(), tmp 55 | end 56 | 57 | function profileTime:time(net, img, iterations, platform) 58 | iterations = iterations or 10 59 | local time = { total = 0, conv = 0, mlp = 0, } 60 | 61 | if platform == 'cuda' then 62 | 63 | time.total = calc_time_cuda(net, img, iterations) 64 | 65 | else 66 | 67 | time.total = calc_time_cpu(net, img, iterations) 68 | 69 | end 70 | 71 | return time 72 | end 73 | 74 | return profileTime 75 | -------------------------------------------------------------------------------- /src/profiler.lua: -------------------------------------------------------------------------------- 1 | local op_count 2 | local op_used 3 | local multiply_adds = opt.MACs 4 | 5 | function count_ops(network, input) 6 | op_count = 0 7 | op_used = {} 8 | network:apply(intercept_updateOutput) 9 | inputImg = input[1] 10 | network:forward(input) 11 | network:apply(restore_updateOutput) 12 | return op_count, op_used 13 | end 14 | 15 | -- Intercept updateOutput. At each call increment op_count appropriately. 16 | function intercept_updateOutput(module) 17 | module.updateOutput_original = module.updateOutput 18 | module.updateOutput = function(self, input) 19 | compute_ops(module, input) 20 | return module:updateOutput_original(input) 21 | end 22 | end 23 | 24 | -- Restore original network behaviour 25 | function restore_updateOutput(module) 26 | assert(module.updateOutput_original, 27 | "restore_updateOutput should be called after intercept_updateOutput!") 28 | module.updateOutput = module.updateOutput_original 29 | module.updateOutput_original = nil 30 | end 31 | 32 | -- Compute #flops that specified module needs to process an input. 33 | -- module_handlers table is at the bottom of this file 34 | function compute_ops(module, input) 35 | module_name = torch.type(module) 36 | handler = module_handlers[module_name] 37 | assert(handler, string.format("No handler for module %s!", module_name)) 38 | local ops = handler(module, input) 39 | op_count = op_count + ops 40 | local maps = 0 41 | local neurons = 0 42 | if torch.type(module) ~= 'nn.JoinTable' and torch.type(module) ~= 'nn.CAddTable' then 43 | for i = 1, input:dim() do 44 | if i == 1 then 45 | maps = input:size(1) 46 | neurons = input:size(1) 47 | else 48 | maps = maps .. ' x ' .. input:size(i) 49 | neurons = neurons * input:size(i) 50 | end 51 | end 52 | else 53 | end 54 | table.insert(op_used, {name = torch.type(module), 55 | ops = ops, 56 | maps = maps, 57 | neurons = neurons}) 58 | end 59 | 60 | -------------------------------------------------------------------------------- 61 | ------------------------------- Module handlers -------------------------------- 62 | -------------------------------------------------------------------------------- 63 | 64 | local function ops_nothing(module, input) 65 | return 0 66 | end 67 | 68 | local function ops_linear(module, input) 69 | local batch_size = input:dim() == 2 and input:size(1) or 1 70 | local weight_ops = module.weight:nElement() * (multiply_adds and 1 or 2) 71 | local bias_ops = module.bias:nElement() 72 | local ops_per_sample = weight_ops + bias_ops 73 | return batch_size * ops_per_sample 74 | end 75 | 76 | local function ops_logsoftmax(module, input) 77 | local batch_size = input:dim() == 2 and input:size(1) or 1 78 | local input_dim = input:dim() == 2 and input:size(2) or input:size(1) 79 | local expminusapprox_ops = 1 -- around 8 in Torch 80 | -- +2 for accumulation and substraction in two loops 81 | local ops_per_elem = expminusapprox_ops + 1 + 1 82 | local ops_per_sample = input_dim * ops_per_elem 83 | return batch_size * ops_per_sample 84 | end 85 | 86 | -- WARNING: an oversimplified version 87 | local function ops_nonlinearity(module, input) 88 | return input:nElement() 89 | end 90 | 91 | local function ops_convolution(module, input) 92 | assert(input:dim() == 4, "ops_convolution supports only batched inputs!") 93 | assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!") 94 | local batch_size = input:size(1) 95 | local input_planes = input:size(2) 96 | local input_height = input:size(3) 97 | local input_width = input:size(4) 98 | 99 | -- ops per output element 100 | local kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2) 101 | local bias_ops = 1 102 | local ops_per_element = kernel_ops + bias_ops 103 | 104 | local output_width = math.floor((input_width + 2 * module.padW - module.kW) / module.dW + 1) 105 | local output_height = math.floor((input_height + 2 * module.padH - module.kH) / module.dH + 1) 106 | 107 | return batch_size * module.nOutputPlane * output_width * output_height * ops_per_element 108 | end 109 | 110 | local function ops_fullconvolution(module, input) 111 | assert(input:dim() == 4, "ops_fullconvolution supports only batched inputs!") 112 | assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!") 113 | local batch_size = input:size(1) 114 | local input_planes = input:size(2) 115 | local input_height = input:size(3) 116 | local input_width = input:size(4) 117 | 118 | -- ops per input element 119 | local single_kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2) 120 | local sample_kernel_ops = input_planes * input_width * input_height * single_kernel_ops 121 | 122 | local output_width = (input_width - 1) * module.dW - 2 * module.padW + module.kW + module.adjW 123 | local output_height = (input_height - 1) * module.dH - 2 * module.padW + module.kH + module.adjH 124 | local bias_ops = output_width * output_height * module.nOutputPlane 125 | 126 | return batch_size * (sample_kernel_ops + bias_ops) 127 | end 128 | 129 | local function ops_dilatedconvolution(module, input) 130 | assert(input:dim() == 4, "ops_convolution supports only batched inputs!") 131 | assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!") 132 | local batch_size = input:size(1) 133 | local input_planes = input:size(2) 134 | local input_height = input:size(3) 135 | local input_width = input:size(4) 136 | local dilW = module.dilationW 137 | local dilH = module.dilationH 138 | 139 | -- ops per output element 140 | local kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2) 141 | local bias_ops = 1 142 | local ops_per_element = kernel_ops + bias_ops 143 | 144 | local output_width = math.floor( 145 | (input_width + 2 * module.padW - dilW * (module.kW - 1) + 1) 146 | /module.dW + 1) 147 | local output_height = math.floor( 148 | (input_height + 2 * module.padH - dilH * (module.kH - 1) + 1) 149 | /module.dH + 1) 150 | 151 | return batch_size * module.nOutputPlane * output_width * output_height * ops_per_element 152 | end 153 | 154 | local function ops_pooling(module, input) 155 | assert(input:dim() == 4, "ops_averagepooling supports only batched inputs!") 156 | local batch_size = input:size(1) 157 | local input_planes = input:size(2) 158 | local input_height = input:size(3) 159 | local input_width = input:size(4) 160 | 161 | local kernel_ops = module.kH * module.kW 162 | 163 | local output_width = math.floor((input_width + 2 * module.padW - module.kW) / module.dW + 1) 164 | local output_height = math.floor((input_height + 2 * module.padH - module.kH) / module.dH + 1) 165 | 166 | return batch_size * input_planes * output_width * output_height * kernel_ops 167 | end 168 | 169 | local function ops_unpooling(module, input) 170 | assert(input:dim() == 4, "ops_unpooling supports only batched inputs!") 171 | local batch_size = input:size(1) 172 | local input_planes = input:size(2) 173 | local input_height = input:size(3) 174 | local input_width = input:size(4) 175 | 176 | 177 | local output_width = (input_width - 1) * module.pooling.dW - (2 * module.pooling.padW - module.pooling.kW) 178 | local output_height = (input_height - 1) * module.pooling.dH - (2 * module.pooling.padH - module.pooling.kH) 179 | 180 | return batch_size * input_planes * output_width * output_height 181 | end 182 | 183 | local function ops_caddtable(module, input) 184 | assert(torch.type(input) == 'table', "ops_caddtable input should be a table!") 185 | return input[1]:nElement() * #input 186 | end 187 | 188 | local function ops_batchnorm(module, input) 189 | return input:nElement() * (multiply_adds and 1 or 2) 190 | end 191 | 192 | local function ops_sum(module, input) 193 | assert(not module.nInputDims, 'nInputDims mode of nn.Sum not supported.') 194 | local ops = 1 195 | for d = 1, input:dim() do 196 | local s = input:size(d) 197 | ops = d ~= module.dimension and ops * s or ops * (s - 1) 198 | end 199 | return ops 200 | end 201 | 202 | local function ops_mulconstant(module, input) 203 | local ops = 1 204 | for d = 1, input:dim() do 205 | ops = ops * input:size(d) 206 | end 207 | return ops 208 | end 209 | 210 | module_handlers = { 211 | -- Containers 212 | ['nn.Sequential'] = ops_nothing, 213 | ['nn.Parallel'] = ops_nothing, 214 | ['nn.Concat'] = ops_nothing, 215 | ['nn.gModule'] = ops_nothing, 216 | ['nn.Identity'] = ops_nothing, 217 | ['nn.DataParallelTable'] = ops_nothing, 218 | ['nn.Contiguous'] = ops_nothing, 219 | ['nn.ConcatTable'] = ops_nothing, 220 | ['nn.JoinTable'] = ops_nothing, 221 | ['nn.Padding'] = ops_nothing, 222 | 223 | -- Nonlinearities 224 | ['nn.ReLU'] = ops_nonlinearity, 225 | ['nn.PReLU'] = ops_nonlinearity, 226 | ['nn.Threshold'] = ops_nonlinearity, 227 | ['nn.LogSoftMax'] = ops_logsoftmax, 228 | ['nn.SoftMax'] = ops_logsoftmax, --TODO Update it with correct ops calculator 229 | ['cudnn.ReLU'] = ops_nonlinearity, 230 | ['cudnn.PReLU'] = ops_nonlinearity, 231 | 232 | -- Basic modules 233 | ['nn.Linear'] = ops_linear, 234 | ['nn.Sum'] = ops_sum, 235 | ['nn.MulConstant'] = ops_mulconstant, 236 | 237 | -- Spatial Modules 238 | ['nn.SpatialConvolution'] = ops_convolution, 239 | ['nn.SpatialConvolutionMM'] = ops_convolution, 240 | ['nn.SpatialDilatedConvolution'] = ops_dilatedconvolution, 241 | ['nn.SpatialFullConvolution'] = ops_fullconvolution, 242 | ['nn.SpatialMaxPooling'] = ops_pooling, 243 | ['nn.SpatialAveragePooling'] = ops_pooling, 244 | ['nn.SpatialMaxUnpooling'] = ops_unpooling, 245 | ['nn.SpatialZeroPadding'] = ops_nothing, 246 | ['nn.BatchNormalization'] = ops_nothing, -- Can be squashed 247 | ['nn.SpatialBatchNormalization'] = ops_nothing, -- Can be squashed 248 | 249 | ['cudnn.SpatialConvolution'] = ops_convolution, 250 | ['cudnn.SpatialConvolutionMM'] = ops_convolution, 251 | ['cudnn.SpatialDilatedConvolution'] = ops_dilatedconvolution, 252 | ['cudnn.SpatialMaxPooling'] = ops_pooling, 253 | ['cudnn.SpatialAveragePooling'] = ops_pooling, 254 | ['cudnn.SpatialBatchNormalization'] = ops_nothing, -- Can be squashed 255 | 256 | -- Table modules 257 | ['nn.CAddTable'] = ops_caddtable, 258 | 259 | -- Various modules 260 | ['nn.View'] = ops_nothing, 261 | ['nn.Reshape'] = ops_nothing, 262 | ['nn.Dropout'] = ops_nothing, -- Is turned off in inference 263 | ['nn.SpatialDropout'] = ops_nothing, -- Is turned off in inference 264 | ['nn.Concat'] = ops_nothing, 265 | } 266 | --------------------------------------------------------------------------------