├── .gitignore
├── README.md
├── RESULTS.md
├── models
    ├── ENet-encoder.lua
    ├── ENet.lua
    ├── alexnetowt.lua
    ├── googlenet.lua
    ├── inception-v3.lua
    ├── resnet.lua
    └── vgg.lua
├── opts.lua
├── profile-model.lua
└── src
    ├── modelTimer.lua
    └── profiler.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | # Files
2 | *.sw*
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Torch7-Network Profiler
 2 | 
 3 | Repository to calculate feed-forward time and number of operations taken by a neural network.
 4 | 
 5 | 
 6 | ## Running the application
 7 | 
 8 | The application can profile both a written definition of a network or an already trained network (saved as ether `ascii` or `binary`). Pass in the location of the model using the `-m` or its long equivalent `--model`.
 9 | 
10 | To profile the written definition of the model it must be defined in a specially formatted table saved to a file with the `.lua` extension. Examples of which can be found in the [models](models) directory.
11 | 
12 | ```
13 | th profile-model.lua --model <'path/filename.lua'> --res 1x3x231x231
14 | ```
15 | 
16 | To profile the already trained network, pass in the path and file name by again using the `-m/model` flag. If the network file has standard extensions the application will auto detect if the network is saved as ether an `ascii` or `binary` network and load appropriately.
17 | 
18 | Profiling the network speed on different platforms is also possible. Currently the default platform is `cpu` but if available the profiler can be targeted to run the networks using `cuda`.
19 | 
20 | ```
21 | th profile-model.lua --model <'path/filename.lua'> --platform <'cpu'|'cuda'>
22 | ```
23 | 
24 | ### License
25 | 
26 | This software is released under a creative commons license which allows for personal and research use only. For a commercial license please contact the authors. You can view a license summary here: http://creativecommons.org/licenses/by-nc/4.0/
27 | 


--------------------------------------------------------------------------------
/RESULTS.md:
--------------------------------------------------------------------------------
 1 | #Torch7 Profiling 
 2 | # RESULTS TABLE
 3 | 
 4 | # ResNet 18
 5 | 
 6 | 720p (720x1280) 
 7 | 
 8 | 67.7 Gops/frame
 9 | 
10 | ###Titan X Pascal
11 | 
12 | 20.51 ms
13 | 
14 | ###TitanX
15 | 
16 | 35.40 ms
17 | 
18 | ###GTX 1080
19 | 
20 | 25.48 ms
21 | 
22 | # AlexNet OWT 
23 | operations: 1.43 G
24 | 
25 | image size: 224 x 224
26 | 
27 | All results are averaged over 100 runs unless otherwise mentioned
28 | 
29 | ### Macbook Pro 15in Late 2013 CPU intel i7
30 | 31.90 ms
31 | 
32 | ### Intel Core i7 4710HQ (Gigabyte P35x V4)
33 | 25.30 ms (4C8T)
34 | 
35 | ### Macbook Pro 15in Late 2013 GPU GT 750M 
36 | 25.18 ms
37 | 
38 | ### Intel(R) Xeon(R) CPU E5-1620 0 @ 3.60GHz (GPU2)
39 | 462.37 ms (1-core)
40 | 
41 | ### nVidia GeForce GTX 980M (Gigabyte P35X v4)
42 | 3.74 ms
43 | 
44 | ### nVidia GeForce GTX 980 (GPU2)
45 | 2.99 ms
46 | 
47 | ### nVidia GeForce GTX Titan X (GPU3)
48 | 2.57 ms
49 | 
50 | ### nVidia GeForce GTX 1080 (GPU1)
51 | 2.00 ms
52 | 
53 | ### Titan X Pascal (GPU4)
54 | 1.96 ms
55 | 
56 | ### nVidia TX1 CPU
57 | 114.66 ms
58 | 
59 | ### nVidia TX1 GPU 32 bits
60 | 25.73 ms
61 | 
62 | ### nVidia TX1 CUDNN 4, FP32 thnets:
63 | 
64 | |      Batch Size     |   1   |   2   |   4   |   8   |   16  |   32*  |
65 | |:-------------------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
66 | | Time (ms per batch) | 54 | 57 | 69 | 93 | 137 | 216 |
67 | | Time (ms per frame) | 54 | 28 | 17 | 12 | 8 | 7 |
68 | 
69 | *batch > 32 gets worse
70 | 
71 | ### nVidia TX1 CUDNN 4, FP16 thnets:
72 | 
73 | 
74 | |      Batch Size     |   1   |   2   |   4  |   8   |   16  |   32  |
75 | |:-------------------:|:-----:|:-----:|:----:|:-----:|:-----:|:-----:|
76 | | Time (ms per batch) | 28 | 33 | 40 |  70 | 135 | 593 |
77 | | Time (ms per frame) | 28 | 16 | 10 | 9 | 8 | 18 |
78 | 
79 | 
80 | ### nVidia TX1 CPU thnets:
81 | 
82 | batch 1 31.6170 ms
83 | 
84 | (batch > 1 is not better in performance)
85 | 
86 | ### nVidia TX1 nVidia TX1 thnets cudnn 4
87 | 
88 | | Input Resolution | Perf. CPU FP32* (ms) | Perf. GPU FP32 (ms) | Perf. GPU FP16 (ms) |
89 | |:----------------:|:--------------------:|:-------------------:|:-------------------:|
90 | |   VGA (640x480)  |         1272        |        95        |        58         |
91 | |  WXGA (1280x720) |         4406        |         308      |        203        |
92 | |  FHD (1920x1080) |        11237        |         673      |        434        |
93 | 
94 | *CPU results averaged over 10 runs
95 | 


--------------------------------------------------------------------------------
/models/ENet-encoder.lua:
--------------------------------------------------------------------------------
 1 | local classes = 30
 2 | local model = nn.Sequential()
 3 | 
 4 | local ct = 0
 5 | function _bottleneck(internal_scale, use_relu, asymetric, dilated, input, output, downsample)
 6 |    local internal = output / internal_scale
 7 |    local input_stride = downsample and 2 or 1
 8 | 
 9 |    local sum = nn.ConcatTable()
10 | 
11 |    local main = nn.Sequential()
12 |    local other = nn.Sequential()
13 |    sum:add(main):add(other)
14 | 
15 |    main:add(nn.SpatialConvolution(input, internal, input_stride, input_stride, input_stride, input_stride, 0, 0):noBias())
16 |    main:add(nn.SpatialBatchNormalization(internal, 1e-3))
17 |    if use_relu then main:add(nn.PReLU(internal)) end
18 |    if not asymetric and not dilated then
19 |       main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1))
20 |    elseif asymetric then
21 |       local pad = (asymetric-1) / 2
22 |       main:add(nn.SpatialConvolution(internal, internal, asymetric, 1, 1, 1, pad, 0):noBias())
23 |       main:add(nn.SpatialConvolution(internal, internal, 1, asymetric, 1, 1, 0, pad))
24 |    elseif dilated then
25 |       main:add(nn.SpatialDilatedConvolution(internal, internal, 3, 3, 1, 1, dilated, dilated, dilated, dilated))
26 |    else
27 |       assert(false, 'You shouldn\'t be here')
28 |    end
29 |    main:add(nn.SpatialBatchNormalization(internal, 1e-3))
30 |    if use_relu then main:add(nn.PReLU(internal)) end
31 |    main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias())
32 |    main:add(nn.SpatialBatchNormalization(output, 1e-3))
33 |    main:add(nn.SpatialDropout((ct < 5) and 0.01 or 0.1))
34 |    ct = ct + 1
35 | 
36 |    other:add(nn.Identity())
37 |    if downsample then
38 |       other:add(nn.SpatialMaxPooling(2, 2, 2, 2))
39 |    end
40 |    if input ~= output then
41 |       other:add(nn.Padding(1, output-input, 3))
42 |    end
43 | 
44 |    return nn.Sequential():add(sum):add(nn.CAddTable()):add(nn.PReLU(output))
45 | end
46 | 
47 | local _ = require 'moses'
48 | local bottleneck = _.bindn(_bottleneck, 4, true, false, false)
49 | local cbottleneck = _.bindn(_bottleneck, 4, true, false, false)
50 | local xbottleneck = _.bindn(_bottleneck, 4, true, 7, false)
51 | local wbottleneck = _.bindn(_bottleneck, 4, true, 5, false)
52 | local dbottleneck = _.bindn(_bottleneck, 4, true, false, 2)
53 | local xdbottleneck = _.bindn(_bottleneck, 4, true, false, 4)
54 | local xxdbottleneck = _.bindn(_bottleneck, 4, true, false, 8)
55 | local xxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 16)
56 | local xxxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 32)
57 | 
58 | local initial_block = nn.ConcatTable(2)
59 | initial_block:add(nn.SpatialConvolution(3, 13, 3, 3, 2, 2, 1, 1))
60 | initial_block:add(nn.SpatialMaxPooling(2, 2, 2, 2))
61 | 
62 | model:add(initial_block)                                         -- 128x256
63 | model:add(nn.JoinTable(2)) -- can't use Concat, because SpatialConvolution needs contiguous gradOutput
64 | model:add(nn.SpatialBatchNormalization(16, 1e-3))
65 | model:add(nn.PReLU(16))
66 | model:add(bottleneck(16, 64, true))                              -- 64x128
67 | for i = 1,4 do
68 |    model:add(bottleneck(64, 64))
69 | end
70 | model:add(bottleneck(64, 128, true))                             -- 32x64
71 | for i = 1,2 do
72 |    model:add(cbottleneck(128, 128))
73 |    model:add(dbottleneck(128, 128))
74 |    model:add(wbottleneck(128, 128))
75 |    model:add(xdbottleneck(128, 128))
76 |    model:add(cbottleneck(128, 128))
77 |    model:add(xxdbottleneck(128, 128))
78 |    model:add(wbottleneck(128, 128))
79 |    model:add(xxxdbottleneck(128, 128))
80 | end
81 | model:add(nn.SpatialConvolution(128, classes, 1, 1))
82 | return model
83 | 


--------------------------------------------------------------------------------
/models/ENet.lua:
--------------------------------------------------------------------------------
  1 | local function getEncoder()
  2 |    local model = nn.Sequential()
  3 | 
  4 |    local ct = 0
  5 |    function _bottleneck(internal_scale, use_relu, asymetric, dilated, input, output, downsample)
  6 |       local internal = output / internal_scale
  7 |       local input_stride = downsample and 2 or 1
  8 | 
  9 |       local sum = nn.ConcatTable()
 10 | 
 11 |       local main = nn.Sequential()
 12 |       local other = nn.Sequential()
 13 |       sum:add(main):add(other)
 14 | 
 15 |       main:add(nn.SpatialConvolution(input, internal, input_stride, input_stride, input_stride, input_stride, 0, 0):noBias())
 16 |       main:add(nn.SpatialBatchNormalization(internal, 1e-3))
 17 |       if use_relu then main:add(nn.PReLU(internal)) end
 18 |       if not asymetric and not dilated then
 19 |          main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1))
 20 |       elseif asymetric then
 21 |          local pad = (asymetric-1) / 2
 22 |          main:add(nn.SpatialConvolution(internal, internal, asymetric, 1, 1, 1, pad, 0):noBias())
 23 |          main:add(nn.SpatialConvolution(internal, internal, 1, asymetric, 1, 1, 0, pad))
 24 |       elseif dilated then
 25 |          main:add(nn.SpatialDilatedConvolution(internal, internal, 3, 3, 1, 1, dilated, dilated, dilated, dilated))
 26 |       else
 27 |          assert(false, 'You shouldn\'t be here')
 28 |       end
 29 |       main:add(nn.SpatialBatchNormalization(internal, 1e-3))
 30 |       if use_relu then main:add(nn.PReLU(internal)) end
 31 |       main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias())
 32 |       main:add(nn.SpatialBatchNormalization(output, 1e-3))
 33 |       main:add(nn.SpatialDropout((ct < 5) and 0.01 or 0.1))
 34 |       ct = ct + 1
 35 | 
 36 |       other:add(nn.Identity())
 37 |       if downsample then
 38 |          other:add(nn.SpatialMaxPooling(2, 2, 2, 2))
 39 |       end
 40 |       if input ~= output then
 41 |          other:add(nn.Padding(1, output-input, 3))
 42 |       end
 43 | 
 44 |       return nn.Sequential():add(sum):add(nn.CAddTable()):add(nn.PReLU(output))
 45 |    end
 46 | 
 47 |    local _ = require 'moses'
 48 |    local bottleneck = _.bindn(_bottleneck, 4, true, false, false)
 49 |    local cbottleneck = _.bindn(_bottleneck, 4, true, false, false)
 50 |    local xbottleneck = _.bindn(_bottleneck, 4, true, 7, false)
 51 |    local wbottleneck = _.bindn(_bottleneck, 4, true, 5, false)
 52 |    local dbottleneck = _.bindn(_bottleneck, 4, true, false, 2)
 53 |    local xdbottleneck = _.bindn(_bottleneck, 4, true, false, 4)
 54 |    local xxdbottleneck = _.bindn(_bottleneck, 4, true, false, 8)
 55 |    local xxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 16)
 56 |    local xxxxdbottleneck = _.bindn(_bottleneck, 4, true, false, 32)
 57 | 
 58 |    local initial_block = nn.ConcatTable(2)
 59 |    initial_block:add(nn.SpatialConvolution(3, 13, 3, 3, 2, 2, 1, 1))
 60 |    initial_block:add(nn.SpatialMaxPooling(2, 2, 2, 2))
 61 | 
 62 |    model:add(initial_block)                                         -- 128x256
 63 |    model:add(nn.JoinTable(2)) -- can't use Concat, because SpatialConvolution needs contiguous gradOutput
 64 |    model:add(nn.SpatialBatchNormalization(16, 1e-3))
 65 |    model:add(nn.PReLU(16))
 66 |    model:add(bottleneck(16, 64, true))                              -- 64x128
 67 |    for i = 1,4 do
 68 |       model:add(bottleneck(64, 64))
 69 |    end
 70 |    model:add(bottleneck(64, 128, true))                             -- 32x64
 71 |    for i = 1,2 do
 72 |       model:add(cbottleneck(128, 128))
 73 |       model:add(dbottleneck(128, 128))
 74 |       model:add(wbottleneck(128, 128))
 75 |       model:add(xdbottleneck(128, 128))
 76 |       model:add(cbottleneck(128, 128))
 77 |       model:add(xxdbottleneck(128, 128))
 78 |       model:add(wbottleneck(128, 128))
 79 |       model:add(xxxdbottleneck(128, 128))
 80 |    end
 81 |    --model:add(nn.SpatialConvolution(128, classes, 1, 1))
 82 |    return model
 83 | end
 84 | 
 85 | --------------------------------------------------------------------------------
 86 | -- Model definition starts here
 87 | --------------------------------------------------------------------------------
 88 | 
 89 | local classes = 30
 90 | local model = getEncoder()
 91 | -- SpatialMaxUnpooling requires nn modules...
 92 | model:apply(function(module)
 93 |    if module.modules then
 94 |       for i,submodule in ipairs(module.modules) do
 95 |          if torch.typename(submodule):match('nn.SpatialMaxPooling') then
 96 |             module.modules[i] = nn.SpatialMaxPooling(2, 2, 2, 2) -- TODO: make more flexible
 97 |          end
 98 |       end
 99 |    end
100 | end)
101 | 
102 | -- find pooling modules
103 | local pooling_modules = {}
104 | model:apply(function(module)
105 |    if torch.typename(module):match('nn.SpatialMaxPooling') then
106 |       table.insert(pooling_modules, module)
107 |    end
108 | end)
109 | assert(#pooling_modules == 3, 'There should be 3 pooling modules')
110 | 
111 | function bottleneck(input, output, upsample, reverse_module)
112 |    local internal = output / 4
113 |    local input_stride = upsample and 2 or 1
114 | 
115 |    local module = nn.Sequential()
116 |    local sum = nn.ConcatTable()
117 |    local main = nn.Sequential()
118 |    local other = nn.Sequential()
119 |    sum:add(main):add(other)
120 | 
121 |    main:add(nn.SpatialConvolution(input, internal, 1, 1, 1, 1, 0, 0):noBias())
122 |    main:add(nn.SpatialBatchNormalization(internal, 1e-3))
123 |    main:add(nn.ReLU(true))
124 |    if not upsample then
125 |       main:add(nn.SpatialConvolution(internal, internal, 3, 3, 1, 1, 1, 1))
126 |    else
127 |       main:add(nn.SpatialFullConvolution(internal, internal, 3, 3, 2, 2, 1, 1, 1, 1))
128 |    end
129 |    main:add(nn.SpatialBatchNormalization(internal, 1e-3))
130 |    main:add(nn.ReLU(true))
131 |    main:add(nn.SpatialConvolution(internal, output, 1, 1, 1, 1, 0, 0):noBias())
132 |    main:add(nn.SpatialBatchNormalization(output, 1e-3))
133 | 
134 |    other:add(nn.Identity())
135 |    if input ~= output or upsample then
136 |       other:add(nn.SpatialConvolution(input, output, 1, 1, 1, 1, 0, 0):noBias())
137 |       other:add(nn.SpatialBatchNormalization(output, 1e-3))
138 |       if upsample and reverse_module then
139 |          other:add(nn.SpatialMaxUnpooling(reverse_module))
140 |       end
141 |    end
142 | 
143 |    if upsample and not reverse_module then
144 |       main:remove(#main.modules) -- remove BN
145 |       return main
146 |    end
147 |    return module:add(sum):add(nn.CAddTable()):add(nn.ReLU(true))
148 | end
149 | 
150 | --model:add(bottleneck(128, 128))
151 | model:add(bottleneck(128, 64, true, pooling_modules[3]))         -- 32x64
152 | model:add(bottleneck(64, 64))
153 | model:add(bottleneck(64, 64))
154 | model:add(bottleneck(64, 16, true, pooling_modules[2]))          -- 64x128
155 | model:add(bottleneck(16, 16))
156 | model:add(nn.SpatialFullConvolution(16, classes, 2, 2, 2, 2))
157 | return model
158 | 


--------------------------------------------------------------------------------
/models/alexnetowt.lua:
--------------------------------------------------------------------------------
 1 | -- from https://code.google.com/p/cuda-convnet2/source/browse/layers/layers-imagenet-1gpu.cfg
 2 | -- this is AlexNet that was presented in the One Weird Trick paper. http://arxiv.org/abs/1404.5997
 3 | local features = nn.Sequential()
 4 | features:add(nn.SpatialConvolution(3,64,11,11,4,4,2,2))       -- 224 -> 55
 5 | features:add(nn.ReLU(true))
 6 | features:add(nn.SpatialMaxPooling(3,3,2,2))                   -- 55 ->  27
 7 | features:add(nn.SpatialConvolution(64,192,5,5,1,1,2,2))       --  27 -> 27
 8 | features:add(nn.ReLU(true))
 9 | features:add(nn.SpatialMaxPooling(3,3,2,2))                   --  27 ->  13
10 | features:add(nn.SpatialConvolution(192,384,3,3,1,1,1,1))      --  13 ->  13
11 | features:add(nn.ReLU(true))
12 | features:add(nn.SpatialConvolution(384,256,3,3,1,1,1,1))      --  13 ->  13
13 | features:add(nn.ReLU(true))
14 | features:add(nn.SpatialConvolution(256,256,3,3,1,1,1,1))      --  13 ->  13
15 | features:add(nn.ReLU(true))
16 | features:add(nn.SpatialMaxPooling(3,3,2,2))                   -- 13 -> 6
17 | 
18 | --features:cuda()
19 | --features = makeDataParallel(features, nGPU) -- defined in util.lua
20 | 
21 | local classifier = nn.Sequential()
22 | classifier:add(nn.View(256*6*6))
23 | 
24 | classifier:add(nn.Dropout(0.5))
25 | classifier:add(nn.Linear(256*6*6, 4096))
26 | classifier:add(nn.ReLU())
27 | 
28 | classifier:add(nn.Dropout(0.5))
29 | classifier:add(nn.Linear(4096, 4096))
30 | classifier:add(nn.ReLU())
31 | 
32 | classifier:add(nn.Linear(4096, 1000))
33 | classifier:add(nn.LogSoftMax())
34 | 
35 | --classifier:cuda()
36 | 
37 | local model = nn.Sequential():add(features):add(classifier)
38 | model.imageSize = 256
39 | model.imageCrop = 224
40 | 
41 | return model
42 | 


--------------------------------------------------------------------------------
/models/googlenet.lua:
--------------------------------------------------------------------------------
 1 | -- adapted from nagadomi's CIFAR attempt: https://github.com/nagadomi/kaggle-cifar10-torch7/blob/cuda-convnet2/inception_model.lua
 2 | 
 3 | -- Adapted and taken from Soumith's convnet-benchmarks repo:
 4 | -- https://github.com/soumith/convnet-benchmarks
 5 | 
 6 | --The MIT License (MIT)
 7 | --
 8 | --Copyright (c) 2016 Soumith Chintala
 9 | --
10 | --Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
11 | --
12 | --The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
13 | --
14 | --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | local function inception(depth_dim, input_size, config)
17 |    local SpatialConvolution = nn.SpatialConvolution
18 |    local SpatialMaxPooling = nn.SpatialMaxPooling
19 |    local ReLU = nn.ReLU
20 | 
21 |    local depth_concat = nn.Concat(depth_dim)
22 |    local conv1 = nn.Sequential()
23 |    conv1:add(SpatialConvolution(input_size, config[1][1], 1, 1)):add(ReLU(true))
24 |    depth_concat:add(conv1)
25 | 
26 |    local conv3 = nn.Sequential()
27 |    conv3:add(SpatialConvolution(input_size, config[2][1], 1, 1)):add(ReLU(true))
28 |    conv3:add(SpatialConvolution(config[2][1], config[2][2], 3, 3, 1, 1, 1, 1)):add(ReLU(true))
29 |    depth_concat:add(conv3)
30 | 
31 |    local conv5 = nn.Sequential()
32 |    conv5:add(SpatialConvolution(input_size, config[3][1], 1, 1)):add(ReLU(true))
33 |    conv5:add(SpatialConvolution(config[3][1], config[3][2], 5, 5, 1, 1, 2, 2)):add(ReLU(true))
34 |    depth_concat:add(conv5)
35 | 
36 |    local pool = nn.Sequential()
37 |    pool:add(SpatialMaxPooling(config[4][1], config[4][1], 1, 1, 1, 1))
38 |    pool:add(SpatialConvolution(input_size, config[4][2], 1, 1)):add(ReLU(true))
39 |    depth_concat:add(pool)
40 | 
41 |    return depth_concat
42 | end
43 | 
44 | local SpatialConvolution = nn.SpatialConvolution
45 | local SpatialMaxPooling = nn.SpatialMaxPooling
46 | local SpatialAveragePooling = nn.SpatialAveragePooling
47 | local ReLU = nn.ReLU
48 | local model = nn.Sequential()
49 | model:add(SpatialConvolution(3,64,7,7,2,2,3,3)):add(ReLU(true))
50 | model:add(SpatialMaxPooling(3,3,2,2,1,1))
51 | -- LRN (not added for now)
52 | model:add(SpatialConvolution(64,64,1,1,1,1,0,0)):add(ReLU(true))
53 | model:add(SpatialConvolution(64,192,3,3,1,1,1,1)):add(ReLU(true))
54 | -- LRN (not added for now)
55 | model:add(SpatialMaxPooling(3,3,2,2,1,1))
56 | model:add(inception(2, 192, {{ 64}, { 96,128}, {16, 32}, {3, 32}})) -- 256
57 | model:add(inception(2, 256, {{128}, {128,192}, {32, 96}, {3, 64}})) -- 480
58 | model:add(SpatialMaxPooling(3,3,2,2,1,1))
59 | model:add(inception(2, 480, {{192}, { 96,208}, {16, 48}, {3, 64}})) -- 4(a)
60 | model:add(inception(2, 512, {{160}, {112,224}, {24, 64}, {3, 64}})) -- 4(b)
61 | model:add(inception(2, 512, {{128}, {128,256}, {24, 64}, {3, 64}})) -- 4(c)
62 | model:add(inception(2, 512, {{112}, {144,288}, {32, 64}, {3, 64}})) -- 4(d)
63 | model:add(inception(2, 528, {{256}, {160,320}, {32,128}, {3,128}})) -- 4(e) (14x14x832)
64 | model:add(SpatialMaxPooling(3,3,2,2,1,1))
65 | model:add(inception(2, 832, {{256}, {160,320}, {32,128}, {3,128}})) -- 5(a)
66 | model:add(inception(2, 832, {{384}, {192,384}, {48,128}, {3,128}})) -- 5(b)
67 | model:add(SpatialAveragePooling(7,7,1,1))
68 | model:add(nn.View(1024):setNumInputDims(3))
69 | ---- model:add(nn.Dropout(0.4))
70 | model:add(nn.Linear(1024,1000)):add(nn.ReLU(true))
71 | -- model:add(nn.LogSoftMax())
72 | model:get(1).gradInput = nil
73 | return model
74 | 


--------------------------------------------------------------------------------
/models/inception-v3.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | -- Inception-V3 from this paper -
  3 | -- https://arxiv.org/pdf/1512.00567v3.pdf
  4 | -- and as visualized by http://dgschwend.github.io/netscope/#/preset/inceptionv3
  5 | -- Inception uses a 3x299x299 input
  6 | 
  7 | -- This module is from Figure 5 of Inception-V3 paper
  8 | local function inception_duplicate()
  9 |    local SpatialConvolution = nn.SpatialConvolution
 10 |    local SpatialMaxPooling = nn.SpatialMaxPooling
 11 |    local SpatialAveragePooling = nn.SpatialAveragePooling
 12 |    local ReLU = nn.ReLU
 13 | 
 14 |    local depth_concat = nn.Concat(2)
 15 |    local path1 = nn.Sequential()
 16 |    path1:add(SpatialConvolution(288,64,1,1,1,1,0,0)):add(ReLU(true))
 17 |    path1:add(SpatialConvolution(64,96,3,3,1,1,1,1)):add(ReLU(true))
 18 |    path1:add(SpatialConvolution(96,96,3,3,1,1,1,1)):add(ReLU(true))
 19 |    depth_concat:add(path1)
 20 | 
 21 |    local path2 = nn.Sequential()
 22 |    path2:add(SpatialConvolution(288,48,1,1,1,1,0,0)):add(ReLU(true))
 23 |    path2:add(SpatialConvolution(48,64,3,3,1,1,1,1)):add(ReLU(true))
 24 |    depth_concat:add(path2)
 25 | 
 26 |    local path3 = nn.Sequential()
 27 |    path3:add(SpatialAveragePooling(3,3,1,1,1,1))
 28 |    path3:add(SpatialConvolution(288,64,1,1,1,1,0,0))
 29 |    depth_concat:add(path3)
 30 | 
 31 |    local path4 = nn.Sequential()
 32 |    path4:add(SpatialConvolution(288,64,1,1,1,1,0,0)):add(ReLU(true))
 33 |    depth_concat:add(path4)
 34 | 
 35 |    return depth_concat
 36 | end
 37 | 
 38 | -- This module is from Figure 6 of Inception-V3 paper
 39 | local function inception_asymmetric()
 40 |    local SpatialConvolution = nn.SpatialConvolution
 41 |    local SpatialMaxPooling = nn.SpatialMaxPooling
 42 |    local SpatialAveragePooling = nn.SpatialAveragePooling
 43 |    local ReLU = nn.ReLU
 44 | 
 45 |    local depth_concat = nn.Concat(2)
 46 |    local path1 = nn.Sequential()
 47 |    path1:add(SpatialConvolution(768,128,1,1,1,1,0,0)):add(ReLU(true))
 48 |    path1:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true))
 49 |    path1:add(SpatialConvolution(128,128,7,1,1,1,3,0)):add(ReLU(true))
 50 |    path1:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true))
 51 |    path1:add(SpatialConvolution(128,192,7,1,1,1,3,0)):add(ReLU(true))
 52 |    depth_concat:add(path1)
 53 | 
 54 |    local path2 = nn.Sequential()
 55 |    path2:add(SpatialConvolution(768,128,1,1,1,1,0,0)):add(ReLU(true))
 56 |    path2:add(SpatialConvolution(128,128,1,7,1,1,0,3)):add(ReLU(true))
 57 |    path2:add(SpatialConvolution(128,192,7,1,1,1,3,0)):add(ReLU(true))
 58 |    depth_concat:add(path2)
 59 | 
 60 |    local path3 = nn.Sequential()
 61 |    path3:add(SpatialAveragePooling(3,3,1,1,1,1))
 62 |    path3:add(SpatialConvolution(768,192,1,1,1,1,0,0)):add(ReLU(true))
 63 |    depth_concat:add(path3)
 64 | 
 65 |    local path4 = nn.Sequential()
 66 |    path4:add(SpatialConvolution(768,192,1,1,1,1,0,0)):add(ReLU(true))
 67 |    depth_concat:add(path4)
 68 | 
 69 |    return depth_concat
 70 | 
 71 | end
 72 | 
 73 | -- This and expanded2 are from Figure 7 of Inception-V3 paper
 74 | local function inception_asymmetric_expanded1()
 75 |    local SpatialConvolution = nn.SpatialConvolution
 76 |    local SpatialMaxPooling = nn.SpatialMaxPooling
 77 |    local SpatialAveragePooling = nn.SpatialAveragePooling
 78 |    local ReLU = nn.ReLU
 79 | 
 80 |    local depth_concat = nn.Concat(2)
 81 |    local path1 = nn.Sequential()
 82 |    path1:add(SpatialConvolution(1280,448,1,1,1,1,0,0)):add(ReLU(true))
 83 |    path1:add(SpatialConvolution(448,384,3,3,1,1,1,1)):add(ReLU(true))
 84 |    local path1_depth_concat = nn.Concat(2)
 85 |    local path1_1 = nn.Sequential()
 86 |    path1_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true))
 87 |    path1_depth_concat:add(path1_1)
 88 |    local path1_2 = nn.Sequential()
 89 |    path1_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true))
 90 |    path1_depth_concat:add(path1_2)
 91 |    path1:add(path1_depth_concat)
 92 |    depth_concat:add(path1)
 93 | 
 94 |    local path2 = nn.Sequential()
 95 |    path2:add(SpatialConvolution(1280,384,1,1,1,1,0,0)):add(ReLU(true))
 96 |    local path2_depth_concat = nn.Concat(2)
 97 |    local path2_1 = nn.Sequential()
 98 |    path2_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true))
 99 |    path2_depth_concat:add(path2_1)
100 |    local path2_2 = nn.Sequential()
101 |    path2_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true))
102 |    path2_depth_concat:add(path2_2)
103 |    path2:add(path2_depth_concat)
104 |    depth_concat:add(path2)
105 | 
106 |    local path3 = nn.Sequential()
107 |    path3:add(SpatialAveragePooling(3,3,1,1,1,1))
108 |    path3:add(SpatialConvolution(1280,192,1,1,1,1,0,0)):add(ReLU(true))
109 |    depth_concat:add(path3)
110 | 
111 |    local path4 = nn.Sequential()
112 |    path4:add(SpatialConvolution(1280,320,1,1,1,1,0,0)):add(ReLU(true))
113 |    depth_concat:add(path4)
114 | 
115 |    return depth_concat
116 | end
117 | 
118 | local function inception_asymmetric_expanded2()
119 |    local SpatialConvolution = nn.SpatialConvolution
120 |    local SpatialMaxPooling = nn.SpatialMaxPooling
121 |    local SpatialAveragePooling = nn.SpatialAveragePooling
122 |    local ReLU = nn.ReLU
123 | 
124 |    local depth_concat = nn.Concat(2)
125 |    local path1 = nn.Sequential()
126 |    path1:add(SpatialConvolution(2048,448,1,1,1,1,0,0)):add(ReLU(true))
127 |    path1:add(SpatialConvolution(448,384,3,3,1,1,1,1)):add(ReLU(true))
128 |    local path1_depth_concat = nn.Concat(2)
129 |    local path1_1 = nn.Sequential()
130 |    path1_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true))
131 |    path1_depth_concat:add(path1_1)
132 |    local path1_2 = nn.Sequential()
133 |    path1_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true))
134 |    path1_depth_concat:add(path1_2)
135 |    path1:add(path1_depth_concat)
136 |    depth_concat:add(path1)
137 | 
138 |    local path2 = nn.Sequential()
139 |    path2:add(SpatialConvolution(2048,384,1,1,1,1,0,0)):add(ReLU(true))
140 |    local path2_depth_concat = nn.Concat(2)
141 |    local path2_1 = nn.Sequential()
142 |    path2_1:add(SpatialConvolution(384,384,1,3,1,1,0,1)):add(ReLU(true))
143 |    path2_depth_concat:add(path2_1)
144 |    local path2_2 = nn.Sequential()
145 |    path2_2:add(SpatialConvolution(384,384,3,1,1,1,1,0)):add(ReLU(true))
146 |    path2_depth_concat:add(path2_2)
147 |    path2:add(path2_depth_concat)
148 |    depth_concat:add(path2)
149 | 
150 |    local path3 = nn.Sequential()
151 |    path3:add(SpatialAveragePooling(3,3,1,1,1,1))
152 |    path3:add(SpatialConvolution(2048,192,1,1,1,1,0,0)):add(ReLU(true))
153 |    depth_concat:add(path3)
154 | 
155 |    local path4 = nn.Sequential()
156 |    path4:add(SpatialConvolution(2048,320,1,1,1,1,0,0)):add(ReLU(true))
157 |    depth_concat:add(path4)
158 | 
159 |    return depth_concat
160 | end
161 | 
162 | 
163 | local function inception_grid_reduce(n_input_maps, n_output_maps)
164 |    local SpatialConvolution = nn.SpatialConvolution
165 |    local SpatialMaxPooling = nn.SpatialMaxPooling
166 |    local ReLU = nn.ReLU
167 | 
168 |    local conv_output_maps = (n_output_maps - n_input_maps)/2
169 | 
170 |    local depth_concat = nn.Concat(2)
171 | 
172 |    local path1 = nn.Sequential()
173 |    path1:add(SpatialConvolution(n_input_maps,conv_output_maps,1,1,1,1,0,0)):add(ReLU(true))
174 |    path1:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,1,1,1,1)):add(ReLU(true))
175 |    path1:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,2,2,0,0)):add(ReLU(true))
176 |    depth_concat:add(path1)
177 | 
178 |    local path2 = nn.Sequential()
179 |    path2:add(SpatialConvolution(n_input_maps,conv_output_maps,1,1,1,1,0,0)):add(ReLU(true))
180 |    path2:add(SpatialConvolution(conv_output_maps,conv_output_maps,3,3,2,2,0,0)):add(ReLU(true))
181 |    depth_concat:add(path2)
182 | 
183 |    local path3 = nn.Sequential()
184 |    path3:add(SpatialMaxPooling(3,3,2,2,0,0))
185 |    depth_concat:add(path3)
186 | 
187 |    return depth_concat
188 | end
189 | 
190 | local SpatialConvolution = nn.SpatialConvolution
191 | local SpatialMaxPooling = nn.SpatialMaxPooling
192 | local SpatialAveragePooling = nn.SpatialAveragePooling
193 | local ReLU = nn.ReLU
194 | 
195 | local model = nn.Sequential()
196 | -- Begin Inception "stem"
197 | model:add(SpatialConvolution(3,32,3,3,2,2,0,0)):add(ReLU(true))
198 | model:add(SpatialConvolution(32,32,3,3,1,1,0,0)):add(ReLU(true))
199 | model:add(SpatialConvolution(32,64,3,3,1,1,1,1)):add(ReLU(true))
200 | model:add(SpatialMaxPooling(3,3,2,2,0,0))
201 | model:add(SpatialConvolution(64,80,3,3,1,1,0,0)):add(ReLU(true))
202 | model:add(SpatialConvolution(80,192,3,3,2,2,0,0)):add(ReLU(true))
203 | model:add(SpatialConvolution(192,288,3,3,1,1,1,1)):add(ReLU(true))
204 | 
205 | model:add(inception_duplicate())
206 | model:add(inception_duplicate())
207 | model:add(inception_duplicate())
208 | model:add(inception_grid_reduce(288,768))
209 | 
210 | model:add(inception_asymmetric())
211 | model:add(inception_asymmetric())
212 | model:add(inception_asymmetric())
213 | model:add(inception_asymmetric())
214 | model:add(inception_asymmetric())
215 | model:add(inception_grid_reduce(768,1280))
216 | 
217 | model:add(inception_asymmetric_expanded1())
218 | model:add(inception_asymmetric_expanded2())
219 | model:add(SpatialAveragePooling(8,8,1,1,0,0))
220 | 
221 | model:add(nn.View(1024):setNumInputDims(3))
222 | model:add(nn.Linear(1024,1000)):add(nn.ReLU(true))
223 | 
224 | model:get(1).gradInput = nil
225 | 
226 | return model
227 | 


--------------------------------------------------------------------------------
/models/resnet.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --  Copyright (c) 2016, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  The ResNet model definition
 10 | --
 11 | 
 12 | local depth = 18           -- Imagenet: 18, 34, 50, 101, 152 | Cifar: 20, 32, 44, 56, 110, 1202
 13 | local shortcutType = 'B'   -- B/C
 14 | local dataset = 'imagenet' -- imagenet/cifar10
 15 | local iChannels
 16 | 
 17 | local nn = require 'nn'
 18 | --require 'cunn'
 19 | 
 20 | local Convolution = nn.SpatialConvolution
 21 | local Avg = nn.SpatialAveragePooling
 22 | local ReLU = nn.ReLU
 23 | local Max = nn.SpatialMaxPooling
 24 | local SBatchNorm = nn.SpatialBatchNormalization
 25 | 
 26 | -- The shortcut layer is either identity or 1x1 convolution
 27 | local function shortcut(nInputPlane, nOutputPlane, stride)
 28 |    local useConv = shortcutType == 'C' or
 29 |    (shortcutType == 'B' and nInputPlane ~= nOutputPlane)
 30 |    if useConv then
 31 |       -- 1x1 convolution
 32 |       return nn.Sequential()
 33 |       :add(Convolution(nInputPlane, nOutputPlane, 1, 1, stride, stride))
 34 |       :add(SBatchNorm(nOutputPlane))
 35 |    elseif nInputPlane ~= nOutputPlane then
 36 |       -- Strided, zero-padded identity shortcut
 37 |       return nn.Sequential()
 38 |       :add(nn.SpatialAveragePooling(1, 1, stride, stride))
 39 |       :add(nn.Concat(2)
 40 |       :add(nn.Identity())
 41 |       :add(nn.MulConstant(0)))
 42 |    else
 43 |       return nn.Identity()
 44 |    end
 45 | end
 46 | 
 47 | -- The basic residual layer block for 18 and 34 layer network, and the
 48 | -- CIFAR networks
 49 | local function basicblock(n, stride)
 50 |    local nInputPlane = iChannels
 51 |    iChannels = n
 52 | 
 53 |    local s = nn.Sequential()
 54 |    s:add(Convolution(nInputPlane,n,3,3,stride,stride,1,1))
 55 |    s:add(SBatchNorm(n))
 56 |    s:add(ReLU(true))
 57 |    s:add(Convolution(n,n,3,3,1,1,1,1))
 58 |    s:add(SBatchNorm(n))
 59 | 
 60 |    return nn.Sequential()
 61 |    :add(nn.ConcatTable()
 62 |    :add(s)
 63 |    :add(shortcut(nInputPlane, n, stride)))
 64 |    :add(nn.CAddTable(true))
 65 |    :add(ReLU(true))
 66 | end
 67 | 
 68 | -- The bottleneck residual layer for 50, 101, and 152 layer networks
 69 | local function bottleneck(n, stride)
 70 |    local nInputPlane = iChannels
 71 |    iChannels = n * 4
 72 | 
 73 |    local s = nn.Sequential()
 74 |    s:add(Convolution(nInputPlane,n,1,1,1,1,0,0))
 75 |    s:add(SBatchNorm(n))
 76 |    s:add(ReLU(true))
 77 |    s:add(Convolution(n,n,3,3,stride,stride,1,1))
 78 |    s:add(SBatchNorm(n))
 79 |    s:add(ReLU(true))
 80 |    s:add(Convolution(n,n*4,1,1,1,1,0,0))
 81 |    s:add(SBatchNorm(n * 4))
 82 | 
 83 |    return nn.Sequential()
 84 |    :add(nn.ConcatTable()
 85 |    :add(s)
 86 |    :add(shortcut(nInputPlane, n * 4, stride)))
 87 |    :add(nn.CAddTable(true))
 88 |    :add(ReLU(true))
 89 | end
 90 | 
 91 | -- Creates count residual blocks with specified number of features
 92 | local function layer(block, features, count, stride)
 93 |    local s = nn.Sequential()
 94 |    for i=1,count do
 95 |       s:add(block(features, i == 1 and stride or 1))
 96 |    end
 97 |    return s
 98 | end
 99 | 
100 | local model = nn.Sequential()
101 | if dataset == 'imagenet' then
102 |    -- Configurations for ResNet:
103 |    --  num. residual blocks, num features, residual block function
104 |    local cfg = {
105 |       [18]  = {{2, 2, 2, 2}, 512, basicblock},
106 |       [34]  = {{3, 4, 6, 3}, 512, basicblock},
107 |       [50]  = {{3, 4, 6, 3}, 2048, bottleneck},
108 |       [101] = {{3, 4, 23, 3}, 2048, bottleneck},
109 |       [152] = {{3, 8, 36, 3}, 2048, bottleneck},
110 |    }
111 | 
112 |    assert(cfg[depth], 'Invalid depth: ' .. tostring(depth))
113 |    local def, nFeatures, block = table.unpack(cfg[depth])
114 |    iChannels = 64
115 |    print('ResNet-' .. depth .. ' ImageNet')
116 | 
117 |    -- The ResNet ImageNet model
118 |    model:add(Convolution(3,64,7,7,2,2,3,3))
119 |    model:add(SBatchNorm(64))
120 |    model:add(ReLU(true))
121 |    model:add(Max(3,3,2,2,1,1))
122 |    model:add(layer(block, 64, def[1]))
123 |    model:add(layer(block, 128, def[2], 2))
124 |    model:add(layer(block, 256, def[3], 2))
125 |    model:add(layer(block, 512, def[4], 2))
126 |    model:add(Avg(7, 7, 1, 1))
127 |    model:add(nn.View(nFeatures):setNumInputDims(3))
128 |    model:add(nn.Linear(nFeatures, 1000))
129 | elseif dataset == 'cifar10' then
130 |    -- Model type specifies number of layers for CIFAR-10 model
131 |    assert((depth - 2) % 6 == 0, 'depth should be one of 20, 32, 44, 56, 110, 1202')
132 |    local n = (depth - 2) / 6
133 |    iChannels = 16
134 |    print('ResNet-' .. depth .. ' CIFAR-10')
135 | 
136 |    -- The ResNet CIFAR-10 model
137 |    model:add(Convolution(3,16,3,3,1,1,1,1))
138 |    model:add(SBatchNorm(16))
139 |    model:add(ReLU(true))
140 |    model:add(layer(basicblock, 16, n))
141 |    model:add(layer(basicblock, 32, n, 2))
142 |    model:add(layer(basicblock, 64, n, 2))
143 |    model:add(Avg(8, 8, 1, 1))
144 |    model:add(nn.View(64):setNumInputDims(3))
145 |    model:add(nn.Linear(64, 10))
146 | else
147 |    error('invalid dataset: ' .. dataset)
148 | end
149 | 
150 | local function ConvInit(name)
151 |    for k,v in pairs(model:findModules(name)) do
152 |       local n = v.kW*v.kH*v.nOutputPlane
153 |       v.weight:normal(0,math.sqrt(2/n))
154 |       --if cudnn.version >= 4000 then
155 |       --   v.bias = nil
156 |       --   v.gradBias = nil
157 |       --else
158 |       --   v.bias:zero()
159 |       --end
160 |    end
161 | end
162 | local function BNInit(name)
163 |    for k,v in pairs(model:findModules(name)) do
164 |       v.weight:fill(1)
165 |       v.bias:zero()
166 |    end
167 | end
168 | 
169 | ConvInit('nn.SpatialConvolution')
170 | ConvInit('nn.SpatialConvolution')
171 | BNInit('fbnn.SpatialBatchNormalization')
172 | BNInit('nn.SpatialBatchNormalization')
173 | BNInit('nn.SpatialBatchNormalization')
174 | for k,v in pairs(model:findModules('nn.Linear')) do
175 |    v.bias:zero()
176 | end
177 | --model:cuda()
178 | 
179 | -- if opt.cudnn == 'deterministic' then
180 | --    model:apply(function(m)
181 | --       if m.setMode then m:setMode(1,1,1) end
182 | --    end)
183 | -- end
184 | 
185 | model:get(1).gradInput = nil
186 | 
187 | return model
188 | 


--------------------------------------------------------------------------------
/models/vgg.lua:
--------------------------------------------------------------------------------
 1 | local classes = 1000
 2 | local modelType = 'A' -- on a titan black, B/D/E run out of memory even for batch-size 32
 3 | 
 4 | -- Create tables describing VGG configurations A, B, D, E
 5 | local cfg = {}
 6 | if modelType == 'A' then
 7 |    cfg = {64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'}
 8 | elseif modelType == 'B' then
 9 |    cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'}
10 | elseif modelType == 'D' then
11 |    cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'}
12 | elseif modelType == 'E' then
13 |    cfg = {64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'}
14 | else
15 |    error('Unknown model type: ' .. modelType .. ' | Please specify a modelType A or B or D or E')
16 | end
17 | 
18 | local features = nn.Sequential()
19 | do
20 |    local iChannels = 3;
21 |    for k,v in ipairs(cfg) do
22 |       if v == 'M' then
23 |          features:add(nn.SpatialMaxPooling(2,2,2,2))
24 |       else
25 |          local oChannels = v;
26 |          local conv3 = nn.SpatialConvolution(iChannels,oChannels,3,3,1,1,1,1);
27 |          features:add(conv3)
28 |          features:add(nn.ReLU(true))
29 |          iChannels = oChannels;
30 |       end
31 |    end
32 | end
33 | 
34 | --features:cuda()
35 | --features = makeDataParallel(features, nGPU) -- defined in util.lua
36 | 
37 | local classifier = nn.Sequential()
38 | classifier:add(nn.View(512*7*7))
39 | classifier:add(nn.Linear(512*7*7, 4096))
40 | classifier:add(nn.Threshold(0, 1e-6))
41 | --classifier:add(nn.BatchNormalization(4096, 1e-3))
42 | classifier:add(nn.Dropout(0.5))
43 | classifier:add(nn.Linear(4096, 4096))
44 | classifier:add(nn.Threshold(0, 1e-6))
45 | --classifier:add(nn.BatchNormalization(4096, 1e-3))
46 | classifier:add(nn.Dropout(0.5))
47 | classifier:add(nn.Linear(4096, classes))
48 | classifier:add(nn.LogSoftMax())
49 | --classifier:cuda()
50 | 
51 | local model = nn.Sequential()
52 | model:add(features):add(classifier)
53 | 
54 | return model
55 | 


--------------------------------------------------------------------------------
/opts.lua:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- Contains options required by run.lua
 3 | --
 4 | -- Written by: Abhishek Chaurasia
 5 | -- Dated:      6th June, 2016
 6 | --------------------------------------------------------------------------------
 7 | 
 8 | local opts = {}
 9 | 
10 | lapp = require 'pl.lapp'
11 | function opts.parse(arg)
12 |    local opt = lapp [[
13 | 
14 |    Command line options:
15 |    -m, --model       (default '')      Path & filename of network model to profile
16 |    -p, --platform    (default cpu)     Select profiling platform (cpu|cuda)
17 |    -r, --res         (default 1x3x231x231) Input image resolution Channel x Width x Height
18 |    -e, --eye         (default 0)       Network eye
19 |    -i, --iter        (default 1000)    Averaging iterations
20 |    -s, --save        (default -)       Save the float model to file as <model.net.ascii>in
21 |                                        [a]scii or as <model.net> in [b]inary format (a|b)
22 |    --verbose         (default detail)  detail/medium/compact
23 |    --MACs                              Use multiply-add when counting ops
24 | 
25 |    Example:
26 |    th profile-model.lua --model <'path/filename.lua'> --res 1x3x231x231
27 | 
28 |  ]]
29 | 
30 |    return opt
31 | end
32 | 
33 | return opts
34 | 


--------------------------------------------------------------------------------
/profile-model.lua:
--------------------------------------------------------------------------------
  1 | require 'nn'
  2 | require 'xlua'
  3 | require 'sys'
  4 | 
  5 | local lapp = assert(require('pl.lapp'))
  6 | local opts = assert(require('opts'))
  7 | local profileTime = assert(require('src/modelTimer.lua'))
  8 | 
  9 | -- to load a 64-bit model in binary, we override torch.DiskFile if 32bit machine (ARM):
 10 | local systembit = tonumber(io.popen("getconf LONG_BIT"):read('*a'))
 11 | if systembit == 32 then
 12 |    require('libbincompat')
 13 | end
 14 | 
 15 | local pf = function(...) print(string.format(...)) end
 16 | local r = sys.COLORS.red
 17 | local g = sys.COLORS.green
 18 | local b = sys.COLORS.blue
 19 | local n = sys.COLORS.none
 20 | local THIS = sys.COLORS.blue .. 'THIS' .. n
 21 | 
 22 | -- Parsing input arguemnets
 23 | opt = opts.parse(arg)
 24 | if opt.platform == 'cuda' then
 25 |   require 'cunn'
 26 |   require 'cudnn'
 27 | end
 28 | 
 29 | torch.setdefaulttensortype('torch.FloatTensor')
 30 | 
 31 | paths.dofile('src/profiler.lua')
 32 | -- Loading model
 33 | if string.find(opt.model, '.lua', #opt.model-4) then
 34 |    model = { channel = 3, name = opt.model }
 35 |    pf('Building %s model \n', r..model.name..n)
 36 |    net = require (opt.model)
 37 | elseif string.find(opt.model, '.net', #opt.model-4) then
 38 |    model = { channel = 3, name = 'Trained binary network' }
 39 |    pf('Loading %s model from binary file...\n', r..model.name..n)
 40 |    net = torch.load(opt.model)
 41 | elseif string.find(opt.model, '.net.ascii', #opt.model-10) then
 42 |    model = { channel = 3, name = 'Trained ascii network' }
 43 |    pf('Loading %s model from ascii file...\n', r..model.name..n)
 44 |    net = torch.load(opt.model, 'ascii')
 45 | else
 46 |    error('Network named not recognized')
 47 | end
 48 | 
 49 | if net:type() == 'torch.CudaTensor' then
 50 |    cudnn.convert(net, nn)
 51 |    net:float()
 52 | end
 53 | 
 54 | net:evaluate()
 55 | net:clearState()
 56 | 
 57 | local iBatch, iChannel, iWidth, iHeight = string.match(opt.res, '(%d+)x(%d+)x(%d+)x(%d+)')
 58 | --                                  or string.match(opt.res, '(%d+)X(%d+)X(%d+)')
 59 | 
 60 | iBatch = tonumber(iBatch)
 61 | iChannel = tonumber(iChannel)
 62 | iWidth = tonumber(iWidth)
 63 | iHeight = tonumber(iHeight)
 64 | 
 65 | if iChannel ~= 0 then
 66 |    model.channel = iChannel
 67 | end
 68 | 
 69 | local batch    = (iBatch ~= 0) and iBatch
 70 | local width    = (iWidth ~= 0) and iWidth
 71 | local height   = (iHeight ~= 0) and iHeight or width
 72 | 
 73 | imgBatch = torch.FloatTensor(batch, model.channel, height, width)
 74 | 
 75 | if opt.save == 'a' then
 76 |    pf('Saving model as model.net.ascii... ')
 77 |    torch.save('model.net.ascii', net, 'ascii')
 78 |    pf('Done.\n')
 79 | elseif opt.save == 'b' then
 80 |    pf('Saving model as model.net... ')
 81 |    torch.save('model.net', net)
 82 |    pf('Done.\n')
 83 | end
 84 | 
 85 | -- calculate the number of operations performed by the network
 86 | if not model.def then
 87 |    totalOps, layerOps = count_ops(net:clone(), imgBatch)
 88 | else
 89 |    totalOps, layerOps = count_ops(model.def, imgBatch)
 90 | end
 91 | 
 92 | pf('Operations estimation for image size: %d x %d', width, height)
 93 | 
 94 | local function detailedPrint(...)
 95 |    if opt.verbose == 'detail' or opt.verbose == 'medium' then
 96 |       pf(...)
 97 |    end
 98 | end
 99 | 
100 | -- Compute per layer opt counts
101 | detailedPrint('\n-----------------------------------------------------------------------------------------------')
102 | detailedPrint('%5s %-29s %20s %11s %15s %9s', 'S.No.', 'Module Name', 'Input Resolution', 'Neurons', 'Ops', '% Ops')
103 | detailedPrint('===============================================================================================')
104 | local opsPerCommonModule = {}
105 | local totalNeurons = 0
106 | for i, info in pairs(layerOps) do
107 |    local name = info['name']
108 |    local ops = info['ops']
109 |    local maps = info['maps']
110 |    local neurons = info['neurons']
111 |    if not opsPerCommonModule[name] then
112 |       opsPerCommonModule[name] = 0
113 |    end
114 |    local percOps = (ops/totalOps)*100
115 |    if percOps > 1 then
116 |       percOps = string.format('%s%9.4f%s', b, percOps, n)
117 |    else
118 |       percOps = string.format('%9.4f', percOps)
119 |    end
120 |    if opt.verbose == 'medium' and (ops/totalOps)*100 > 0 then
121 |       pf('%5d %s%-29s%s %s%20s%s %11s %s%15s%s %9s', i, g, name, n, r, maps, n, neurons, r, ops, n, percOps)
122 |    elseif opt.verbose == 'detail' then
123 |       pf('%5d %s%-29s%s %s%20s%s %11s %s%15s%s %9s', i, g, name, n, r, maps, n, neurons, r, ops, n, percOps)
124 |    end
125 |    totalNeurons = totalNeurons + neurons
126 |    opsPerCommonModule[name] = opsPerCommonModule[name] + ops
127 | end
128 | 
129 | print('-----------------------------------------------------------------------------------------------')
130 | pf('   %s%s%s : %d ', r, 'Total number of trainable parameters', n, net:getParameters():size(1))
131 | pf('   %s%-36s%s : %d', r, 'Total number of neurons', n, totalNeurons)
132 | print('-----------------------------------------------------------------------------------------------')
133 | print('* Operations per common module *')
134 | -- Print total
135 | local ops = opt.MACs and 'MACs' or 'Ops'
136 | for name, count in pairs(opsPerCommonModule) do
137 |     if count > 0 then
138 |         print(string.format('   + %-35s: %.4e %s', name, count, ops))
139 |     end
140 | end
141 | pf('     %s%-35s: %.4e %s', b, 'Total', totalOps, ops)
142 | print('===============================================================================================')
143 | 
144 | -- time and average over a number of iterations
145 | pf('Profiling %s, %d iterations', r..model.name..n, opt.iter)
146 | net:evaluate()
147 | net:clearState()
148 | time = profileTime:time(net, imgBatch, opt.iter, opt.platform)
149 | 
150 | local d = g..'CPU'..n
151 | if 'cuda' == opt.platform then
152 |    d = g..'GPU'..n
153 | end
154 | 
155 | pf('   Forward average time on %s %s : %.2f ms', THIS, d, time.total * 1e3)
156 | pf('   Performance for %s %s         : %.2f G-Ops/s\n', THIS, d, totalOps * 1e-9 / time.total)
157 | 


--------------------------------------------------------------------------------
/src/modelTimer.lua:
--------------------------------------------------------------------------------
 1 | local profileTime = {}
 2 | local xlua = assert(require('xlua'))
 3 | 
 4 | local function calc_time_cuda(net, img, iterations)
 5 |    collectgarbage()
 6 | 
 7 |    cutorch.setDevice(1)
 8 |    cudnn.convert(net, cudnn, function(m) return torch.type(m):find('MaxPooling') end)
 9 |    net:cuda()
10 | 
11 |    print('==> using GPU #' .. cutorch.getDevice())
12 |    cutorch.synchronize()
13 | 
14 |    local tmp = false
15 |    local timer = torch.Timer()
16 |    local timing = torch.FloatTensor(iterations)
17 |    local t = 0
18 | 
19 |    -- iterations plus one to prime the jit
20 |    for i=1, (iterations+1) do
21 |       xlua.progress(i, iterations)
22 | 
23 |       timer:reset()
24 | 
25 |       tmp = net:forward(img:cuda())
26 |       cutorch.synchronize()
27 |       tmp:float()
28 | 
29 |       t = timer:time().real
30 |       timing[(i%iterations)+1] = t
31 |    end
32 | 
33 |    return timing:mean(), tmp
34 | end
35 | 
36 | local function calc_time_cpu(net, img, iterations)
37 |    local tmp = false
38 |    local timer = torch.Timer()
39 |    local timing = torch.FloatTensor(iterations)
40 |    local t = 0
41 | 
42 |    -- iterations plus one to prime the jit
43 |    for i=1, (iterations+1) do
44 |       xlua.progress(i, iterations)
45 | 
46 |       timer:reset()
47 | 
48 |       tmp = net:forward(img)
49 | 
50 |       t = timer:time().real
51 |       timing[(i%iterations)+1] = t
52 |    end
53 | 
54 |    return timing:mean(), tmp
55 | end
56 | 
57 | function profileTime:time(net, img, iterations, platform)
58 |    iterations = iterations or 10
59 |    local time = { total = 0, conv = 0, mlp = 0, }
60 | 
61 |    if platform == 'cuda' then
62 | 
63 |       time.total = calc_time_cuda(net, img, iterations)
64 | 
65 |    else
66 | 
67 |       time.total = calc_time_cpu(net, img, iterations)
68 | 
69 |    end
70 | 
71 |    return time
72 | end
73 | 
74 | return profileTime
75 | 


--------------------------------------------------------------------------------
/src/profiler.lua:
--------------------------------------------------------------------------------
  1 | local op_count
  2 | local op_used
  3 | local multiply_adds = opt.MACs
  4 | 
  5 | function count_ops(network, input)
  6 |     op_count = 0
  7 |     op_used = {}
  8 |     network:apply(intercept_updateOutput)
  9 |     inputImg = input[1]
 10 |     network:forward(input)
 11 |     network:apply(restore_updateOutput)
 12 |     return op_count, op_used
 13 | end
 14 | 
 15 | -- Intercept updateOutput. At each call increment op_count appropriately.
 16 | function intercept_updateOutput(module)
 17 |     module.updateOutput_original = module.updateOutput
 18 |     module.updateOutput = function(self, input)
 19 |         compute_ops(module, input)
 20 |         return module:updateOutput_original(input)
 21 |     end
 22 | end
 23 | 
 24 | -- Restore original network behaviour
 25 | function restore_updateOutput(module)
 26 |     assert(module.updateOutput_original,
 27 |         "restore_updateOutput should be called after intercept_updateOutput!")
 28 |     module.updateOutput = module.updateOutput_original
 29 |     module.updateOutput_original = nil
 30 | end
 31 | 
 32 | -- Compute #flops that specified module needs to process an input.
 33 | -- module_handlers table is at the bottom of this file
 34 | function compute_ops(module, input)
 35 |     module_name = torch.type(module)
 36 |     handler = module_handlers[module_name]
 37 |     assert(handler, string.format("No handler for module %s!", module_name))
 38 |     local ops = handler(module, input)
 39 |     op_count = op_count + ops
 40 |     local maps = 0
 41 |     local neurons = 0
 42 |     if torch.type(module) ~= 'nn.JoinTable' and torch.type(module) ~= 'nn.CAddTable' then
 43 |        for i = 1, input:dim() do
 44 |           if i == 1 then
 45 |              maps = input:size(1)
 46 |              neurons = input:size(1)
 47 |           else
 48 |              maps = maps .. ' x ' .. input:size(i)
 49 |              neurons = neurons * input:size(i)
 50 |           end
 51 |        end
 52 |     else
 53 |     end
 54 |     table.insert(op_used, {name = torch.type(module),
 55 |                            ops = ops,
 56 |                            maps = maps,
 57 |                            neurons = neurons})
 58 | end
 59 | 
 60 | --------------------------------------------------------------------------------
 61 | ------------------------------- Module handlers --------------------------------
 62 | --------------------------------------------------------------------------------
 63 | 
 64 | local function ops_nothing(module, input)
 65 |     return 0
 66 | end
 67 | 
 68 | local function ops_linear(module, input)
 69 |     local batch_size = input:dim() == 2 and input:size(1) or 1
 70 |     local weight_ops = module.weight:nElement() * (multiply_adds and 1 or 2)
 71 |     local bias_ops = module.bias:nElement()
 72 |     local ops_per_sample = weight_ops + bias_ops
 73 |     return batch_size * ops_per_sample
 74 | end
 75 | 
 76 | local function ops_logsoftmax(module, input)
 77 |     local batch_size = input:dim() == 2 and input:size(1) or 1
 78 |     local input_dim = input:dim() == 2 and input:size(2) or input:size(1)
 79 |     local expminusapprox_ops = 1 -- around 8 in Torch
 80 |     -- +2 for accumulation and substraction in two loops
 81 |     local ops_per_elem = expminusapprox_ops + 1 + 1
 82 |     local ops_per_sample = input_dim * ops_per_elem
 83 |     return batch_size * ops_per_sample
 84 | end
 85 | 
 86 | -- WARNING: an oversimplified version
 87 | local function ops_nonlinearity(module, input)
 88 |     return input:nElement()
 89 | end
 90 | 
 91 | local function ops_convolution(module, input)
 92 |     assert(input:dim() == 4, "ops_convolution supports only batched inputs!")
 93 |     assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!")
 94 |     local batch_size = input:size(1)
 95 |     local input_planes = input:size(2)
 96 |     local input_height = input:size(3)
 97 |     local input_width = input:size(4)
 98 | 
 99 |     -- ops per output element
100 |     local kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2)
101 |     local bias_ops = 1
102 |     local ops_per_element = kernel_ops + bias_ops
103 | 
104 |     local output_width = math.floor((input_width + 2 * module.padW - module.kW) / module.dW + 1)
105 |     local output_height = math.floor((input_height + 2 * module.padH - module.kH) / module.dH + 1)
106 | 
107 |     return batch_size * module.nOutputPlane * output_width * output_height * ops_per_element
108 | end
109 | 
110 | local function ops_fullconvolution(module, input)
111 |     assert(input:dim() == 4, "ops_fullconvolution supports only batched inputs!")
112 |     assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!")
113 |     local batch_size = input:size(1)
114 |     local input_planes = input:size(2)
115 |     local input_height = input:size(3)
116 |     local input_width = input:size(4)
117 | 
118 |     -- ops per input element
119 |     local single_kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2)
120 |     local sample_kernel_ops = input_planes * input_width * input_height * single_kernel_ops
121 | 
122 |     local output_width = (input_width - 1) * module.dW - 2 * module.padW + module.kW + module.adjW
123 |     local output_height = (input_height - 1) * module.dH - 2 * module.padW + module.kH + module.adjH
124 |     local bias_ops = output_width * output_height * module.nOutputPlane
125 | 
126 |     return batch_size * (sample_kernel_ops + bias_ops)
127 | end
128 | 
129 | local function ops_dilatedconvolution(module, input)
130 |     assert(input:dim() == 4, "ops_convolution supports only batched inputs!")
131 |     assert(input:size(2) == module.nInputPlane, "number of input planes doesn't match!")
132 |     local batch_size = input:size(1)
133 |     local input_planes = input:size(2)
134 |     local input_height = input:size(3)
135 |     local input_width = input:size(4)
136 |     local dilW = module.dilationW
137 |     local dilH = module.dilationH
138 | 
139 |     -- ops per output element
140 |     local kernel_ops = module.kH * module.kW * input_planes * (multiply_adds and 1 or 2)
141 |     local bias_ops = 1
142 |     local ops_per_element = kernel_ops + bias_ops
143 | 
144 |     local output_width = math.floor(
145 |                          (input_width + 2 * module.padW - dilW * (module.kW - 1) + 1)
146 |                          /module.dW + 1)
147 |     local output_height = math.floor(
148 |                           (input_height + 2 * module.padH - dilH * (module.kH - 1) + 1)
149 |                           /module.dH + 1)
150 | 
151 |     return batch_size * module.nOutputPlane * output_width * output_height * ops_per_element
152 | end
153 | 
154 | local function ops_pooling(module, input)
155 |     assert(input:dim() == 4, "ops_averagepooling supports only batched inputs!")
156 |     local batch_size = input:size(1)
157 |     local input_planes = input:size(2)
158 |     local input_height = input:size(3)
159 |     local input_width = input:size(4)
160 | 
161 |     local kernel_ops = module.kH * module.kW
162 | 
163 |     local output_width = math.floor((input_width + 2 * module.padW - module.kW) / module.dW + 1)
164 |     local output_height = math.floor((input_height + 2 * module.padH - module.kH) / module.dH + 1)
165 | 
166 |     return batch_size * input_planes * output_width * output_height * kernel_ops
167 | end
168 | 
169 | local function ops_unpooling(module, input)
170 |     assert(input:dim() == 4, "ops_unpooling supports only batched inputs!")
171 |     local batch_size = input:size(1)
172 |     local input_planes = input:size(2)
173 |     local input_height = input:size(3)
174 |     local input_width = input:size(4)
175 | 
176 | 
177 |     local output_width = (input_width - 1) * module.pooling.dW - (2 * module.pooling.padW - module.pooling.kW)
178 |     local output_height = (input_height - 1) * module.pooling.dH - (2 * module.pooling.padH - module.pooling.kH)
179 | 
180 |     return batch_size * input_planes * output_width * output_height
181 | end
182 | 
183 | local function ops_caddtable(module, input)
184 |     assert(torch.type(input) == 'table', "ops_caddtable input should be a table!")
185 |     return input[1]:nElement() * #input
186 | end
187 | 
188 | local function ops_batchnorm(module, input)
189 |     return input:nElement() * (multiply_adds and 1 or 2)
190 | end
191 | 
192 | local function ops_sum(module, input)
193 |    assert(not module.nInputDims, 'nInputDims mode of nn.Sum not supported.')
194 |    local ops = 1
195 |    for d = 1, input:dim() do
196 |       local s = input:size(d)
197 |       ops = d ~= module.dimension and ops * s or ops * (s - 1)
198 |    end
199 |    return ops
200 | end
201 | 
202 | local function ops_mulconstant(module, input)
203 |    local ops = 1
204 |    for d = 1, input:dim() do
205 |       ops = ops * input:size(d)
206 |    end
207 |    return ops
208 | end
209 | 
210 | module_handlers = {
211 |     -- Containers
212 |     ['nn.Sequential'] = ops_nothing,
213 |     ['nn.Parallel'] = ops_nothing,
214 |     ['nn.Concat'] = ops_nothing,
215 |     ['nn.gModule'] = ops_nothing,
216 |     ['nn.Identity'] = ops_nothing,
217 |     ['nn.DataParallelTable'] = ops_nothing,
218 |     ['nn.Contiguous'] = ops_nothing,
219 |     ['nn.ConcatTable'] = ops_nothing,
220 |     ['nn.JoinTable'] = ops_nothing,
221 |     ['nn.Padding'] = ops_nothing,
222 | 
223 |     -- Nonlinearities
224 |     ['nn.ReLU'] = ops_nonlinearity,
225 |     ['nn.PReLU'] = ops_nonlinearity,
226 |     ['nn.Threshold'] = ops_nonlinearity,
227 |     ['nn.LogSoftMax'] = ops_logsoftmax,
228 |     ['nn.SoftMax'] = ops_logsoftmax,   --TODO Update it with correct ops calculator
229 |     ['cudnn.ReLU'] = ops_nonlinearity,
230 |     ['cudnn.PReLU'] = ops_nonlinearity,
231 | 
232 |     -- Basic modules
233 |     ['nn.Linear'] = ops_linear,
234 |     ['nn.Sum'] = ops_sum,
235 |     ['nn.MulConstant'] = ops_mulconstant,
236 | 
237 |     -- Spatial Modules
238 |     ['nn.SpatialConvolution'] = ops_convolution,
239 |     ['nn.SpatialConvolutionMM'] = ops_convolution,
240 |     ['nn.SpatialDilatedConvolution'] = ops_dilatedconvolution,
241 |     ['nn.SpatialFullConvolution'] = ops_fullconvolution,
242 |     ['nn.SpatialMaxPooling'] = ops_pooling,
243 |     ['nn.SpatialAveragePooling'] = ops_pooling,
244 |     ['nn.SpatialMaxUnpooling'] = ops_unpooling,
245 |     ['nn.SpatialZeroPadding'] = ops_nothing,
246 |     ['nn.BatchNormalization'] = ops_nothing, -- Can be squashed
247 |     ['nn.SpatialBatchNormalization'] = ops_nothing, -- Can be squashed
248 | 
249 |     ['cudnn.SpatialConvolution'] = ops_convolution,
250 |     ['cudnn.SpatialConvolutionMM'] = ops_convolution,
251 |     ['cudnn.SpatialDilatedConvolution'] = ops_dilatedconvolution,
252 |     ['cudnn.SpatialMaxPooling'] = ops_pooling,
253 |     ['cudnn.SpatialAveragePooling'] = ops_pooling,
254 |     ['cudnn.SpatialBatchNormalization'] = ops_nothing, -- Can be squashed
255 | 
256 |     -- Table modules
257 |     ['nn.CAddTable'] = ops_caddtable,
258 | 
259 |     -- Various modules
260 |     ['nn.View'] = ops_nothing,
261 |     ['nn.Reshape'] = ops_nothing,
262 |     ['nn.Dropout'] = ops_nothing, -- Is turned off in inference
263 |     ['nn.SpatialDropout'] = ops_nothing, -- Is turned off in inference
264 |     ['nn.Concat'] = ops_nothing,
265 | }
266 | 


--------------------------------------------------------------------------------