├── .gitignore
├── CMakeLists.txt
├── README.md
├── demos
    ├── face-detector-parallel.lua
    ├── face-detector.lua
    ├── face-detector
    │   ├── PyramidPacker.lua
    │   ├── PyramidUnPacker.lua
    │   ├── blobParser.lua
    │   └── face.net
    ├── filter-bank.lua
    ├── loopback.lua
    └── loopback_camera.lua
├── etherflow
    ├── CMakeLists.txt
    ├── etherflow.c
    ├── etherflow.h
    ├── example.c
    ├── generic
    │   └── etherflow.c
    ├── init.c
    ├── init.lua
    └── test
    │   ├── receive.lua
    │   └── send.lua
├── ethertbsp
    ├── CMakeLists.txt
    ├── ethertbsp.c
    ├── ethertbsp.h
    ├── example.c
    ├── generic
    │   └── ethertbsp.c
    ├── init.c
    ├── init.lua
    └── test
    │   ├── receive.lua
    │   └── send.lua
├── neuflow-1.scm-0.rockspec
├── scripts
    ├── get-latest-neuflow-image
    └── load-bitfile
├── segments
    ├── coef_Abs
    ├── coef_Sqrt
    ├── coef_Sqrt_th
    ├── coef_Sqrt_th_div_3
    ├── coef_Sqrt_th_div_32
    ├── coef_StdSigm
    ├── coef_StdSigmAbs
    ├── coef_StdSigm_abs_err
    ├── coef_StdSigm_abs_err_all_range
    ├── coef_Tanh
    └── coef_TanhAbs
└── src
    ├── Camera.lua
    ├── Compiler.lua
    ├── Core.lua
    ├── CoreUser.lua
    ├── DmaInterface.lua
    ├── Interface.lua
    ├── Linker.lua
    ├── LinkerExtensions.lua
    ├── Log.lua
    ├── Memory.lua
    ├── NeuFlow.lua
    ├── Profiler.lua
    ├── Serial.lua
    ├── defines.lua
    ├── defines_ibm_asic.lua
    ├── defines_pico_m503.lua
    ├── defines_xilinx_ml605.lua
    ├── defines_xilinx_ml605_tbsp.lua
    ├── init.lua
    ├── rom.lua
    └── tools.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | # Lines starting with '#' are considered comments.
2 | #
3 | # Ignore swap file generated by vim.
4 | *.swp
5 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 3 | CMAKE_POLICY(VERSION 2.6)
 4 | IF(LUAROCKS_PREFIX)
 5 |     MESSAGE(STATUS "Installing Torch through Luarocks")
 6 |     STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX  "${LUAROCKS_PREFIX}")
 7 |     MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}")
 8 | ENDIF()
 9 | FIND_PACKAGE(Torch REQUIRED)
10 | 
11 | ADD_SUBDIRECTORY (etherflow)
12 | ADD_SUBDIRECTORY (ethertbsp)
13 | 
14 | SET(src)
15 | FILE(GLOB luasrc src/*.lua segments/*)
16 | ADD_TORCH_PACKAGE(neuflow "${src}" "${luasrc}" "neuFlow")
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # neuFlow
  2 | 
  3 | **neuFlow** is dataflow architecture optimized for large array/tensor
  4 | transforms, and especially image processing operations.  More info about the
  5 | architecture, hardware and applications can be found
  6 | [here](http://www.neuflow.org).
  7 | 
  8 | ## this package
  9 | 
 10 | This package is a compiler toolkit for neuFlow. It is entirely written in
 11 | [Lua](http://www.lua.org/), and relies on
 12 | [Torch7](https://github.com/andresy/torch) to represent N-dimensional arrays
 13 | efficiently. It also interfaces Torch7's neural-network package natively.
 14 | 
 15 | ## how to install
 16 | 
 17 | Torch7 must be install first, a task most easily accomplished using the single
 18 | line [install script](https://github.com/clementfarabet/torchinstall).
 19 | 
 20 | or alternatively to install Torch7 and the neuFlow package by hand, you will
 21 | need to install a few dependencies.
 22 | 
 23 | On Linux (Ubuntu):
 24 | 
 25 | ``` sh
 26 | $ apt-get install gcc g++ git libreadline5-dev cmake wget
 27 | $ apt-get install libqt4-core libqt4-gui libqt4-dev
 28 | $ apt-get install ffmpeg gnuplot
 29 | ```
 30 | 
 31 | On Mac OS X (> 10.5): get [Homebrew](http://mxcl.github.com/homebrew/)
 32 | and then:
 33 | 
 34 | ``` sh
 35 | $ brew install git readline cmake wget
 36 | $ brew install qt
 37 | $ brew install ffmpeg gnuplot
 38 | ```
 39 | 
 40 | You're ready to install Torch7 (www.torch.ch). The most up to date instructions
 41 | can be found at the [Torch7 github page](https://github.com/andresy/torch).
 42 | 
 43 | ``` sh
 44 | $ git clone git://github.com/andresy/torch.git
 45 | $ cd torch
 46 | $ mkdir build
 47 | $ cd build
 48 | 
 49 | $ cmake ..
 50 | OR
 51 | $ cmake .. -DCMAKE_INSTALL_PREFIX=/my/install/path
 52 | ```
 53 | 
 54 | Or if you already have a previous Torch7 installed:
 55 | 
 56 | ``` sh
 57 | $ luarocks install torch WITH_LUA_JIT=1 # Torch7, an efficient numeric library for Lua
 58 | ```
 59 | 
 60 | You will also need additional packages:
 61 | 
 62 | ``` sh
 63 | $ luarocks install image        # an image library for Torch7
 64 | $ luarocks install nnx          # lots of extra neural-net modules
 65 | $ luarocks install camera       # a camera interface for Linux/MacOS
 66 | $ luarocks install ffmpeg       # a video decoder for most formats
 67 | $ luarocks install inline-c     # inline C capability
 68 | ```
 69 | 
 70 | Now that Torch7 has been installed the neuflow package can be installed.
 71 | Installing the neuflow package requires you to download the source code
 72 | repository. It'll give you access to some demos, to get started:
 73 | 
 74 | ``` sh
 75 | $ git clone https://github.com/clementfarabet/neuflow.git
 76 | $ cd neuflow
 77 | $ luarocks make
 78 | ```
 79 | 
 80 | ## how to run code on neuFlow
 81 | 
 82 | Demos are located in demos/. To get started, you'll need
 83 | a standard Xilinx dev board for the Virtex 6: [the ML605 Kit]
 84 | (http://www.xilinx.com/products/devkits/EK-V6-ML605-G.htm).
 85 | We provide an image of neuFlow that's pre synthesized/mapped/routed
 86 | for the Virtex6 VLX240T on this platform.
 87 | 
 88 | To run any of the demos, follow these instructions (tested on
 89 | Ubuntu 9.04, 10.04 and Mac OS X 10.5, 10.6 and 10.7).
 90 | 
 91 | ``` sh
 92 | $ git clone https://github.com/clementfarabet/neuflow.git
 93 | $ cd neuflow
 94 | 
 95 | # make Xilinx tools available (it implies you have them
 96 | # installed somewhere...)
 97 | $ source $XILINX_INSTALL_PATH/settings**.sh
 98 | 
 99 | # turn on the ML605, plug the JTAG cable then load one of
100 | # our pre-built bitfiles *:
101 | $ cd scripts
102 | $ ./get-latest-neuflow-image
103 | $ ./load-bitfile neuFlow-ml605.bit
104 | 
105 | # at this points, you just have wait 2 seconds that the Ethernet
106 | # LEDs are back on (out of reset)
107 | 
108 | # run the simplest demo, a loopback client, to verify your setup **:
109 | $ cd ../demos
110 | $ sudo torch loopback.lua # on Linux
111 | or
112 | $ ./loopback.lua # on OSX
113 | 
114 | # before loading a new demo, you have to reset neuFlow: for
115 | # now it is done by pressing the SW10 button (cpu rst)
116 | 
117 | # then you can run a typical convnet-based program, a face detector:
118 | $ sudo torch face-detector.lua # on Linux
119 | or
120 | $ ./face-detector.lua # on OSX
121 | ```
122 | 
123 | (*) the load-bitfile script assumes that you have properly installed Xilinx's
124 | USB cable driver. On RedHat and derivatives it works out of the box when
125 | installing Xilinx ISE, but on Ubuntu you'll have to follow these instructions:
126 | http://rmdir.de/~michael/xilinx/.  This is not doable on Mac OS X
127 | unfortunately. I usually flash the ML605 board using Ubuntu (even a virtual box
128 | version works), and then run all the demos under Mac OS X.
129 | 
130 | (**) you need to have admin privileges on your machine (sudo) to be able to
131 | interact with neuFlow, as we're using a custom low-level Ethernet framing
132 | protocol.
133 | 


--------------------------------------------------------------------------------
/demos/face-detector-parallel.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env torch
  2 | 
  3 | -- libs
  4 | require 'parallel'
  5 | require 'image'
  6 | require 'camera'
  7 | require 'neuflow'
  8 | require 'qtwidget'
  9 | 
 10 | 
 11 | -- forked process
 12 | function worker()
 13 |    require 'torch'
 14 |    require 'camera'
 15 | 
 16 |    -- camera
 17 |    local camera = image.Camera{}
 18 | 
 19 |    -- image
 20 |    local frameRGB = torch.Tensor(3,480,640)
 21 | 
 22 |    while true do
 23 |       frameRGB = camera:forward()
 24 | 
 25 |       local m = parallel.yield()
 26 |       if m == 'break' then break end
 27 |       parallel.parent:send(frameRGB)
 28 | 
 29 |       collectgarbage()
 30 |    end
 31 | end
 32 | 
 33 | function parent(arg)
 34 |    child = parallel.fork()
 35 |    child:exec(worker)
 36 | 
 37 |    ----------------------------------------------------------------------
 38 |    -- ARGS: parse user arguments
 39 |    --
 40 |    op = xlua.OptionParser('%prog [options]')
 41 |    op:option{'-c', '--camera', action='store', dest='camidx',
 42 |              help='if source=camera, you can specify the camera index: /dev/videoIDX', 
 43 |              default=0}
 44 |    op:option{'-n', '--network', action='store', dest='network', 
 45 |              help='path to existing [trained] network',
 46 |              default='face-detector/face.net'}
 47 |    opt,args = op:parse()
 48 | 
 49 |    ----------------------------------------------------------------------
 50 |    -- INIT: initialize the neuFlow context
 51 |    -- a mem manager, the dataflow core, and the compiler
 52 |    --
 53 |    -- platform='xilinx_ml605' or platform='pico_m503'
 54 | 
 55 |    local platform = args[1] or 'xilinx_ml605'
 56 |    nf = neuflow.init {
 57 |       prog_name   = 'face-detector',
 58 |       platform    = platform
 59 |    }
 60 | 
 61 |    ----------------------------------------------------------------------
 62 |    -- ELABORATION: describe the algorithm to be run on neuFlow, and 
 63 |    -- how it should interact with the host (data exchange)
 64 |    -- note: any copy**Host() inserted here needs to be matched by
 65 |    -- a copy**Dev() in the EXEC section.
 66 |    --
 67 | 
 68 |    -- load pre-trained network from disk
 69 |    network     = torch.load(opt.network)
 70 |    network_fov = 32
 71 |    network_sub = 4
 72 |    softnorm    = network.modules[1]
 73 |    hardnet     = nn.Sequential()
 74 |    for i = 2,#network.modules do
 75 |       hardnet:add(network.modules[i])
 76 |    end
 77 |    network = hardnet
 78 | 
 79 |    -- process input at multiple scales
 80 |    scales = {0.3, 0.24, 0.192, 0.15, 0.12, 0.1}
 81 | 
 82 |    -- use a pyramid packer/unpacker
 83 |    require 'face-detector/PyramidPacker'
 84 |    require 'face-detector/PyramidUnPacker'
 85 |    packer = nn.PyramidPacker(network, scales)
 86 |    unpacker = nn.PyramidUnPacker(network)
 87 | 
 88 |    -- blob parser
 89 |    parse = require 'face-detector/blobParser'
 90 | 
 91 |    -- a gaussian for smoothing the distributions
 92 |    gaussian = image.gaussian(3,0.15)
 93 | 
 94 |    -- generate input data for compiler
 95 |    frameRGB = torch.Tensor(3,480,640)
 96 |    frameY = image.rgb2y(frameRGB)
 97 |    input = packer:forward(frameY)
 98 | 
 99 |    -- loop over the main code
100 |    nf:beginLoop('main') do
101 | 
102 |       -- send data to device
103 |       input_dev = nf:copyFromHost(input)
104 | 
105 |       -- compile network
106 |       output_dev = nf:compile(network, input_dev)
107 | 
108 |       -- send result back to host
109 |       outputs = nf:copyToHost(output_dev)
110 | 
111 |    end nf:endLoop('main')
112 | 
113 | 
114 |    -- package hardware network
115 |    nf.forward = function(nf,input)
116 |       local normed = softnorm:forward(input)
117 |       nf:copyToDev(normed)
118 |       nf:copyFromDev(outputs)
119 |       return outputs
120 |    end
121 | 
122 | 
123 |    ----------------------------------------------------------------------
124 |    -- LOAD: load the bytecode on the device, and execute it
125 |    --
126 |    nf:sendReset()
127 |    nf:loadBytecode()
128 | 
129 |    ----------------------------------------------------------------------
130 |    -- EXEC: this part executes the host code, and interacts with the dev
131 |    --
132 | 
133 |    -- profiler
134 |    p = nf.profiler
135 | 
136 |    -- zoom
137 |    zoom = 0.5
138 | 
139 |    -- process loop
140 |    function process()
141 |       p:start('whole-loop','fps')
142 | 
143 |       -- (1) grab frame
144 |       p:start('get-camera-frame')
145 |       child:join()
146 |       frameRGB = child:receive()
147 |       frameRGB = image.scale(frameRGB, 640, 480)
148 |       p:lap('get-camera-frame')
149 | 
150 |       -- (2) transform it into Y space
151 |       p:start('RGB->Y')
152 |       frameY = image.rgb2y(frameRGB)
153 |       p:lap('RGB->Y')
154 | 
155 |       -- (3) create multiscale pyramid
156 |       p:start('pack-pyramid')
157 |       pyramid, coordinates = packer:forward(frameY)
158 |       p:lap('pack-pyramid')
159 | 
160 |       -- (4) run pre-trained network on it
161 |       p:start('network-inference')
162 |       result = nf:forward(pyramid)
163 |       p:lap('network-inference')
164 | 
165 |       -- (5) unpack pyramid
166 |       p:start('unpack-pyramid')
167 |       distributions = unpacker:forward(result, coordinates)
168 |       p:lap('unpack-pyramid')
169 | 
170 |       -- (6) parse distributions to extract blob centroids
171 |       p:start('parse-distributions')
172 |       threshold = 0.9
173 |       rawresults = {}
174 |       for i,distribution in ipairs(distributions) do
175 |          local smoothed = image.convolve(distribution[1]:add(1):mul(0.5), gaussian)
176 |          parse(smoothed, threshold, rawresults, scales[i])
177 |       end
178 |       p:lap('parse-distributions')
179 | 
180 |       -- (7) clean up results
181 |       p:start('clean-up')
182 |       detections = {}
183 |       for i,res in ipairs(rawresults) do
184 |          local scale = res[3]
185 |          local x = res[1]*network_sub/scale
186 |          local y = res[2]*network_sub/scale
187 |          local w = network_fov/scale
188 |          local h = network_fov/scale
189 |          detections[i] = {x=x, y=y, w=w, h=h}
190 |       end
191 |       p:lap('clean-up')
192 |    end
193 | 
194 |    -- display loop
195 |    function display()
196 |       win:gbegin()
197 |       win:showpage()
198 |       -- (1) display input image + pyramid
199 |       image.display{image=frameRGB, win=win}
200 | 
201 |       -- (2) overlay bounding boxes for each detection
202 |       for i,detect in ipairs(detections) do
203 |          win:setcolor(1,0,0)
204 |          win:rectangle(detect.x, detect.y, detect.w, detect.h)
205 |          win:stroke()
206 |          win:setfont(qt.QFont{serif=false,italic=false,size=16})
207 |          win:moveto(detect.x, detect.y-1)
208 |          win:show('face')
209 |       end
210 | 
211 |       -- (3) display distributions
212 |       local prevx = 0
213 |       for i,distribution in ipairs(distributions) do
214 |          local prev = distributions[i-1]
215 |          if prev then prevx = prevx + prev:size(3) end
216 |          image.display{image=distribution[1], win=win, x=prevx, min=0, max=1}
217 |       end
218 | 
219 |       p:lap('whole-loop')
220 |       p:displayAll{painter=win, x=5, y=distributions[1]:size(2)+20, font=12}
221 |       win:gend()
222 |    end
223 | 
224 |    ----------------------------------------------------------------------
225 |    -- GUI: setup user interface / display
226 |    --
227 | 
228 |    if not win then
229 |       win = qtwidget.newwindow(frameRGB:size(3), frameRGB:size(2), 'Face Detection')
230 |    end
231 | 
232 |    while win:valid() do
233 |       process()
234 |       display()
235 |       collectgarbage()
236 |    end
237 | 
238 |    child:join('break')
239 | 
240 | --[[
241 |    timer = qt.QTimer()
242 |    timer.interval = 10
243 |    timer.singleShot = true
244 |    qt.connect (
245 |       timer,
246 |       'timeout()',
247 |       function()
248 |          process()
249 |          display()
250 |          collectgarbage()
251 |          timer:start()
252 |       end
253 |    )
254 |    timer:start()
255 | --]]
256 | end
257 | 
258 | -- protected env
259 | ok,err = pcall(parent, arg)
260 | if not ok then print(err) end
261 | parallel.close()
262 | 


--------------------------------------------------------------------------------
/demos/face-detector.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env torch
  2 | ----------------------------------------------------------------------
  3 | -- This program demonstrates the computation of a bank of filters
  4 | -- over a grayscale image. The image is grabbed from a webcam,
  5 | -- if available (and if the package 'camera' is installed as well),
  6 | -- otherwise, a fixed image (lena) is used as an input.
  7 | --
  8 | -- This script demonstrates how to describe a simple algorithm
  9 | -- using Torch7's 'nn' package, and how to compile it for neuFlow.
 10 | --
 11 | 
 12 | require 'neuflow'
 13 | require 'qt'
 14 | require 'qtwidget'
 15 | require 'xlua'
 16 | require 'inline'
 17 | require 'nnx'
 18 | require 'camera'
 19 | require 'image'
 20 | 
 21 | ----------------------------------------------------------------------
 22 | -- ARGS: parse user arguments
 23 | --
 24 | op = xlua.OptionParser('%prog [options]')
 25 | op:option{'-c', '--camera', action='store', dest='camidx',
 26 |           help='if source=camera, you can specify the camera index: /dev/videoIDX', 
 27 |           default=0}
 28 | op:option{'-n', '--network', action='store', dest='network', 
 29 |           help='path to existing [trained] network',
 30 |           default='face-detector/face.net'}
 31 | opt,args = op:parse()
 32 | 
 33 | ----------------------------------------------------------------------
 34 | -- INIT: initialize the neuFlow context
 35 | -- a mem manager, the dataflow core, and the compiler
 36 | --
 37 | -- platform='xilinx_ml605' or platform='pico_m503'
 38 | 
 39 | local platform = args[1] or 'xilinx_ml605'
 40 | nf = neuflow.init {
 41 |    prog_name   = 'face-detector',
 42 |    platform    = platform
 43 | }
 44 | 
 45 | ----------------------------------------------------------------------
 46 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 
 47 | -- how it should interact with the host (data exchange)
 48 | -- note: any copy**Host() inserted here needs to be matched by
 49 | -- a copy**Dev() in the EXEC section.
 50 | --
 51 | 
 52 | -- load pre-trained network from disk
 53 | network = torch.load(opt.network)
 54 | network_fov = 32
 55 | network_sub = 4
 56 | softnorm = network.modules[1]
 57 | hardnet = nn.Sequential()
 58 | for i = 2,#network.modules do
 59 |    hardnet:add(network.modules[i])
 60 | end
 61 | network = hardnet
 62 | 
 63 | -- process input at multiple scales
 64 | scales = {0.3, 0.24, 0.192, 0.15, 0.12, 0.1}
 65 | 
 66 | -- use a pyramid packer/unpacker
 67 | require 'face-detector/PyramidPacker'
 68 | require 'face-detector/PyramidUnPacker'
 69 | packer = nn.PyramidPacker(network, scales)
 70 | unpacker = nn.PyramidUnPacker(network)
 71 | 
 72 | -- blob parser
 73 | parse = require 'face-detector/blobParser'
 74 | 
 75 | -- a gaussian for smoothing the distributions
 76 | gaussian = image.gaussian(3,0.15)
 77 | 
 78 | -- generate input data for compiler
 79 | frameRGB = torch.Tensor(3,480,640)
 80 | frameY = image.rgb2y(frameRGB)
 81 | input = packer:forward(frameY)
 82 | 
 83 | -- loop over the main code
 84 | nf:beginLoop('main') do
 85 | 
 86 |    -- send data to device
 87 |    input_dev = nf:copyFromHost(input)
 88 | 
 89 |    -- compile network
 90 |    output_dev = nf:compile(network, input_dev)
 91 | 
 92 |    -- send result back to host
 93 |    outputs = nf:copyToHost(output_dev)
 94 | 
 95 | end nf:endLoop('main')
 96 | 
 97 | -- package hardware network
 98 | nf.forward = function(nf,input)
 99 |                 local normed = softnorm:forward(input)
100 |                 nf:copyToDev(normed)
101 |                 nf:copyFromDev(outputs)
102 |                 return outputs
103 |              end
104 | 
105 | ----------------------------------------------------------------------
106 | -- LOAD: load the bytecode on the device, and execute it
107 | --
108 | nf:sendReset()
109 | nf:loadBytecode()
110 | 
111 | ----------------------------------------------------------------------
112 | -- EXEC: this part executes the host code, and interacts with the dev
113 | --
114 | 
115 | -- camera
116 | camera = image.Camera{}
117 | 
118 | -- profiler
119 | p = nf.profiler
120 | 
121 | -- zoom
122 | zoom = 0.5
123 | 
124 | -- process loop
125 | function process()
126 |    p:start('whole-loop','fps')
127 | 
128 |    -- (1) grab frame
129 |    p:start('get-camera-frame')
130 |    frameRGB = camera:forward()
131 |    frameRGB = image.scale(frameRGB, 640, 480)
132 |    p:lap('get-camera-frame')
133 | 
134 |    -- (2) transform it into Y space
135 |    p:start('RGB->Y')
136 |    frameY = image.rgb2y(frameRGB)
137 |    p:lap('RGB->Y')
138 | 
139 |    -- (3) create multiscale pyramid
140 |    p:start('pack-pyramid')
141 |    pyramid, coordinates = packer:forward(frameY)
142 |    p:lap('pack-pyramid')
143 | 
144 |    -- (4) run pre-trained network on it
145 |    p:start('network-inference')
146 |    result = nf:forward(pyramid)
147 |    p:lap('network-inference')
148 | 
149 |    -- (5) unpack pyramid
150 |    p:start('unpack-pyramid')
151 |    distributions = unpacker:forward(result, coordinates)
152 |    p:lap('unpack-pyramid')
153 | 
154 |    -- (6) parse distributions to extract blob centroids
155 |    p:start('parse-distributions')
156 |    threshold = 0.9
157 |    rawresults = {}
158 |    for i,distribution in ipairs(distributions) do
159 |       local smoothed = image.convolve(distribution[1]:add(1):mul(0.5), gaussian)
160 |       parse(smoothed, threshold, rawresults, scales[i])
161 |    end
162 |    p:lap('parse-distributions')
163 | 
164 |    -- (7) clean up results
165 |    p:start('clean-up')
166 |    detections = {}
167 |    for i,res in ipairs(rawresults) do
168 |       local scale = res[3]
169 |       local x = res[1]*network_sub/scale
170 |       local y = res[2]*network_sub/scale
171 |       local w = network_fov/scale
172 |       local h = network_fov/scale
173 |       detections[i] = {x=x, y=y, w=w, h=h}
174 |    end
175 |    p:lap('clean-up')
176 | end
177 | 
178 | -- display loop
179 | function display()
180 |    win:gbegin()
181 |    win:showpage()
182 |    -- (1) display input image + pyramid
183 |    image.display{image=frameRGB, win=win}
184 | 
185 |    -- (2) overlay bounding boxes for each detection
186 |    for i,detect in ipairs(detections) do
187 |       win:setcolor(1,0,0)
188 |       win:rectangle(detect.x, detect.y, detect.w, detect.h)
189 |       win:stroke()
190 |       win:setfont(qt.QFont{serif=false,italic=false,size=16})
191 |       win:moveto(detect.x, detect.y-1)
192 |       win:show('face')
193 |    end
194 | 
195 |    -- (3) display distributions
196 |    local prevx = 0
197 |    for i,distribution in ipairs(distributions) do
198 |       local prev = distributions[i-1]
199 |       if prev then prevx = prevx + prev:size(3) end
200 |       image.display{image=distribution[1], win=win, x=prevx, min=0, max=1}
201 |    end
202 | 
203 |    p:lap('whole-loop')
204 |    p:displayAll{painter=win, x=5, y=distributions[1]:size(2)+20, font=12}
205 |    win:gend()
206 | end
207 | 
208 | ----------------------------------------------------------------------
209 | -- GUI: setup user interface / display
210 | --
211 | 
212 | if not win then
213 |    win = qtwidget.newwindow(frameRGB:size(3), frameRGB:size(2), 'Face Detection')
214 | end
215 | 
216 | timer = qt.QTimer()
217 | timer.interval = 10
218 | timer.singleShot = true
219 | qt.connect(timer,
220 |            'timeout()',
221 |            function()
222 |               process()
223 |               display()
224 |               collectgarbage()
225 |               timer:start()
226 |            end)
227 | timer:start()
228 | 


--------------------------------------------------------------------------------
/demos/face-detector/PyramidPacker.lua:
--------------------------------------------------------------------------------
  1 | local PyramidPacker, parent = torch.class('nn.PyramidPacker', 'nn.Module')
  2 | 
  3 | function getCoordinates(args)
  4 |    local scales = args.scales
  5 |    local step_width = args.step_width
  6 |    local step_height = args.step_height
  7 |    local dim_width_orig = args.dim_width
  8 |    local dim_height_orig = args.dim_height
  9 | 
 10 |    local dim_width = math.floor(dim_width_orig*scales[1])
 11 |    local dim_height = math.floor(dim_height_orig*scales[1])
 12 |    -- we define the coordinates table, which we will fill-in
 13 |    -- once per each different input or different scales
 14 |    -- and we will use it to pack and unpack different sclales into/out of
 15 |    -- one big pack.
 16 |    -- The rows of the table are different scales,
 17 |    -- the columns of the table are:
 18 |    --     1   2   3   4   5     6
 19 |    --     x1  y1  x2  y2  width height
 20 |    --
 21 |    -- (x1, y1) - top left corner, (x2, y2) - bottom right corner,
 22 |    -- (width, height) - sizes of the current scale
 23 | 
 24 |    local coordinates = torch.Tensor(#scales, 6)
 25 |    coordinates[1][1] = 1
 26 |    coordinates[1][2] = 1
 27 |    coordinates[1][3] = dim_width
 28 |    coordinates[1][4] = dim_height
 29 |    coordinates[1][5] = dim_width
 30 |    coordinates[1][6] = dim_height
 31 |    local max_width = dim_width
 32 |    local max_height = dim_height
 33 | 
 34 |    -- fill the coordinates table and get the size for the big pack
 35 |    for i=2,#scales,1 do
 36 | 
 37 |       dim_width = math.floor(dim_width_orig*scales[i])
 38 |       dim_height = math.floor(dim_height_orig*scales[i])
 39 | 
 40 |       -- an even case - putting down
 41 |       if (i%2 == 0) then
 42 |          coordinates[i][1] = coordinates[i-1][1]
 43 |          coordinates[i][2] = (math.floor((coordinates[i-1][4]-1)/step_height) + 1)*step_height+1
 44 |       else -- an odd case - putting beside
 45 |          coordinates[i][1] = (math.floor((coordinates[i-1][3]-1)/step_width) + 1)*step_width+1
 46 |          coordinates[i][2] = coordinates[i-1][2]
 47 |       end
 48 | 
 49 |       coordinates[i][3] = dim_width + coordinates[i][1] - 1
 50 |       coordinates[i][4] = dim_height + coordinates[i][2] - 1
 51 |       coordinates[i][5] = dim_width
 52 |       coordinates[i][6] = dim_height
 53 | 
 54 |       max_width = math.max(max_width, coordinates[i][3])
 55 |       max_height = math.max(max_height, coordinates[i][4])
 56 |    end
 57 | 
 58 |    return coordinates, max_width, max_height
 59 | end
 60 | 
 61 | local function getSizesTbl(net)
 62 |    local sizes_tbl = {}
 63 |    for i=1,#net.modules do
 64 |       dw = net.modules[i].dW
 65 |       dh = net.modules[i].dH
 66 |       kw = net.modules[i].kW
 67 |       kh = net.modules[i].kH
 68 |       if((dw ~= nil)and(dh ~= nil)and(kw ~= nil) and(kh ~= nil)) then
 69 |          table.insert(sizes_tbl, {kw=kw,kh=kh,dw=dw,dh=dh})
 70 |       end
 71 |    end
 72 | 
 73 |    return sizes_tbl
 74 | end
 75 | 
 76 | local function getRange(args)
 77 |    local sizes_tbl = args.sizes_tbl
 78 |    local idx_output = args.idx_output
 79 | 
 80 |    local x = torch.Tensor(#sizes_tbl+1)
 81 |    local y = torch.Tensor(#sizes_tbl+1)
 82 |    x[#sizes_tbl+1] = idx_output
 83 |    y[#sizes_tbl+1] = idx_output
 84 | 
 85 |    for k = #sizes_tbl,1,-1 do
 86 |       -- rightmost point of the image that affects x(k+1)
 87 |       x[k] = sizes_tbl[k].kw+ (x[k+1]-1) * sizes_tbl[k].dw
 88 |       -- leftmost point of the image that affects y(k+1)
 89 |       y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dw
 90 |    end
 91 |    local left_width = y[1]
 92 |    local right_width = x[1]
 93 | 
 94 |    for k = #sizes_tbl,1,-1 do
 95 |       -- rightmost point of the image that affects x(k+1)
 96 |       x[k] = sizes_tbl[k].kh+ (x[k+1]-1) * sizes_tbl[k].dh
 97 |       -- leftmost point of the image that affects y(k+1)
 98 |       y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dh
 99 |    end
100 | 
101 |    local left_height = y[1]
102 |    local right_height = x[1]
103 | 
104 | 
105 |    return left_width, right_width, left_height, right_height
106 | end
107 | 
108 | local function getGlobalSizes(args)
109 |    local sizes_tbl = args.sizes_tbl
110 | 
111 |    -- to find gobal kernel size we use recursive formula:
112 |    -- glob_ker(n + 1) = 1
113 |    -- glob_ker(n) = ker(n) + (glob_ker(n+1)-1)*step(n)
114 |    --
115 |    -- where: ker(n) - kernel size on layer n, step(n) - step size on layer n
116 |    -- and n is number of layers that change the size of the input (convolution and subsample)
117 |    local left_width1, right_width1, left_height1, right_height1 = getRange({sizes_tbl=sizes_tbl, idx_output=1})
118 |    local ker_width = right_width1 - left_width1 +1
119 |    local ker_height = right_height1 - left_height1 +1
120 | 
121 |    local step_width = 1
122 |    local step_height = 1
123 | 
124 |    -- global step = MUL(step_1, step_2, ... , step_n)
125 |    for i = 1, #sizes_tbl do
126 |       step_width = step_width * sizes_tbl[i].dw
127 |       step_height = step_height * sizes_tbl[i].dh
128 |    end
129 | 
130 |    return step_width, step_height, ker_width, ker_height
131 | end
132 | 
133 | function PyramidPacker:__init(network, scales)
134 |    parent.__init(self)
135 |    -- vars
136 |    self.scales = scales or {1}
137 |    self.dim_width = 1
138 |    self.dim_height = 1
139 |    self.dimz = 1
140 |    if network then
141 |       -- infer params from given net
142 |       self.step_width, self.step_height = getGlobalSizes({sizes_tbl=getSizesTbl(network)})
143 |    else
144 |       self.step_width = 1
145 |       self.step_height = 1
146 |    end
147 | 
148 |    self.output = torch.Tensor(1,1,1)
149 |    self.output:fill(0)
150 | end
151 | 
152 | function PyramidPacker:forward(input)
153 | 
154 |    if ((input:size(3) ~= self.dim_width) or (input:size(2) ~= self.dim_height)) then
155 |       self.dim_height = input:size(2)
156 |       self.dim_width = input:size(3)
157 |       self.coordinates, self.max_width, self.max_height =
158 |          getCoordinates({dim_width = self.dim_width, dim_height = self.dim_height,
159 |                          scales = self.scales,
160 |                          step_width = self.step_width, step_height = self.step_height})
161 |    end
162 | 
163 |    if(input:size(1) ~= dim_z) then self.dimz = input:size(1) end
164 |    self.output:resize(self.dimz, self.max_height, self.max_width):zero()
165 | 
166 |    -- using the coordinates table fill the pack with different scales
167 |    -- if the pack and coordinates already exist for the same input size we go directly to here
168 |    for i = 1,#self.scales do
169 |       local temp = self.output:narrow(3,self.coordinates[i][1],self.coordinates[i][5])
170 |       temp = temp:narrow(2,self.coordinates[i][2],self.coordinates[i][6])
171 |       image.scale(input, temp, 'bilinear')
172 |    end
173 | 
174 |    return self.output, self.coordinates
175 | end
176 | 
177 | function PyramidPacker:backward(input, gradOutput)
178 |    xlua.error('backward non implemented', 'PyramidPacker')
179 | end
180 | 
181 | function PyramidPacker:write(file)
182 |    parent.write(self,file)
183 |    file:writeDouble(#self.scales)
184 |    for i = 1,#self.scales do
185 |       file:writeDouble(self.scales[i])
186 |    end
187 | end
188 | 
189 | function PyramidPacker:read(file)
190 |    parent.read(self,file)
191 |    local nbScales = file:readDouble()
192 |    for i = 1,nbScales do
193 |       self.scales[i] = file:readDouble()
194 |    end
195 | end
196 | 


--------------------------------------------------------------------------------
/demos/face-detector/PyramidUnPacker.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | local PyramidUnPacker, parent = torch.class('nn.PyramidUnPacker', 'nn.Module')
  3 | 
  4 | local function getSizesTbl(net)
  5 |    local sizes_tbl = {}
  6 |    for i=1,#net.modules do
  7 |       dw = net.modules[i].dW
  8 |       dh = net.modules[i].dH
  9 |       kw = net.modules[i].kW
 10 |       kh = net.modules[i].kH
 11 |       if((dw ~= nil)and(dh ~= nil)and(kw ~= nil) and(kh ~= nil)) then 
 12 | 	 table.insert(sizes_tbl, {kw=kw,kh=kh,dw=dw,dh=dh})
 13 |       end
 14 |    end
 15 | 
 16 |    return sizes_tbl
 17 | end
 18 | 
 19 | local function getRange(args)
 20 |    local sizes_tbl = args.sizes_tbl
 21 |    local idx_output = args.idx_output
 22 | 
 23 |    local x = torch.Tensor(#sizes_tbl+1)
 24 |    local y = torch.Tensor(#sizes_tbl+1)
 25 |    x[#sizes_tbl+1] = idx_output
 26 |    y[#sizes_tbl+1] = idx_output
 27 | 
 28 |    for k = #sizes_tbl,1,-1 do
 29 |       -- rightmost point of the image that affects x(k+1)
 30 |       x[k] = sizes_tbl[k].kw+ (x[k+1]-1) * sizes_tbl[k].dw
 31 |       -- leftmost point of the image that affects y(k+1)
 32 |       y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dw
 33 |    end
 34 |    local left_width = y[1]
 35 |    local right_width = x[1]
 36 | 
 37 |    for k = #sizes_tbl,1,-1 do
 38 |       -- rightmost point of the image that affects x(k+1)
 39 |       x[k] = sizes_tbl[k].kh+ (x[k+1]-1) * sizes_tbl[k].dh
 40 |       -- leftmost point of the image that affects y(k+1)
 41 |       y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dh
 42 |    end
 43 | 
 44 |    local left_height = y[1]
 45 |    local right_height = x[1]
 46 | 
 47 | 
 48 |    return left_width, right_width, left_height, right_height
 49 | end
 50 | 
 51 | local function getGlobalSizes(args)
 52 |    local sizes_tbl = args.sizes_tbl
 53 |    
 54 |    -- to find gobal kernel size we use recursive formula:
 55 |    -- glob_ker(n + 1) = 1
 56 |    -- glob_ker(n) = ker(n) + (glob_ker(n+1)-1)*step(n)
 57 |    --
 58 |    -- where: ker(n) - kernel size on layer n, step(n) - step size on layer n
 59 |    -- and n is number of layers that change the size of the input (convolution and subsample)
 60 |    local left_width1, right_width1, left_height1, right_height1 = getRange({sizes_tbl=sizes_tbl, idx_output=1})
 61 |    local ker_width = right_width1 - left_width1 +1
 62 |    local ker_height = right_height1 - left_height1 +1
 63 | 
 64 |    local step_width = 1
 65 |    local step_height = 1
 66 | 
 67 |    -- global step = MUL(step_1, step_2, ... , step_n)
 68 |    for i = 1, #sizes_tbl do
 69 |       step_width = step_width * sizes_tbl[i].dw
 70 |       step_height = step_height * sizes_tbl[i].dh
 71 |    end
 72 | 
 73 |    return step_width, step_height, ker_width, ker_height
 74 | end
 75 | 
 76 | function PyramidUnPacker:__init(network)
 77 |    parent.__init(self)
 78 | 
 79 |    -- infer params from given net
 80 |    self.step_width, self.step_height, self.ker_width, self.ker_height
 81 |       = getGlobalSizes({sizes_tbl=getSizesTbl(network)})
 82 | end
 83 | 
 84 | function PyramidUnPacker:forward(input, coordinates)
 85 |    self.out_tbl = {}
 86 |    self.coordinates = coordinates
 87 | 
 88 |    for i = 1, self.coordinates:size(1) do
 89 |       local start_x = math.floor((self.coordinates[i][1] - 1)/self.step_width) + 1
 90 |       local start_y = math.floor((self.coordinates[i][2] - 1)/self.step_height) + 1
 91 |       local width = math.floor((self.coordinates[i][5] - self.ker_width)/self.step_width) + 1
 92 |       local height = math.floor((self.coordinates[i][6] - self.ker_height)/self.step_height) + 1
 93 | 
 94 |       local temp = input:narrow(3, start_x, width)
 95 |       temp = temp:narrow(2, start_y, height)
 96 |       table.insert(self.out_tbl, temp) 
 97 |    end
 98 |    return self.out_tbl
 99 | end
100 | 
101 | function PyramidUnPacker:backward(input, gradOutput)
102 |    error('backward non implemented', 'PyramidUnPacker')
103 | end
104 | 
105 | function PyramidUnPacker:write(file)
106 |    parent.write(self,file)
107 |    file:writeDouble(#self.scales)
108 |    for i = 1,#self.scales do
109 |       file:writeDouble(self.scales[i])
110 |    end
111 | end
112 | 
113 | function PyramidUnPacker:read(file)
114 |    parent.read(self,file)
115 |    local nbScales = file:readDouble()
116 |    for i = 1,nbScales do
117 |       self.scales[i] = file:readDouble()
118 |    end
119 | end
120 | 


--------------------------------------------------------------------------------
/demos/face-detector/blobParser.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'inline'
 3 | 
 4 | local parse = inline.load [[
 5 |       // get args
 6 |       const void* id = luaT_checktypename2id(L, "torch.DoubleTensor");
 7 |       THDoubleTensor *tensor = luaT_checkudata(L, 1, id);
 8 |       double threshold = lua_tonumber(L, 2);
 9 |       int table_blobs = 3;
10 |       int idx = lua_objlen(L, 3) + 1;
11 |       double scale = lua_tonumber(L, 4);
12 | 
13 |       // loop over pixels
14 |       int x,y;
15 |       for (y=0; y<tensor->size[0]; y++) {
16 |          for (x=0; x<tensor->size[1]; x++) {
17 |             double val = THDoubleTensor_get2d(tensor, y, x);
18 |             if (val > threshold) {
19 |                // entry = {}
20 |                lua_newtable(L);
21 |                int entry = lua_gettop(L);
22 | 
23 |                // entry[1] = x
24 |                lua_pushnumber(L, x);
25 |                lua_rawseti(L, entry, 1);
26 | 
27 |                // entry[2] = y
28 |                lua_pushnumber(L, y);
29 |                lua_rawseti(L, entry, 2);
30 | 
31 |                // entry[3] = scale
32 |                lua_pushnumber(L, scale);
33 |                lua_rawseti(L, entry, 3);
34 | 
35 |                // blobs[idx] = entry; idx = idx + 1
36 |                lua_rawseti(L, table_blobs, idx++);
37 |             }
38 |          }
39 |       }
40 |       return 0;
41 | ]]
42 | 
43 | return parse
44 | 


--------------------------------------------------------------------------------
/demos/face-detector/face.net:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clementfarabet/neuflow/cf3364568c6345767085eb8c36d90e8acc0ebffa/demos/face-detector/face.net


--------------------------------------------------------------------------------
/demos/filter-bank.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env torch
  2 | ----------------------------------------------------------------------
  3 | -- This program demonstrates the computation of a bank of filters
  4 | -- over a grayscale image. The image is grabbed from a webcam,
  5 | -- if available (and if the package 'camera' is installed as well),
  6 | -- otherwise, a fixed image (lena) is used as an input.
  7 | --
  8 | -- This script demonstrates how to describe a simple algorithm
  9 | -- using Torch7's 'nn' package, and how to compile it for neuFlow.
 10 | --
 11 | 
 12 | require 'image'
 13 | require 'neuflow'
 14 | require 'qt'
 15 | require 'qtwidget'
 16 | 
 17 | ----------------------------------------------------------------------
 18 | -- INIT: initialize the neuFlow context
 19 | -- a mem manager, the dataflow core, and the compiler
 20 | --
 21 | -- platform='xilinx_ml605' or platform='pico_m503'
 22 | 
 23 | local platform = arg[1] or 'xilinx_ml605'
 24 | nf = neuflow.init {
 25 |    prog_name   = 'filter-bank',
 26 |    platform    = platform
 27 | }
 28 | 
 29 | ----------------------------------------------------------------------
 30 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 
 31 | -- how it should interact with the host (data exchange)
 32 | -- note: any copy**Host() inserted here needs to be matched by
 33 | -- a copy**Dev() in the EXEC section.
 34 | --
 35 | 
 36 | -- input data
 37 | inputsize = 400
 38 | input = image.scale(image.lena()[1], inputsize,inputsize)
 39 | 
 40 | -- compute 16 9x9 random filters on the input,
 41 | -- followed by a non-linear activation unit
 42 | network = nn.Sequential()
 43 | network:add(nn.SpatialConvolution(1,16,9,9))
 44 | network:add(nn.Tanh())
 45 | 
 46 | -- loop over the main code
 47 | nf:beginLoop('main') do
 48 | 
 49 |    -- send data to device
 50 |    input_dev = nf:copyFromHost(input)
 51 | 
 52 |    -- compile network
 53 |    output_dev = nf:compile(network, input_dev)
 54 | 
 55 |    -- send result back to host
 56 |    outputs = nf:copyToHost(output_dev)
 57 | 
 58 | end nf:endLoop('main')
 59 | 
 60 | ----------------------------------------------------------------------
 61 | -- LOAD: load the bytecode on the device, and execute it
 62 | --
 63 | nf:sendReset()
 64 | nf:loadBytecode()
 65 | 
 66 | ----------------------------------------------------------------------
 67 | -- EXEC: this part executes the host code, and interacts with the dev
 68 | --
 69 | 
 70 | -- profiler
 71 | p = nf.profiler
 72 | 
 73 | -- zoom
 74 | zoom = 0.5
 75 | 
 76 | -- try to initialize camera, or default to Lena
 77 | if xlua.require 'camera' then
 78 |    camera = image.Camera{}
 79 | end
 80 | 
 81 | -- process loop
 82 | function process()
 83 |    p:start('whole-loop','fps')
 84 | 
 85 |    if camera then
 86 |       p:start('get-camera-frame')
 87 |       local frame = camera:forward()
 88 |       image.scale(input,frame:narrow(1,2,1))
 89 |       p:lap('get-camera-frame')
 90 |    end
 91 | 
 92 |    nf:copyToDev(input)
 93 |    nf:copyFromDev(outputs)
 94 | 
 95 |    win:gbegin()
 96 |    win:showpage()
 97 | 
 98 |    p:start('display')
 99 |    image.display{image=outputs, win=win, min=-1, max=1, zoom=zoom}
100 |    p:lap('display')
101 | 
102 |    p:lap('whole-loop')
103 |    p:displayAll{painter=win, x=outputs:size(3)*4*zoom+10, y=outputs:size(2)*2*zoom+40, font=12}
104 |    win:gend()
105 | end
106 | 
107 | ----------------------------------------------------------------------
108 | -- GUI: setup user interface / display
109 | --
110 | 
111 | if not win then
112 |    win = qtwidget.newwindow(outputs:size(3)*6*zoom, outputs:size(2)*3*zoom, 'Filter Bank')
113 | end
114 | 
115 | timer = qt.QTimer()
116 | timer.interval = 10
117 | timer.singleShot = true
118 | qt.connect(timer,
119 |            'timeout()',
120 |            function()
121 |               process()
122 |               collectgarbage()
123 |               timer:start()
124 |            end)
125 | timer:start()
126 | 


--------------------------------------------------------------------------------
/demos/loopback.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env torch
  2 | ----------------------------------------------------------------------
  3 | -- A simple loopback program for neuFlow: send images and receive
  4 | -- them back from neuFlow, in a loop.
  5 | --
  6 | -- If this script works, it validates:
  7 | --  (1) the ethernet interface
  8 | --  (2) the embedded openFlow CPU
  9 | --  (3) the streamer
 10 | --  (4) the DDR2/3 interface
 11 | --
 12 | 
 13 | require 'image'
 14 | require 'neuflow'
 15 | require 'qt'
 16 | require 'qtwidget'
 17 | 
 18 | ----------------------------------------------------------------------
 19 | -- INIT: initialize the neuFlow context
 20 | -- a mem manager, the dataflow core, and the compiler
 21 | --
 22 | -- platform='xilinx_ml605' or platform='pico_m503'
 23 | 
 24 | local platform = arg[1] or 'xilinx_ml605'
 25 | local network_if_name = arg[2]
 26 | 
 27 | nf = neuflow.init {
 28 |    prog_name         = 'loopback',
 29 |    platform          = platform,
 30 |    network_if_name   = network_if_name
 31 | }
 32 | 
 33 | ----------------------------------------------------------------------
 34 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 
 35 | -- how it should interact with the host (data exchange)
 36 | -- note: any copy**Host() inserted here needs to be matched by
 37 | -- a copy**Dev() in the EXEC section.
 38 | --
 39 | 
 40 | -- input data
 41 | inputsize = 400
 42 | 
 43 | -- rescale
 44 | input = image.scale(image.lena(), inputsize,inputsize)
 45 | 
 46 | -- loop over the main code
 47 | nf:beginLoop('main') do
 48 | 
 49 |    -- send data to device
 50 |    input_dev = nf:copyFromHost(input)
 51 | 
 52 |    -- get it back
 53 |    outputs = nf:copyToHost(input_dev)
 54 | 
 55 | end nf:endLoop('main')
 56 | 
 57 | ----------------------------------------------------------------------
 58 | -- LOAD: load the bytecode on the device, and execute it
 59 | --
 60 | nf:sendReset()
 61 | nf:loadBytecode()
 62 | 
 63 | ----------------------------------------------------------------------
 64 | -- EXEC: this part executes the host code, and interacts with the dev
 65 | --
 66 | 
 67 | -- profiler
 68 | p = nf.profiler
 69 | 
 70 | -- process loop
 71 | function process()
 72 |    p:start('whole-loop','fps')
 73 | 
 74 |    nf:copyToDev(input)
 75 |    nf:copyFromDev(outputs)
 76 | 
 77 |    p:start('compute-error')
 78 |    error = outputs:clone():add(-1,input):abs()
 79 |    p:lap('compute-error')
 80 |    
 81 |    win:gbegin()
 82 |    win:showpage()
 83 | 
 84 |    p:start('display')
 85 |    image.display{image=input, win=win, x=0, min=0, max=1}
 86 |    image.display{image=outputs, win=win, x=input:size(3), min=0, max=1}
 87 |    image.display{image=error, win=win, x=input:size(3)*2, min=0, max=1}
 88 |    p:lap('display')
 89 | 
 90 |    p:lap('whole-loop')
 91 |    p:displayAll{painter=win, x=10, y=input:size(2)+20, font=12}
 92 |    win:gend()
 93 | end
 94 | 
 95 | ----------------------------------------------------------------------
 96 | -- GUI: setup user interface / display
 97 | --
 98 | 
 99 | if not win then
100 |    win = qtwidget.newwindow(1200,540,'Loopback Test')
101 | end
102 | 
103 | timer = qt.QTimer()
104 | timer.interval = 10
105 | timer.singleShot = true
106 | qt.connect(timer,
107 |            'timeout()',
108 |            function()
109 |               process()
110 |               collectgarbage()
111 |               timer:start()
112 |            end)
113 | timer:start()
114 | 


--------------------------------------------------------------------------------
/demos/loopback_camera.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env torch
  2 | ----------------------------------------------------------------------
  3 | -- A simple program for neuFlow: receive images from embedded camera
  4 | -- of m503 board
  5 | -- them back from neuFlow, in a loop.
  6 | --
  7 | -- If this script works, it validates:
  8 | --  (1) the ethernet interface
  9 | --  (2) the embedded openFlow CPU
 10 | --  (3) the streamer
 11 | --  (4) the DDR2/3 interface
 12 | --  (5) the cameras capture and configuration
 13 | 
 14 | require 'image'
 15 | require 'neuflow'
 16 | require 'qt'
 17 | require 'qtwidget'
 18 | 
 19 | ----------------------------------------------------------------------
 20 | -- INIT: initialize the neuFlow context
 21 | -- a mem manager, the dataflow core, and the compiler
 22 | --
 23 | nf = neuflow.init {
 24 |    prog_name           = 'loopback',
 25 |    platform            ='pico_m503',
 26 |    --global_msg_level    = 'detailled',
 27 |    --interface_msg_level = 'detailled',
 28 | }
 29 | 
 30 | ----------------------------------------------------------------------
 31 | -- ELABORATION: describe the algorithm to be run on neuFlow, and
 32 | -- how it should interact with the host (data exchange)
 33 | -- note: any copy**Host() inserted here needs to be matched by
 34 | -- a copy**Dev() in the EXEC section.
 35 | --
 36 | activeCamera = {'B','A'}
 37 | toto = nf.camera:config(activeCamera, 'iic', 'ON')
 38 | --toto = nf.camera:config(activeCamera, 'domain', 'RGB')
 39 | --toto = nf.camera:config(activeCamera, 'definition', 'QVGA')
 40 | toto = nf.camera:config(activeCamera, 'scan', 'PROGRESSIVE')
 41 | toto = nf.camera:config(activeCamera, 'color', 'B&W')
 42 | --toto = nf.camera:config(activeCamera, 'domain', 'RGB')
 43 | --toto = nf.camera:cPROGRESSIVEonfig(activeCamera, 'grab', 'ONESHOT')
 44 | --print('<neuflow.Camera> : reg ctrl ' .. toto)
 45 | 
 46 | --nf.camera:stopRBCameras() -- Being sure that the Camera is stopped
 47 | nf.camera.core:sleep(1)
 48 | --nf.camera:startRBCameras() -- Start camera and send images to Running Buffer
 49 | nf.camera:enableCameras(activeCamera)
 50 | 
 51 | -- loop over the main code
 52 | nf:beginLoop('main') do
 53 | 
 54 | 
 55 |    -- send image from camera to memory
 56 |    nf.camera:captureOneFrame(activeCamera)
 57 |    input_dev = nf.camera:getLastFrame(activeCamera)
 58 | 
 59 |    -- pass image to host
 60 |    outputs = nf:copyToHost(input_dev)
 61 |    --outputs = nf.camera:copyToHostLatestFrame() -- Get the latest complete frame from both camers
 62 |    --nf.camera.core:sleep(0.15)
 63 | 
 64 | end nf:endLoop('main')
 65 | 
 66 | ----------------------------------------------------------------------
 67 | -- LOAD: load the bytecode on the device, and execute it
 68 | --
 69 | nf:sendReset()
 70 | nf:loadBytecode()
 71 | 
 72 | ----------------------------------------------------------------------
 73 | -- EXEC: this part executes the host code, and interacts with the dev
 74 | --
 75 | 
 76 | -- profiler
 77 | p = nf.profiler
 78 | 
 79 | local framecnt = 0
 80 | -- process loop
 81 | function process()
 82 |    p:start('whole-loop','fps')
 83 |    --end
 84 | 
 85 |    nf:copyFromDev(outputs)
 86 | 
 87 |    p:start('display')
 88 |    win:gbegin()
 89 |    win:showpage()
 90 |    image.display{image=outputs, win=win, x=0, min=0, max=1}
 91 |    p:lap('display')
 92 |    p:lap('whole-loop')
 93 |    p:displayAll{painter=win, x=10, y=500, font=12}
 94 |    win:gend()
 95 |    --end
 96 |    framecnt = framecnt + 1
 97 | end
 98 | 
 99 | ----------------------------------------------------------------------
100 | -- GUI: setup user interface / display
101 | --
102 | 
103 | torch.setdefaulttensortype('torch.FloatTensor')
104 | 
105 | if not win then
106 |    win = qtwidget.newwindow(2000,800,'Loopback Camera Test')
107 | end
108 | 
109 | timer = qt.QTimer()
110 | timer.interval = 10
111 | timer.singleShot = true
112 | qt.connect(timer,
113 |            'timeout()',
114 |            function()
115 |               process()
116 |               collectgarbage()
117 |               timer:start()
118 |            end)
119 | timer:start()
120 | 


--------------------------------------------------------------------------------
/etherflow/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | IF (APPLE)
 3 |     SET (CMAKE_C_FLAGS "-D_APPLE_=1")
 4 | ELSE (APPLE)
 5 |     SET (CMAKE_C_FLAGS "-D_LINUX_=1")
 6 | ENDIF (APPLE)
 7 | 
 8 | INCLUDE_DIRECTORIES (${PROJECT_SOURCE_DIR}/etherflow)
 9 | SET(src init.c)
10 | SET(luasrc init.lua)
11 | ADD_TORCH_PACKAGE(etherflow "${src}" "${luasrc}" "neuFlow")
12 | TARGET_LINK_LIBRARIES(etherflow luaT TH)
13 | 


--------------------------------------------------------------------------------
/etherflow/etherflow.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained API to interface neuFlow
 3 |  **********************************************************/
 4 | 
 5 | #include <math.h>
 6 | #include <string.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <unistd.h>
10 | 
11 | // self-contained (no lua)
12 | #define _NO_LUA_
13 | 
14 | // define template macros
15 | #define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z)
16 | #define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z
17 | #define etherflow_(NAME) TH_CONCAT_3(etherflow_, Real, NAME)
18 | #define etherflow_send_(NAME) TH_CONCAT_3(etherflow_send_, Real, NAME)
19 | #define etherflow_receive_(NAME) TH_CONCAT_3(etherflow_receive_, Real, NAME)
20 | 
21 | // load templated code
22 | #undef TH_GENERIC_FILE
23 | #include "generic/etherflow.c"
24 | 
25 | // generate Float version
26 | #define real float
27 | #define accreal double
28 | #define Real Float
29 | #define TH_REAL_IS_FLOAT
30 | #line 1 TH_GENERIC_FILE
31 | #include TH_GENERIC_FILE
32 | #undef accreal
33 | #undef real
34 | #undef Real
35 | #undef TH_REAL_IS_FLOAT
36 | 
37 | // generate Double version
38 | #define real double
39 | #define accreal double
40 | #define Real Double
41 | #define TH_REAL_IS_DOUBLE
42 | #line 1 TH_GENERIC_FILE
43 | #include TH_GENERIC_FILE
44 | #undef accreal
45 | #undef real
46 | #undef Real
47 | #undef TH_REAL_IS_DOUBLE
48 | 


--------------------------------------------------------------------------------
/etherflow/etherflow.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained API to interface neuFlow
 3 |  **********************************************************/
 4 | 
 5 | /***********************************************************
 6 |  * open_socket()
 7 |  * what: opens an ethernet socket
 8 |  * params:
 9 |  *    none
10 |  * returns:
11 |  *    socket - a socket descriptor
12 |  **********************************************************/
13 | int etherflow_open_socket_C(const char *dev, unsigned char *destmac, unsigned char *srcmac);
14 | 
15 | /***********************************************************
16 |  * close_socket()
17 |  * what: closes an ethernet socket
18 |  * params:
19 |  *    socket
20 |  * returns:
21 |  *    none
22 |  **********************************************************/
23 | int etherflow_close_socket_C();
24 | 
25 | /***********************************************************
26 |  * etherflow_send_reset_C()
27 |  * what: send a reset Ethernet frame
28 |  * params:
29 |  *    none
30 |  * returns:
31 |  *    return sendto error code
32 |  **********************************************************/
33 | int etherflow_send_reset_C()
34 | 
35 | /***********************************************************
36 |  * receive_frame_C()
37 |  * what: receives an ethernet frame
38 |  * params:
39 |  *    socket - socket descriptor.
40 |  *    buffer - to receive the data
41 |  * returns:
42 |  *    length - nb of bytes read/received
43 |  **********************************************************/
44 | unsigned char * etherflow_receive_frame_C(int *lengthp);
45 | 
46 | /***********************************************************
47 |  * send_frame_C()
48 |  * what: sends an ethernet frame
49 |  * params:
50 |  *    socket - socket descriptor.
51 |  *    length - length of data to send
52 |  *    data_p - data pointer
53 |  * returns:
54 |  *    error code
55 |  **********************************************************/
56 | int etherflow_send_frame_C(short int length, const unsigned char * data_p);
57 | 
58 | /***********************************************************
59 |  * send_tensor_byte()
60 |  * what: sends a torch byte tensor by breaking it down into
61 |  *       ethernet packets of maximum size
62 |  * params:
63 |  *    socket - socket descriptor.
64 |  *    tensor - tensor to send
65 |  * returns:
66 |  *    void
67 |  **********************************************************/
68 | int etherflow_send_ByteTensor_C(unsigned char * data, int size);
69 | 
70 | /***********************************************************
71 |  * send_tensor()
72 |  * what: sends a torch tensor by breaking it down into
73 |  *       ethernet packets of maximum size
74 |  *       a tensor of reals is converted to Q8.8
75 |  * params:
76 |  *    socket - socket descriptor.
77 |  *    tensor - tensor to send
78 |  * returns:
79 |  *    void
80 |  **********************************************************/
81 | int etherflow_send_FloatTensor_C(float * data, int size);
82 | int etherflow_send_DoubleTensor_C(double * data, int size);
83 | 
84 | /***********************************************************
85 |  * receive_tensor_TYPE()
86 |  * what: receives a torch tensor by concatenating eth packs
87 |  *       a tensor of TYPE is created from Q8.8
88 |  * params:
89 |  *    socket - socket descriptor.
90 |  *    tensor - tensor to fill
91 |  * returns:
92 |  *    void
93 |  **********************************************************/
94 | int etherflow_receive_FloatTensor_C(float *data, int size, int height);
95 | int etherflow_receive_DoubleTensor_C(double *data, int size, int height);
96 | 


--------------------------------------------------------------------------------
/etherflow/example.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained example
 3 |  * Compile: 
 4 |  *   gcc -fpic -shared etherflow.c -o libeth.so
 5 |  *   gcc example.c libeth.so -o example
 6 |  **********************************************************/
 7 | 
 8 | #include <string.h>
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <unistd.h>
12 | 
13 | #include "etherflow.h"
14 | 
15 | #define BINARY_SIZE 32*1024*1024
16 | 
17 | #ifdef _LINUX_
18 | #define ETH_DEV "eth0"
19 | #else // _APPLE_
20 | #define ETH_DEV "en0"
21 | #endif
22 | 
23 | #define abs(a) (a)>0 ? (a) : -(a)
24 | 
25 | int main() {
26 |   // init device
27 |   etherflow_open_socket_C(ETH_DEV, NULL, NULL);
28 | 
29 |   // load code (binary) from file
30 |   unsigned char *neuflow_bin = (unsigned char *)malloc(BINARY_SIZE);
31 |   memset(neuflow_bin, BINARY_SIZE, 0);
32 |   FILE *f = fopen("neuflow.bin", "rb");
33 |   int nread;
34 |   if (f) nread = fread(neuflow_bin, 1, BINARY_SIZE, f);
35 |   else {
36 |     printf("error: could not find neuflow code (neuflow.bin)\n");
37 |     return 1;
38 |   }
39 |   printf("loaded bytecode [size = %d]\n", nread);
40 | 
41 |   // load (and exec) code on neuFlow
42 |   printf("transmitting bytecode\n");
43 |   etherflow_send_ByteTensor_C(neuflow_bin, BINARY_SIZE);
44 |   sleep(1);
45 |   printf("transmitted.\n");
46 | 
47 |   // data structures
48 |   double *input_data = malloc(sizeof(double) * 3 * 400 * 400);
49 |   double *output_data = malloc(sizeof(double) * 3 * 400 * 400);
50 | 
51 |   // initialize data
52 |   int i,k;
53 |   for (k = 0; k < 3; k++) {
54 |     for (i = 0; i < 400*400; i++) {
55 |       input_data[k*400*400+i] = k;
56 |       output_data[k*400*400+i] = 0;
57 |     }
58 |   }
59 | 
60 |   // code is now executing, send data and receive answer in a loop
61 |   while (1) {
62 |     // send input data (a 3x400x400 image)
63 |     double *input_p = input_data;
64 |     for (i = 0; i < 3; i++) {
65 |       etherflow_send_DoubleTensor_C(input_p, 400*400);
66 |       input_p += 400*400;
67 |     }
68 |     etherflow_receive_frame_C(NULL);
69 | 
70 |     // receive data, processed by neuFlow (a 3x400x400 image, loopbacked)
71 |     etherflow_receive_frame_C(NULL);
72 |     double *output_p = output_data;
73 |     for (i = 0; i < 3; i++) {
74 |       etherflow_receive_DoubleTensor_C(output_p, 400*400, 400);
75 |       output_p += 400*400;
76 |     }
77 | 
78 |     // measure loopback error
79 |     double error = 0;
80 |     double maxerr = 0;
81 |     for (i = 0; i < 3*400*400; i++) {
82 |       double err = abs(input_data[i] - output_data[i]);
83 |       if (err > maxerr) maxerr = err;
84 |       error += err;
85 |     }
86 |     error /= 3*400*400;
87 |     printf("average error = %f, max error = %f\n", error, maxerr);
88 |   }
89 | 
90 |   // cleanup
91 |   free(input_data);
92 |   free(output_data);
93 |   free(neuflow_bin);
94 |   return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/etherflow/init.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <math.h>
 3 | #include <string.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <unistd.h>
 7 | 
 8 | #include <luaT.h>
 9 | #include <TH/TH.h>
10 | 
11 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
12 | #define torch_string_(NAME) TH_CONCAT_STRING_3(torch., Real, NAME)
13 | #define etherflow_(NAME) TH_CONCAT_3(etherflow_, Real, NAME)
14 | #define etherflow_send_(NAME) TH_CONCAT_3(etherflow_send_, Real, NAME)
15 | #define etherflow_receive_(NAME) TH_CONCAT_3(etherflow_receive_, Real, NAME)
16 | 
17 | static const void* torch_FloatTensor_id = NULL;
18 | static const void* torch_DoubleTensor_id = NULL;
19 | 
20 | #undef TH_GENERIC_FILE
21 | #include "generic/etherflow.c"
22 | #include "THGenerateFloatTypes.h"
23 | 
24 | DLL_EXPORT int luaopen_libetherflow(lua_State *L)
25 | {
26 |   torch_FloatTensor_id = luaT_checktypename2id(L, "torch.FloatTensor");
27 |   torch_DoubleTensor_id = luaT_checktypename2id(L, "torch.DoubleTensor");
28 | 
29 |   etherflow_FloatApi_init(L);
30 |   etherflow_DoubleApi_init(L);
31 | 
32 |   luaL_register(L, "etherflow.double", etherflow_DoubleApi__); 
33 |   luaL_register(L, "etherflow.float", etherflow_FloatApi__);
34 | 
35 |   return 1;
36 | }
37 | 


--------------------------------------------------------------------------------
/etherflow/init.lua:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------------
 2 | --
 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod
 4 | -- 
 5 | -- Permission is hereby granted, free of charge, to any person obtaining
 6 | -- a copy of this software and associated documentation files (the
 7 | -- "Software"), to deal in the Software without restriction, including
 8 | -- without limitation the rights to use, copy, modify, merge, publish,
 9 | -- distribute, sublicense, and/or sell copies of the Software, and to
10 | -- permit persons to whom the Software is furnished to do so, subject to
11 | -- the following conditions:
12 | -- 
13 | -- The above copyright notice and this permission notice shall be
14 | -- included in all copies or substantial portions of the Software.
15 | -- 
16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | -- 
24 | ----------------------------------------------------------------------
25 | -- description:
26 | --     etherflow - a raw serial interface over gigabit ethernet,
27 | --                 for communication between neuFlow <-> UNIX host.
28 | --
29 | -- history: 
30 | --     July 16, 2011, 1:46PM - import from Torch5 - Clement Farabet
31 | ----------------------------------------------------------------------
32 | 
33 | require 'torch'
34 | require 'libetherflow'
35 | 
36 | function etherflow.open(dev, destmac, srcmac)
37 |    return etherflow.double.open_socket(dev, destmac, srcmac)
38 | end
39 | 
40 | function etherflow.close(dev)
41 |    etherflow.double.close_socket()
42 | end
43 | 
44 | function etherflow.sendreset()
45 |    return etherflow.double.send_reset()
46 | end
47 | 
48 | function etherflow.handshake(bool)
49 |    etherflow.double.handshake(bool)
50 | end
51 | 
52 | function etherflow.sendstring(str)
53 |    etherflow.double.send_frame(str)
54 | end
55 | 
56 | function etherflow.receivestring()
57 |    return etherflow.double.receive_string()
58 | end
59 | 
60 | function etherflow.receiveframe()
61 |    return etherflow.double.receive_frame()
62 | end
63 | 
64 | function etherflow.sendtensor(tensor)
65 |    tensor.etherflow.send_tensor(tensor)
66 | end
67 | 
68 | function etherflow.receivetensor(tensor)
69 |    tensor.etherflow.receive_tensor(tensor)
70 | end
71 | 
72 | function etherflow.loadbytecode(bytetensor)
73 |    etherflow.double.send_bytetensor(bytetensor)
74 | end
75 | 
76 | function etherflow.setfirstcall(val)
77 |    etherflow.double.set_first_call(val)
78 | end
79 | 


--------------------------------------------------------------------------------
/etherflow/test/receive.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'etherflow'
 3 | require 'image'
 4 | 
 5 | etherflow.open()
 6 | 
 7 | t = torch.Tensor(512,512)
 8 | 
 9 | for i = 1,1000 do
10 |    print 'waiting for tensor'
11 |    sys.tic()
12 |    etherflow.setfirstcall(1)
13 |    etherflow.receivetensor(t)
14 |    print 'got tensor !'
15 |    sys.toc(true)
16 |    w = image.display{image=t, win=w, gui=false}
17 | end
18 | 


--------------------------------------------------------------------------------
/etherflow/test/send.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'etherflow'
 3 | require 'image'
 4 | 
 5 | etherflow.open(nil, {0xff,0xff,0xff,0xff,0xff,0xff}, {0x01,0x02,0x03,0x04,0x05,0x06})
 6 | 
 7 | l = image.lena()[1]
 8 | 
 9 | for i = 1,1000 do
10 |    sys.tic()
11 |    etherflow.sendtensor(l)
12 |    sys.toc(true)
13 | end
14 | 


--------------------------------------------------------------------------------
/ethertbsp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | IF (APPLE)
 3 |     SET (CMAKE_C_FLAGS "-D_APPLE_=1")
 4 | ELSE (APPLE)
 5 |     SET (CMAKE_C_FLAGS "-D_LINUX_=1")
 6 | ENDIF (APPLE)
 7 | 
 8 | INCLUDE_DIRECTORIES (${PROJECT_SOURCE_DIR}/ethertbsp)
 9 | SET(src init.c)
10 | SET(luasrc init.lua)
11 | ADD_TORCH_PACKAGE(ethertbsp "${src}" "${luasrc}" "neuFlow")
12 | TARGET_LINK_LIBRARIES(ethertbsp luaT TH)
13 | 


--------------------------------------------------------------------------------
/ethertbsp/ethertbsp.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained API to interface neuFlow
 3 |  **********************************************************/
 4 | 
 5 | #include <math.h>
 6 | #include <string.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <unistd.h>
10 | 
11 | // self-contained (no lua)
12 | #define _NO_LUA_
13 | 
14 | // define template macros
15 | #define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z)
16 | #define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z
17 | #define ethertbsp_(NAME) TH_CONCAT_3(ethertbsp_, Real, NAME)
18 | #define ethertbsp_send_(NAME) TH_CONCAT_3(ethertbsp_send_, Real, NAME)
19 | #define ethertbsp_receive_(NAME) TH_CONCAT_3(ethertbsp_receive_, Real, NAME)
20 | 
21 | // load templated code
22 | #undef TH_GENERIC_FILE
23 | #include "generic/ethertbsp.c"
24 | 
25 | // generate Float version
26 | #define real float
27 | #define accreal double
28 | #define Real Float
29 | #define TH_REAL_IS_FLOAT
30 | #line 1 TH_GENERIC_FILE
31 | #include TH_GENERIC_FILE
32 | #undef accreal
33 | #undef real
34 | #undef Real
35 | #undef TH_REAL_IS_FLOAT
36 | 
37 | // generate Double version
38 | #define real double
39 | #define accreal double
40 | #define Real Double
41 | #define TH_REAL_IS_DOUBLE
42 | #line 1 TH_GENERIC_FILE
43 | #include TH_GENERIC_FILE
44 | #undef accreal
45 | #undef real
46 | #undef Real
47 | #undef TH_REAL_IS_DOUBLE
48 | 


--------------------------------------------------------------------------------
/ethertbsp/ethertbsp.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained API to interface Ethernet to neuFlow
 3 |  **********************************************************/
 4 | 
 5 | /***********************************************************
 6 |  * open_socket()
 7 |  * what: opens an ethernet socket
 8 |  * params:
 9 |  *    dev - network device name
10 |  *    remote_mac - MAC addr of remote dev
11 |  *    local_mac - MAC addr of host computer
12 |  *
13 |  * returns:
14 |  *    error - 0 for succsess, -1 for error
15 |  **********************************************************/
16 | int ethertbsp_open_socket_C(const char *dev, unsigned char *remote_mac, unsigned char *local_mac);
17 | 
18 | /***********************************************************
19 |  * close_socket()
20 |  * what: closes the ethernet socket
21 |  * params:
22 |  *    none
23 |  * returns:
24 |  *    none
25 |  **********************************************************/
26 | int ethertbsp_close_socket_C();
27 | 
28 | /***********************************************************
29 |  * send_tensor_byte()
30 |  * what: sends a torch byte tensor by breaking it down into
31 |  *       ethernet packets of maximum size
32 |  * params:
33 |  *    data - send tensor as array
34 |  *    size - length of data array
35 |  * returns:
36 |  *    zero
37 |  **********************************************************/
38 | int ethertbsp_send_ByteTensor_C(unsigned char * data, int size);
39 | 
40 | /***********************************************************
41 |  * send_tensor()
42 |  * what: sends a torch tensor by breaking it down into
43 |  *       ethernet packets of maximum size
44 |  *       a tensor of reals is converted to Q8.8
45 |  * params:
46 |  *    data - send tensor as array
47 |  *    size - length of data array
48 |  * returns:
49 |  *    zero
50 |  **********************************************************/
51 | int ethertbsp_send_FloatTensor_C(float * data, int size);
52 | int ethertbsp_send_DoubleTensor_C(double * data, int size);
53 | 
54 | /***********************************************************
55 |  * receive_tensor_TYPE()
56 |  * what: receives a torch tensor by concatenating eth packs
57 |  *       a tensor of TYPE is created from Q8.8
58 |  * params:
59 |  *    data - tensor as array to be filled
60 |  *    size - length of data array
61 |  * returns:
62 |  *    zero
63 |  **********************************************************/
64 | int ethertbsp_receive_FloatTensor_C(float *data, int size, int height);
65 | int ethertbsp_receive_DoubleTensor_C(double *data, int size, int height);
66 | 


--------------------------------------------------------------------------------
/ethertbsp/example.c:
--------------------------------------------------------------------------------
 1 | /***********************************************************
 2 |  * A self-contained example
 3 |  * Compile:
 4 |  *   gcc -fpic -shared ethertbsp.c -o libeth.so
 5 |  *   gcc example.c libeth.so -o example
 6 |  **********************************************************/
 7 | 
 8 | #include <string.h>
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <unistd.h>
12 | 
13 | #include "ethertbsp.h"
14 | 
15 | #define BINARY_SIZE 32*1024*1024
16 | 
17 | #ifdef _LINUX_
18 | #define ETH_DEV "eth0"
19 | #else // _APPLE_
20 | #define ETH_DEV "en0"
21 | #endif
22 | 
23 | #define abs(a) (a)>0 ? (a) : -(a)
24 | 
25 | int main() {
26 |   // init device
27 |   ethertbsp_open_socket_C(ETH_DEV, NULL, NULL);
28 | 
29 |   // load code (binary) from file
30 |   unsigned char *neuflow_bin = (unsigned char *)malloc(BINARY_SIZE);
31 |   memset(neuflow_bin, BINARY_SIZE, 0);
32 |   FILE *f = fopen("neuflow.bin", "rb");
33 |   int nread;
34 |   if (f) nread = fread(neuflow_bin, 1, BINARY_SIZE, f);
35 |   else {
36 |     printf("error: could not find neuflow code (neuflow.bin)\n");
37 |     return 1;
38 |   }
39 |   printf("loaded bytecode [size = %d]\n", nread);
40 | 
41 |   // load (and exec) code on neuFlow
42 |   printf("transmitting bytecode\n");
43 |   ethertbsp_send_ByteTensor_C(neuflow_bin, BINARY_SIZE);
44 |   sleep(1);
45 |   printf("transmitted.\n");
46 | 
47 |   // data structures
48 |   double *input_data = malloc(sizeof(double) * 3 * 400 * 400);
49 |   double *output_data = malloc(sizeof(double) * 3 * 400 * 400);
50 | 
51 |   // initialize data
52 |   int i,k;
53 |   for (k = 0; k < 3; k++) {
54 |     for (i = 0; i < 400*400; i++) {
55 |       input_data[k*400*400+i] = k;
56 |       output_data[k*400*400+i] = 0;
57 |     }
58 |   }
59 | 
60 |   // code is now executing, send data and receive answer in a loop
61 |   while (1) {
62 |     // send input data (a 3x400x400 image)
63 |     double *input_p = input_data;
64 |     for (i = 0; i < 3; i++) {
65 |       ethertbsp_send_DoubleTensor_C(input_p, 400*400);
66 |       input_p += 400*400;
67 |     }
68 | 
69 |     // receive data, processed by neuFlow (a 3x400x400 image, loopbacked)
70 |     double *output_p = output_data;
71 |     for (i = 0; i < 3; i++) {
72 |       ethertbsp_receive_DoubleTensor_C(output_p, 400*400, 400);
73 |       output_p += 400*400;
74 |     }
75 | 
76 |     // measure loopback error
77 |     double error = 0;
78 |     double maxerr = 0;
79 |     for (i = 0; i < 3*400*400; i++) {
80 |       double err = abs(input_data[i] - output_data[i]);
81 |       if (err > maxerr) maxerr = err;
82 |       error += err;
83 |     }
84 |     error /= 3*400*400;
85 |     printf("average error = %f, max error = %f\n", error, maxerr);
86 |   }
87 | 
88 |   // cleanup
89 |   free(input_data);
90 |   free(output_data);
91 |   free(neuflow_bin);
92 |   return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/ethertbsp/init.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <math.h>
 3 | #include <string.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <unistd.h>
 7 | 
 8 | #include <luaT.h>
 9 | #include <TH/TH.h>
10 | 
11 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
12 | #define torch_string_(NAME) TH_CONCAT_STRING_3(torch., Real, NAME)
13 | #define ethertbsp_(NAME) TH_CONCAT_3(ethertbsp_, Real, NAME)
14 | #define ethertbsp_send_(NAME) TH_CONCAT_3(ethertbsp_send_, Real, NAME)
15 | #define ethertbsp_receive_(NAME) TH_CONCAT_3(ethertbsp_receive_, Real, NAME)
16 | 
17 | static const void* torch_FloatTensor_id = NULL;
18 | static const void* torch_DoubleTensor_id = NULL;
19 | 
20 | #undef TH_GENERIC_FILE
21 | #include "generic/ethertbsp.c"
22 | #include "THGenerateFloatTypes.h"
23 | 
24 | DLL_EXPORT int luaopen_libethertbsp(lua_State *L)
25 | {
26 |   torch_FloatTensor_id = luaT_checktypename2id(L, "torch.FloatTensor");
27 |   torch_DoubleTensor_id = luaT_checktypename2id(L, "torch.DoubleTensor");
28 | 
29 |   ethertbsp_FloatApi_init(L);
30 |   ethertbsp_DoubleApi_init(L);
31 | 
32 |   luaL_register(L, "ethertbsp.double", ethertbsp_DoubleApi__);
33 |   luaL_register(L, "ethertbsp.float", ethertbsp_FloatApi__);
34 | 
35 |   return 1;
36 | }
37 | 


--------------------------------------------------------------------------------
/ethertbsp/init.lua:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------------
 2 | --
 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod
 4 | --
 5 | -- Permission is hereby granted, free of charge, to any person obtaining
 6 | -- a copy of this software and associated documentation files (the
 7 | -- "Software"), to deal in the Software without restriction, including
 8 | -- without limitation the rights to use, copy, modify, merge, publish,
 9 | -- distribute, sublicense, and/or sell copies of the Software, and to
10 | -- permit persons to whom the Software is furnished to do so, subject to
11 | -- the following conditions:
12 | --
13 | -- The above copyright notice and this permission notice shall be
14 | -- included in all copies or substantial portions of the Software.
15 | --
16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | --
24 | ----------------------------------------------------------------------
25 | -- description:
26 | --     ethertbsp - a raw Ethernet packet interface over gigabit ethernet,
27 | --                 for communication between neuFlow <-> UNIX host.
28 | --
29 | -- history:
30 | --     July 16, 2011, 1:46PM - import from Torch5 - Clement Farabet
31 | --     Wed 25 Apr 2012 22:53:06 EDT - Berin Martini
32 | ----------------------------------------------------------------------
33 | 
34 | require 'torch'
35 | require 'libethertbsp'
36 | 
37 | function ethertbsp.open(dev, destmac, srcmac)
38 |    return ethertbsp.double.open_socket(dev, destmac, srcmac)
39 | end
40 | 
41 | function ethertbsp.close(dev)
42 |    ethertbsp.double.close_socket()
43 | end
44 | 
45 | function ethertbsp.sendreset()
46 |    return ethertbsp.double.send_reset()
47 | end
48 | 
49 | function ethertbsp.sendtensor(tensor)
50 |    tensor.ethertbsp.send_tensor(tensor)
51 | end
52 | 
53 | function ethertbsp.receivetensor(tensor)
54 |    tensor.ethertbsp.receive_tensor(tensor)
55 | end
56 | 
57 | function ethertbsp.loadbytecode(bytetensor)
58 |    ethertbsp.double.send_bytetensor(bytetensor)
59 | end
60 | 


--------------------------------------------------------------------------------
/ethertbsp/test/receive.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'ethertbsp'
 3 | require 'image'
 4 | 
 5 | ethertbsp.open()
 6 | 
 7 | t = torch.Tensor(512,512)
 8 | 
 9 | for i = 1,1000 do
10 |    print 'waiting for tensor'
11 |    sys.tic()
12 |    ethertbsp.receivetensor(t)
13 |    print 'got tensor !'
14 |    sys.toc(true)
15 |    w = image.display{image=t, win=w, gui=false}
16 | end
17 | 


--------------------------------------------------------------------------------
/ethertbsp/test/send.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | require 'ethertbsp'
 3 | require 'image'
 4 | 
 5 | ethertbsp.open(nil, {0xff,0xff,0xff,0xff,0xff,0xff}, {0x01,0x02,0x03,0x04,0x05,0x06})
 6 | 
 7 | l = image.lena()[1]
 8 | 
 9 | for i = 1,1000 do
10 |    sys.tic()
11 |    ethertbsp.sendtensor(l)
12 |    sys.toc(true)
13 | end
14 | 


--------------------------------------------------------------------------------
/neuflow-1.scm-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "neuflow"
 2 | version = "1.scm-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/clementfarabet/neuflow",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "A compiler toolkit for the neuFlow v1 arch",
10 |    detailed = [[
11 | A package to generate the bytecode for and to setup a communication channel with the neuFlow v1 processor.
12 |    ]],
13 |    homepage = "https://github.com/clementfarabet/neuflow",
14 |    license = "MIT/X11"
15 | }
16 | 
17 | dependencies = {
18 |    "torch >= 7.0",
19 |    "xlua >= 1.0",
20 |    "nnx >= 0.1",
21 |    "luabitop >= 1.0.1",
22 | }
23 | 
24 | build = {
25 |    type = "command",
26 |    build_command = [[
27 | cmake -E make_directory build;
28 | cd build;
29 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)"; 
30 | $(MAKE)
31 |    ]],
32 |    install_command = "cd build && $(MAKE) install"
33 | }
34 | 


--------------------------------------------------------------------------------
/scripts/get-latest-neuflow-image:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | 
3 | wget http://data.neuflow.org/share/neuFlow-ml605.bit
4 | 


--------------------------------------------------------------------------------
/scripts/load-bitfile:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # need an arg
 4 | if [ $# == 0 ]
 5 | then
 6 |     echo "syntax:"
 7 |     echo "  load-bitfile bitfile            [loads a bitfile]"
 8 |     echo "  load-bitfile bitfile platform   [specifies the platform: ml605 | m503 (default=ml605)]"
 9 |     echo "  load-bitfile unlock             [unlocks the cable, if not responsive]"
10 |     exit
11 | fi
12 | 
13 | # require impact in path
14 | if [ ! `which impact` ]
15 | then
16 |     echo "impact could not be found..."
17 |     echo " > impact is part of Xilinx's ISE toolchain"
18 |     echo " > it is used to load a bitfile into any Xilinx's FPGA, via JTAG"
19 |     echo " > it comes for free with ISE webpack edition"
20 |     echo " > if you already installed it, then simply add the tools to your path:"
21 |     echo "   $ source /opt/Xilinx/VERSION/.../settings**.sh"
22 |     echo "   and then re-run this script !"
23 |     exit
24 | fi
25 | 
26 | # platform
27 | if [ $# == 2 ]
28 | then
29 |     if [ $2 == "ml605" ]
30 |     then
31 |         fpga=2
32 |     fi
33 |     if [ $2 == "m503" ]
34 |     then
35 |         fpga=1
36 |     fi
37 |     echo "--> programming device ${fpga}"
38 | else
39 |     fpga=2 
40 | fi
41 | 
42 | # parse arg
43 | if [ $1 == "unlock" ]
44 | then
45 |     echo "--> unlocking cable"
46 |     tmp=/tmp/impact_batch_`date`
47 |     echo $tmp
48 |     echo "cleancablelock" > "$tmp"
49 |     echo "quit" >> "$tmp"
50 | else
51 |     echo "--> loading bitfile"
52 |     tmp=/tmp/impact_batch_`date`
53 |     echo $tmp
54 |     echo "setmode -bs" > "$tmp"
55 |     echo "setcable -p auto" >> "$tmp"
56 |     echo "identify" >> "$tmp"
57 |     echo "assignFile -p ${fpga} -file" $1 >> "$tmp"
58 |     echo "program -p ${fpga}" >> "$tmp"
59 |     echo "quit" >> "$tmp"
60 | fi
61 | 
62 | # run commands in batch mode
63 | impact -batch "$tmp"
64 | 


--------------------------------------------------------------------------------
/segments/coef_Abs:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 8
 4 | 256
 5 | 0
 6 | 32767
 7 | 256
 8 | 0
 9 | 32767
10 | 256
11 | 0
12 | 32767
13 | 256
14 | 0
15 | 32767
16 | 256
17 | 0
18 | 32767
19 | 256
20 | 0
21 | 32767
22 | 256
23 | 0
24 | 32767
25 | 256
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_Sqrt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 0
 3 | 8
 4 | 15
 5 | 941
 6 | 7598
 7 | 33
 8 | 430
 9 | 1510
10 | 73
11 | 194
12 | 311
13 | 159
14 | 89
15 | 67
16 | 340
17 | 42
18 | 15
19 | 697
20 | 21
21 | 4
22 | 1365
23 | 11
24 | 1
25 | 4096
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_Sqrt_th:
--------------------------------------------------------------------------------
 1 | 0
 2 | 0
 3 | 8
 4 | 15
 5 | 941
 6 | 7598
 7 | 33
 8 | 430
 9 | 1510
10 | 73
11 | 194
12 | 311
13 | 159
14 | 89
15 | 67
16 | 340
17 | 42
18 | 15
19 | 697
20 | 21
21 | 4
22 | 1365
23 | 11
24 | 1
25 | 3840
26 | 1
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_Sqrt_th_div_3:
--------------------------------------------------------------------------------
 1 | 0
 2 | 0
 3 | 8
 4 | 8
 5 | 595
 6 | 9985
 7 | 17
 8 | 279
 9 | 1883
10 | 39
11 | 118
12 | 325
13 | 92
14 | 50
15 | 61
16 | 205
17 | 23
18 | 14
19 | 432
20 | 11
21 | 3
22 | 3840
23 | -29
24 | 2
25 | 0
26 | 1
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_Sqrt_th_div_32:
--------------------------------------------------------------------------------
 1 | 0
 2 | 0
 3 | 8
 4 | 256
 5 | 0
 6 | 32767
 7 | 3
 8 | 176
 9 | 8915
10 | 6
11 | 76
12 | 1386
13 | 14
14 | 32
15 | 269
16 | 30
17 | 15
18 | 66
19 | 53
20 | 9
21 | 32
22 | 3840
23 | -464
24 | 31
25 | 0
26 | 1
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_StdSigm:
--------------------------------------------------------------------------------
 1 | 1
 2 | 0
 3 | 8
 4 | 5
 5 | 410
 6 | 810
 7 | 77
 8 | 184
 9 | 267
10 | 253
11 | 1
12 | 4
13 | 293
14 | 0
15 | 3
16 | 293
17 | 0
18 | 2
19 | 293
20 | 0
21 | 1
22 | 293
23 | 0
24 | 0
25 | -2147483648
26 | -2147483648
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_StdSigmAbs:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 8
 4 | 16
 5 | 349
 6 | 509
 7 | 158
 8 | 68
 9 | 142
10 | 280
11 | 0
12 | 7
13 | 293
14 | 0
15 | 6
16 | 293
17 | 0
18 | 5
19 | 293
20 | 0
21 | 4
22 | 293
23 | 0
24 | 3
25 | 293
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_StdSigm_abs_err:
--------------------------------------------------------------------------------
 1 | 1
 2 | 0
 3 | 8
 4 | 2
 5 | 430
 6 | 1172
 7 | 8
 8 | 403
 9 | 815
10 | 27
11 | 342
12 | 634
13 | 57
14 | 268
15 | 496
16 | 101
17 | 183
18 | 374
19 | 157
20 | 101
21 | 267
22 | 219
23 | 36
24 | 157
25 | 278
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_StdSigm_abs_err_all_range:
--------------------------------------------------------------------------------
 1 | 1
 2 | 0
 3 | 8
 4 | 0
 5 | 437
 6 | 1172
 7 | 8
 8 | 403
 9 | 815
10 | 27
11 | 342
12 | 634
13 | 57
14 | 268
15 | 496
16 | 101
17 | 183
18 | 374
19 | 157
20 | 101
21 | 267
22 | 219
23 | 36
24 | 157
25 | 278
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_Tanh:
--------------------------------------------------------------------------------
 1 | 1
 2 | 0
 3 | 8
 4 | 2
 5 | 245
 6 | 562
 7 | 39
 8 | 165
 9 | 295
10 | 130
11 | 60
12 | 156
13 | 215
14 | 8
15 | 58
16 | 250
17 | 0
18 | 21
19 | 254
20 | 0
21 | 20
22 | 255
23 | 0
24 | 19
25 | 256
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/segments/coef_TanhAbs:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 8
 4 | 0
 5 | 250
 6 | 562
 7 | 39
 8 | 165
 9 | 295
10 | 130
11 | 60
12 | 156
13 | 215
14 | 8
15 | 58
16 | 250
17 | 0
18 | 21
19 | 254
20 | 0
21 | 20
22 | 255
23 | 0
24 | 19
25 | 256
26 | 0
27 | 0
28 | 


--------------------------------------------------------------------------------
/src/Camera.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | ----------------------------------------------------------------------
  3 | --- Class: Camera
  4 | --
  5 | -- This class provides a set of methods to exchange data/info with the Camera.
  6 | --
  7 | local Camera = torch.class('neuflow.Camera')
  8 | 
  9 | function Camera:__init(args)
 10 |    -- args:
 11 |    self.nf = args.nf
 12 |    self.core = args.nf.core
 13 |    self.msg_level = args.msg_level or 'none'  -- 'detailled' or 'none' or 'concise'
 14 |    self.frames = {}
 15 | 
 16 |    self.nb_frames = 4 -- number of frames in running buffer
 17 |    -- self.Aw_ = 640
 18 |    -- self.Ah_ = 480
 19 |    -- self.Bw_ = 640
 20 |    -- self.Bh_ = 480
 21 |    self.size = {
 22 |       ['B'] = {['width'] = 640, ['height'] = 480, ['component'] = 3},
 23 |       ['A'] = {['width'] = 640, ['height'] = 480, ['component'] = 3}
 24 |    }
 25 | 
 26 |    self.mask = {
 27 |       ['counter']     = {['A'] = 0x0000000c, ['B'] = 0x000c0000},
 28 |       ['status']      = {['A'] = 0x00000001, ['B'] = 0x00010000},
 29 |    }
 30 | 
 31 |    self.conf = {
 32 |       ['acquisition'] = {
 33 |          ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0},
 34 |          ['mask'] = 0x1,
 35 |          ['index'] = 10},
 36 |       ['definition'] = {
 37 |          ['value'] = {['QVGA'] = 0x1, ['VGA'] = 0x0},
 38 |          ['mask'] = 0x1,
 39 |          ['index'] = 0},
 40 |       ['framerate'] = {
 41 |          ['value'] = {['60FPS'] = 0x1, ['30FPS'] = 0x0},
 42 |          ['mask'] = 0x1,
 43 |          ['index'] = 1},
 44 |       ['color'] = {
 45 |          ['value'] = {['COLOR'] = 0x0, ['B&W'] = 0x1},
 46 |          ['mask'] = 0x1,
 47 |          ['index'] = 2},
 48 |       ['domain'] = {
 49 |          ['value'] = {['RGB'] = 0x1, ['YUV'] = 0x0},
 50 |          ['mask'] = 0x1,
 51 |          ['index'] = 3},
 52 |       ['scan'] = {
 53 |          ['value'] = {['INTERLACED'] = 0x0, ['PROGRESSIVE'] = 0x1},
 54 |          ['mask'] = 0x1,
 55 |          ['index'] = 4},
 56 |       ['grab'] = {
 57 |          ['value'] = {['ONESHOT'] = 0x1, ['CONTINUOUS'] = 0x0},
 58 |          ['mask'] = 0x1,
 59 |          ['index'] = 8},
 60 |       ['power'] = {
 61 |          ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0},
 62 |          ['mask'] = 0x1,
 63 |          ['index'] = 11},
 64 |       ['iic'] = {
 65 |          ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0},
 66 |          ['mask'] = 0x1,
 67 |          ['index'] = 12}
 68 |    }
 69 | 
 70 |    -- Memorize here the camera register value
 71 |    self.reg_ctrl = 0x00000000
 72 |    self.reg_status = 0x00000000
 73 | 
 74 |    self.cam_param = {
 75 |       ['A'] = {['port_addrs'] = dma.camera_A_port_id, ['offset'] = 0},
 76 |       ['B'] = {['port_addrs'] = dma.camera_B_port_id, ['offset'] = 16}
 77 |    }
 78 | 
 79 |    -- compulsory
 80 |    if (self.core == nil) then
 81 |       error('<neuflow.Camera> ERROR: requires a Dataflow Core')
 82 |    end
 83 | end
 84 | 
 85 | function Camera:config(cameraID, param, value)
 86 |    local temp_mask
 87 |    local temp_offset
 88 |    local lcameraID
 89 |    if #cameraID == 1 then
 90 |       lcameraID = {cameraID}
 91 |    else
 92 |       lcameraID = cameraID
 93 |    end
 94 | 
 95 |    for i = 1,#lcameraID do
 96 |       temp_offset = self.conf[param].index + self.cam_param[lcameraID[i]].offset
 97 |       -- Unset all dedicated bits of the config paramater
 98 |       temp_mask = bit.bnot(bit.lshift(self.conf[param].mask,temp_offset))
 99 |       self.reg_ctrl = bit.band(self.reg_ctrl, temp_mask)
100 |       -- Set the new value in reg_ctrl
101 |       temp_mask = bit.lshift(self.conf[param].value[value],temp_offset)
102 |       self.reg_ctrl = bit.bor(self.reg_ctrl, temp_mask)
103 | 
104 |       -- Adjust camera memory size to definition
105 |       if param == 'definition' then
106 |          if value == 'QVGA' then
107 |             self.size[lcameraID[i]].width = 320
108 |             self.size[lcameraID[i]].height = 240
109 |          else
110 |             self.size[lcameraID[i]].width = 640
111 |             self.size[lcameraID[i]].height = 480
112 |          end
113 |       end
114 |       if param == 'color' then
115 |          if value == 'B&W' then
116 |             self.size[lcameraID[i]].component = 1
117 |          else
118 |             self.size[lcameraID[i]].component = 3
119 |          end
120 |       end
121 |    end
122 | 
123 |    return self.reg_ctrl
124 | end
125 | 
126 | 
127 | function Camera:initCamera(cameraID, alloc_frames)
128 | 
129 |    self.frames[cameraID] = alloc_frames
130 |    print('<neuflow.Camera> : init Camera ' .. cameraID)
131 | 
132 |    -- puts the cameras in standby
133 |    local reg_ctrl = self.core:allocRegister()
134 |    self:config(cameraID,'power','ON')
135 |    self.core:setreg(reg_ctrl, self.reg_ctrl)
136 |    self.core:iowrite(oFlower.io_gpios, reg_ctrl)
137 |    --self.core:sleep(1)
138 |    self.core:message('Camera: Init done')
139 | end
140 | 
141 | -- Not stable for now because of the camera settings. Use getLastFrame instead
142 | function Camera:getLastFrameSafe(cameraID)
143 |    local outputs = {}
144 |    local lcameraID
145 | 
146 |    if #cameraID == 1 then
147 |       lcameraID = {cameraID}
148 |    else
149 |       lcameraID = cameraID
150 |    end
151 | 
152 |    for i = 1,#lcameraID do
153 |       table.insert(outputs, self.frames[lcameraID[i]])
154 | 
155 |       self.core:closePortSafe(self.cam_param[lcameraID[i]].port_addrs)
156 |    end
157 |    return outputs
158 | end
159 | 
160 | function Camera:getLastFrame(cameraID)
161 |    local outputs = {}
162 | 
163 |    local reg_acqst = self.core:allocRegister()
164 |    local reg_tmp = self.core:allocRegister()
165 |    local lcameraID
166 | 
167 |    local mask_status = 0x00000000
168 |    if #cameraID == 1 then
169 |       lcameraID = {cameraID}
170 |    else
171 |       lcameraID = cameraID
172 |    end
173 |    for i = 1,#lcameraID do
174 |       mask_status = bit.bor(mask_status, self.mask.status[lcameraID[i]])
175 |    end
176 |    self.core:loopUntilStart()
177 |    self.core:ioread(oFlower.io_gpios, reg_acqst)
178 |    self.core:bitandi(reg_acqst, mask_status, reg_tmp)
179 |    self.core:compi(reg_tmp, 0x00000000, reg_tmp)
180 |    self.core:loopUntilEndIfNonZero(reg_tmp)
181 | 
182 |    for i = 1,#lcameraID do
183 |       table.insert(outputs, self.frames[lcameraID[i]])
184 |       self.core:closePort(self.cam_param[lcameraID[i]].port_addrs)
185 |    end
186 | 
187 |    return outputs
188 | end
189 | 
190 | function Camera:captureOneFrame(cameraID)
191 |    local lcameraID
192 | 
193 |    local reg_ctrl = self.core:allocRegister()
194 |    local reg_acqst = self.core:allocRegister()
195 |    local reg_tmp = self.core:allocRegister()
196 | 
197 |    local mask_ctrl = 0x00000000
198 |    local mask_status = 0x00000000
199 | 
200 |    -- Enable camera acquisition
201 |    if #cameraID == 1 then
202 |       lcameraID = {cameraID}
203 |    else
204 |       lcameraID = cameraID
205 |    end
206 | 
207 |    for i = 1,#lcameraID do
208 |       self.core:openPortWr(self.cam_param[lcameraID[i]].port_addrs, self.frames[lcameraID[i]])
209 |       mask_status = bit.bor(mask_status, self.mask.status[lcameraID[i]])
210 |    end
211 | 
212 |    -- trigger acquisition
213 |    mask_ctrl = self:config(cameraID, 'acquisition', 'ON')
214 |    self.core:setreg(reg_ctrl, mask_ctrl)
215 |    self.core:iowrite(oFlower.io_gpios, reg_ctrl)
216 | 
217 |    -- loop until acquisition has started
218 |    self.core:loopUntilStart()
219 |    self.core:ioread(oFlower.io_gpios, reg_acqst)
220 |    self.core:bitandi(reg_acqst, mask_status, reg_tmp)
221 |    self.core:compi(reg_tmp, mask_status, reg_tmp)
222 |    self.core:loopUntilEndIfNonZero(reg_tmp)
223 | 
224 |    -- Once the acquisition start. Disable the acquisition for the next frame
225 |    mask_ctrl = self:config(cameraID, 'acquisition', 'OFF')
226 |    self.core:setreg(reg_ctrl, mask_ctrl)
227 |    self.core:iowrite(oFlower.io_gpios, reg_ctrl)
228 | end
229 | 
230 | function Camera:enableCameras(cameraID)
231 |    local lcameraID
232 |    if #cameraID == 1 then
233 |       lcameraID = {cameraID}
234 |    else
235 |       lcameraID = cameraID
236 |    end
237 | 
238 |    for i=1,#lcameraID do
239 |       local image_tensor = torch.Tensor(self.size[lcameraID[i]].height, self.size[lcameraID[i]].width*self.size[lcameraID[i]].component)
240 |       local image_segment = self.core.mem:allocPersistentData(image_tensor, '2D')
241 | 
242 |       self:initCamera(lcameraID[i], image_segment)
243 |    end
244 |    self.core:sleep(1)
245 | end
246 | 
247 | function Camera:startRBCameras() -- Start camera and send images to Running Buffer
248 | 
249 |    print('<neuflow.Camera> : enable Camera: ' .. self.size['A'].width * self.size['A'].component .. 'x' .. self.size['A'].height)
250 | 
251 |    local image_tensor_A = torch.Tensor(self.size['A'].height, self.size['A'].width*self.size['A'].component)
252 |    local image_segment_A = self.core.mem:allocPersistentData(image_tensor_A, '2D')
253 | 
254 |    local image_tensor_B = torch.Tensor(self.size['B'].height, self.size['B'].width*self.size['B'].component)
255 |    local image_segment_B = self.core.mem:allocPersistentData(image_tensor_B, '2D')
256 | 
257 |    -- The two cameras have to be initialized in the same time if an IIC configuration occured.
258 |    self:initCamera('B', image_segment_B)
259 |    self:initCamera('A', image_segment_A)
260 | 
261 |    self.core:sleep(2)
262 | 
263 |    -- Global setup for DMA port (camera A and B) to make continuous
264 |    local stride_bit_shift = math.log(1024) / math.log(2)
265 | 
266 |    self.core:send_selectModule(blast_bus.area_streamer, blast_bus.addr_mem_streamer_0+dma.camera_A_port_id, 1)
267 |    self.core:send_setup(0, 16*1024*1024, stride_bit_shift, 1)
268 | 
269 |    self.core:send_selectModule(blast_bus.area_streamer, blast_bus.addr_mem_streamer_0+dma.camera_B_port_id, 1)
270 |    self.core:send_setup(0, 16*1024*1024, stride_bit_shift, 1)
271 | 
272 |    -- Open the streamer ports for writing
273 |    self.core:openPortWr(dma.camera_B_port_id, self.frames['B'])
274 |    self.core:openPortWr(dma.camera_A_port_id, self.frames['A'])
275 | 
276 |    --self.core:sleep(0.1)
277 |    -- Start cameras sending images
278 |    local reg_ctrl = self.core:allocRegister()
279 |    local mask_ctrl = self:config({'B','A'}, 'acquisition', 'ON')
280 | 
281 |    -- trigger acquisition
282 |    self.core:setreg(reg_ctrl, mask_ctrl)
283 |    self.core:iowrite(oFlower.io_gpios, reg_ctrl)
284 | end
285 | 
286 | function Camera:stopRBCameras() -- Stop camera sending to Running Buffer
287 | 
288 |    local reg_acqst = self.core:allocRegister()
289 |    local mask_status = bit.bor(self.mask.status['A'], self.mask.status['B'])
290 |    local mask_ctrl = self:config({'A','B'}, 'acquisition', 'OFF')
291 | 
292 |    -- Once the acquisition stop. Disable the acquisition for the next frame
293 |    self.core:setreg(reg_acqst, mask_ctrl)
294 |    self.core:iowrite(oFlower.io_gpios, reg_acqst)
295 |    self.core:nop(100) -- small delay
296 | 
297 |    -- wait for the frame to finish being sent
298 |    -- self.core:loopUntilStart()
299 |    -- self.core:ioread(oFlower.io_gpios, reg_acqst)
300 |    -- self.core:bitandi(reg_acqst, mask_status, reg_acqst)
301 |    -- self.core:compi(reg_acqst, 0x00000000, reg_acqst)
302 |    -- self.core:loopUntilEndIfNonZero(reg_acqst)
303 | 
304 |    -- reset ports setup
305 |    self.core:configureStreamer(0, 16*1024*1024, 1024, {dma.camera_A_port_id, dma.camera_B_port_id})
306 | end
307 | 
308 | function Camera:copyToHostLatestFrame() -- Get the latest complete frame
309 | 
310 |    local reg_acqst = self.core:allocRegister()
311 |    self.core:ioread(oFlower.io_gpios, reg_acqst)
312 | 
313 |    self:streamLatestFrameFromPort('B', reg_acqst, dma.ethernet_read_port_id, 'full')
314 |    self.nf.ethernet:streamFromHost(self.nf.ethernet.ack_stream[1], 'ack_stream')
315 |    self:streamLatestFrameFromPort('A', reg_acqst, dma.ethernet_read_port_id, 'full')
316 | 
317 |    return torch.Tensor(2, self.size['A'].height, self.size['A'].width)
318 | end
319 | 
320 | function Camera:streamLatestFrameFromPort(cameraID, reg_acqst, port_addr, port_addr_range)
321 | 
322 |    function coordinateOffset(coordinate, offset)
323 |       return {
324 |          coordinate = coordinate,
325 |          calc = function(self)
326 |             return self.coordinate:calc() + offset
327 |          end
328 |       }
329 |    end
330 | 
331 | 
332 |    local goto_ends = {}
333 |    local reg_count = self.core:allocRegister()
334 | 
335 |    for ii = (self.nb_frames-1), 1, -1 do
336 |       -- copy camera status into reg but masked for frame count
337 |       self.core:bitandi(reg_acqst, self.mask.counter[cameraID], reg_count)
338 |       self.core:compi(reg_count, ii, reg_count) -- test if current frame is 'ii'
339 | 
340 |       -- if current frame not eq to 'ii' (reg_count == 0) goto next possible option
341 |       self.core:gotoTagIfZero(nil, reg_count) -- goto next pos
342 |       local goto_next = self.core.linker:getLastReference()
343 | 
344 |       -- read the last frame in running buffer
345 |       self.core:configPort{
346 |          index  = port_addr,
347 |          action = 'fetch+read+sync+close',
348 |          data   = {
349 |             x = self.frames[cameraID].x,
350 |             y = coordinateOffset(self.frames[cameraID].y, ((ii-1)*self.size[cameraID].height)),
351 |             w = self.size[cameraID].width * self.size[cameraID].component,
352 |             h = self.size[cameraID].height
353 |          },
354 | 
355 |          range  = port_addr_range
356 |       }
357 | 
358 |       self.core:gotoTag(nil) -- finish so goto end
359 |       goto_ends[ii] = self.core.linker:getLastReference()
360 | 
361 |       -- next pos
362 |       goto_next.goto_tag = self.core:makeGotoTag()
363 |       self.core:nop()
364 |    end
365 | 
366 |    -- if got here only option left is to read the following frame
367 |    self.core:configPort {
368 |       index  = port_addr,
369 |       action = 'fetch+read+sync+close',
370 |       data   = {
371 |          x = self.frames[cameraID].x,
372 |          y = coordinateOffset(self.frames[cameraID].y, ((self.nb_frames-1)*self.size[cameraID].height)),
373 |          w = self.size[cameraID].width * self.size[cameraID].component,
374 |          h = self.size[cameraID].height
375 |       },
376 | 
377 |       range  = port_addr_range
378 |    }
379 | 
380 |    -- end point
381 |    local goto_end_tag = self.core:makeGotoTag()
382 |    self.core:nop()
383 | 
384 |    for i, goto_end in pairs(goto_ends) do
385 |       goto_end.goto_tag = goto_end_tag
386 |    end
387 | end
388 | 


--------------------------------------------------------------------------------
/src/DmaInterface.lua:
--------------------------------------------------------------------------------
  1 | ----------------------------------------------------------------------
  2 | --- Class: DmaEthernet
  3 | --
  4 | -- This class provides a set of methods to exchange data/info with the host.
  5 | --
  6 | local DmaEthernet = torch.class('neuflow.DmaEthernet')
  7 | 
  8 | xrequire 'ethertbsp'
  9 | 
 10 | function DmaEthernet:__init(args)
 11 |    -- args:
 12 |    self.nf = args.nf
 13 |    self.core = args.core
 14 |    self.profiler = self.nf.profiler
 15 | 
 16 |    self.msg_level = args.msg_level or 'none'  -- 'detailled' or 'none' or 'concise'
 17 |    self.max_packet_size = 1500 or args.max_packet_size
 18 | 
 19 |    -- compulsory
 20 |    if (self.core == nil) then
 21 |       error('<neuflow.DmaEthernet> ERROR: requires a Dataflow Core')
 22 |    end
 23 | 
 24 |    -- data ack
 25 |    self.ack_tensor = torch.Tensor(1,1,32)
 26 |    self.ack_stream = self.nf:allocHeap(self.ack_tensor)
 27 | end
 28 | 
 29 | function DmaEthernet:open(network_if_name)
 30 |    if(network_if_name) then
 31 |       ethertbsp.open(network_if_name)
 32 |    else
 33 |       ethertbsp.open()
 34 |    end
 35 | end
 36 | 
 37 | function DmaEthernet:close()
 38 |    ethertbsp.close()
 39 | end
 40 | 
 41 | function DmaEthernet:sendReset()
 42 |    if (-1 == ethertbsp.sendreset()) then
 43 |       print('<reset> fail')
 44 |    end
 45 | end
 46 | 
 47 | function DmaEthernet:dev_copyToHost(tensor)
 48 |    -- profiler ack
 49 |    self.nf.core:executionTimeSensitive(function()
 50 |       self:streamToHost(self.ack_stream[1], 'ack_stream')
 51 |    end)
 52 | 
 53 |    for i = 1, (#tensor-1) do
 54 |       self.nf.core:executionTimeSensitive(function()
 55 |          self:streamToHost(tensor[i], 'default')
 56 |          --self:streamFromHost(self.ack_stream[1], 'ack_stream')
 57 |       end)
 58 |    end
 59 | 
 60 |    self.nf.core:executionTimeSensitive(function()
 61 |       self:streamToHost(tensor[#tensor], 'default')
 62 |    end)
 63 | end
 64 | 
 65 | function DmaEthernet:dev_copyFromHost(tensor)
 66 |    for i = 1,#tensor do
 67 |       self.nf.core:executionTimeSensitive(function()
 68 |          self:streamFromHost(tensor[i], 'default')
 69 |       end)
 70 |    end
 71 | end
 72 | 
 73 | function DmaEthernet:dev_receiveBytecode()
 74 |    self:loadByteCode()
 75 | end
 76 | 
 77 | function DmaEthernet:host_copyToDev(tensor)
 78 |    self.profiler:start('copy-to-dev')
 79 |    for i = 1,tensor:size(1) do
 80 |       ethertbsp.sendtensor(tensor[i])
 81 |    end
 82 |    self.profiler:lap('copy-to-dev')
 83 | end
 84 | 
 85 | function DmaEthernet:host_copyFromDev(tensor)
 86 |    -- profiler ack
 87 |    self.profiler:start('on-board-processing')
 88 |    self.profiler:setColor('on-board-processing', 'blue')
 89 |    ethertbsp.receivetensor(self.ack_tensor)
 90 |    self.profiler:lap('on-board-processing')
 91 | 
 92 | 
 93 |    self.profiler:start('copy-from-dev')
 94 |    ethertbsp.receivetensor(tensor[1])
 95 |    for i = 2,tensor:size(1) do
 96 |       --ethertbsp.sendtensor(self.ack_tensor)
 97 |       ethertbsp.receivetensor(tensor[i])
 98 |    end
 99 |    self.profiler:lap('copy-from-dev')
100 | end
101 | 
102 | function DmaEthernet:host_sendBytecode(bytecode)
103 |    self.profiler:start('load-bytecode')
104 |    ethertbsp.loadbytecode(bytecode)
105 |    self.profiler:lap('load-bytecode')
106 | end
107 | 
108 | function DmaEthernet:printToEthernet(str)
109 |    print("DEPRECATED")
110 | 
111 |    -- Printing to ethernet involves initializing a transfer with the driver,
112 |    -- then writing the data (frame), then triggering the transfer.
113 | 
114 |    if (self.msg_level == 'detailled') then
115 |       self.core:print(string.format('[ETHERNET TX : %s]',str))
116 |    end
117 | 
118 |    -- verif data size >= 64
119 |    str = str .. '\n'
120 |    local data_size = string.len(str)
121 |    if (data_size < 64) then
122 |       data_size = 64
123 |    end
124 | 
125 |    -- allocate string in memory (TODO: this call is wrong, it allocates the right size,
126 |    -- but the data will be corrupted, need to implement a allocString function)
127 |    local fake_string = {x = 0, y = 0, w = math.ceil(data_size/2), h = 1}
128 | 
129 |    -- stream data to DMA ethernet interface
130 |    self.core:configPort{index = dma.ethernet_read_port_id,
131 |       action = 'fetch+read+sync+close',
132 |       data = fake_string,
133 |       range = 'full'
134 |    }
135 | end
136 | 
137 | function DmaEthernet:streamToHost(stream, tag, mode)
138 |    local data_size = stream.w * stream.h * 2
139 | 
140 |    -- estimate number of eth packets
141 |    local nb_packets = math.ceil(data_size / self.max_packet_size)
142 | 
143 |    -- debug
144 |    if (self.msg_level ~= 'none') then
145 |       self.core:message(string.format('eth: sending %0d packets [tag = %s]', nb_packets, tag))
146 |    end
147 | 
148 |    -- stream data (tensor) out with a write ack
149 | --   self.core:configPort{index = -1, action = 'write', data = {x=0, y=0, w=32, h=1}}
150 |    self.core:configPort{index = dma.ethernet_read_port_id,
151 |       action = 'fetch+read+sync+close',
152 |       data = stream,
153 |       range = 'full'}
154 | --   self.core:configPort{index = -1, action = 'sync+close'}
155 | 
156 | end
157 | 
158 | function DmaEthernet:streamFromHost(stream, tag)
159 |    -- verif data size >= 64
160 |    local data_size = stream.w * stream.h * 2
161 |    if (data_size < 64) then
162 |       error('<neuflow.DmaEthernet> ERROR: cant stream data packets smaller than 64 bytes')
163 |    end
164 | 
165 |    -- estimate number of eth packets
166 |    local nb_packets = math.ceil(data_size / self.max_packet_size)
167 | 
168 |    -- debug
169 |    if (self.msg_level ~= 'none') then
170 |       self.core:message(string.format('eth: requesting %0d packets [tag = %s]', nb_packets, tag))
171 |    end
172 | 
173 |    -- stream data in
174 |    self.core:configPort{index = dma.ethernet_write_port_id,
175 |       action = 'write',
176 |       data = stream,
177 |       range = 'full'}
178 |    self.core:configPort{index = dma.ethernet_write_port_id,
179 |       action = 'sync+close',
180 |       range = 'full'}
181 | end
182 | 
183 | function DmaEthernet:loadByteCode()
184 |    -- Creating a stream
185 |    local bytecode_stream = {x = 0, y = 0, w = 1024, h = 16*1024}
186 | 
187 |    -- Regular streamFromHost
188 |    self:streamFromHost(bytecode_stream, 'bytecode')
189 | 
190 |    -- ACK to indicate that bytecode has been received
191 |    --self.core:configPort{index = 0, action = 'fetch+read+sync+close', data = {x = 0, y = 0, w = 64, h = 1}}
192 | 
193 |    -- Jump to address 0 and execute
194 |    self.core:gotoGlobal(bootloader.entry_point)
195 | end
196 | 


--------------------------------------------------------------------------------
/src/Linker.lua:
--------------------------------------------------------------------------------
  1 | ----------------------------------------------------------------------
  2 | --- Class: Linker
  3 | --
  4 | -- This class is used to manage and link the bytecode.
  5 | -- The bytecode contains processes and data:
  6 | -- (1) a process is an action recognized by the virtual machine
  7 | --     running on the dataflow computer
  8 | -- (2) data is used by processes
  9 | --
 10 | local Linker = torch.class('neuflow.Linker')
 11 | 
 12 | function Linker:__init(args)
 13 |    -- args
 14 |    self.disassemble = args.disassemble
 15 | 
 16 |    -- the bytecode array
 17 |    local sentinel_node = {}
 18 |    self.instruction_list = {
 19 |       start_node     = sentinel_node,
 20 |       end_node       = sentinel_node,
 21 |       start_sentinel = sentinel_node,
 22 |       end_sentinel   = sentinel_node
 23 |    }
 24 | 
 25 |    local init_offset = (args.init_offset or 0) + 1
 26 | 
 27 |    -- only if we start NOT from page zero
 28 |    if (init_offset ~= 1) then
 29 | 
 30 |       -- init padding
 31 |       for aa = 0, ((init_offset/8)-1) do
 32 |          self:appendInstruction{bytes = {0,0,0,0,0,0,0,0}}
 33 |       end
 34 | 
 35 |       -- Sentinel to seperate init padding from next process
 36 |       self:appendSentinel()
 37 |    end
 38 | 
 39 |    self.counter_bytes = 0
 40 | end
 41 | 
 42 | function Linker:getLastReference()
 43 |    return self.instruction_list.end_node
 44 | end
 45 | 
 46 | function Linker:getReference()
 47 |    error('# ERROR <Linker:getReference> : Deprecated')
 48 | end
 49 | 
 50 | function Linker:linkGotos()
 51 | 
 52 |    local goto_table = {}
 53 |    local node = self.instruction_list.start_node
 54 |    while node do
 55 |       if node.goto_tag then
 56 |          goto_table[node] = node.goto_tag
 57 |       end
 58 | 
 59 |       node = node.next
 60 |    end
 61 | 
 62 |    for node in pairs(goto_table) do
 63 |       local ref_node = goto_table[node].ref
 64 |       local offset = goto_table[node].offset
 65 | 
 66 |       if offset <= 0 then
 67 |          local ii = 0
 68 |          while ii > offset do
 69 | 
 70 |             ref_node = ref_node.prev
 71 |             ii = ii - 1
 72 |          end
 73 |       else
 74 |          local ii = 0
 75 |          while ii < offset do
 76 | 
 77 |             ref_node = ref_node.next
 78 |             ii = ii + 1
 79 |          end
 80 |       end
 81 | 
 82 |       -- if destination node is a sentinel, try linking to a node in the next
 83 |       -- direction, if cannot then in the prev direction. Throw an error if
 84 |       -- cannot find a non sentinel node.
 85 |       function checkNode(ref_node, reverse)
 86 | 
 87 |          if nil ~= ref_node.bytes then
 88 |             return ref_node
 89 |          else
 90 |             if ref_node.next and not reverse then
 91 |                return checkNode(ref_node.next)
 92 |             elseif ref_node.prev then
 93 |                return checkNode(ref_node.prev, true)
 94 |             else
 95 |                error('# ERROR <Linker:linkGotos> : could not link goto')
 96 |             end
 97 |          end
 98 |       end
 99 | 
100 |       ref_node = checkNode(ref_node)
101 | 
102 |       -- remove just processed goto tab from table
103 |       goto_table[node] = nil
104 | 
105 |       -- ref_node is destination instr
106 |       node.goto_instr = ref_node
107 |    end
108 | end
109 | 
110 | function Linker:resolveGotos()
111 |    local addr_index = {}
112 |    local ii = 0
113 | 
114 |    local node = self.instruction_list.start_node
115 |    while node do
116 |       if node.bytes ~= nil then
117 |          addr_index[node] = ii
118 |          ii = ii + 1
119 |       end
120 | 
121 |       node = node.next
122 |    end
123 | 
124 |    local node = self.instruction_list.start_node
125 |    while node do
126 |       if node.goto_instr ~= nil then
127 |          self:rewriteARG32(node.bytes, addr_index[node.goto_instr])
128 |       end
129 | 
130 |       node = node.next
131 |    end
132 | 
133 |    return ii
134 | end
135 | 
136 | function Linker:resolveMemSegments()
137 |    local node = self.instruction_list.start_node
138 | 
139 |    while node do
140 |       if node.mem_offset ~= nil then
141 |          self:rewriteARG32(node.bytes, node.mem_offset:calc())
142 |       end
143 | 
144 |       node = node.next
145 |    end
146 | end
147 | 
148 | function Linker:genBytecode()
149 |    local node = self.instruction_list.start_node
150 |    local instruction_output = {}
151 |    local ii = 0
152 | 
153 |    while node do
154 |       if node.bytes ~= nil then
155 |          instruction_output[ii+1] = node.bytes[1]
156 |          instruction_output[ii+2] = node.bytes[2]
157 |          instruction_output[ii+3] = node.bytes[3]
158 |          instruction_output[ii+4] = node.bytes[4]
159 |          instruction_output[ii+5] = node.bytes[5]
160 |          instruction_output[ii+6] = node.bytes[6]
161 |          instruction_output[ii+7] = node.bytes[7]
162 |          instruction_output[ii+8] = node.bytes[8]
163 | 
164 |          ii = ii + 8
165 |       end
166 | 
167 |       node = node.next
168 |    end
169 | 
170 |    return instruction_output
171 | end
172 | 
173 | function Linker:appendSentinel(mode)
174 |    assert('start' == mode or 'end' == mode or nil == mode)
175 | 
176 |    local new_sentinel = {mode = mode}
177 |    local last_sentinel = self.instruction_list.end_sentinel
178 |    local last_node = self.instruction_list.end_node
179 | 
180 |    last_sentinel.next_sentinel = new_sentinel
181 |    new_sentinel.prev_sentinel = last_sentinel
182 |    self.instruction_list.end_sentinel = new_sentinel
183 | 
184 |    last_node.next = new_sentinel
185 |    new_sentinel.prev = last_node
186 |    self.instruction_list.end_node = new_sentinel
187 | end
188 | 
189 | function Linker:appendInstruction(instruction)
190 | 
191 |    if not instruction.bytes then
192 |       instruction.bytes = self:newInstructionBytes(instruction)
193 |    end
194 | 
195 |    local node = self.instruction_list.end_node
196 | 
197 |    node.next = instruction
198 |    instruction.prev = node
199 |    self.instruction_list.end_node = instruction
200 | end
201 | 
202 | function Linker:newInstructionBytes(args)
203 | 
204 |    -- parse args
205 |    local opcode = args.opcode or oFlower.op_nop
206 |    local arg8_1 = args.arg8_1 or 0
207 |    local arg8_2 = args.arg8_2 or 0
208 |    local arg8_3 = args.arg8_3 or 0
209 |    local arg32_1 = args.arg32_1 or 0
210 |    local bytes = {}
211 | 
212 |    -- serialize opcode + args
213 |    bytes[1] = math.floor(arg32_1/256^0) % 256
214 |    bytes[2] = math.floor(arg32_1/256^1) % 256
215 |    bytes[3] = math.floor(arg32_1/256^2) % 256
216 |    bytes[4] = math.floor(arg32_1/256^3) % 256
217 |    bytes[5] = arg8_3
218 |    bytes[6] = arg8_2
219 |    bytes[7] = arg8_1
220 |    bytes[8] = opcode
221 | 
222 |    return bytes
223 | end
224 | 
225 | function Linker:rewriteARG32(instr_bytes, uint32)
226 |    instr_bytes[1] = math.floor(uint32/256^0) % 256
227 |    instr_bytes[2] = math.floor(uint32/256^1) % 256
228 |    instr_bytes[3] = math.floor(uint32/256^2) % 256
229 |    instr_bytes[4] = math.floor(uint32/256^3) % 256
230 | end
231 | 
232 | function Linker:insertInstruction(node, instruction)
233 | 
234 |    instruction.next = node.next
235 |    instruction.prev = node
236 | 
237 |    instruction.next.prev = instruction
238 |    instruction.prev.next = instruction
239 | end
240 | 
241 | function Linker:insertSegment(earlier_node, seg_start, seg_end)
242 |    local later_node = earlier_node.next
243 | 
244 |    earlier_node.next = seg_start
245 |    seg_start.prev = earlier_node
246 | 
247 |    later_node.prev = seg_end
248 |    seg_end.next = later_node
249 | end
250 | 
251 | function Linker:removeSegment(seg_start, seg_end)
252 |    local earlier_node = seg_start.prev
253 |    local later_node = seg_end.next
254 | 
255 |    earlier_node.next = later_node
256 |    later_node.prev = earlier_node
257 | end
258 | 
259 | function Linker:alignSensitiveCode(walker)
260 |    walker = walker or {
261 |       current_node      = self.instruction_list.start_node,
262 |       sentinel_start    = nil,
263 |       sentinel_nesting  = 0,
264 |       sentinel_size     = 0,
265 |       bytecode_size     = 0,
266 |    }
267 | 
268 |    if nil == walker.current_node.bytes then
269 |       -- sentinel
270 | 
271 |       if 'start' == walker.current_node.mode then
272 |          if 0 == walker.sentinel_nesting then
273 |             walker.sentinel_start = walker.current_node
274 |             walker.sentinel_size  = 0
275 |          end
276 |          walker.sentinel_nesting = walker.sentinel_nesting + 1
277 |       end
278 | 
279 |       if 'end' == walker.current_node.mode then
280 |          walker.sentinel_nesting = walker.sentinel_nesting - 1
281 |          assert(0 <= walker.sentinel_nesting)
282 |       end
283 |    else
284 |       -- instr
285 |       walker.bytecode_size = walker.bytecode_size + 1
286 | 
287 |       if 0 < walker.sentinel_nesting then
288 |          if (1 == (walker.bytecode_size % (oFlower.page_size_b/8))) then
289 |             -- current node is first of new page
290 | 
291 |             if walker.sentinel_start ~= walker.current_node.prev then
292 |                -- shift sensitive section into new page
293 |                assert((oFlower.page_size_b/8) > walker.sentinel_size)
294 | 
295 |                local before_sensitive = walker.sentinel_start.next
296 |                for i = 1, walker.sentinel_size do
297 |                   self:insertInstruction(before_sensitive, {bytes = {0,0,0,0,0,0,0,0}})
298 |                   before_sensitive = before_sensitive.next
299 |                   walker.bytecode_size = walker.bytecode_size + 1
300 |                end
301 |             end
302 |          end
303 | 
304 |          walker.sentinel_size = walker.sentinel_size + 1
305 |       end
306 |    end
307 | 
308 |    if walker.current_node.next then
309 |       walker.current_node = walker.current_node.next
310 |       return self:alignSensitiveCode(walker)
311 |    end
312 | end
313 | 
314 | function Linker:dump(info, mem)
315 | 
316 |    self:linkGotos()
317 |    self:alignSensitiveCode()
318 |    local instr_nb = self:resolveGotos()
319 | 
320 |    mem:adjustBytecodeSize(instr_nb*8)
321 | 
322 |    self:resolveMemSegments()
323 |    local instr = self:genBytecode()
324 | 
325 |    -- optional disassemble
326 |    if self.disassemble then
327 |       neuflow.tools.disassemble(instr, {length = #instr})
328 |    end
329 | 
330 |    -- parse argument
331 |    assert(info.tensor)
332 |    info.bigendian = info.bigendian or 0
333 | 
334 |    -- print all the instructions
335 |    self:dump_instructions(instr, info.tensor)
336 | 
337 |    -- and embedded data
338 |    self:dump_embedded_data(info, info.tensor, mem)
339 | 
340 |    -- print memory area statistics
341 |    mem:printAreaStatistics()
342 | 
343 |    return self.counter_bytes
344 | end
345 | 
346 | function Linker:dump_instructions(instr, tensor)
347 |    -- copy instructions into tensor
348 |    for i=1, #instr do
349 |       tensor[self.counter_bytes+1] = instr[i]
350 |       self.counter_bytes = self.counter_bytes + 1
351 |    end
352 | end
353 | 
354 | function Linker:dump_embedded_data(info, tensor, mem)
355 |    -- pad initial offset for raw data
356 |    self.counter_bytes = mem.embedded.start.y * streamer.stride_b
357 |                       + mem.embedded.start.x * streamer.word_b
358 | 
359 |    for i=1, #mem.embedded do
360 |       mem_entry = mem.embedded[i]
361 | 
362 |       -- set offset in file
363 |       if ('number' == type(mem_entry.y)) then
364 |          self.counter_bytes = mem_entry.y * streamer.stride_b + mem_entry.x * streamer.word_b
365 |       else
366 |          self.counter_bytes = mem_entry.y:calc() * streamer.stride_b + mem_entry.x:calc() * streamer.word_b
367 |       end
368 | 
369 |       if (mem_entry.bias ~= nil) then
370 |          for b = 1,mem_entry.bias:size(1) do
371 |             dataTwos = math.floor(mem_entry.bias[b] * num.one + 0.5)
372 |             dataTwos = bit.band(dataTwos, num.mask)
373 |             for j=0,(num.size_b - 1) do
374 |                -- get char from short
375 |                if (info.bigendian == 1) then
376 |                   tempchar = math.floor(dataTwos / (256^((num.size_b - 1)-j))) % 256
377 |                else
378 |                   tempchar = math.floor(dataTwos / (256^j)) % 256
379 |                end
380 |                tensor[self.counter_bytes+1] = tempchar
381 |                self.counter_bytes = self.counter_bytes + 1
382 |             end
383 |          end
384 |       end
385 | 
386 |       for r=1,mem_entry.data:size(1) do
387 |          for c=1,mem_entry.data:size(2) do
388 |             dataTwos = math.floor(mem_entry.data[r][c] * num.one + 0.5)
389 |             dataTwos = bit.band(dataTwos, num.mask)
390 |             for j=0,(num.size_b - 1) do
391 |                -- get char from short
392 |                if (info.bigendian == 1) then
393 |                   tempchar = math.floor(dataTwos / (256^((num.size_b - 1)-j))) % 256
394 |                else
395 |                   tempchar = math.floor(dataTwos / (256^j)) % 256
396 |                end
397 |                tensor[self.counter_bytes+1] = tempchar
398 |                self.counter_bytes = self.counter_bytes + 1
399 |             end
400 |          end
401 |       end
402 |    end
403 | end
404 | 


--------------------------------------------------------------------------------
/src/LinkerExtensions.lua:
--------------------------------------------------------------------------------
  1 | ----------------------------------------------------------------------
  2 | --- Class: Linker
  3 | --
  4 | -- This file contains extensions to the Linker class.
  5 | --
  6 | 
  7 | function neuflow.Linker:cacheConfigOptimization()
  8 |    -- Filter for the instruction linked list, would be used after 'linkGotos()' and before
  9 |    -- 'alignProcessWithPages()'
 10 | 
 11 |    -- Beginning from the start of list, move along list until dead time is found.
 12 |    -- From that point, descend list looking for configs that can be moved.
 13 |    -- If config that can be moved is found, remove segment and then insert the segment in dead time.
 14 |    -- Repeat until the end of list is reached
 15 | 
 16 | 
 17 |    local function bytesDecode(bytes)
 18 |       -- instr bit packing is hard code, any change in the blast_bus.vh will make errors here
 19 | 
 20 |       local instr = {}
 21 |       instr.config8_1 = bytes[1]
 22 |       instr.config8_2 = bytes[2]
 23 |       instr.config8_3 = bytes[3]
 24 |       instr.config8_4 = bytes[4]
 25 |       instr.config16_1 = (256^1)*bytes[4]+(256^0)*bytes[3]
 26 |       instr.config32_1 = (256^3)*bytes[4]+(256^2)*bytes[3]+(256^1)*bytes[2]+(256*0)*bytes[1]
 27 | 
 28 |       instr.arg8_3 = bytes[5]
 29 |       instr.arg8_2 = bytes[6]
 30 |       instr.arg8_1 = bytes[7] -- config_content
 31 |       instr.of_opcode = bytes[8] -- openflower opcode
 32 | 
 33 |       return instr
 34 |    end
 35 | 
 36 |    -- makes a table that holds the current state of all the ports, argument 'state' is an old port
 37 |    -- state table to be cloned
 38 |    local function makePorts(state)
 39 |       if not state then state = {} end
 40 |       local ports = {}
 41 |       ports.addr = state.addr or nil -- if nil no port is being addressed
 42 |       ports.submod = state.addr or nil -- if nil no port sub module is being addressed
 43 | 
 44 |       for aa = 1, (streamer.nb_ports-1) do
 45 |          if not state[aa] then state[aa] = {} end
 46 |          ports[aa] = {}
 47 |          ports[aa].valid = state[aa].valid or 1 -- if 0, no longer in considerion for reordering
 48 |          ports[aa].idle = state[aa].idle or 1 -- if 1, is idle & does not need to be cached set
 49 |          ports[aa].active = state[aa].active or 0
 50 |          ports[aa].cached = state[aa].cached or 0
 51 |          ports[aa].reset = state[aa].reset or 0
 52 |          ports[aa].prefetch = state[aa].prefetch or 0
 53 |       end
 54 | 
 55 |       function ports:reset_valid()
 56 |          for aa = 1, (streamer.nb_ports-1) do
 57 |             ports[aa].valid = 1
 58 |          end
 59 |       end
 60 | 
 61 |       return ports
 62 |    end
 63 | 
 64 |    -- determines how the current instruction affects which end point the config bus
 65 |    -- is interacting with
 66 |    local function addressState(of_opcode, config_content, config_addr, config_submod, ports)
 67 | 
 68 |       if of_opcode == oFlower.op_writeConfig then
 69 |          if config_content == blast_bus.content_command then
 70 |             -- last 4 bits of config_addr is the area address
 71 |             local area = (config_addr - (config_addr%(2^12)))/(2^12)
 72 | 
 73 |             -- group addr and broadcast addr means more then one port can be active, these will
 74 |             -- be ignored as this version of config optimizer only can deal with a single port
 75 |             -- being addressed
 76 |             if area == blast_bus.area_streamer then
 77 |                -- first 12 bits of config_addr is the port address
 78 |                ports.addr = config_addr%(2^12)
 79 |                ports.submod = config_submod
 80 | 
 81 |                if ((ports.addr < 1) or (ports.addr > (streamer.nb_ports-1))) then
 82 |                   -- addr zero is broadcast to all ports while any address above the
 83 |                   -- number of ports is a group addr, both are ignored
 84 |                   ports.addr = nil
 85 |                   ports.submod = nil
 86 |                end
 87 |             else
 88 |                ports.addr = nil
 89 |                ports.submod = nil
 90 |             end
 91 |          end
 92 |       end
 93 |    end
 94 | 
 95 |    -- determines if the current instruction has a command that will affect the addressed port
 96 |    -- should be called after addressState in case the addring command also had an config_instr
 97 |    local function portCommand(of_opcode, config_content, config_instr, ports)
 98 |       for aa = 1, (streamer.nb_ports-1) do
 99 |          ports[aa].reset = 0
100 |          ports[aa].prefetch = 0
101 |       end
102 | 
103 |       local command = false
104 | 
105 |       if of_opcode == oFlower.op_writeConfig then
106 |          if ports.addr and (config_content == blast_bus.content_command or
107 |                            config_content == blast_bus.content_instruc) then
108 | 
109 |             command = true
110 | 
111 |             if config_instr == blast_bus.instruc_config then
112 |                -- place holder for addressing without an opcode
113 |             elseif config_instr == blast_bus.instruc_setAdd then
114 |                -- set group address, defualt is broadcast address
115 |                -- addr could be set to a different area code which
116 |                -- would mean the addressState would need to be changed
117 |             elseif config_instr == blast_bus.instruc_reset then
118 |                ports[ports.addr].valid = 0
119 |                ports[ports.addr].reset = 1
120 |             elseif config_instr == blast_bus.instruc_cacheStart then
121 |                ports[ports.addr].valid = 0
122 |                ports[ports.addr].cached = 1
123 |             elseif config_instr == blast_bus.instruc_cacheFinish then
124 |                ports[ports.addr].valid = 0
125 |                ports[ports.addr].cached = 0
126 |             elseif config_instr == blast_bus.instruc_activate then
127 |                ports[ports.addr].valid = 0
128 |                ports[ports.addr].idle = 0
129 |                ports[ports.addr].active = 1
130 |             elseif config_instr == blast_bus.instruc_deActivate then
131 |                ports[ports.addr].active = 0
132 |             elseif config_instr == blast_bus.instruc_control_1 then
133 |                -- prefetch
134 |                ports[ports.addr].valid = 0
135 |                ports[ports.addr].idle = 0
136 |                ports[ports.addr].prefetch = 1
137 |             else
138 |                print("WARNING: Unknown comand sent to streamer")
139 |                command = false
140 |             end
141 |          end
142 |       end
143 |       return command
144 |    end
145 | 
146 |    local function portConfig(of_opcode, config_content, ports)
147 |       local config = false
148 | 
149 |       if of_opcode == oFlower.op_writeConfig then
150 |          if ports.addr and config_content == blast_bus.content_config then
151 |             -- sending config words to sub module, currently only can move sub mod 2
152 |             -- global and timeout config is ignored
153 | 
154 |             config = true
155 |          end
156 |       end
157 | 
158 |       return config
159 |    end
160 | 
161 |    local function portWaitStatus(of_opcode, config_content, ports)
162 |       local wait = false
163 | 
164 |       -- TODO: have estimate of time spent in wait and if there is enough time for
165 |       -- a config reorder set wait to true
166 | 
167 |       if ports.addr and (of_opcode == oFlower.op_getStatus) then
168 |          if config_content == blast_bus.status_primed then
169 |             wait = true
170 |          elseif config_content == blast_bus.status_done then
171 |             wait = true
172 |          end
173 |       end
174 | 
175 |       return wait
176 |    end
177 | 
178 |    local function makeCacheSetInstr()
179 |       local instr_bytes = self:newInstructionBytes {
180 |          opcode = oFlower.op_writeConfig,
181 |          arg8_1 = blast_bus.content_instruc,
182 |          arg32_1 = blast_bus.instruc_cacheStart
183 |       }
184 | 
185 |       return {bytes = instr_bytes}
186 |    end
187 | 
188 |    local function makeCacheUnsetInstr()
189 |       local instr_bytes = self:newInstructionBytes {
190 |          opcode = oFlower.op_writeConfig,
191 |          arg8_1 = blast_bus.content_instruc,
192 |          arg32_1 = blast_bus.instruc_cacheFinish
193 |       }
194 | 
195 |       return {bytes = instr_bytes}
196 |    end
197 | 
198 |    local function makeAddrInstr(addr, submod)
199 |       submod = submod or 0
200 |       local configWord = blast_bus.area_streamer*(2^28) + addr*(2^16) + submod*(2^8)
201 | 
202 |       local instr_bytes = self:newInstructionBytes {
203 |          opcode = oFlower.op_writeConfig,
204 |          arg8_1 = blast_bus.content_command,
205 |          arg32_1 = configWord
206 |       }
207 | 
208 |       return {bytes = instr_bytes}
209 |    end
210 | 
211 |    local function findConfigSegment(node, ports)
212 |       -- start_node is an instruction that addressess a port and the 2nd sub mod
213 |       -- end_node is the last config instruction
214 |       local start_node = nil
215 |       local end_node = nil
216 |       local search = true
217 | 
218 |       while (search and node) do
219 |          if node.bytes ~= nil then
220 |             local instr = bytesDecode(node.bytes)
221 | 
222 |             addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports)
223 |             portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports)
224 | 
225 |             if (ports.submod == 2 and ports[ports.addr].valid == 1) then
226 |                start_node = node
227 | 
228 |                node = node.next
229 |                local nb_config = 0
230 |                while (search and node) do
231 | 
232 |                   if node.bytes == nil then
233 |                      search = false
234 |                      break
235 |                   end
236 | 
237 |                   local instr = bytesDecode(node.bytes)
238 | 
239 |                   if portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports) then
240 |                      search = false
241 |                      break
242 |                   end
243 | 
244 |                   if portConfig(instr.of_opcode, instr.arg8_1, ports) then
245 |                      nb_config = nb_config + 1
246 |                   end
247 | 
248 |                   if nb_config == 5 then
249 |                      search = false
250 |                      end_node = node
251 |                   end
252 | 
253 |                   node = node.next
254 |                end
255 |             end
256 |          end
257 |          if node then node = node.next end
258 |       end
259 | 
260 |       return start_node, end_node
261 |    end
262 | 
263 |    local function findWaitAddrNode(node, target_addr)
264 |       -- NOTE: if there is any other instr b/w addr instr and wait instr, make a addr
265 |       --       instr node and insert it before wait instr
266 |       local ports = makePorts()
267 | 
268 |       while not ports.addr do
269 |          node = node.prev
270 | 
271 |          local instr = bytesDecode(node.bytes)
272 |          addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports)
273 | 
274 |          if ports.addr == target_addr then
275 |             break
276 |          else
277 |             local tmp_ports = makePorts(ports)
278 |             tmp_ports.addr = target_addr
279 | 
280 |             local command = portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, tmp_ports)
281 |             local config = portConfig(instr.of_opcode, instr.arg8_1, tmp_ports)
282 | 
283 |             if config or command then
284 |                local addr_node = makeAddrInstr(target_addr)
285 |                self:insertInstruction(node, addr_node)
286 |                node = addr_node
287 | 
288 |                break
289 |             end
290 |          end
291 |       end
292 | 
293 |       return node
294 |    end
295 | 
296 |    local node = self.instruction_list.start_sentinel
297 |    local ports = makePorts()
298 | 
299 |    while node do
300 |       if node.bytes ~= nil then
301 |          local instr = bytesDecode(node.bytes)
302 | 
303 |          addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports)
304 |          portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports)
305 | 
306 |          --while portWaitStatus(instr.of_opcode, instr.arg8_1, ports) do -- only with time estimate
307 |          if portWaitStatus(instr.of_opcode, instr.arg8_1, ports) then
308 |             ports:reset_valid()
309 |             local descent_ports = makePorts(ports)
310 |             local descent_node = node.next
311 |             local start_node = nil
312 |             local end_node = nil
313 | 
314 |             start_node, end_node = findConfigSegment(descent_node, descent_ports)
315 | 
316 |             if start_node and end_node then
317 |                -- find the addr instr node used to addr port for the wait for status instr node
318 |                local wait_addr_node = findWaitAddrNode(node, ports.addr)
319 | 
320 |                -- insert instruction to re-addr port after first making it
321 |                local new_addr = makeAddrInstr(descent_ports.addr, 2)
322 |                self:insertInstruction(end_node, new_addr)
323 | 
324 |                -- NOTE: idle/not idle might not be correct, probable sould not use until sure
325 |                --       and just use the cache every time
326 |                --if ports[descent_ports.addr].idle then
327 |                   local cache_set = makeCacheSetInstr()
328 |                   local cache_unset = makeCacheUnsetInstr()
329 | 
330 |                   -- insert cache set instr in segment if using caching
331 |                   self:insertInstruction(start_node, cache_set)
332 | 
333 |                   -- insert instr to unset cache if using caching
334 |                   -- (if port is not idle at dead time)
335 |                   self:insertInstruction(new_addr, cache_unset)
336 |                --end
337 | 
338 |                -- remove (cut) the config segment
339 |                self:removeSegment(start_node, end_node)
340 | 
341 |                -- re-insert segment in its new place
342 |                self:insertSegment(wait_addr_node.prev, start_node, end_node)
343 |             end
344 |          end
345 |       end
346 | 
347 |       node = node.next
348 |    end
349 | end
350 | 


--------------------------------------------------------------------------------
/src/Log.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | ----------------------------------------------------------------------
 3 | --- Class: Log
 4 | --
 5 | -- logs info during compilation.
 6 | --
 7 | local Log = torch.class('neuflow.Log')
 8 | 
 9 | function Log:__init(file)
10 |    self.logFile = assert(io.open(file, "w"))
11 | end
12 | 
13 | function Log:write(msg)
14 |    self.logFile:write(msg)
15 | end
16 | 
17 | function Log:close()
18 |    self.logFile:close()
19 | end
20 | 
21 | 


--------------------------------------------------------------------------------
/src/Memory.lua:
--------------------------------------------------------------------------------
  1 | --[[ Class: Memory
  2 | 
  3 | This class is used to allocate areas of memory in a controlled manner. It
  4 | generate offsets and areas. If data needs to be written to the bytecode start
  5 | up stream, that is done in the Linker class.
  6 | 
  7 | The offsets and memory areas are represented in pixels. Conceptually the memory
  8 | is considered to be a large rectangular matrix.
  9 | 
 10 | The requirements for the memory that gets allocation vary but for our purposes
 11 | they can be grouped into three broad types. As such when requesting a memory
 12 | allocation the way that memory will be used needs to be considered and the
 13 | correct alloc function selected. The definition of these 3 type are as follows:
 14 | 
 15 | 1) Embedded data (e.g., kernels) whose value is know at compile time and thus
 16 |    would benefit by being written to memory when the bytecode is sent at start
 17 |    up.
 18 | 
 19 | 2) Persistent data (e.g., circular image buffers). The contents of this memory
 20 |    may be updated or change in time but the area addressing it does not. This
 21 |    allocation is used when data needs to be preserved between multiples layers
 22 |    in a conv net or between multiple runs of the program.
 23 | 
 24 | 3) Managed data (e.g., intermediate results). The contents of which only need
 25 |    to exist to pass data between operations or layers in a program. Area
 26 |    allocations of this type can be freed and reused in a managed fashion as the
 27 |    need arises.
 28 | 
 29 | --]]
 30 | 
 31 | local Memory = torch.class('neuflow.Memory')
 32 | 
 33 | function Memory:__init(args)
 34 | 
 35 |    self.prog_name = args.prog_name
 36 |    self.init_offset = (args.init_offset or 0) + 1
 37 |    self.bytecode_size_b = 0
 38 | 
 39 |    -- table of embedded data segments
 40 |    self.embedded = {
 41 |       ['start'] = {
 42 |          ['x'] = 0,
 43 |          ['y'] = 0,
 44 |       },
 45 |       ['current'] = {
 46 |          ['x'] = 0,
 47 |          ['y'] = 0,
 48 |       },
 49 |       ['layer'] = {
 50 |          ['h'] = 0,
 51 |          ['packing'] = 'kernel'
 52 |       }
 53 |    }
 54 | 
 55 |    -- table of persistent data segments
 56 |    self.persistent = {
 57 |       ['start'] = {
 58 |          ['x'] = 0,
 59 |          ['y'] = 0,
 60 |       },
 61 |       ['current'] = {
 62 |          ['x'] = 0,
 63 |          ['y'] = 0,
 64 |       },
 65 |       ['layer'] = {
 66 |          ['h'] = 0,
 67 |          ['packing'] = '1D'
 68 |       }
 69 |    }
 70 | 
 71 |    -- table of managed data segments
 72 |    self.managed = {
 73 |       ['start'] = {
 74 |          ['x'] = 0,
 75 |          ['y'] = 0,
 76 |       },
 77 |       ['current'] = {
 78 |          ['x'] = 0,
 79 |          ['y'] = 0,
 80 |       },
 81 |       ['layer'] = {
 82 |          ['h'] = 0,
 83 |          ['packing'] = '1D'
 84 |       },
 85 |    }
 86 | end
 87 | 
 88 | function Memory:adjustBytecodeSize(size_in_bytes)
 89 | 
 90 |    self.bytecode_size_b = size_in_bytes
 91 | 
 92 |    self.embedded.start.x = 0
 93 |    self.embedded.start.y =  math.ceil((size_in_bytes + 1) / streamer.stride_b)
 94 | 
 95 |    self.persistent.start.x = 0
 96 |    self.persistent.start.y = self.embedded.start.y + self.embedded.current.y + 1
 97 | 
 98 |    self.managed.start.x = 0
 99 |    self.managed.start.y = self.persistent.start.y + self.persistent.current.y + 1
100 | end
101 | 
102 | function Memory:constructCoordinate(area, coor)
103 |    return {
104 |       coor = coor,
105 |       start = self[area].start,
106 |       offset = self[area].current[coor],
107 |       calc = function(self)
108 |          return self.start[self.coor] + self.offset
109 |       end
110 |    }
111 | end
112 | 
113 | --[[ Allocate Embedded Data
114 | 
115 |    By default the data is reformatted & treated as a kernel. If non kernel data
116 |    needs to be embedded an explicit 1D or 2D packing argument needs to be
117 |    passed in. If 2D is selected but the width of the data is larger then the
118 |    streamer (memory) stride, packing is reverted to 1D.
119 | --]]
120 | function Memory:allocEmbeddedData(data_, bias_, packing)
121 |    packing = packing or 'kernel'
122 |    assert(packing == 'kernel' or packing == '1D' or packing == '2D')
123 |    assert(packing == 'kernel' or (not bias_))
124 | 
125 |    local orig_w_  = data_:size(2)
126 |    local orig_h_  = data_:size(1)
127 |    local w_
128 |    local h_
129 |    local offset_width
130 |    local offset_height
131 | 
132 |    if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then
133 |       print("<neuflow.Memory> WARNING: Current Embedded Data tensor cannot be written with 2D packing, switching to 1D.")
134 |       packing = '1D'
135 |    end
136 | 
137 |    if 'kernel' == packing then
138 |       local dh = grid.kernel_height - orig_h_
139 |       local kernel = torch.zeros(grid.kernel_height, grid.kernel_width)
140 | 
141 |       -- copy incoming data to the bottom left corner of kernel
142 |       for r = 1, orig_h_ do
143 |          for c = 1, orig_w_ do
144 |             kernel[r+dh][c] = data_[r][c]
145 |          end
146 |       end
147 | 
148 |       -- overwrite with new transformed values
149 |       data_ = kernel
150 |       h_ = 1
151 | 
152 |       if bias_ then
153 |          w_ = data_:size(1) * data_:size(2) + bias_:size(1)
154 |       else
155 |          w_ = data_:size(1) * data_:size(2)
156 |       end
157 |    elseif '1D' == packing then
158 |       w_ = orig_w_ * orig_h_
159 |       h_ = 1
160 |    else
161 |       w_ = orig_w_
162 |       h_ = orig_h_
163 |    end
164 | 
165 |    if '2D' ~= packing then
166 |       offset_width = w_ % streamer.stride_w
167 |       offset_height = math.floor(w_ / streamer.stride_w)
168 | 
169 |       if '2D' == self.embedded.layer.packing then
170 |          self.embedded.current.x = 0
171 |          self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h
172 |          self.embedded.layer.h = 0
173 |       end
174 |    else
175 |       offset_width = w_
176 |       offset_height = h_
177 | 
178 |       -- check if current data fits in the line
179 |       if (self.embedded.current.x + w_) > streamer.stride_w then
180 |          self.embedded.current.x = 0
181 |          self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h
182 |          self.embedded.layer.h = 0
183 |       end
184 |    end
185 | 
186 |    -- the layer height is the height of the maximum data area in the layer
187 |    if self.embedded.layer.h < h_ then
188 |       self.embedded.layer.h = h_
189 |    end
190 | 
191 |    self.embedded[ #self.embedded+1 ] = {
192 |       x        = self:constructCoordinate('embedded', 'x'),
193 |       y        = self:constructCoordinate('embedded', 'y'),
194 |       w        = w_,
195 |       h        = h_,
196 |       orig_w   = orig_w_,
197 |       orig_h   = orig_h_,
198 |       data     = data_,
199 |       bias     = bias_
200 |    }
201 | 
202 |    self.embedded.current.x = self.embedded.current.x + offset_width
203 | 
204 |    if '2D' ~= packing then
205 |       self.embedded.current.y = self.embedded.current.y + offset_height
206 | 
207 |       --  check if we did not step out of the line
208 |       if (self.embedded.current.x > streamer.stride_w) then
209 |          self.embedded.current.y = self.embedded.current.y + 1
210 |          self.embedded.current.x = self.embedded.current.x - streamer.stride_w
211 |       end
212 |    end
213 | 
214 |    -- alignment of addresses to physical memory pages
215 |    if (self.embedded.current.x % streamer.align_w) ~= 0 then
216 |       self.embedded.current.x = (math.floor(self.embedded.current.x/streamer.align_w) + 1) * streamer.align_w
217 |       -- and check if we did not step out of the line again
218 |       if (self.embedded.current.x > streamer.stride_w) then
219 |          self.embedded.current.x = 0
220 |          self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h
221 |          self.embedded.layer.h = 0
222 |       end
223 |    end
224 | 
225 |    self.embedded.layer.packing = packing
226 | 
227 |    return self.embedded[ #self.embedded ]
228 | end
229 | 
230 | --[[ Allocate Persistent Data
231 | 
232 |    Data can be transformed to use 1D or 2D packing depending on packing
233 |    argument. If 2D is selected but the width of the data is larger then the
234 |    streamer (memory) stride, packing is reverted to 1D.
235 | --]]
236 | function Memory:allocPersistentData(data_, packing)
237 |    packing = packing or '1D'
238 |    assert(packing == '1D' or packing == '2D')
239 | 
240 |    local orig_w_ = data_:size(2)
241 |    local orig_h_ = data_:size(1)
242 |    local w_
243 |    local h_
244 |    local offset_width
245 |    local offset_height
246 | 
247 |    if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then
248 |       print("<neuflow.Memory> WARNING: Current Persistent Data tensor cannot be written with 2D packing, switching to 1D.")
249 |       packing = '1D'
250 |    end
251 | 
252 |    if '1D' == packing then
253 |       w_ = orig_w_ * orig_h_
254 |       h_ = 1
255 | 
256 |       offset_width = w_ % streamer.stride_w
257 |       offset_height = math.floor(w_ / streamer.stride_w)
258 | 
259 |       if '1D' ~= self.persistent.layer.packing then
260 |          self.persistent.current.x = 0
261 |          self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h
262 |          self.persistent.layer.h = 0
263 |       end
264 |    else
265 |       w_ = orig_w_
266 |       h_ = orig_h_
267 | 
268 |       offset_width = w_
269 |       offset_height = h_
270 | 
271 |       -- check if current data fits in the line
272 |       if (self.persistent.current.x + w_) > streamer.stride_w then
273 |          self.persistent.current.x = 0
274 |          self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h
275 |          self.persistent.layer.h = 0
276 |       end
277 |    end
278 | 
279 |    -- the layer height is the height of the maximum data area in the layer
280 |    if self.persistent.layer.h < h_ then
281 |       self.persistent.layer.h = h_
282 |    end
283 | 
284 |    self.persistent[ #self.persistent+1 ] = {
285 |       x        = self:constructCoordinate('persistent', 'x'),
286 |       y        = self:constructCoordinate('persistent', 'y'),
287 |       w        = w_,
288 |       h        = h_,
289 |       orig_w   = orig_w_,
290 |       orig_h   = orig_h_,
291 |       data     = data_
292 |    }
293 | 
294 |    self.persistent.current.x = self.persistent.current.x + offset_width
295 | 
296 |    if '1D' == packing then
297 |       self.persistent.current.y = self.persistent.current.y + offset_height
298 | 
299 |       --  check if we did not step out of the line
300 |       if (self.persistent.current.x > streamer.stride_w) then
301 |          self.persistent.current.y = self.persistent.current.y + 1
302 |          self.persistent.current.x = self.persistent.current.x - streamer.stride_w
303 |       end
304 |    end
305 | 
306 |    -- alignment of addresses to physical memory pages
307 |    if (self.persistent.current.x % streamer.align_w) ~= 0 then
308 |       self.persistent.current.x = (math.floor(self.persistent.current.x/streamer.align_w) + 1)*streamer.align_w
309 |       -- and check if we did not step out of the line again
310 |       if (self.persistent.current.x > streamer.stride_w) then
311 |          self.persistent.current.x = 0
312 |          self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h
313 |          self.persistent.layer.h = 0
314 |       end
315 |    end
316 | 
317 |    self.persistent.layer.packing = packing
318 | 
319 |    return self.persistent[ #self.persistent ]
320 | end
321 | 
322 | --[[ Allocate Managed Data
323 | 
324 |    Data can be transformed to use 1D or 2D packing depending on packing
325 |    argument. If 2D is selected but the width of the data is larger then the
326 |    streamer (memory) stride, packing is reverted to 1D.
327 | 
328 |    If the end of physical memory is reached, function will start overwriting
329 |    from the start of the Managed memory space.
330 | --]]
331 | function Memory:allocManagedData(data_, packing)
332 |    packing = packing or '1D'
333 |    assert(packing == '1D' or packing == '2D')
334 | 
335 |    local orig_w_ = data_:size(2)
336 |    local orig_h_ = data_:size(1)
337 |    local w_
338 |    local h_
339 |    local offset_width
340 |    local offset_height
341 | 
342 |    if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then
343 |       print("<neuflow.Memory> WARNING: Current Managed Data tensor cannot be written with 2D packing, switching to 1D.")
344 |       packing = '1D'
345 |    end
346 | 
347 |    if '1D' == packing then
348 |       w_ = orig_w_ * orig_h_
349 |       h_ = 1
350 | 
351 |       offset_width = w_ % streamer.stride_w
352 |       offset_height = math.floor(w_ / streamer.stride_w)
353 | 
354 |       if '1D' ~= self.managed.layer.packing then
355 |          self.managed.current.x = 0
356 |          self.managed.current.y = self.managed.current.y + self.managed.layer.h
357 |          self.managed.layer.h = 0
358 |       end
359 |    else
360 |       w_ = orig_w_
361 |       h_ = orig_h_
362 | 
363 |       offset_width = w_
364 |       offset_height = h_
365 | 
366 |       -- check if current data fits in the line
367 |       if (self.managed.current.x + w_) > streamer.stride_w then
368 |          self.managed.current.x = 0
369 |          self.managed.current.y = self.managed.current.y + self.managed.layer.h
370 |          self.managed.layer.h = 0
371 |       end
372 |    end
373 | 
374 |    -- check if there is space in the mem if not start overwriting first layers
375 |    if (self.managed.current.y + offset_height) > memory.size_r then
376 |       print("<neuflow.Memory> WARNING: Overwriting the first layers of heap!")
377 |       self.managed.current.x = 0
378 |       self.managed.current.y = 0
379 |       self.managed.layer.h = 0
380 |    end
381 | 
382 |    -- the layer height is the height of the maximum data area in the layer
383 |    if self.managed.layer.h < h_ then
384 |       self.managed.layer.h = h_
385 |    end
386 | 
387 |    self.managed[ #self.managed+1 ] = {
388 |       x        = self:constructCoordinate('managed', 'x'),
389 |       y        = self:constructCoordinate('managed', 'y'),
390 |       w        = w_,
391 |       h        = h_,
392 |       orig_w   = orig_w_,
393 |       orig_h   = orig_h_,
394 |       data     = data_
395 |    }
396 | 
397 |    self.managed.current.x = self.managed.current.x + offset_width
398 | 
399 |    if '1D' == packing then
400 |       self.managed.current.y = self.managed.current.y + offset_height
401 | 
402 |       --  check if we did not step out of the line
403 |       if (self.managed.current.x > streamer.stride_w) then
404 |          self.managed.current.y = self.managed.current.y + 1
405 |          self.managed.current.x = self.managed.current.x - streamer.stride_w
406 |       end
407 |    end
408 | 
409 |    -- alignment of addresses to physical memory pages
410 |    if (self.managed.current.x % streamer.align_w) ~= 0 then
411 |       self.managed.current.x = (math.floor(self.managed.current.x/streamer.align_w) + 1)*streamer.align_w
412 |       -- and check if we did not step out of the line again
413 |       if (self.managed.current.x > streamer.stride_w) then
414 |          self.managed.current.x = 0
415 |          self.managed.current.y = self.managed.current.y + self.managed.layer.h
416 |          self.managed.layer.h = 0
417 |       end
418 |    end
419 | 
420 |    self.managed.layer.packing = packing
421 | 
422 |    return self.managed[#self.managed]
423 | end
424 | 
425 | function Memory:printAreaStatistics()
426 | 
427 |    embedded_start_b = self.embedded.start.y * streamer.stride_b
428 |                     + self.embedded.start.x * streamer.word_b
429 | 
430 |    embedded_size_b = self.embedded.current.y * streamer.stride_b
431 |                    + self.embedded.current.x * streamer.word_b
432 | 
433 |    persistent_start_b = self.persistent.start.y * streamer.stride_b
434 |                       + self.persistent.start.x * streamer.word_b
435 | 
436 |    persistent_size_b = self.persistent.current.y * streamer.stride_b
437 |    if (self.persistent.current.x ~= 0) then
438 |       -- if we did not just step a new line
439 |       -- take into account all the lines we wrote (the last entry's height is enough)
440 |       -- if not all the lines are filled till the end we are counting more than we should here,
441 |       -- but for checking collision it's OK
442 |       persistent_size_b = persistent_size_b + self.persistent[#self.persistent].h * streamer.stride_b
443 |    end
444 | 
445 |    managed_start_b = self.managed.start.y * streamer.stride_b
446 |                    + self.managed.start.x * streamer.word_b
447 | 
448 |    managed_size_b = self.managed.current.y * streamer.stride_b
449 |    if (self.managed.current.x ~= 0) then
450 |       -- if we did not just step a new line
451 |       -- take into account all the lines we wrote (the last entry's height is enough)
452 |       -- if not all the lines are filled till the end we are counting more than we should here,
453 |       -- but for checking collision it's OK
454 |       managed_size_b = managed_size_b + (self.managed[#self.managed].h * streamer.stride_b)
455 |    end
456 | 
457 |    local binary_size = embedded_start_b+embedded_size_b
458 | 
459 |    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
460 |    print(c.Cyan .. '-openFlow-' .. c.Magenta .. ' ConvNet Name ' ..
461 |          c.none ..'[ ' .. self.prog_name .. ' ]\n')
462 |    print(
463 |       string.format("       bytecode segment: start = %10d, size = %10d, end = %10d",
464 |          self.init_offset,
465 |          self.bytecode_size_b-self.init_offset,
466 |          self.bytecode_size_b)
467 |    )
468 |    print(
469 |       string.format("  embedded data segment: start = %10d, size = %10d, end = %10d",
470 |          embedded_start_b,
471 |          embedded_size_b,
472 |          embedded_start_b+embedded_size_b)
473 |    )
474 |    print(
475 |       string.format("persistent data segment: start = %10d, size = %10d, end = %10d",
476 |          persistent_start_b,
477 |          persistent_size_b,
478 |          persistent_start_b+persistent_size_b)
479 |    )
480 |    print(
481 |       string.format("   managed data segment: start = %10d, size = %10d, end = %10d",
482 |          managed_start_b,
483 |          managed_size_b,
484 |          memory.size_b)
485 |    )
486 |    print(
487 |       string.format("\n  the binary file size should be = %10d, total memory used = %10d",
488 |          binary_size,
489 |          managed_start_b+managed_size_b)
490 |    )
491 |    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
492 | 
493 | end
494 | 


--------------------------------------------------------------------------------
/src/NeuFlow.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --------------------------------------------------------------------------------
  3 | -- NeuFlow
  4 | -- a class to abstract the neuFlow processor
  5 | --
  6 | 
  7 | ----------------------------------------------------------------------
  8 | -- register class + constructor
  9 | --
 10 | local NeuFlow = torch.class('neuflow.NeuFlow')
 11 | 
 12 | function NeuFlow:__init(args)
 13 |    -- parse args
 14 |    args = args or {}
 15 |    self.prog_name = args.prog_name or 'temp'
 16 |    self.use_ethernet = args.use_ethernet or false
 17 |    self.serial_device = args.serial_device or false
 18 |    self.global_msg_level = args.global_msg_level or 'none'
 19 |    self.mode = args.mode or 'runtime' -- or 'simulation' or 'rom'
 20 |    self.use_ethernet = (self.mode == 'runtime')
 21 |    if(args.network_if_name) then
 22 |       self.network_if_name = args.network_if_name
 23 |    end
 24 | 
 25 |    -- default offsets, for conveniency
 26 |    args.offset_code = args.offset_code or bootloader.entry_point_b
 27 |    -- in simul, bypass header
 28 |    if self.mode == 'simulation' then
 29 |       args.offset_code = 0
 30 |    end
 31 | 
 32 |    -- instantiate core, with all args
 33 |    args.msg_level = args.core_msg_level or self.global_msg_level
 34 |    self.core = neuflow.Core(args)
 35 | 
 36 |    -- instantiate the compiler, relies on the core
 37 |    self.compiler = neuflow.Compiler {
 38 |       optimize_across_layers = true,
 39 |       core = self.core,
 40 |       msg_level = args.compiler_msg_level or self.global_msg_level
 41 |    }
 42 | 
 43 |    -- use a profiler
 44 |    self.profiler = neuflow.Profiler()
 45 | 
 46 |    -- instantiate the interface
 47 |    if (self.core.platform == 'pico_m503') or (self.core.platform == 'xilinx_ml605_tbsp') then
 48 |       self.handshake = false
 49 |       self.ethernet = neuflow.DmaEthernet {
 50 |          msg_level = args.ethernet_msg_level or self.global_msg_level,
 51 |          core = self.core,
 52 |          nf = self
 53 |       }
 54 |    else
 55 |       self.handshake = true
 56 |       self.ethernet = neuflow.Ethernet {
 57 |          msg_level = args.ethernet_msg_level or self.global_msg_level,
 58 |          core = self.core,
 59 |          nf = self
 60 |       }
 61 |    end
 62 | 
 63 |    if self.core.platform == 'pico_m503' then
 64 |       self.camera = neuflow.Camera {
 65 |          msg_level = args.camera_msg_level or self.global_msg_level,
 66 |          nf = self
 67 |       }
 68 |    end
 69 | 
 70 |    -- for loops: this retains a list of jump locations
 71 |    self.loopTags = {}
 72 | 
 73 |    -- ethernet socket (auto found for now)
 74 |    if self.use_ethernet then
 75 |       print '<neuflow.NeuFlow> loading ethernet driver'
 76 |       if self.ethernet:open(self.network_if_name) ~= 0 then
 77 |          self.use_ethernet = false
 78 |       end
 79 |    end
 80 | 
 81 |    -- serial dev
 82 |    if self.serial_device then
 83 |       self.tty = neuflow.Serial(self.serial_device, '57600')
 84 |    end
 85 | 
 86 |    -- bytecode has a constant size (oFlower bios)
 87 |    self.bytecodesize = bootloader.load_size
 88 | 
 89 |    -- and finally initialize hardware
 90 |    self:initialize()
 91 | end
 92 | 
 93 | ----------------------------------------------------------------------
 94 | -- ending functions: this is not clean for now, but insures that
 95 | -- the hardware stays in sync.
 96 | --
 97 | function NeuFlow:cleanup()
 98 |    if self.use_ethernet then
 99 |       self.ethernet:close()
100 |    end
101 |    if self.tty then
102 |       self.tty:cleanup()
103 |    end
104 | end
105 | 
106 | ----------------------------------------------------------------------
107 | -- print messages / send message
108 | --
109 | function NeuFlow:printMessage()
110 |    if self.tty then
111 |       print(self.tty:read())
112 |    end
113 | end
114 | 
115 | function NeuFlow:sendMessage(message)
116 |    if self.tty then
117 |       self.tty:write(message)
118 |    end
119 | end
120 | 
121 | ----------------------------------------------------------------------
122 | -- initialize system
123 | --
124 | function NeuFlow:initialize(args)
125 |    -- args
126 |    if args and args.selftest then
127 |       self.core:bootSequence{selftest=true}
128 |    else
129 |       self.core:bootSequence{selftest=false}
130 |    end
131 | end
132 | 
133 | ----------------------------------------------------------------------
134 | -- high-level memory functions
135 | --
136 | function NeuFlow:allocHeap(tensor)
137 |    local alloc_list = {}
138 |    if type(tensor) == 'table' then
139 |       local first = true
140 |       for i = 1,#tensor do
141 |          if tensor[i]:nDimension() ~= 2 then
142 |             xlua.error('only supports list of 2D tensors','NeuFlow.allocHeap')
143 |          end
144 |          local segment = self.core.mem:allocManagedData(tensor[i])
145 |          table.insert(alloc_list, segment)
146 |          first = false
147 |       end
148 |    else
149 |       local dims = tensor:nDimension()
150 |       if dims == 2 then
151 |          local segment = self.core.mem:allocManagedData(tensor)
152 |          table.insert(alloc_list, segment)
153 |       elseif dims == 3 then
154 |          local first = true
155 |          for i = 1,tensor:size(1) do
156 |             local segment = self.core.mem:allocManagedData(tensor[i])
157 |             table.insert(alloc_list, segment)
158 |             first = false
159 |          end
160 |       else
161 |          error('tensors must have 2 or 3 dimensions')
162 |       end
163 |    end
164 |    return alloc_list
165 | end
166 | 
167 | function NeuFlow:allocDataPacked(tensor,bias)
168 |    local alloc_list = {}
169 |    if type(tensor) == 'table' then
170 |       for i = 1,#tensor do
171 |          if tensor[i]:nDimension() ~= 2 then
172 |             xlua.error('only supports list of 2D tensors','NeuFlow.allocHeap')
173 |          end
174 |          local segment
175 |          if bias then
176 |             segment = self.core.mem:allocEmbeddedData(tensor[i], bias[i])
177 |          else
178 |             segment = self.core.mem:allocEmbeddedData(tensor[i])
179 |          end
180 |          table.insert(alloc_list, segment)
181 |       end
182 |    else
183 |       local dims = tensor:nDimension()
184 |       if dims == 2 then
185 |          local segment
186 |          if bias then
187 |             segment = self.core.mem:allocEmbeddedData(tensor, bias)
188 |          else
189 |             segment = self.core.mem:allocEmbeddedData(tensor)
190 |          end
191 |          table.insert(alloc_list, segment)
192 |       elseif dims == 3 then
193 |          for i = 1,tensor:size(1) do
194 |             local segment
195 |             if bias then
196 |                segment = self.core.mem:allocEmbeddedData(tensor[i], bias:narrow(1,i,1))
197 |             else
198 |                segment = self.core.mem:allocEmbeddedData(tensor[i])
199 |             end
200 |             table.insert(alloc_list, segment)
201 |          end
202 |       else
203 |          error('tensors must have 2 or 3 dimensions')
204 |       end
205 |    end
206 |    return alloc_list
207 | end
208 | 
209 | function NeuFlow:allocData(tensor)
210 |    local alloc_list = {}
211 |    if type(tensor) == 'table' then
212 |       for i = 1,#tensor do
213 |          if tensor[i]:nDimension() ~= 2 then
214 |             xlua.error('only supports list of 2D tensors','NeuFlow.allocPersistentData')
215 |          end
216 |          if self.mode == 'simulation' then
217 |             local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D')
218 |             table.insert(alloc_list, segment)
219 |          else
220 |             local segment = self.core.mem:allocPersistentData(tensor[i])
221 |             table.insert(alloc_list, segment)
222 |          end
223 |       end
224 |    else
225 |       local dims = tensor:nDimension()
226 |       if dims == 2 then
227 |          if self.mode == 'simulation' then
228 |             local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D')
229 |             table.insert(alloc_list, segment)
230 |          else
231 |             local segment = self.core.mem:allocPersistentData(tensor)
232 |             table.insert(alloc_list, segment)
233 |          end
234 |       elseif dims == 3 then
235 |          for i = 1,tensor:size(1) do
236 |             if self.mode == 'simulation' then
237 |                local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D')
238 |                table.insert(alloc_list, segment)
239 |             else
240 |                local segment = self.core.mem:allocPersistentData(tensor[i])
241 |                table.insert(alloc_list, segment)
242 |             end
243 |          end
244 |       else
245 |          error('tensors must have 2 or 3 dimensions')
246 |       end
247 |    end
248 |    return alloc_list
249 | end
250 | 
251 | function NeuFlow:copy(source, dest)
252 |    -- check if source/dest are lists of streams, or streams
253 |    if #source == 0 then
254 |       source = {source}
255 |       if dest then
256 |          dest = {dest}
257 |       end
258 |    end
259 | 
260 |    -- if no dest, create it
261 |    if not dest then
262 |       dest = self:allocHeap(source)
263 |    end
264 | 
265 |    -- process a list of streams
266 |    for i = 1,#source do
267 |       self.core:copy(source[i],dest[i])
268 |    end
269 | 
270 |    -- return result
271 |    return dest
272 | end
273 | 
274 | function NeuFlow:copyFromHost(source, dest)
275 |    -- if no dest, create it
276 |    if not dest then
277 |       dest = self:allocHeap(source)
278 |    end
279 |    -- check if dest is a list of streams, or a stream
280 |    local ldest
281 |    if #dest == 0 then
282 |       ldest = {dest}
283 |    else
284 |       ldest = dest
285 |    end
286 |    -- if simulation, we replace this transfer by a plain copy
287 |    if self.mode == 'simulation' then
288 |       -- alloc in constant data:
289 |       source = self:allocData(source)
290 |       print('<neuflow.NeuFlow> copy host->dev [simul]: ' .. #ldest .. 'x' .. ldest[1].orig_h .. 'x' .. ldest[1].orig_w)
291 |       self:copy(source,ldest)
292 |    else
293 |       -- process list of streams
294 |       print('<neuflow.NeuFlow> copy host->dev: ' .. #ldest .. 'x' .. ldest[1].orig_h .. 'x' .. ldest[1].orig_w)
295 | 
296 |       self.ethernet:dev_copyFromHost(ldest)
297 |    end
298 | 
299 |    return dest
300 | end
301 | 
302 | function NeuFlow:copyToHost(source, dest)
303 |    -- no ack in simulation
304 |    local ack
305 |    if self.mode == 'simulation' or (not self.handshake) then
306 |       ack = 'no-ack'
307 |    end
308 | 
309 |    -- check if source is a list of streams, or a stream
310 |    local lsource
311 |    if #source == 0 then
312 |       lsource = {source}
313 |    else
314 |       lsource = source
315 |    end
316 | 
317 |    -- record original sizes
318 |    local orig_h = lsource[1].orig_h
319 |    local orig_w = lsource[1].orig_w
320 | 
321 |    -- process list of streams
322 |    print('<neuflow.NeuFlow> copy dev->host: ' .. #lsource .. 'x' .. lsource[1].orig_h .. 'x' .. lsource[1].orig_w)
323 | 
324 |    self.ethernet:dev_copyToHost(lsource, ack)
325 | 
326 |    -- create/resize dest
327 |    if not dest then
328 |       dest = torch.Tensor()
329 |    end
330 |    dest:resize(#lsource, orig_h, orig_w)
331 |    return dest
332 | end
333 | 
334 | ----------------------------------------------------------------------
335 | -- wrappers for compilers
336 | --
337 | function NeuFlow:compile(network, input)
338 |    -- retrieve IDs
339 |    local inputs
340 |    if #input == 0 then
341 |       inputs = { input }
342 |    else
343 |       inputs = input
344 |    end
345 | 
346 |    local outputs
347 |    outputs, self.gops = self.compiler:processNetwork(network, inputs)
348 | 
349 |    return outputs
350 | end
351 | 
352 | ----------------------------------------------------------------------
353 | -- high-level GOTO functions
354 | --
355 | function NeuFlow:beginLoop(tag)
356 |    self.loopTags.tag = self.core:makeGotoTag()
357 |    self.loopTags.tag.offset = 1
358 | end
359 | 
360 | function NeuFlow:endLoop(tag)
361 |    self.core:defaults()
362 |    self.core:gotoTag(self.loopTags.tag)
363 | end
364 | 
365 | function NeuFlow:term()
366 |    self.core:terminate()
367 | end
368 | 
369 | ----------------------------------------------------------------------
370 | -- write bytecode in binary/hex mode
371 | --
372 | function NeuFlow:writeBytecode(args)
373 |    local tensor = torch.ByteTensor(self.bytecodesize):zero()
374 | 
375 |    -- generate binary once
376 |    local tensor_size = self.core.linker:dump(
377 |       {
378 |          tensor   = tensor,
379 |       },
380 |       self.core.mem
381 |    )
382 | 
383 |    local filepath
384 |    if next(args) ~= nil then -- called with arguments pasted in
385 |       filepath = '/tmp/' .. self.prog_name .. '-' .. os.date("%Y_%m_%d_%H_%M_%S") .. '.bin'
386 |       local file = assert(torch.DiskFile(filepath,'w'):binary())
387 |       file:writeString(tensor:storage():string():sub(1, tensor_size))
388 |       assert(file:close())
389 |    end
390 | 
391 |    -- generate all outputs
392 |    for _,args in ipairs(args) do
393 |       -- args
394 |       local format = args.format or 'bin' -- or 'hex'
395 |       local width = args.width or 8
396 |       local length = args.length
397 | 
398 |       if format == 'bin' then
399 |          -- simple copy
400 |          os.execute('cp -v' .. filepath .. ' ' .. self.prog_name .. '.bin')
401 |       elseif format == 'hex' then
402 |          local filehex = self.prog_name ..'.hex'..tostring(width)
403 |          neuflow.tools.readBinWriteHex(filepath, filehex, width, length)
404 |       elseif format == 'rom' then
405 |          local filev = self.prog_name ..'.v'
406 |          neuflow.tools.readBinWriteRom(filepath, filev, width, 'flow_rom')
407 |       else
408 |          error('format should be one of: bin | hex')
409 |       end
410 |    end
411 | 
412 |    return tensor
413 | end
414 | 
415 | ----------------------------------------------------------------------
416 | -- execute simulation (testbench)
417 | --
418 | function NeuFlow:execSimulation(args)
419 |    local testbench = args.testbench or error('please provide a testbench script')
420 |    local cache_hex = args.cache_hex or error('please provide path for cache hex mask')
421 |    local mem_hex = args.mem_hex or error('please provide path for mem hex mask')
422 | 
423 |    print('<neuflow.NeuFlow> exporting compiled code [hex]')
424 |    self:writeBytecode{{format='hex', width=oFlower.bus_, length=oFlower.cache_size_b},
425 |                       {format='hex', width=streamer.mem_bus_}}
426 | 
427 |    -- platform-dependent memories:
428 |    if self.core.platform == 'ibm_asic' then
429 |       os.execute('mv '..self.prog_name..'.hex64 '..cache_hex)
430 |       for subidx = 0,7 do
431 |          os.execute('cut -c'..(subidx*8+1)..'-'..(subidx*8+8)..' '
432 |               ..self.prog_name..'.hex256 > '..mem_hex..'.'..(subidx+1))
433 |       end
434 |       os.execute('rm '..self.prog_name..'.hex256 ')
435 |    else
436 |       os.execute('mv '..self.prog_name..'.hex64 '..cache_hex)
437 |       os.execute('mv '..self.prog_name..'.hex256 '..mem_hex)
438 |    end
439 | 
440 |    local c = sys.COLORS
441 |    print(c._cyan)
442 |    print('<neuflow.NeuFlow> running compiled bytecode in simulation')
443 |    local path = paths.dirname(testbench)
444 |    local script = paths.basename(testbench)
445 |    os.execute('cd ' .. path .. '; ./' .. script .. ' ' .. options.tb_args)
446 |    print(c.none)
447 | end
448 | 
449 | ----------------------------------------------------------------------
450 | -- transmit reset
451 | --
452 | function NeuFlow:sendReset()
453 |    self.ethernet:sendReset()
454 | end
455 | 
456 | ----------------------------------------------------------------------
457 | -- tell device to wait for the bytecode to be sent from the host
458 | --
459 | function NeuFlow:receiveBytecode()
460 |    self.ethernet:dev_receiveBytecode()
461 | end
462 | 
463 | ----------------------------------------------------------------------
464 | -- send bytecode to device
465 | --
466 | function NeuFlow:sendBytecode(bytecode)
467 |    self:loadBytecode(bytecode)
468 | end
469 | 
470 | ----------------------------------------------------------------------
471 | -- transmit bytecode
472 | --
473 | function NeuFlow:loadBytecode(bytecode)
474 |    if bytecode then
475 |       -- then transmit bytecode
476 |       print('<neuflow.NeuFlow> transmitting bytecode')
477 |       self.ethernet:host_sendBytecode(bytecode)
478 |    else
479 |       -- if no bytecode given, first dump it to file, then load it from there
480 |       self:loadBytecode(self:writeBytecode{})
481 |    end
482 | end
483 | 
484 | ----------------------------------------------------------------------
485 | -- transmit bytecode (from file)
486 | --
487 | function NeuFlow:loadBytecodeFromFile(filename)
488 |    local file = assert(io.open(filename, "r"))
489 |    local tensor = self:convertBytecodeString(file:read("*all"))
490 |    file:close()
491 | 
492 |    self:loadBytecode(tensor)
493 | end
494 | 
495 | function NeuFlow:convertBytecodeString(bytes)
496 |    local tensor = torch.ByteTensor(self.bytecodesize)
497 |    local i = 1
498 |    for b in string.gfind(bytes, ".") do
499 |       tensor[i] = string.byte(b)
500 |       i = i+1
501 |    end
502 | 
503 |    return tensor
504 | end
505 | 
506 | ----------------------------------------------------------------------
507 | -- transmit tensor
508 | --
509 | function NeuFlow:copyToDev(tensor)
510 |    self.ethernet:host_copyToDev(tensor)
511 | end
512 | 
513 | ----------------------------------------------------------------------
514 | -- receive tensor
515 | --
516 | function NeuFlow:copyFromDev(tensor)
517 |    self.ethernet:host_copyFromDev(tensor, self.handshake)
518 | end
519 | 


--------------------------------------------------------------------------------
/src/Profiler.lua:
--------------------------------------------------------------------------------
  1 | --------------------------------------------------------------------------------
  2 | -- Profiler: a simple class to help profiling code
  3 | --------------------------------------------------------------------------------
  4 | local Profiler = torch.class('neuflow.Profiler')
  5 | 
  6 | function Profiler:__init(mode,verbose)
  7 |    self.events = {}
  8 |    self.list = {}
  9 |    self.off = (mode == 'off') or false
 10 |    self.verbose = verbose or false
 11 | end
 12 | 
 13 | function Profiler:start(name, fps)
 14 |    if self.events[name] then
 15 |       -- update
 16 |       self.events[name].cpu = os.clock()
 17 |       self.events[name].real = sys.clock()
 18 |    else
 19 |       -- create
 20 |       self.events[name] = {cpu=os.clock(), real=sys.clock(), name=name}
 21 |       self.list[#self.list+1] = self.events[name]
 22 |    end
 23 |    if fps and fps == 'fps' then
 24 |       self.events[name].fps = true
 25 |    end
 26 |    if self.verbose then io.write('<' .. name .. '>') io.flush() end
 27 | end
 28 | 
 29 | function Profiler:setColor(name, color)
 30 |    if self.events[name] then
 31 |       -- update
 32 |       self.events[name].color = color
 33 |    else
 34 |       error('# ERROR: There is no such profiler - '.. name..', create it first')
 35 |    end
 36 | end
 37 | 
 38 | 
 39 | function Profiler:cpu(name,divider)
 40 |    local delta = os.clock() - self.events[name].cpu
 41 |    if divider then delta = delta / divider end
 42 |    self.events[name].cpud = delta
 43 |    return delta
 44 | end
 45 | 
 46 | function Profiler:real(name,divider)
 47 |    local delta = sys.clock() - self.events[name].real
 48 |    if divider then delta = delta / divider end
 49 |    self.events[name].reald = delta
 50 |    return delta
 51 | end
 52 | 
 53 | function Profiler:lap(name,divider)
 54 |    local r = self:real(name,divider)
 55 |    local c = self:cpu(name,divider)
 56 |    if self.verbose then io.write('\r') self:print(name) end
 57 |    return r,c
 58 | end
 59 | 
 60 | function Profiler:format(name)
 61 |    return string.format('$ real | cpu: %f | %f <%s>',
 62 |                         self.events[name].reald or -1, self.events[name].cpud or -1, name)
 63 | end
 64 | 
 65 | function Profiler:print(name)
 66 |    if not self.off then
 67 |       print(self:format(name))
 68 |    end
 69 | end
 70 | 
 71 | function Profiler:formatAll()
 72 |    local str = '$ profiler report:'
 73 |    for i = 1,#self.list do
 74 |       if self.list[i].fps then
 75 |          str = str .. '\n' .. string.format('$ real %f | cpu %f <%s> = %f fps',
 76 |                                             self.list[i].reald or -1,
 77 |                                             self.list[i].cpud or -1,
 78 |                                             self.list[i].name,
 79 |                                             1/self.list[i].reald)
 80 |       else
 81 |          str = str .. '\n' .. string.format('$ real %f | cpu %f <%s>',
 82 |                                             self.list[i].reald or -1,
 83 |                                             self.list[i].cpud or -1,
 84 |                                             self.list[i].name)
 85 |       end
 86 |    end
 87 |    return str
 88 | end
 89 | 
 90 | function Profiler:printAll()
 91 |    if not self.off then
 92 |       print(self:formatAll())
 93 |    end
 94 | end
 95 | 
 96 | function Profiler:displayAll(args)
 97 |    -- args
 98 |    local x = args.x or 0
 99 |    local y = args.y or 0
100 |    local zoom = args.zoom or 1
101 |    local painter = args.painter or args.win
102 |    local font = args.font or 24*zoom
103 |    if not painter then error('# ERROR: Profiler.displayAll() needs a painter') end
104 | 
105 |    painter:setfont(qt.QFont{serif=false,italic=false,size=font})
106 |    if not self.off then
107 |       for i = 1,#self.list do
108 |          painter:setcolor(self.list[i].color or "black")
109 |          local str
110 |          if self.list[i].fps then
111 |             str = string.format('$ real %f | cpu %f <%s> = %f fps',
112 |                                 self.list[i].reald or -1,
113 |                                 self.list[i].cpud or -1,
114 |                                 self.list[i].name,
115 |                                 1/self.list[i].reald)
116 |          else
117 |             str = string.format('$ real %f | cpu %f <%s>',
118 |                                 self.list[i].reald or -1,
119 |                                 self.list[i].cpud or -1,
120 |                                 self.list[i].name)
121 |          end
122 |          -- disp line:
123 |          painter:moveto(x,y); y = y + font*1.5
124 |          painter:show(str)
125 |       end
126 |    end
127 | end
128 | 


--------------------------------------------------------------------------------
/src/Serial.lua:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- Serial
 3 | -- a class to read/write through serial port
 4 | --------------------------------------------------------------------------------
 5 | 
 6 | ----------------------------------------------------------------------
 7 | -- register class + constructor
 8 | --
 9 | local Serial = torch.class('neuflow.Serial')
10 | 
11 | function Serial:__init(dev,baud)
12 |    -- error messages
13 |    self.WARNING_NOTFOUND = '# serial: warning, device ' .. dev .. ' not found'
14 | 
15 |    -- device + speed
16 |    self.dev = dev or '/dev/tty'
17 |    self.baud = baud or 57600
18 | 
19 |    -- dev exists ?
20 |    if not paths.filep(self.dev) then
21 |       print(self.WARNING_NOTFOUND)
22 |       return
23 |    end
24 | 
25 |    -- this is linux dependent ?
26 |    local ret = sys.execute('stty -F ' .. self.dev .. ' ' .. self.baud .. ' min 0 time 1')
27 | 
28 |    -- dev exists ?
29 |    if ret ~= '' then
30 |       print(self.WARNING_NOTFOUND)
31 |       return
32 |    end
33 | 
34 |    -- file descriptors
35 |    self.devr = io.open(dev, 'r')
36 |    self.devw = io.open(dev, 'w')
37 | 
38 |    -- background reader
39 |    require 'thread'
40 |    local function dumpTTY ()
41 |       local c = sys.COLORS
42 |       local highlight = c._cyan
43 |       local none = c.none
44 |       while true do
45 |          local fromTTY = self:read()
46 |          if fromTTY then print(fromTTY) end
47 |       end
48 |    end
49 |    thread.newthread(dumpTTY, {})
50 | end
51 | 
52 | function Serial:cleanup()
53 |    self.dev:close()
54 | end
55 | 
56 | function Serial:read()
57 |    return self.devr:read('*l')
58 | end
59 | 
60 | function Serial:write(line)
61 |    return self.devw:write(line)
62 | end
63 | 


--------------------------------------------------------------------------------
/src/defines.lua:
--------------------------------------------------------------------------------
  1 | -- -*- lua -*-
  2 | 
  3 | ----------------------------------------------------------------------
  4 | --- Useful abbrevs
  5 | --
  6 | kB = 1024
  7 | MB = 1024*1024
  8 | GB = 1024*1024*1024
  9 | kHz = 1000
 10 | MHz = 1000*1000
 11 | GHz = 1000*1000*1000
 12 | 
 13 | ----------------------------------------------------------------------
 14 | --- Blast Bus parameters
 15 | --
 16 | blast_bus = {
 17 |    -- Addressing :
 18 |    area_streamer       =  1,
 19 |    area_tile           =  2,
 20 |    area_memctrl        =  3,
 21 |    area_dma            =  4,
 22 |    --
 23 |    addr_broadcast      = 0,
 24 |    addr_conv_0         = 1,
 25 |    addr_conv_1         = 2,
 26 |    addr_comb_0         = 16,
 27 |    addr_mapp_0         = 24,
 28 |    addr_div_0          = 28,
 29 |    addr_grid_0         = 256,
 30 |    addr_mem_streamer_0 = 1,
 31 |    addr_mem_streamer_1 = 2,
 32 |    addr_mem_streamer_2 = 3,
 33 |    addr_mem_streamer_3 = 4,
 34 |    addr_mem_streamer_4 = 5,
 35 |    addr_mem_streamer_5 = 6,
 36 |    addr_mem_streamer_6 = 7,
 37 |    addr_mem_streamer_7 = 8,
 38 |    addr_dma            = 0,
 39 |    addr_memctrl        = 0,
 40 |    --
 41 |    subAddr_router      =  0,
 42 |    subAddr_operator    =  1,
 43 |    subAddr_cacher      =  2,
 44 |    subAddr_IO          =  3,
 45 |    subAddr_none        =  0,
 46 |    subAddr_memTimeouts =  0,
 47 |    subAddr_memGlobals  =  1,
 48 |    subAddr_memLocals   =  2,
 49 | 
 50 |    -- Content:
 51 |    content_nothing     = 0,
 52 |    content_command     = 1,
 53 |    content_instruc     = 2,
 54 |    content_config      = 3,
 55 |    content_valid       = 1,
 56 | 
 57 |    -- Instructions
 58 |    instruc_config      = 0,
 59 |    instruc_setAdd      = 1,
 60 |    instruc_activate    = 2,
 61 |    instruc_deActivate  = 3,
 62 |    instruc_reset       = 4,
 63 |    instruc_RESERVED_1  = 5,
 64 |    instruc_control_0   = 6,
 65 |    instruc_control_1   = 7,
 66 |    instruc_control_2   = 8,
 67 |    instruc_control_3   = 9,
 68 |    instruc_control_4   = 10,
 69 |    instruc_control_5   = 11,
 70 |    instruc_control_6   = 12,
 71 |    instruc_control_7   = 13,
 72 |    instruc_cacheStart  = 14,
 73 |    instruc_cacheFinish = 15,
 74 | 
 75 |    -- Status
 76 |    status_notAddressed = 0,
 77 |    status_idle         = 1,
 78 |    status_busy         = 2,
 79 |    status_done         = 3,
 80 |    status_primed       = 4,
 81 |    status_unconfigured = 5,
 82 |    status_misconfigured = 6
 83 | }
 84 | 
 85 | 
 86 | ----------------------------------------------------------------------
 87 | --- OpenFlower Instruction Set.
 88 | --
 89 | oFlower = {
 90 |    -- Opcodes
 91 |    op_writeConfig = 0,
 92 |    op_getStatus   = 1,
 93 |    op_writeStream = 2,
 94 |    op_routeStream = 3,
 95 |    op_writeWord   = 4,
 96 |    op_readWord    = 5,
 97 |    op_setReg      = 6,
 98 |    op_goto        = 7,
 99 |    op_add         = 8,
100 |    op_control     = 9,
101 |    op_and         = 10,
102 |    op_or          = 11,
103 |    op_comp        = 12,
104 |    op_shr         = 13,
105 |    op_nop         = 14,
106 |    op_term        = 15,
107 | 
108 |    -- Register map
109 |    reg_operation  = 0,
110 |    reg_size       = 1,
111 |    reg_type       = 2,
112 |    reg_state      = 3,
113 |    reg_counter    = 4,
114 |    reg_loops      = 5,
115 |    reg_status     = 6,
116 |    reg_sys_A      = 7,
117 |    reg_sys_B      = 8,
118 |    reg_sys_C      = 9,
119 |    reg_A          = 10,
120 |    reg_B          = 11,
121 |    reg_C          = 12,
122 |    reg_D          = 13,
123 |    reg_E          = 14,
124 |    reg_F          = 15,
125 | 
126 |    -- ctrl map
127 |    ctrl_lock_config_bus = 0,
128 | 
129 |    -- I/O Map
130 |    io_uart        = 0,
131 |    io_uart_status = 1,
132 |    io_dma         = 2,
133 |    io_dma_status  = 3,
134 |    io_ethernet    = 4,
135 |    io_ethernet_status = 5,
136 |    io_iic         = 6,
137 |    io_iic_status  = 7,
138 |    io_spi         = 8,
139 |    io_spi_status  = 8,
140 |    io_gpios       = 10,
141 |    io_timer       = 11,
142 |    io_timer_ctrl  = 12,
143 | 
144 |    -- CPU types
145 |    type_uint8     = 8,
146 |    type_uint16    = 4,
147 |    type_uint32    = 2,
148 |    type_uint64    = 1,
149 | 
150 |    -- clock
151 |    clock_freq     = 100*MHz,
152 |    uart_freq      = 57600,
153 | 
154 |    -- nb of dmas (this includes instruction path)
155 |    nb_dmas = 2
156 | }
157 | do
158 |    -- Cache
159 |    oFlower.cache_size_b    = 64*kB
160 |    oFlower.page_size_b     = oFlower.cache_size_b/2
161 |    oFlower.bus_            = 64
162 |    oFlower.bus_b           = oFlower.bus_/8
163 | end
164 | 
165 | 
166 | ----------------------------------------------------------------------
167 | --- Grid parameters
168 | --
169 | grid = {}
170 | do
171 |    -- nb of grids
172 |    grid.nb_grids = 1
173 |    -- global IOs
174 |    grid.nb_ios = 6
175 |    -- conv
176 |    grid.nb_convs = 4
177 |    grid.kernel_width = 10
178 |    grid.kernel_height = 10
179 |    -- mapper
180 |    grid.nb_mappers = 4
181 |    grid.mapper_segs = 8
182 |    -- generic ALUs
183 |    grid.nb_alus = 4
184 |    -- clock:
185 |    grid.clock_freq = 200*MHz
186 | end
187 | 
188 | 
189 | ----------------------------------------------------------------------
190 | --- General DMAs
191 | --
192 | dma = {}
193 | do
194 |    -- global DMA IOs
195 |    dma.nb_ios = 2
196 |    dma.ethernet_write_port_id = 2
197 |    dma.ethernet_read_port_id = 3
198 | end
199 | 
200 | 
201 | ----------------------------------------------------------------------
202 | --- Streamer parameters
203 | --
204 | -- Units:
205 | -- _: bits
206 | -- _b: bytes
207 | -- _w: words (1 word = word_b bytes)
208 | -- _r: memory rows (1 row = size_b bytes)
209 | -- _i: integers (1 int = 4 bytes)
210 | --
211 | streamer = {}
212 | do
213 |    -- physical params
214 |    streamer.nb_ports   = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios
215 |    -- geometry
216 |    streamer.mem_bus_   = 256
217 |    streamer.mem_bus_b  = 256 / 8
218 |    streamer.stride_b   = 2048
219 |    streamer.word_b     = 2
220 |    streamer.align_b    = streamer.mem_bus_ / 8
221 |    streamer.stride_w   = streamer.stride_b / streamer.word_b
222 |    streamer.align_w    = streamer.align_b / streamer.word_b
223 |    -- clock
224 |    streamer.clock_freq = 200*MHz
225 | end
226 | 
227 | 
228 | ----------------------------------------------------------------------
229 | --- Memory parameters
230 | --
231 | -- the parameters are expressed in different units:
232 | -- _: bits
233 | -- _b: bytes
234 | -- _w: words (1 word = word_b bytes)
235 | -- _r: memory rows (1 row = size_b bytes)
236 | -- _i: integers (1 int = 4 bytes)
237 | --
238 | memory = {}
239 | do
240 |    -- size:
241 |    memory.size_b      = 512*MB
242 |    memory.size_w      = memory.size_b / streamer.word_b
243 |    memory.size_r      = memory.size_b / streamer.stride_b
244 |    -- clock:
245 |    memory.clock_freq  = 400*MHz
246 |    -- bandwidth
247 |    memory.bus_        = 32
248 |    memory.is_ddr      = true
249 |    memory.bandwidth_  = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1)
250 |    memory.bandwidth_b = memory.bandwidth_ / 8
251 |    memory.bandwidth_w = memory.bandwidth_b / streamer.word_b
252 | 
253 |    memory.offset_text = 0
254 | end
255 | 
256 | 
257 | ----------------------------------------------------------------------
258 | --- Extra Streamer parameters
259 | --
260 | do
261 |    -- parallel streams: this is application dependent
262 |    streamer.max_parallel_rd_streams = grid.nb_convs + 1
263 |    streamer.max_parallel_wr_streams = 1
264 |    streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams
265 |    -- bandwidth per stream:
266 |    streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b
267 |    streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams
268 |    streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor
269 |    -- bandwidth first check
270 |    if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then
271 |       print('ERROR <streamer> internal bandwidth too high: '
272 |             .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s'
273 |             .. ' > external bandwidth available: '
274 |             ..  streamer.mem_bandwidth_b/1e9 ..'GB/s')
275 |       os.exit()
276 |    end
277 |    -- continous streaming per rd port:
278 |    -- this is based on the observation that:
279 |    -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b
280 |    local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams
281 |    local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams
282 |    streamer.min_timeout_rd = math.ceil(dead_cycles_rd /
283 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
284 |                                      - streamer.max_parallel_streams))
285 |    streamer.min_timeout_wr = math.ceil(dead_cycles_wr /
286 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
287 |                                      - streamer.max_parallel_streams))
288 |    --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr
289 |    --      .. ' and rd=' .. streamer.min_timeout_rd)
290 |    -- for these timeouts, we compute necessary buffers to insure no one is starving
291 |    streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd
292 |                                                          + streamer.min_timeout_rd
293 |                                                          *(streamer.max_parallel_streams-1))
294 |                                    / streamer.mem_bus_b))
295 |    streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr
296 |                                                          + streamer.min_timeout_wr
297 |                                                          *(streamer.max_parallel_streams-1))
298 |                                    / streamer.mem_bus_b))
299 |    --print('# streamer min cache sizes: wr='..streamer.min_cache_wr
300 |    --      ..' and rd='..streamer.min_cache_rd)
301 | end
302 | 
303 | 
304 | ----------------------------------------------------------------------
305 | --- Num parameters
306 | --
307 | num = {}
308 | do
309 |    num.size_b = 2
310 |    num.size_ = 16
311 |    num.frac_ = 8
312 |    num.int_ = num.size_-num.frac_
313 |    num.max = (2^(num.size_-1)-1) / 2^num.frac_
314 |    num.min = -(2^(num.size_-1)) / 2^num.frac_
315 |    num.one = 2^num.frac_
316 |    num.res = 1 / 2^num.frac_
317 |    num.precision = num.res
318 |    num.mask = 0xFFFF
319 | end
320 | 
321 | 
322 | ----------------------------------------------------------------------
323 | --- System Banner
324 | --
325 | banner =
326 |    '------------------------------------------------------------\r\n' ..
327 |    '--     _ _  __        neuFlow [v.1.0]                     --\r\n' ..
328 |    '--    ( | )/_/                                            --\r\n' ..
329 |    '-- __( >O< )         This code runs on                    --\r\n' ..
330 |    '-- \\_\\(_|_)       the custom openFlow CPU.                --\r\n' ..
331 |    '--                                                        --\r\n' ..
332 |    '--   Copyright (C) 2009/10  |  Farabet/Akselrod/Martini   --\r\n' ..
333 |    '------------------------------------------------------------'
334 | 
335 | 
336 | ----------------------------------------------------------------------
337 | --- BootLoader parameters
338 | --
339 | bootloader = {}
340 | do
341 |    bootloader.entry_point_b =  oFlower.cache_size_b
342 |    bootloader.entry_point   =  bootloader.entry_point_b / oFlower.bus_b
343 |    bootloader.load_size     =  32*MB
344 | end
345 | 


--------------------------------------------------------------------------------
/src/defines_ibm_asic.lua:
--------------------------------------------------------------------------------
  1 | -- -*- lua -*-
  2 | 
  3 | ----------------------------------------------------------------------
  4 | --- Useful abbrevs
  5 | --
  6 | kB = 1024
  7 | MB = 1024*1024
  8 | GB = 1024*1024*1024
  9 | kHz = 1000
 10 | MHz = 1000*1000
 11 | GHz = 1000*1000*1000
 12 | 
 13 | ----------------------------------------------------------------------
 14 | --- Blast Bus parameters
 15 | --
 16 | blast_bus = {
 17 |    -- Addressing :
 18 |    area_streamer       =  1,
 19 |    area_tile           =  2,
 20 |    area_memctrl        =  3,
 21 |    area_dma            =  4,
 22 |    --
 23 |    addr_broadcast      = 0,
 24 |    addr_conv_0         = 1,
 25 |    addr_conv_1         = 2,
 26 |    addr_comb_0         = 16,
 27 |    addr_mapp_0         = 24,
 28 |    addr_div_0          = 28,
 29 |    addr_grid_0         = 256,
 30 |    addr_mem_streamer_0 = 1,
 31 |    addr_mem_streamer_1 = 2,
 32 |    addr_mem_streamer_2 = 3,
 33 |    addr_mem_streamer_3 = 4,
 34 |    addr_mem_streamer_4 = 5,
 35 |    addr_mem_streamer_5 = 6,
 36 |    addr_mem_streamer_6 = 7,
 37 |    addr_mem_streamer_7 = 8,
 38 |    addr_dma            = 0,
 39 |    addr_memctrl        = 0,
 40 |    --
 41 |    subAddr_router      =  0,
 42 |    subAddr_operator    =  1,
 43 |    subAddr_cacher      =  2,
 44 |    subAddr_IO          =  3,
 45 |    subAddr_none        =  0,
 46 |    subAddr_memTimeouts =  0,
 47 |    subAddr_memGlobals  =  1,
 48 |    subAddr_memLocals   =  2,
 49 | 
 50 |    -- Content:
 51 |    content_nothing     = 0,
 52 |    content_command     = 1,
 53 |    content_instruc     = 2,
 54 |    content_config      = 3,
 55 |    content_valid       = 1,
 56 | 
 57 |    -- Instructions
 58 |    instruc_config      = 0,
 59 |    instruc_setAdd      = 1,
 60 |    instruc_activate    = 2,
 61 |    instruc_deActivate  = 3,
 62 |    instruc_reset       = 4,
 63 |    instruc_RESERVED_1  = 5,
 64 |    instruc_control_0   = 6,
 65 |    instruc_control_1   = 7,
 66 |    instruc_control_2   = 8,
 67 |    instruc_control_3   = 9,
 68 |    instruc_control_4   = 10,
 69 |    instruc_control_5   = 11,
 70 |    instruc_control_6   = 12,
 71 |    instruc_control_7   = 13,
 72 |    instruc_cacheStart  = 14,
 73 |    instruc_cacheFinish = 15,
 74 | 
 75 |    -- Status
 76 |    status_notAddressed = 0,
 77 |    status_idle         = 1,
 78 |    status_busy         = 2,
 79 |    status_done         = 3,
 80 |    status_primed       = 4,
 81 |    status_unconfigured = 5,
 82 |    status_misconfigured = 6
 83 | }
 84 | 
 85 | 
 86 | ----------------------------------------------------------------------
 87 | --- OpenFlower Instruction Set.
 88 | --
 89 | oFlower = {
 90 |    -- Opcodes
 91 |    op_writeConfig = 0,
 92 |    op_getStatus   = 1,
 93 |    op_writeStream = 2,
 94 |    op_routeStream = 3,
 95 |    op_writeWord   = 4,
 96 |    op_readWord    = 5,
 97 |    op_setReg      = 6,
 98 |    op_goto        = 7,
 99 |    op_add         = 8,
100 |    op_control     = 9,
101 |    op_and         = 10,
102 |    op_or          = 11,
103 |    op_comp        = 12,
104 |    op_shr         = 13,
105 |    op_nop         = 14,
106 |    op_term        = 15,
107 | 
108 |    -- Register map
109 |    reg_operation  = 0,
110 |    reg_size       = 1,
111 |    reg_type       = 2,
112 |    reg_state      = 3,
113 |    reg_counter    = 4,
114 |    reg_loops      = 5,
115 |    reg_status     = 6,
116 |    reg_sys_A      = 7,
117 |    reg_sys_B      = 8,
118 |    reg_sys_C      = 9,
119 |    reg_A          = 10,
120 |    reg_B          = 11,
121 |    reg_C          = 12,
122 |    reg_D          = 13,
123 |    reg_E          = 14,
124 |    reg_F          = 15,
125 | 
126 |    -- ctrl map
127 |    ctrl_lock_config_bus = 0,
128 | 
129 |    -- I/O Map
130 |    io_uart        = 0,
131 |    io_uart_status = 1,
132 |    io_dma         = 2,
133 |    io_dma_status  = 3,
134 |    io_ethernet    = 4,
135 |    io_ethernet_status = 5,
136 |    io_iic         = 6,
137 |    io_iic_status  = 7,
138 |    io_spi         = 8,
139 |    io_spi_status  = 8,
140 |    io_gpios       = 10,
141 |    io_timer       = 11,
142 |    io_timer_ctrl  = 12,
143 | 
144 |    -- CPU types
145 |    type_uint8     = 8,
146 |    type_uint16    = 4,
147 |    type_uint32    = 2,
148 |    type_uint64    = 1,
149 | 
150 |    -- clock
151 |    clock_freq     = 200*MHz,
152 |    uart_freq      = 57600,
153 | 
154 |    -- nb of dmas (this includes instruction path)
155 |    nb_dmas = 2
156 | }
157 | do
158 |    -- Cache
159 |    oFlower.cache_size_b    = 64*kB
160 |    oFlower.page_size_b     = oFlower.cache_size_b/2
161 |    oFlower.bus_            = 64
162 |    oFlower.bus_b           = oFlower.bus_/8
163 | end
164 | 
165 | 
166 | ----------------------------------------------------------------------
167 | --- Grid parameters
168 | --
169 | grid = {}
170 | do
171 |    -- nb of grids
172 |    grid.nb_grids = 1
173 |    -- global IOs
174 |    grid.nb_ios = 7
175 |    -- conv
176 |    grid.nb_convs = 4
177 |    grid.kernel_width = 10
178 |    grid.kernel_height = 10
179 |    -- mapper
180 |    grid.nb_mappers = 4
181 |    grid.mapper_segs = 8
182 |    -- generic ALUs
183 |    grid.nb_alus = 4
184 |    -- clock:
185 |    grid.clock_freq = 400*MHz
186 | end
187 | 
188 | 
189 | ----------------------------------------------------------------------
190 | --- General DMAs
191 | --
192 | dma = {}
193 | do
194 |    -- global DMA IOs
195 |    dma.nb_ios = 2
196 | end
197 | 
198 | 
199 | ----------------------------------------------------------------------
200 | --- Streamer parameters
201 | --
202 | -- Units:
203 | -- _: bits
204 | -- _b: bytes
205 | -- _w: words (1 word = word_b bytes)
206 | -- _r: memory rows (1 row = size_b bytes)
207 | -- _i: integers (1 int = 4 bytes)
208 | --
209 | streamer = {}
210 | do
211 |    -- physical params
212 |    streamer.nb_ports   = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios * grid.nb_grids
213 |    -- geometry
214 |    streamer.mem_bus_   = 256
215 |    streamer.mem_bus_b  = 256 / 8
216 |    streamer.stride_b   = 2048
217 |    streamer.word_b     = 2
218 |    streamer.align_b    = streamer.mem_bus_ / 8
219 |    streamer.stride_w   = streamer.stride_b / streamer.word_b
220 |    streamer.align_w    = streamer.align_b / streamer.word_b
221 |    -- clock
222 |    streamer.clock_freq = 400*MHz
223 | end
224 | 
225 | 
226 | ----------------------------------------------------------------------
227 | --- Memory parameters
228 | --
229 | -- the parameters are expressed in different units:
230 | -- _: bits
231 | -- _b: bytes
232 | -- _w: words (1 word = word_b bytes)
233 | -- _r: memory rows (1 row = size_b bytes)
234 | -- _i: integers (1 int = 4 bytes)
235 | --
236 | memory = {}
237 | do
238 |    -- size:
239 |    memory.size_b      = 16*MB
240 |    memory.size_w      = memory.size_b / streamer.word_b
241 |    memory.size_r      = memory.size_b / streamer.stride_b
242 |    -- clock:
243 |    memory.clock_freq  = 400*MHz
244 |    -- bandwidth
245 |    memory.bus_        = 64
246 |    memory.is_ddr      = true
247 |    memory.is_dual     = true
248 |    memory.bandwidth_  = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1)
249 |    memory.bandwidth_b = memory.bandwidth_ / 8
250 |    memory.bandwidth_w = memory.bandwidth_b / streamer.word_b
251 | 
252 |    memory.offset_text = 0
253 | end
254 | 
255 | 
256 | ----------------------------------------------------------------------
257 | --- Extra Streamer parameters
258 | --
259 | do
260 |    -- parallel streams: this is application dependent
261 |    streamer.max_parallel_rd_streams = grid.nb_convs + 1
262 |    streamer.max_parallel_wr_streams = 1
263 |    streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams
264 |    -- bandwidth per stream:
265 |    streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b
266 |    streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams
267 |    streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor
268 |    -- bandwidth first check
269 |    if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then
270 |       print('ERROR <streamer> internal bandwidth too high: '
271 |             .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s'
272 |             .. ' > external bandwidth available: '
273 |             ..  streamer.mem_bandwidth_b/1e9 ..'GB/s')
274 |       os.exit()
275 |    end
276 |    -- continous streaming per rd port:
277 |    -- this is based on the observation that:
278 |    -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b
279 |    local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams
280 |    local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams
281 |    streamer.min_timeout_rd = math.ceil(dead_cycles_rd /
282 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
283 |                                      - streamer.max_parallel_streams))
284 |    streamer.min_timeout_wr = math.ceil(dead_cycles_wr /
285 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
286 |                                      - streamer.max_parallel_streams))
287 |    --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr
288 |    --      .. ' and rd=' .. streamer.min_timeout_rd)
289 |    -- for these timeouts, we compute necessary buffers to insure no one is starving
290 |    streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd
291 |                                                          + streamer.min_timeout_rd
292 |                                                          *(streamer.max_parallel_streams-1))
293 |                                    / streamer.mem_bus_b))
294 |    streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr
295 |                                                          + streamer.min_timeout_wr
296 |                                                          *(streamer.max_parallel_streams-1))
297 |                                    / streamer.mem_bus_b))
298 |    --print('# streamer min cache sizes: wr='..streamer.min_cache_wr
299 |    --      ..' and rd='..streamer.min_cache_rd)
300 | end
301 | 
302 | 
303 | ----------------------------------------------------------------------
304 | --- Num parameters
305 | --
306 | num = {}
307 | do
308 |    num.size_b = 2
309 |    num.size_ = 16
310 |    num.frac_ = 8
311 |    num.int_ = num.size_-num.frac_
312 |    num.max = (2^(num.size_-1)-1) / 2^num.frac_
313 |    num.min = -(2^(num.size_-1)) / 2^num.frac_
314 |    num.one = 2^num.frac_
315 |    num.res = 1 / 2^num.frac_
316 |    num.precision = num.res
317 |    num.mask = 0xFFFF
318 | end
319 | 
320 | 
321 | ----------------------------------------------------------------------
322 | --- System Banner
323 | --
324 | banner =
325 |    '------------------------------------------------------------\r\n' ..
326 |    '--     _ _  __        neuFlow [v.1.0]                     --\r\n' ..
327 |    '--    ( | )/_/                                            --\r\n' ..
328 |    '-- __( >O< )         This code runs on                    --\r\n' ..
329 |    '-- \\_\\(_|_)       the custom openFlow CPU.                --\r\n' ..
330 |    '--                                                        --\r\n' ..
331 |    '--   Copyright (C) 2009/10  |  Farabet/Akselrod/Martini   --\r\n' ..
332 |    '------------------------------------------------------------'
333 | 
334 | 
335 | ----------------------------------------------------------------------
336 | --- BootLoader parameters
337 | --
338 | bootloader = {}
339 | do
340 |    bootloader.entry_point_b =  oFlower.cache_size_b
341 |    bootloader.entry_point   =  bootloader.entry_point_b / oFlower.bus_b
342 |    bootloader.load_size     =  32*MB
343 | end
344 | 


--------------------------------------------------------------------------------
/src/defines_pico_m503.lua:
--------------------------------------------------------------------------------
  1 | -- -*- lua -*-
  2 | 
  3 | ----------------------------------------------------------------------
  4 | --- Useful abbrevs
  5 | --
  6 | kB = 1024
  7 | MB = 1024*1024
  8 | GB = 1024*1024*1024
  9 | kHz = 1000
 10 | MHz = 1000*1000
 11 | GHz = 1000*1000*1000
 12 | 
 13 | ----------------------------------------------------------------------
 14 | --- Blast Bus parameters
 15 | --
 16 | blast_bus = {
 17 |    -- Addressing :
 18 |    area_streamer       =  1,
 19 |    area_tile           =  2,
 20 |    area_memctrl        =  3,
 21 |    area_dma            =  4,
 22 |    --
 23 |    addr_broadcast      = 0,
 24 |    addr_conv_0         = 1,
 25 |    addr_conv_1         = 2,
 26 |    addr_comb_0         = 16,
 27 |    addr_mapp_0         = 24,
 28 |    addr_div_0          = 28,
 29 |    addr_grid_0         = 256,
 30 |    addr_mem_streamer_0 = 1,
 31 |    addr_mem_streamer_1 = 2,
 32 |    addr_mem_streamer_2 = 3,
 33 |    addr_mem_streamer_3 = 4,
 34 |    addr_mem_streamer_4 = 5,
 35 |    addr_mem_streamer_5 = 6,
 36 |    addr_mem_streamer_6 = 7,
 37 |    addr_mem_streamer_7 = 8,
 38 |    addr_dma            = 0,
 39 |    addr_memctrl        = 0,
 40 |    --
 41 |    subAddr_router      =  0,
 42 |    subAddr_operator    =  1,
 43 |    subAddr_cacher      =  2,
 44 |    subAddr_IO          =  3,
 45 |    subAddr_none        =  0,
 46 |    subAddr_memTimeouts =  0,
 47 |    subAddr_memGlobals  =  1,
 48 |    subAddr_memLocals   =  2,
 49 | 
 50 |    -- Content:
 51 |    content_nothing     = 0,
 52 |    content_command     = 1,
 53 |    content_instruc     = 2,
 54 |    content_config      = 3,
 55 |    content_valid       = 1,
 56 | 
 57 |    -- Instructions
 58 |    instruc_config      = 0,
 59 |    instruc_setAdd      = 1,
 60 |    instruc_activate    = 2,
 61 |    instruc_deActivate  = 3,
 62 |    instruc_reset       = 4,
 63 |    instruc_RESERVED_1  = 5,
 64 |    instruc_control_0   = 6,
 65 |    instruc_control_1   = 7,
 66 |    instruc_control_2   = 8,
 67 |    instruc_control_3   = 9,
 68 |    instruc_control_4   = 10,
 69 |    instruc_control_5   = 11,
 70 |    instruc_control_6   = 12,
 71 |    instruc_control_7   = 13,
 72 |    instruc_cacheStart  = 14,
 73 |    instruc_cacheFinish = 15,
 74 | 
 75 |    -- Status
 76 |    status_notAddressed = 0,
 77 |    status_idle         = 1,
 78 |    status_busy         = 2,
 79 |    status_done         = 3,
 80 |    status_primed       = 4,
 81 |    status_unconfigured = 5,
 82 |    status_misconfigured = 6
 83 | }
 84 | 
 85 | 
 86 | ----------------------------------------------------------------------
 87 | --- OpenFlower Instruction Set.
 88 | --
 89 | oFlower = {
 90 |    -- Opcodes
 91 |    op_writeConfig = 0,
 92 |    op_getStatus   = 1,
 93 |    op_writeStream = 2,
 94 |    op_routeStream = 3,
 95 |    op_writeWord   = 4,
 96 |    op_readWord    = 5,
 97 |    op_setReg      = 6,
 98 |    op_goto        = 7,
 99 |    op_add         = 8,
100 |    op_control     = 9,
101 |    op_and         = 10,
102 |    op_or          = 11,
103 |    op_comp        = 12,
104 |    op_shr         = 13,
105 |    op_nop         = 14,
106 |    op_term        = 15,
107 | 
108 |    -- Register map
109 |    reg_operation  = 0,
110 |    reg_size       = 1,
111 |    reg_type       = 2,
112 |    reg_state      = 3,
113 |    reg_counter    = 4,
114 |    reg_loops      = 5,
115 |    reg_status     = 6,
116 |    reg_sys_A      = 7,
117 |    reg_sys_B      = 8,
118 |    reg_sys_C      = 9,
119 |    reg_A          = 10,
120 |    reg_B          = 11,
121 |    reg_C          = 12,
122 |    reg_D          = 13,
123 |    reg_E          = 14,
124 |    reg_F          = 15,
125 | 
126 |    -- ctrl map
127 |    ctrl_lock_config_bus = 0,
128 | 
129 |    -- I/O Map
130 |    io_uart        = 0,
131 |    io_uart_status = 1,
132 |    io_dma         = 2,
133 |    io_dma_status  = 3,
134 |    io_ethernet    = 4,
135 |    io_ethernet_status = 5,
136 |    io_iic         = 6,
137 |    io_iic_status  = 7,
138 |    io_spi         = 8,
139 |    io_spi_status  = 8,
140 |    io_gpios       = 10,
141 |    io_timer       = 11,
142 |    io_timer_ctrl  = 12,
143 | 
144 |    -- CPU types
145 |    type_uint8     = 8,
146 |    type_uint16    = 4,
147 |    type_uint32    = 2,
148 |    type_uint64    = 1,
149 | 
150 |    -- clock
151 |    clock_freq     = 100*MHz,
152 |    uart_freq      = 57600,
153 | 
154 |    -- nb of dmas (this includes instruction path)
155 |    nb_dmas = 2
156 | }
157 | do
158 |    -- Cache
159 |    oFlower.cache_size_b    = 64*kB
160 |    oFlower.page_size_b     = oFlower.cache_size_b/2
161 |    oFlower.bus_            = 64
162 |    oFlower.bus_b           = oFlower.bus_/8
163 | end
164 | 
165 | ----------------------------------------------------------------------
166 | --- General DMAs
167 | --
168 | dma = {}
169 | do
170 |    -- global DMA IOs
171 |    dma.nb_ios = 4
172 |    dma.ethernet_write_port_id = 2
173 |    dma.ethernet_read_port_id = 3
174 |    dma.camera_A_port_id = 4
175 |    dma.camera_B_port_id = 5
176 | end
177 | 
178 | ----------------------------------------------------------------------
179 | --- Grid parameters
180 | --
181 | grid = {}
182 | do
183 |    -- nb of grids
184 |    grid.nb_grids = 1
185 |    -- global IOs
186 |    grid.nb_ios = 6
187 |    -- conv
188 |    grid.nb_convs = 4
189 |    grid.kernel_width = 10
190 |    grid.kernel_height = 10
191 |    -- mapper
192 |    grid.nb_mappers = 4
193 |    grid.mapper_segs = 8
194 |    -- generic ALUs
195 |    grid.nb_alus = 4
196 |    -- clock:
197 |    grid.clock_freq = 200*MHz
198 | end
199 | 
200 | 
201 | ----------------------------------------------------------------------
202 | --- Streamer parameters
203 | --
204 | -- Units:
205 | -- _: bits
206 | -- _b: bytes
207 | -- _w: words (1 word = word_b bytes)
208 | -- _r: memory rows (1 row = size_b bytes)
209 | -- _i: integers (1 int = 4 bytes)
210 | --
211 | streamer = {}
212 | do
213 |    -- physical params
214 |    streamer.nb_ports   = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios
215 |    -- geometry
216 |    streamer.mem_bus_   = 256
217 |    streamer.mem_bus_b  = 256 / 8
218 |    streamer.stride_b   = 2048
219 |    streamer.word_b     = 2
220 |    streamer.align_b    = streamer.mem_bus_ / 8
221 |    streamer.stride_w   = streamer.stride_b / streamer.word_b
222 |    streamer.align_w    = streamer.align_b / streamer.word_b
223 |    -- clock
224 |    streamer.clock_freq = 200*MHz
225 | end
226 | 
227 | 
228 | ----------------------------------------------------------------------
229 | --- Memory parameters
230 | --
231 | -- the parameters are expressed in different units:
232 | -- _: bits
233 | -- _b: bytes
234 | -- _w: words (1 word = word_b bytes)
235 | -- _r: memory rows (1 row = size_b bytes)
236 | -- _i: integers (1 int = 4 bytes)
237 | --
238 | memory = {}
239 | do
240 |    -- size:
241 |    memory.size_b      = 512*MB
242 |    memory.size_w      = memory.size_b / streamer.word_b
243 |    memory.size_r      = memory.size_b / streamer.stride_b
244 |    -- clock:
245 |    memory.clock_freq  = 400*MHz
246 |    -- bandwidth
247 |    memory.bus_        = 32
248 |    memory.is_ddr      = true
249 |    memory.bandwidth_  = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1)
250 |    memory.bandwidth_b = memory.bandwidth_ / 8
251 |    memory.bandwidth_w = memory.bandwidth_b / streamer.word_b
252 | 
253 |    memory.offset_text = 0
254 | end
255 | 
256 | 
257 | ----------------------------------------------------------------------
258 | --- Extra Streamer parameters
259 | --
260 | do
261 |    -- parallel streams: this is application dependent
262 |    streamer.max_parallel_rd_streams = grid.nb_convs + 1
263 |    streamer.max_parallel_wr_streams = 1
264 |    streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams
265 |    -- bandwidth per stream:
266 |    streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b
267 |    streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams
268 |    streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor
269 |    -- bandwidth first check
270 |    if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then
271 |       print('ERROR <streamer> internal bandwidth too high: '
272 |             .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s'
273 |             .. ' > external bandwidth available: '
274 |             ..  streamer.mem_bandwidth_b/1e9 ..'GB/s')
275 |       os.exit()
276 |    end
277 |    -- continous streaming per rd port:
278 |    -- this is based on the observation that:
279 |    -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b
280 |    local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams
281 |    local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams
282 |    streamer.min_timeout_rd = math.ceil(dead_cycles_rd /
283 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
284 |                                      - streamer.max_parallel_streams))
285 |    streamer.min_timeout_wr = math.ceil(dead_cycles_wr /
286 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
287 |                                      - streamer.max_parallel_streams))
288 |    --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr
289 |    --      .. ' and rd=' .. streamer.min_timeout_rd)
290 |    -- for these timeouts, we compute necessary buffers to insure no one is starving
291 |    streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd
292 |                                                          + streamer.min_timeout_rd
293 |                                                          *(streamer.max_parallel_streams-1))
294 |                                    / streamer.mem_bus_b))
295 |    streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr
296 |                                                          + streamer.min_timeout_wr
297 |                                                          *(streamer.max_parallel_streams-1))
298 |                                    / streamer.mem_bus_b))
299 |    --print('# streamer min cache sizes: wr='..streamer.min_cache_wr
300 |    --      ..' and rd='..streamer.min_cache_rd)
301 | end
302 | 
303 | 
304 | ----------------------------------------------------------------------
305 | --- Num parameters
306 | --
307 | num = {}
308 | do
309 |    num.size_b = 2
310 |    num.size_ = 16
311 |    num.frac_ = 8
312 |    num.int_ = num.size_-num.frac_
313 |    num.max = (2^(num.size_-1)-1) / 2^num.frac_
314 |    num.min = -(2^(num.size_-1)) / 2^num.frac_
315 |    num.one = 2^num.frac_
316 |    num.res = 1 / 2^num.frac_
317 |    num.precision = num.res
318 |    num.mask = 0xFFFF
319 | end
320 | 
321 | 
322 | ----------------------------------------------------------------------
323 | --- System Banner
324 | --
325 | banner =
326 |    '------------------------------------------------------------\r\n' ..
327 |    '--     _ _  __        neuFlow [v.1.0]                     --\r\n' ..
328 |    '--    ( | )/_/                                            --\r\n' ..
329 |    '-- __( >O< )         This code runs on                    --\r\n' ..
330 |    '-- \\_\\(_|_)       the custom openFlow CPU.                --\r\n' ..
331 |    '--                                                        --\r\n' ..
332 |    '--   Copyright (C) 2009/10  |  Farabet/Akselrod/Martini   --\r\n' ..
333 |    '------------------------------------------------------------'
334 | 
335 | 
336 | ----------------------------------------------------------------------
337 | --- BootLoader parameters
338 | --
339 | bootloader = {}
340 | do
341 |    bootloader.entry_point_b =  oFlower.cache_size_b
342 |    bootloader.entry_point   =  bootloader.entry_point_b / oFlower.bus_b
343 |    bootloader.load_size     =  32*MB
344 | end
345 | 


--------------------------------------------------------------------------------
/src/defines_xilinx_ml605.lua:
--------------------------------------------------------------------------------
  1 | -- -*- lua -*-
  2 | 
  3 | ----------------------------------------------------------------------
  4 | --- Useful abbrevs
  5 | --
  6 | kB = 1024
  7 | MB = 1024*1024
  8 | GB = 1024*1024*1024
  9 | kHz = 1000
 10 | MHz = 1000*1000
 11 | GHz = 1000*1000*1000
 12 | 
 13 | ----------------------------------------------------------------------
 14 | --- Blast Bus parameters
 15 | --
 16 | blast_bus = {
 17 |    -- Addressing :
 18 |    area_streamer       =  1,
 19 |    area_tile           =  2,
 20 |    area_memctrl        =  3,
 21 |    area_dma            =  4,
 22 |    --
 23 |    addr_broadcast      = 0,
 24 |    addr_conv_0         = 1,
 25 |    addr_conv_1         = 2,
 26 |    addr_comb_0         = 16,
 27 |    addr_mapp_0         = 24,
 28 |    addr_div_0          = 28,
 29 |    addr_grid_0         = 256,
 30 |    addr_mem_streamer_0 = 1,
 31 |    addr_mem_streamer_1 = 2,
 32 |    addr_mem_streamer_2 = 3,
 33 |    addr_mem_streamer_3 = 4,
 34 |    addr_mem_streamer_4 = 5,
 35 |    addr_mem_streamer_5 = 6,
 36 |    addr_mem_streamer_6 = 7,
 37 |    addr_mem_streamer_7 = 8,
 38 |    addr_dma            = 0,
 39 |    addr_memctrl        = 0,
 40 |    --
 41 |    subAddr_router      =  0,
 42 |    subAddr_operator    =  1,
 43 |    subAddr_cacher      =  2,
 44 |    subAddr_IO          =  3,
 45 |    subAddr_none        =  0,
 46 |    subAddr_memTimeouts =  0,
 47 |    subAddr_memGlobals  =  1,
 48 |    subAddr_memLocals   =  2,
 49 | 
 50 |    -- Content:
 51 |    content_nothing     = 0,
 52 |    content_command     = 1,
 53 |    content_instruc     = 2,
 54 |    content_config      = 3,
 55 |    content_valid       = 1,
 56 | 
 57 |    -- Instructions
 58 |    instruc_config      = 0,
 59 |    instruc_setAdd      = 1,
 60 |    instruc_activate    = 2,
 61 |    instruc_deActivate  = 3,
 62 |    instruc_reset       = 4,
 63 |    instruc_RESERVED_1  = 5,
 64 |    instruc_control_0   = 6,
 65 |    instruc_control_1   = 7,
 66 |    instruc_control_2   = 8,
 67 |    instruc_control_3   = 9,
 68 |    instruc_control_4   = 10,
 69 |    instruc_control_5   = 11,
 70 |    instruc_control_6   = 12,
 71 |    instruc_control_7   = 13,
 72 |    instruc_cacheStart  = 14,
 73 |    instruc_cacheFinish = 15,
 74 | 
 75 |    -- Status
 76 |    status_notAddressed = 0,
 77 |    status_idle         = 1,
 78 |    status_busy         = 2,
 79 |    status_done         = 3,
 80 |    status_primed       = 4,
 81 |    status_unconfigured = 5,
 82 |    status_misconfigured = 6
 83 | }
 84 | 
 85 | 
 86 | ----------------------------------------------------------------------
 87 | --- OpenFlower Instruction Set.
 88 | --
 89 | oFlower = {
 90 |    -- Opcodes
 91 |    op_writeConfig = 0,
 92 |    op_getStatus   = 1,
 93 |    op_writeStream = 2,
 94 |    op_routeStream = 3,
 95 |    op_writeWord   = 4,
 96 |    op_readWord    = 5,
 97 |    op_setReg      = 6,
 98 |    op_goto        = 7,
 99 |    op_add         = 8,
100 |    op_control     = 9,
101 |    op_and         = 10,
102 |    op_or          = 11,
103 |    op_comp        = 12,
104 |    op_shr         = 13,
105 |    op_nop         = 14,
106 |    op_term        = 15,
107 | 
108 |    -- Register map
109 |    reg_operation  = 0,
110 |    reg_size       = 1,
111 |    reg_type       = 2,
112 |    reg_state      = 3,
113 |    reg_counter    = 4,
114 |    reg_loops      = 5,
115 |    reg_status     = 6,
116 |    reg_sys_A      = 7,
117 |    reg_sys_B      = 8,
118 |    reg_sys_C      = 9,
119 |    reg_A          = 10,
120 |    reg_B          = 11,
121 |    reg_C          = 12,
122 |    reg_D          = 13,
123 |    reg_E          = 14,
124 |    reg_F          = 15,
125 | 
126 |    -- ctrl map
127 |    ctrl_lock_config_bus = 0,
128 | 
129 |    -- I/O Map
130 |    io_uart        = 0,
131 |    io_uart_status = 1,
132 |    io_dma         = 2,
133 |    io_dma_status  = 3,
134 |    io_ethernet    = 4,
135 |    io_ethernet_status = 5,
136 |    io_iic         = 6,
137 |    io_iic_status  = 7,
138 |    io_spi         = 8,
139 |    io_spi_status  = 8,
140 |    io_gpios       = 10,
141 |    io_timer       = 11,
142 |    io_timer_ctrl  = 12,
143 | 
144 |    -- CPU types
145 |    type_uint8     = 8,
146 |    type_uint16    = 4,
147 |    type_uint32    = 2,
148 |    type_uint64    = 1,
149 | 
150 |    -- clock
151 |    clock_freq     = 100*MHz,
152 |    uart_freq      = 57600,
153 | 
154 |    -- nb of dmas (this includes instruction path)
155 |    nb_dmas = 2
156 | }
157 | do
158 |    -- Cache
159 |    oFlower.cache_size_b    = 64*kB
160 |    oFlower.page_size_b     = oFlower.cache_size_b/2
161 |    oFlower.bus_            = 64
162 |    oFlower.bus_b           = oFlower.bus_/8
163 | end
164 | 
165 | 
166 | ----------------------------------------------------------------------
167 | --- Grid parameters
168 | --
169 | grid = {}
170 | do
171 |    -- nb of grids
172 |    grid.nb_grids = 1
173 |    -- global IOs
174 |    grid.nb_ios = 6
175 |    -- conv
176 |    grid.nb_convs = 4
177 |    grid.kernel_width = 10
178 |    grid.kernel_height = 10
179 |    -- mapper
180 |    grid.nb_mappers = 4
181 |    grid.mapper_segs = 8
182 |    -- generic ALUs
183 |    grid.nb_alus = 4
184 |    -- clock:
185 |    grid.clock_freq = 200*MHz
186 | end
187 | 
188 | 
189 | ----------------------------------------------------------------------
190 | --- General DMAs
191 | --
192 | dma = {}
193 | do
194 |    -- global DMA IOs
195 |    dma.nb_ios = 0
196 | end
197 | 
198 | 
199 | ----------------------------------------------------------------------
200 | --- Streamer parameters
201 | --
202 | -- Units:
203 | -- _: bits
204 | -- _b: bytes
205 | -- _w: words (1 word = word_b bytes)
206 | -- _r: memory rows (1 row = size_b bytes)
207 | -- _i: integers (1 int = 4 bytes)
208 | --
209 | streamer = {}
210 | do
211 |    -- physical params
212 |    streamer.nb_ports   = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios
213 |    -- geometry
214 |    streamer.mem_bus_   = 256
215 |    streamer.mem_bus_b  = 256 / 8
216 |    streamer.stride_b   = 2048
217 |    streamer.word_b     = 2
218 |    streamer.align_b    = streamer.mem_bus_ / 8
219 |    streamer.stride_w   = streamer.stride_b / streamer.word_b
220 |    streamer.align_w    = streamer.align_b / streamer.word_b
221 |    -- clock
222 |    streamer.clock_freq = 200*MHz
223 | end
224 | 
225 | 
226 | ----------------------------------------------------------------------
227 | --- Memory parameters
228 | --
229 | -- the parameters are expressed in different units:
230 | -- _: bits
231 | -- _b: bytes
232 | -- _w: words (1 word = word_b bytes)
233 | -- _r: memory rows (1 row = size_b bytes)
234 | -- _i: integers (1 int = 4 bytes)
235 | --
236 | memory = {}
237 | do
238 |    -- size:
239 |    memory.size_b      = 512*MB
240 |    memory.size_w      = memory.size_b / streamer.word_b
241 |    memory.size_r      = memory.size_b / streamer.stride_b
242 |    -- clock:
243 |    memory.clock_freq  = 400*MHz
244 |    -- bandwidth
245 |    memory.bus_        = 32
246 |    memory.is_ddr      = true
247 |    memory.bandwidth_  = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1)
248 |    memory.bandwidth_b = memory.bandwidth_ / 8
249 |    memory.bandwidth_w = memory.bandwidth_b / streamer.word_b
250 | 
251 |    memory.offset_text = 0
252 | end
253 | 
254 | 
255 | ----------------------------------------------------------------------
256 | --- Extra Streamer parameters
257 | --
258 | do
259 |    -- parallel streams: this is application dependent
260 |    streamer.max_parallel_rd_streams = grid.nb_convs + 1
261 |    streamer.max_parallel_wr_streams = 1
262 |    streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams
263 |    -- bandwidth per stream:
264 |    streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b
265 |    streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams
266 |    streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor
267 |    -- bandwidth first check
268 |    if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then
269 |       print('ERROR <streamer> internal bandwidth too high: '
270 |             .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s'
271 |             .. ' > external bandwidth available: '
272 |             ..  streamer.mem_bandwidth_b/1e9 ..'GB/s')
273 |       os.exit()
274 |    end
275 |    -- continous streaming per rd port:
276 |    -- this is based on the observation that:
277 |    -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b
278 |    local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams
279 |    local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams
280 |    streamer.min_timeout_rd = math.ceil(dead_cycles_rd /
281 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
282 |                                      - streamer.max_parallel_streams))
283 |    streamer.min_timeout_wr = math.ceil(dead_cycles_wr /
284 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
285 |                                      - streamer.max_parallel_streams))
286 |    --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr
287 |    --      .. ' and rd=' .. streamer.min_timeout_rd)
288 |    -- for these timeouts, we compute necessary buffers to insure no one is starving
289 |    streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd
290 |                                                          + streamer.min_timeout_rd
291 |                                                          *(streamer.max_parallel_streams-1))
292 |                                    / streamer.mem_bus_b))
293 |    streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr
294 |                                                          + streamer.min_timeout_wr
295 |                                                          *(streamer.max_parallel_streams-1))
296 |                                    / streamer.mem_bus_b))
297 |    --print('# streamer min cache sizes: wr='..streamer.min_cache_wr
298 |    --      ..' and rd='..streamer.min_cache_rd)
299 | end
300 | 
301 | 
302 | ----------------------------------------------------------------------
303 | --- Num parameters
304 | --
305 | num = {}
306 | do
307 |    num.size_b = 2
308 |    num.size_ = 16
309 |    num.frac_ = 8
310 |    num.int_ = num.size_-num.frac_
311 |    num.max = (2^(num.size_-1)-1) / 2^num.frac_
312 |    num.min = -(2^(num.size_-1)) / 2^num.frac_
313 |    num.one = 2^num.frac_
314 |    num.res = 1 / 2^num.frac_
315 |    num.precision = num.res
316 |    num.mask = 0xFFFF
317 | end
318 | 
319 | 
320 | ----------------------------------------------------------------------
321 | --- System Banner
322 | --
323 | banner =
324 |    '------------------------------------------------------------\r\n' ..
325 |    '--     _ _  __        neuFlow [v.1.0]                     --\r\n' ..
326 |    '--    ( | )/_/                                            --\r\n' ..
327 |    '-- __( >O< )         This code runs on                    --\r\n' ..
328 |    '-- \\_\\(_|_)       the custom openFlow CPU.                --\r\n' ..
329 |    '--                                                        --\r\n' ..
330 |    '--   Copyright (C) 2009/10  |  Farabet/Akselrod/Martini   --\r\n' ..
331 |    '------------------------------------------------------------'
332 | 
333 | 
334 | ----------------------------------------------------------------------
335 | --- BootLoader parameters
336 | --
337 | bootloader = {}
338 | do
339 |    bootloader.entry_point_b =  oFlower.cache_size_b
340 |    bootloader.entry_point   =  bootloader.entry_point_b / oFlower.bus_b
341 |    bootloader.load_size     =  32*MB
342 | end
343 | 


--------------------------------------------------------------------------------
/src/defines_xilinx_ml605_tbsp.lua:
--------------------------------------------------------------------------------
  1 | -- -*- lua -*-
  2 | 
  3 | ----------------------------------------------------------------------
  4 | --- Useful abbrevs
  5 | --
  6 | kB = 1024
  7 | MB = 1024*1024
  8 | GB = 1024*1024*1024
  9 | kHz = 1000
 10 | MHz = 1000*1000
 11 | GHz = 1000*1000*1000
 12 | 
 13 | ----------------------------------------------------------------------
 14 | --- Blast Bus parameters
 15 | --
 16 | blast_bus = {
 17 |    -- Addressing :
 18 |    area_streamer       =  1,
 19 |    area_tile           =  2,
 20 |    area_memctrl        =  3,
 21 |    area_dma            =  4,
 22 |    --
 23 |    addr_broadcast      = 0,
 24 |    addr_conv_0         = 1,
 25 |    addr_conv_1         = 2,
 26 |    addr_comb_0         = 16,
 27 |    addr_mapp_0         = 24,
 28 |    addr_div_0          = 28,
 29 |    addr_grid_0         = 256,
 30 |    addr_mem_streamer_0 = 1,
 31 |    addr_mem_streamer_1 = 2,
 32 |    addr_mem_streamer_2 = 3,
 33 |    addr_mem_streamer_3 = 4,
 34 |    addr_mem_streamer_4 = 5,
 35 |    addr_mem_streamer_5 = 6,
 36 |    addr_mem_streamer_6 = 7,
 37 |    addr_mem_streamer_7 = 8,
 38 |    addr_dma            = 0,
 39 |    addr_memctrl        = 0,
 40 |    --
 41 |    subAddr_router      =  0,
 42 |    subAddr_operator    =  1,
 43 |    subAddr_cacher      =  2,
 44 |    subAddr_IO          =  3,
 45 |    subAddr_none        =  0,
 46 |    subAddr_memTimeouts =  0,
 47 |    subAddr_memGlobals  =  1,
 48 |    subAddr_memLocals   =  2,
 49 | 
 50 |    -- Content:
 51 |    content_nothing     = 0,
 52 |    content_command     = 1,
 53 |    content_instruc     = 2,
 54 |    content_config      = 3,
 55 |    content_valid       = 1,
 56 | 
 57 |    -- Instructions
 58 |    instruc_config      = 0,
 59 |    instruc_setAdd      = 1,
 60 |    instruc_activate    = 2,
 61 |    instruc_deActivate  = 3,
 62 |    instruc_reset       = 4,
 63 |    instruc_RESERVED_1  = 5,
 64 |    instruc_control_0   = 6,
 65 |    instruc_control_1   = 7,
 66 |    instruc_control_2   = 8,
 67 |    instruc_control_3   = 9,
 68 |    instruc_control_4   = 10,
 69 |    instruc_control_5   = 11,
 70 |    instruc_control_6   = 12,
 71 |    instruc_control_7   = 13,
 72 |    instruc_cacheStart  = 14,
 73 |    instruc_cacheFinish = 15,
 74 | 
 75 |    -- Status
 76 |    status_notAddressed = 0,
 77 |    status_idle         = 1,
 78 |    status_busy         = 2,
 79 |    status_done         = 3,
 80 |    status_primed       = 4,
 81 |    status_unconfigured = 5,
 82 |    status_misconfigured = 6
 83 | }
 84 | 
 85 | 
 86 | ----------------------------------------------------------------------
 87 | --- OpenFlower Instruction Set.
 88 | --
 89 | oFlower = {
 90 |    -- Opcodes
 91 |    op_writeConfig = 0,
 92 |    op_getStatus   = 1,
 93 |    op_writeStream = 2,
 94 |    op_routeStream = 3,
 95 |    op_writeWord   = 4,
 96 |    op_readWord    = 5,
 97 |    op_setReg      = 6,
 98 |    op_goto        = 7,
 99 |    op_add         = 8,
100 |    op_control     = 9,
101 |    op_and         = 10,
102 |    op_or          = 11,
103 |    op_comp        = 12,
104 |    op_shr         = 13,
105 |    op_nop         = 14,
106 |    op_term        = 15,
107 | 
108 |    -- Register map
109 |    reg_operation  = 0,
110 |    reg_size       = 1,
111 |    reg_type       = 2,
112 |    reg_state      = 3,
113 |    reg_counter    = 4,
114 |    reg_loops      = 5,
115 |    reg_status     = 6,
116 |    reg_sys_A      = 7,
117 |    reg_sys_B      = 8,
118 |    reg_sys_C      = 9,
119 |    reg_A          = 10,
120 |    reg_B          = 11,
121 |    reg_C          = 12,
122 |    reg_D          = 13,
123 |    reg_E          = 14,
124 |    reg_F          = 15,
125 | 
126 |    -- ctrl map
127 |    ctrl_lock_config_bus = 0,
128 | 
129 |    -- I/O Map
130 |    io_uart        = 0,
131 |    io_uart_status = 1,
132 |    io_dma         = 2,
133 |    io_dma_status  = 3,
134 |    io_ethernet    = 4,
135 |    io_ethernet_status = 5,
136 |    io_iic         = 6,
137 |    io_iic_status  = 7,
138 |    io_spi         = 8,
139 |    io_spi_status  = 8,
140 |    io_gpios       = 10,
141 |    io_timer       = 11,
142 |    io_timer_ctrl  = 12,
143 | 
144 |    -- CPU types
145 |    type_uint8     = 8,
146 |    type_uint16    = 4,
147 |    type_uint32    = 2,
148 |    type_uint64    = 1,
149 | 
150 |    -- clock
151 |    clock_freq     = 100*MHz,
152 |    uart_freq      = 57600,
153 | 
154 |    -- nb of dmas (this includes instruction path)
155 |    nb_dmas = 2
156 | }
157 | do
158 |    -- Cache
159 |    oFlower.cache_size_b    = 64*kB
160 |    oFlower.page_size_b     = oFlower.cache_size_b/2
161 |    oFlower.bus_            = 64
162 |    oFlower.bus_b           = oFlower.bus_/8
163 | end
164 | 
165 | 
166 | ----------------------------------------------------------------------
167 | --- Grid parameters
168 | --
169 | grid = {}
170 | do
171 |    -- nb of grids
172 |    grid.nb_grids = 1
173 |    -- global IOs
174 |    grid.nb_ios = 6
175 |    -- conv
176 |    grid.nb_convs = 4
177 |    grid.kernel_width = 10
178 |    grid.kernel_height = 10
179 |    -- mapper
180 |    grid.nb_mappers = 4
181 |    grid.mapper_segs = 8
182 |    -- generic ALUs
183 |    grid.nb_alus = 4
184 |    -- clock:
185 |    grid.clock_freq = 200*MHz
186 | end
187 | 
188 | 
189 | ----------------------------------------------------------------------
190 | --- General DMAs
191 | --
192 | dma = {}
193 | do
194 |    -- global DMA IOs
195 |    dma.nb_ios = 2
196 |    dma.ethernet_write_port_id = 2
197 |    dma.ethernet_read_port_id = 3
198 | end
199 | 
200 | 
201 | ----------------------------------------------------------------------
202 | --- Streamer parameters
203 | --
204 | -- Units:
205 | -- _: bits
206 | -- _b: bytes
207 | -- _w: words (1 word = word_b bytes)
208 | -- _r: memory rows (1 row = size_b bytes)
209 | -- _i: integers (1 int = 4 bytes)
210 | --
211 | streamer = {}
212 | do
213 |    -- physical params
214 |    streamer.nb_ports   = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios
215 |    -- geometry
216 |    streamer.mem_bus_   = 256
217 |    streamer.mem_bus_b  = 256 / 8
218 |    streamer.stride_b   = 2048
219 |    streamer.word_b     = 2
220 |    streamer.align_b    = streamer.mem_bus_ / 8
221 |    streamer.stride_w   = streamer.stride_b / streamer.word_b
222 |    streamer.align_w    = streamer.align_b / streamer.word_b
223 |    -- clock
224 |    streamer.clock_freq = 200*MHz
225 | end
226 | 
227 | 
228 | ----------------------------------------------------------------------
229 | --- Memory parameters
230 | --
231 | -- the parameters are expressed in different units:
232 | -- _: bits
233 | -- _b: bytes
234 | -- _w: words (1 word = word_b bytes)
235 | -- _r: memory rows (1 row = size_b bytes)
236 | -- _i: integers (1 int = 4 bytes)
237 | --
238 | memory = {}
239 | do
240 |    -- size:
241 |    memory.size_b      = 512*MB
242 |    memory.size_w      = memory.size_b / streamer.word_b
243 |    memory.size_r      = memory.size_b / streamer.stride_b
244 |    -- clock:
245 |    memory.clock_freq  = 400*MHz
246 |    -- bandwidth
247 |    memory.bus_        = 32
248 |    memory.is_ddr      = true
249 |    memory.bandwidth_  = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1)
250 |    memory.bandwidth_b = memory.bandwidth_ / 8
251 |    memory.bandwidth_w = memory.bandwidth_b / streamer.word_b
252 | 
253 |    memory.offset_text = 0
254 | end
255 | 
256 | 
257 | ----------------------------------------------------------------------
258 | --- Extra Streamer parameters
259 | --
260 | do
261 |    -- parallel streams: this is application dependent
262 |    streamer.max_parallel_rd_streams = grid.nb_convs + 1
263 |    streamer.max_parallel_wr_streams = 1
264 |    streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams
265 |    -- bandwidth per stream:
266 |    streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b
267 |    streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams
268 |    streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor
269 |    -- bandwidth first check
270 |    if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then
271 |       print('ERROR <streamer> internal bandwidth too high: '
272 |             .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s'
273 |             .. ' > external bandwidth available: '
274 |             ..  streamer.mem_bandwidth_b/1e9 ..'GB/s')
275 |       os.exit()
276 |    end
277 |    -- continous streaming per rd port:
278 |    -- this is based on the observation that:
279 |    -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b
280 |    local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams
281 |    local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams
282 |    streamer.min_timeout_rd = math.ceil(dead_cycles_rd /
283 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
284 |                                      - streamer.max_parallel_streams))
285 |    streamer.min_timeout_wr = math.ceil(dead_cycles_wr /
286 |                                        ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b)
287 |                                      - streamer.max_parallel_streams))
288 |    --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr
289 |    --      .. ' and rd=' .. streamer.min_timeout_rd)
290 |    -- for these timeouts, we compute necessary buffers to insure no one is starving
291 |    streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd
292 |                                                          + streamer.min_timeout_rd
293 |                                                          *(streamer.max_parallel_streams-1))
294 |                                    / streamer.mem_bus_b))
295 |    streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr
296 |                                                          + streamer.min_timeout_wr
297 |                                                          *(streamer.max_parallel_streams-1))
298 |                                    / streamer.mem_bus_b))
299 |    --print('# streamer min cache sizes: wr='..streamer.min_cache_wr
300 |    --      ..' and rd='..streamer.min_cache_rd)
301 | end
302 | 
303 | 
304 | ----------------------------------------------------------------------
305 | --- Num parameters
306 | --
307 | num = {}
308 | do
309 |    num.size_b = 2
310 |    num.size_ = 16
311 |    num.frac_ = 8
312 |    num.int_ = num.size_-num.frac_
313 |    num.max = (2^(num.size_-1)-1) / 2^num.frac_
314 |    num.min = -(2^(num.size_-1)) / 2^num.frac_
315 |    num.one = 2^num.frac_
316 |    num.res = 1 / 2^num.frac_
317 |    num.precision = num.res
318 |    num.mask = 0xFFFF
319 | end
320 | 
321 | 
322 | ----------------------------------------------------------------------
323 | --- System Banner
324 | --
325 | banner =
326 |    '------------------------------------------------------------\r\n' ..
327 |    '--     _ _  __        neuFlow [v.1.0]                     --\r\n' ..
328 |    '--    ( | )/_/                                            --\r\n' ..
329 |    '-- __( >O< )         This code runs on                    --\r\n' ..
330 |    '-- \\_\\(_|_)       the custom openFlow CPU.                --\r\n' ..
331 |    '--                                                        --\r\n' ..
332 |    '--   Copyright (C) 2009/10  |  Farabet/Akselrod/Martini   --\r\n' ..
333 |    '------------------------------------------------------------'
334 | 
335 | 
336 | ----------------------------------------------------------------------
337 | --- BootLoader parameters
338 | --
339 | bootloader = {}
340 | do
341 |    bootloader.entry_point_b =  oFlower.cache_size_b
342 |    bootloader.entry_point   =  bootloader.entry_point_b / oFlower.bus_b
343 |    bootloader.load_size     =  32*MB
344 | end
345 | 


--------------------------------------------------------------------------------
/src/init.lua:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------------
 2 | --
 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod, Berin Martini
 4 | --
 5 | -- Permission is hereby granted, free of charge, to any person obtaining
 6 | -- a copy of this software and associated documentation files (the
 7 | -- "Software"), to deal in the Software without restriction, including
 8 | -- without limitation the rights to use, copy, modify, merge, publish,
 9 | -- distribute, sublicense, and/or sell copies of the Software, and to
10 | -- permit persons to whom the Software is furnished to do so, subject to
11 | -- the following conditions:
12 | --
13 | -- The above copyright notice and this permission notice shall be
14 | -- included in all copies or substantial portions of the Software.
15 | --
16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | --
24 | ----------------------------------------------------------------------
25 | -- description:
26 | --     neuflow - a compiler toolkit + communication for neuFlow.
27 | --
28 | -- history:
29 | --     July 16, 2011, 1:51PM - import from Torch5 - Clement Farabet
30 | ----------------------------------------------------------------------
31 | 
32 | -- dependencies
33 | require 'xlua'
34 | require 'os'
35 | require 'torch'
36 | require 'nnx'
37 | require 'bit'
38 | 
39 | -- main table
40 | neuflow = {}
41 | 
42 | -- load all submodules
43 | torch.include('neuflow', 'defines.lua')
44 | torch.include('neuflow', 'tools.lua')
45 | torch.include('neuflow', 'rom.lua')
46 | torch.include('neuflow', 'Profiler.lua')
47 | torch.include('neuflow', 'Log.lua')
48 | torch.include('neuflow', 'Memory.lua')
49 | torch.include('neuflow', 'Compiler.lua')
50 | torch.include('neuflow', 'Interface.lua')
51 | torch.include('neuflow', 'DmaInterface.lua')
52 | torch.include('neuflow', 'Camera.lua')
53 | torch.include('neuflow', 'Core.lua')
54 | torch.include('neuflow', 'CoreUser.lua')
55 | torch.include('neuflow', 'Linker.lua')
56 | torch.include('neuflow', 'LinkerExtensions.lua')
57 | torch.include('neuflow', 'Serial.lua')
58 | torch.include('neuflow', 'NeuFlow.lua')
59 | 
60 | -- shortcut for user interface:
61 | neuflow.init = neuflow.NeuFlow
62 | 
63 | -- create a path in home dir to store things
64 | -- like coefficients for example
65 | neuflow.coefpath = os.getenv('HOME')..'/.neuflow/coefs'
66 | os.execute('mkdir -p ' .. neuflow.coefpath)
67 | os.execute('chmod a+rw ' .. neuflow.coefpath)
68 | 
69 | -- migrate all the coefficients
70 | os.execute('cp ' ..  sys.concat(sys.fpath(), 'coef_*') .. ' ' .. neuflow.coefpath)
71 | os.execute('chmod a+rw ' .. neuflow.coefpath .. '/*')
72 | 
73 | -- return table
74 | return neuflow
75 | 


--------------------------------------------------------------------------------
/src/rom.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | neuflow.tools.romTemplate = [[
 3 | /***************************************************************************************************
 4 |  * Module: #ROM_NAME
 5 |  *
 6 |  * Description: Sync ROM, with registered output.
 7 |  *              This is a template: macros of that kind #*** need to be replaced...
 8 |  *
 9 |  * TODO: rst is commented out for now, because not tolerated by XST...
10 |  *
11 |  * Created: December 13, 2009, 12:11PM
12 |  *
13 |  * Author: Clement Farabet
14 |  **************************************************************************************************/
15 | `ifndef _#ROM_NAME_ `define _#ROM_NAME_
16 | 
17 | module #ROM_NAME
18 |   #(parameter
19 |     CPU_ADDR_WIDTH = 32,
20 |     ADDR_WIDTH = #ADDR_WIDTH,
21 |     DATA_WIDTH = #DATA_WIDTH)
22 |    (input wire clk,
23 |     input wire rst,
24 |     input wire [CPU_ADDR_WIDTH-1:0] address,
25 |     output reg [DATA_WIDTH-1:0] data,
26 |     input wire en );
27 | 
28 | 
29 |     /**************************************************************************************
30 |      * Internal address
31 |      **************************************************************************************/
32 |     wire [ADDR_WIDTH-1:0] addr;
33 |     assign addr = address[ADDR_WIDTH-1:0];
34 | 
35 | 
36 |     /**************************************************************************************
37 |      * ROM Storage... a simple case statement.
38 |      **************************************************************************************/
39 |     always @ (posedge clk) begin : ROM_STORAGE_
40 |         if (en) begin
41 |             case (addr)
42 |                 #STORAGE
43 |                 default: data <= #OUTPUT_ON_RESET;
44 |             endcase
45 |         end
46 |     end
47 | 
48 | endmodule
49 | 
50 | `endif //  `ifndef _#ROM_NAME_
51 | ]]
52 | 


--------------------------------------------------------------------------------