├── .gitignore ├── CMakeLists.txt ├── README.md ├── demos ├── face-detector-parallel.lua ├── face-detector.lua ├── face-detector │ ├── PyramidPacker.lua │ ├── PyramidUnPacker.lua │ ├── blobParser.lua │ └── face.net ├── filter-bank.lua ├── loopback.lua └── loopback_camera.lua ├── etherflow ├── CMakeLists.txt ├── etherflow.c ├── etherflow.h ├── example.c ├── generic │ └── etherflow.c ├── init.c ├── init.lua └── test │ ├── receive.lua │ └── send.lua ├── ethertbsp ├── CMakeLists.txt ├── ethertbsp.c ├── ethertbsp.h ├── example.c ├── generic │ └── ethertbsp.c ├── init.c ├── init.lua └── test │ ├── receive.lua │ └── send.lua ├── neuflow-1.scm-0.rockspec ├── scripts ├── get-latest-neuflow-image └── load-bitfile ├── segments ├── coef_Abs ├── coef_Sqrt ├── coef_Sqrt_th ├── coef_Sqrt_th_div_3 ├── coef_Sqrt_th_div_32 ├── coef_StdSigm ├── coef_StdSigmAbs ├── coef_StdSigm_abs_err ├── coef_StdSigm_abs_err_all_range ├── coef_Tanh └── coef_TanhAbs └── src ├── Camera.lua ├── Compiler.lua ├── Core.lua ├── CoreUser.lua ├── DmaInterface.lua ├── Interface.lua ├── Linker.lua ├── LinkerExtensions.lua ├── Log.lua ├── Memory.lua ├── NeuFlow.lua ├── Profiler.lua ├── Serial.lua ├── defines.lua ├── defines_ibm_asic.lua ├── defines_pico_m503.lua ├── defines_xilinx_ml605.lua ├── defines_xilinx_ml605_tbsp.lua ├── init.lua ├── rom.lua └── tools.lua /.gitignore: -------------------------------------------------------------------------------- 1 | # Lines starting with '#' are considered comments. 2 | # 3 | # Ignore swap file generated by vim. 4 | *.swp 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) 3 | CMAKE_POLICY(VERSION 2.6) 4 | IF(LUAROCKS_PREFIX) 5 | MESSAGE(STATUS "Installing Torch through Luarocks") 6 | STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX "${LUAROCKS_PREFIX}") 7 | MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}") 8 | ENDIF() 9 | FIND_PACKAGE(Torch REQUIRED) 10 | 11 | ADD_SUBDIRECTORY (etherflow) 12 | ADD_SUBDIRECTORY (ethertbsp) 13 | 14 | SET(src) 15 | FILE(GLOB luasrc src/*.lua segments/*) 16 | ADD_TORCH_PACKAGE(neuflow "${src}" "${luasrc}" "neuFlow") 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # neuFlow 2 | 3 | **neuFlow** is dataflow architecture optimized for large array/tensor 4 | transforms, and especially image processing operations. More info about the 5 | architecture, hardware and applications can be found 6 | [here](http://www.neuflow.org). 7 | 8 | ## this package 9 | 10 | This package is a compiler toolkit for neuFlow. It is entirely written in 11 | [Lua](http://www.lua.org/), and relies on 12 | [Torch7](https://github.com/andresy/torch) to represent N-dimensional arrays 13 | efficiently. It also interfaces Torch7's neural-network package natively. 14 | 15 | ## how to install 16 | 17 | Torch7 must be install first, a task most easily accomplished using the single 18 | line [install script](https://github.com/clementfarabet/torchinstall). 19 | 20 | or alternatively to install Torch7 and the neuFlow package by hand, you will 21 | need to install a few dependencies. 22 | 23 | On Linux (Ubuntu): 24 | 25 | ``` sh 26 | $ apt-get install gcc g++ git libreadline5-dev cmake wget 27 | $ apt-get install libqt4-core libqt4-gui libqt4-dev 28 | $ apt-get install ffmpeg gnuplot 29 | ``` 30 | 31 | On Mac OS X (> 10.5): get [Homebrew](http://mxcl.github.com/homebrew/) 32 | and then: 33 | 34 | ``` sh 35 | $ brew install git readline cmake wget 36 | $ brew install qt 37 | $ brew install ffmpeg gnuplot 38 | ``` 39 | 40 | You're ready to install Torch7 (www.torch.ch). The most up to date instructions 41 | can be found at the [Torch7 github page](https://github.com/andresy/torch). 42 | 43 | ``` sh 44 | $ git clone git://github.com/andresy/torch.git 45 | $ cd torch 46 | $ mkdir build 47 | $ cd build 48 | 49 | $ cmake .. 50 | OR 51 | $ cmake .. -DCMAKE_INSTALL_PREFIX=/my/install/path 52 | ``` 53 | 54 | Or if you already have a previous Torch7 installed: 55 | 56 | ``` sh 57 | $ luarocks install torch WITH_LUA_JIT=1 # Torch7, an efficient numeric library for Lua 58 | ``` 59 | 60 | You will also need additional packages: 61 | 62 | ``` sh 63 | $ luarocks install image # an image library for Torch7 64 | $ luarocks install nnx # lots of extra neural-net modules 65 | $ luarocks install camera # a camera interface for Linux/MacOS 66 | $ luarocks install ffmpeg # a video decoder for most formats 67 | $ luarocks install inline-c # inline C capability 68 | ``` 69 | 70 | Now that Torch7 has been installed the neuflow package can be installed. 71 | Installing the neuflow package requires you to download the source code 72 | repository. It'll give you access to some demos, to get started: 73 | 74 | ``` sh 75 | $ git clone https://github.com/clementfarabet/neuflow.git 76 | $ cd neuflow 77 | $ luarocks make 78 | ``` 79 | 80 | ## how to run code on neuFlow 81 | 82 | Demos are located in demos/. To get started, you'll need 83 | a standard Xilinx dev board for the Virtex 6: [the ML605 Kit] 84 | (http://www.xilinx.com/products/devkits/EK-V6-ML605-G.htm). 85 | We provide an image of neuFlow that's pre synthesized/mapped/routed 86 | for the Virtex6 VLX240T on this platform. 87 | 88 | To run any of the demos, follow these instructions (tested on 89 | Ubuntu 9.04, 10.04 and Mac OS X 10.5, 10.6 and 10.7). 90 | 91 | ``` sh 92 | $ git clone https://github.com/clementfarabet/neuflow.git 93 | $ cd neuflow 94 | 95 | # make Xilinx tools available (it implies you have them 96 | # installed somewhere...) 97 | $ source $XILINX_INSTALL_PATH/settings**.sh 98 | 99 | # turn on the ML605, plug the JTAG cable then load one of 100 | # our pre-built bitfiles *: 101 | $ cd scripts 102 | $ ./get-latest-neuflow-image 103 | $ ./load-bitfile neuFlow-ml605.bit 104 | 105 | # at this points, you just have wait 2 seconds that the Ethernet 106 | # LEDs are back on (out of reset) 107 | 108 | # run the simplest demo, a loopback client, to verify your setup **: 109 | $ cd ../demos 110 | $ sudo torch loopback.lua # on Linux 111 | or 112 | $ ./loopback.lua # on OSX 113 | 114 | # before loading a new demo, you have to reset neuFlow: for 115 | # now it is done by pressing the SW10 button (cpu rst) 116 | 117 | # then you can run a typical convnet-based program, a face detector: 118 | $ sudo torch face-detector.lua # on Linux 119 | or 120 | $ ./face-detector.lua # on OSX 121 | ``` 122 | 123 | (*) the load-bitfile script assumes that you have properly installed Xilinx's 124 | USB cable driver. On RedHat and derivatives it works out of the box when 125 | installing Xilinx ISE, but on Ubuntu you'll have to follow these instructions: 126 | http://rmdir.de/~michael/xilinx/. This is not doable on Mac OS X 127 | unfortunately. I usually flash the ML605 board using Ubuntu (even a virtual box 128 | version works), and then run all the demos under Mac OS X. 129 | 130 | (**) you need to have admin privileges on your machine (sudo) to be able to 131 | interact with neuFlow, as we're using a custom low-level Ethernet framing 132 | protocol. 133 | -------------------------------------------------------------------------------- /demos/face-detector-parallel.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env torch 2 | 3 | -- libs 4 | require 'parallel' 5 | require 'image' 6 | require 'camera' 7 | require 'neuflow' 8 | require 'qtwidget' 9 | 10 | 11 | -- forked process 12 | function worker() 13 | require 'torch' 14 | require 'camera' 15 | 16 | -- camera 17 | local camera = image.Camera{} 18 | 19 | -- image 20 | local frameRGB = torch.Tensor(3,480,640) 21 | 22 | while true do 23 | frameRGB = camera:forward() 24 | 25 | local m = parallel.yield() 26 | if m == 'break' then break end 27 | parallel.parent:send(frameRGB) 28 | 29 | collectgarbage() 30 | end 31 | end 32 | 33 | function parent(arg) 34 | child = parallel.fork() 35 | child:exec(worker) 36 | 37 | ---------------------------------------------------------------------- 38 | -- ARGS: parse user arguments 39 | -- 40 | op = xlua.OptionParser('%prog [options]') 41 | op:option{'-c', '--camera', action='store', dest='camidx', 42 | help='if source=camera, you can specify the camera index: /dev/videoIDX', 43 | default=0} 44 | op:option{'-n', '--network', action='store', dest='network', 45 | help='path to existing [trained] network', 46 | default='face-detector/face.net'} 47 | opt,args = op:parse() 48 | 49 | ---------------------------------------------------------------------- 50 | -- INIT: initialize the neuFlow context 51 | -- a mem manager, the dataflow core, and the compiler 52 | -- 53 | -- platform='xilinx_ml605' or platform='pico_m503' 54 | 55 | local platform = args[1] or 'xilinx_ml605' 56 | nf = neuflow.init { 57 | prog_name = 'face-detector', 58 | platform = platform 59 | } 60 | 61 | ---------------------------------------------------------------------- 62 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 63 | -- how it should interact with the host (data exchange) 64 | -- note: any copy**Host() inserted here needs to be matched by 65 | -- a copy**Dev() in the EXEC section. 66 | -- 67 | 68 | -- load pre-trained network from disk 69 | network = torch.load(opt.network) 70 | network_fov = 32 71 | network_sub = 4 72 | softnorm = network.modules[1] 73 | hardnet = nn.Sequential() 74 | for i = 2,#network.modules do 75 | hardnet:add(network.modules[i]) 76 | end 77 | network = hardnet 78 | 79 | -- process input at multiple scales 80 | scales = {0.3, 0.24, 0.192, 0.15, 0.12, 0.1} 81 | 82 | -- use a pyramid packer/unpacker 83 | require 'face-detector/PyramidPacker' 84 | require 'face-detector/PyramidUnPacker' 85 | packer = nn.PyramidPacker(network, scales) 86 | unpacker = nn.PyramidUnPacker(network) 87 | 88 | -- blob parser 89 | parse = require 'face-detector/blobParser' 90 | 91 | -- a gaussian for smoothing the distributions 92 | gaussian = image.gaussian(3,0.15) 93 | 94 | -- generate input data for compiler 95 | frameRGB = torch.Tensor(3,480,640) 96 | frameY = image.rgb2y(frameRGB) 97 | input = packer:forward(frameY) 98 | 99 | -- loop over the main code 100 | nf:beginLoop('main') do 101 | 102 | -- send data to device 103 | input_dev = nf:copyFromHost(input) 104 | 105 | -- compile network 106 | output_dev = nf:compile(network, input_dev) 107 | 108 | -- send result back to host 109 | outputs = nf:copyToHost(output_dev) 110 | 111 | end nf:endLoop('main') 112 | 113 | 114 | -- package hardware network 115 | nf.forward = function(nf,input) 116 | local normed = softnorm:forward(input) 117 | nf:copyToDev(normed) 118 | nf:copyFromDev(outputs) 119 | return outputs 120 | end 121 | 122 | 123 | ---------------------------------------------------------------------- 124 | -- LOAD: load the bytecode on the device, and execute it 125 | -- 126 | nf:sendReset() 127 | nf:loadBytecode() 128 | 129 | ---------------------------------------------------------------------- 130 | -- EXEC: this part executes the host code, and interacts with the dev 131 | -- 132 | 133 | -- profiler 134 | p = nf.profiler 135 | 136 | -- zoom 137 | zoom = 0.5 138 | 139 | -- process loop 140 | function process() 141 | p:start('whole-loop','fps') 142 | 143 | -- (1) grab frame 144 | p:start('get-camera-frame') 145 | child:join() 146 | frameRGB = child:receive() 147 | frameRGB = image.scale(frameRGB, 640, 480) 148 | p:lap('get-camera-frame') 149 | 150 | -- (2) transform it into Y space 151 | p:start('RGB->Y') 152 | frameY = image.rgb2y(frameRGB) 153 | p:lap('RGB->Y') 154 | 155 | -- (3) create multiscale pyramid 156 | p:start('pack-pyramid') 157 | pyramid, coordinates = packer:forward(frameY) 158 | p:lap('pack-pyramid') 159 | 160 | -- (4) run pre-trained network on it 161 | p:start('network-inference') 162 | result = nf:forward(pyramid) 163 | p:lap('network-inference') 164 | 165 | -- (5) unpack pyramid 166 | p:start('unpack-pyramid') 167 | distributions = unpacker:forward(result, coordinates) 168 | p:lap('unpack-pyramid') 169 | 170 | -- (6) parse distributions to extract blob centroids 171 | p:start('parse-distributions') 172 | threshold = 0.9 173 | rawresults = {} 174 | for i,distribution in ipairs(distributions) do 175 | local smoothed = image.convolve(distribution[1]:add(1):mul(0.5), gaussian) 176 | parse(smoothed, threshold, rawresults, scales[i]) 177 | end 178 | p:lap('parse-distributions') 179 | 180 | -- (7) clean up results 181 | p:start('clean-up') 182 | detections = {} 183 | for i,res in ipairs(rawresults) do 184 | local scale = res[3] 185 | local x = res[1]*network_sub/scale 186 | local y = res[2]*network_sub/scale 187 | local w = network_fov/scale 188 | local h = network_fov/scale 189 | detections[i] = {x=x, y=y, w=w, h=h} 190 | end 191 | p:lap('clean-up') 192 | end 193 | 194 | -- display loop 195 | function display() 196 | win:gbegin() 197 | win:showpage() 198 | -- (1) display input image + pyramid 199 | image.display{image=frameRGB, win=win} 200 | 201 | -- (2) overlay bounding boxes for each detection 202 | for i,detect in ipairs(detections) do 203 | win:setcolor(1,0,0) 204 | win:rectangle(detect.x, detect.y, detect.w, detect.h) 205 | win:stroke() 206 | win:setfont(qt.QFont{serif=false,italic=false,size=16}) 207 | win:moveto(detect.x, detect.y-1) 208 | win:show('face') 209 | end 210 | 211 | -- (3) display distributions 212 | local prevx = 0 213 | for i,distribution in ipairs(distributions) do 214 | local prev = distributions[i-1] 215 | if prev then prevx = prevx + prev:size(3) end 216 | image.display{image=distribution[1], win=win, x=prevx, min=0, max=1} 217 | end 218 | 219 | p:lap('whole-loop') 220 | p:displayAll{painter=win, x=5, y=distributions[1]:size(2)+20, font=12} 221 | win:gend() 222 | end 223 | 224 | ---------------------------------------------------------------------- 225 | -- GUI: setup user interface / display 226 | -- 227 | 228 | if not win then 229 | win = qtwidget.newwindow(frameRGB:size(3), frameRGB:size(2), 'Face Detection') 230 | end 231 | 232 | while win:valid() do 233 | process() 234 | display() 235 | collectgarbage() 236 | end 237 | 238 | child:join('break') 239 | 240 | --[[ 241 | timer = qt.QTimer() 242 | timer.interval = 10 243 | timer.singleShot = true 244 | qt.connect ( 245 | timer, 246 | 'timeout()', 247 | function() 248 | process() 249 | display() 250 | collectgarbage() 251 | timer:start() 252 | end 253 | ) 254 | timer:start() 255 | --]] 256 | end 257 | 258 | -- protected env 259 | ok,err = pcall(parent, arg) 260 | if not ok then print(err) end 261 | parallel.close() 262 | -------------------------------------------------------------------------------- /demos/face-detector.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env torch 2 | ---------------------------------------------------------------------- 3 | -- This program demonstrates the computation of a bank of filters 4 | -- over a grayscale image. The image is grabbed from a webcam, 5 | -- if available (and if the package 'camera' is installed as well), 6 | -- otherwise, a fixed image (lena) is used as an input. 7 | -- 8 | -- This script demonstrates how to describe a simple algorithm 9 | -- using Torch7's 'nn' package, and how to compile it for neuFlow. 10 | -- 11 | 12 | require 'neuflow' 13 | require 'qt' 14 | require 'qtwidget' 15 | require 'xlua' 16 | require 'inline' 17 | require 'nnx' 18 | require 'camera' 19 | require 'image' 20 | 21 | ---------------------------------------------------------------------- 22 | -- ARGS: parse user arguments 23 | -- 24 | op = xlua.OptionParser('%prog [options]') 25 | op:option{'-c', '--camera', action='store', dest='camidx', 26 | help='if source=camera, you can specify the camera index: /dev/videoIDX', 27 | default=0} 28 | op:option{'-n', '--network', action='store', dest='network', 29 | help='path to existing [trained] network', 30 | default='face-detector/face.net'} 31 | opt,args = op:parse() 32 | 33 | ---------------------------------------------------------------------- 34 | -- INIT: initialize the neuFlow context 35 | -- a mem manager, the dataflow core, and the compiler 36 | -- 37 | -- platform='xilinx_ml605' or platform='pico_m503' 38 | 39 | local platform = args[1] or 'xilinx_ml605' 40 | nf = neuflow.init { 41 | prog_name = 'face-detector', 42 | platform = platform 43 | } 44 | 45 | ---------------------------------------------------------------------- 46 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 47 | -- how it should interact with the host (data exchange) 48 | -- note: any copy**Host() inserted here needs to be matched by 49 | -- a copy**Dev() in the EXEC section. 50 | -- 51 | 52 | -- load pre-trained network from disk 53 | network = torch.load(opt.network) 54 | network_fov = 32 55 | network_sub = 4 56 | softnorm = network.modules[1] 57 | hardnet = nn.Sequential() 58 | for i = 2,#network.modules do 59 | hardnet:add(network.modules[i]) 60 | end 61 | network = hardnet 62 | 63 | -- process input at multiple scales 64 | scales = {0.3, 0.24, 0.192, 0.15, 0.12, 0.1} 65 | 66 | -- use a pyramid packer/unpacker 67 | require 'face-detector/PyramidPacker' 68 | require 'face-detector/PyramidUnPacker' 69 | packer = nn.PyramidPacker(network, scales) 70 | unpacker = nn.PyramidUnPacker(network) 71 | 72 | -- blob parser 73 | parse = require 'face-detector/blobParser' 74 | 75 | -- a gaussian for smoothing the distributions 76 | gaussian = image.gaussian(3,0.15) 77 | 78 | -- generate input data for compiler 79 | frameRGB = torch.Tensor(3,480,640) 80 | frameY = image.rgb2y(frameRGB) 81 | input = packer:forward(frameY) 82 | 83 | -- loop over the main code 84 | nf:beginLoop('main') do 85 | 86 | -- send data to device 87 | input_dev = nf:copyFromHost(input) 88 | 89 | -- compile network 90 | output_dev = nf:compile(network, input_dev) 91 | 92 | -- send result back to host 93 | outputs = nf:copyToHost(output_dev) 94 | 95 | end nf:endLoop('main') 96 | 97 | -- package hardware network 98 | nf.forward = function(nf,input) 99 | local normed = softnorm:forward(input) 100 | nf:copyToDev(normed) 101 | nf:copyFromDev(outputs) 102 | return outputs 103 | end 104 | 105 | ---------------------------------------------------------------------- 106 | -- LOAD: load the bytecode on the device, and execute it 107 | -- 108 | nf:sendReset() 109 | nf:loadBytecode() 110 | 111 | ---------------------------------------------------------------------- 112 | -- EXEC: this part executes the host code, and interacts with the dev 113 | -- 114 | 115 | -- camera 116 | camera = image.Camera{} 117 | 118 | -- profiler 119 | p = nf.profiler 120 | 121 | -- zoom 122 | zoom = 0.5 123 | 124 | -- process loop 125 | function process() 126 | p:start('whole-loop','fps') 127 | 128 | -- (1) grab frame 129 | p:start('get-camera-frame') 130 | frameRGB = camera:forward() 131 | frameRGB = image.scale(frameRGB, 640, 480) 132 | p:lap('get-camera-frame') 133 | 134 | -- (2) transform it into Y space 135 | p:start('RGB->Y') 136 | frameY = image.rgb2y(frameRGB) 137 | p:lap('RGB->Y') 138 | 139 | -- (3) create multiscale pyramid 140 | p:start('pack-pyramid') 141 | pyramid, coordinates = packer:forward(frameY) 142 | p:lap('pack-pyramid') 143 | 144 | -- (4) run pre-trained network on it 145 | p:start('network-inference') 146 | result = nf:forward(pyramid) 147 | p:lap('network-inference') 148 | 149 | -- (5) unpack pyramid 150 | p:start('unpack-pyramid') 151 | distributions = unpacker:forward(result, coordinates) 152 | p:lap('unpack-pyramid') 153 | 154 | -- (6) parse distributions to extract blob centroids 155 | p:start('parse-distributions') 156 | threshold = 0.9 157 | rawresults = {} 158 | for i,distribution in ipairs(distributions) do 159 | local smoothed = image.convolve(distribution[1]:add(1):mul(0.5), gaussian) 160 | parse(smoothed, threshold, rawresults, scales[i]) 161 | end 162 | p:lap('parse-distributions') 163 | 164 | -- (7) clean up results 165 | p:start('clean-up') 166 | detections = {} 167 | for i,res in ipairs(rawresults) do 168 | local scale = res[3] 169 | local x = res[1]*network_sub/scale 170 | local y = res[2]*network_sub/scale 171 | local w = network_fov/scale 172 | local h = network_fov/scale 173 | detections[i] = {x=x, y=y, w=w, h=h} 174 | end 175 | p:lap('clean-up') 176 | end 177 | 178 | -- display loop 179 | function display() 180 | win:gbegin() 181 | win:showpage() 182 | -- (1) display input image + pyramid 183 | image.display{image=frameRGB, win=win} 184 | 185 | -- (2) overlay bounding boxes for each detection 186 | for i,detect in ipairs(detections) do 187 | win:setcolor(1,0,0) 188 | win:rectangle(detect.x, detect.y, detect.w, detect.h) 189 | win:stroke() 190 | win:setfont(qt.QFont{serif=false,italic=false,size=16}) 191 | win:moveto(detect.x, detect.y-1) 192 | win:show('face') 193 | end 194 | 195 | -- (3) display distributions 196 | local prevx = 0 197 | for i,distribution in ipairs(distributions) do 198 | local prev = distributions[i-1] 199 | if prev then prevx = prevx + prev:size(3) end 200 | image.display{image=distribution[1], win=win, x=prevx, min=0, max=1} 201 | end 202 | 203 | p:lap('whole-loop') 204 | p:displayAll{painter=win, x=5, y=distributions[1]:size(2)+20, font=12} 205 | win:gend() 206 | end 207 | 208 | ---------------------------------------------------------------------- 209 | -- GUI: setup user interface / display 210 | -- 211 | 212 | if not win then 213 | win = qtwidget.newwindow(frameRGB:size(3), frameRGB:size(2), 'Face Detection') 214 | end 215 | 216 | timer = qt.QTimer() 217 | timer.interval = 10 218 | timer.singleShot = true 219 | qt.connect(timer, 220 | 'timeout()', 221 | function() 222 | process() 223 | display() 224 | collectgarbage() 225 | timer:start() 226 | end) 227 | timer:start() 228 | -------------------------------------------------------------------------------- /demos/face-detector/PyramidPacker.lua: -------------------------------------------------------------------------------- 1 | local PyramidPacker, parent = torch.class('nn.PyramidPacker', 'nn.Module') 2 | 3 | function getCoordinates(args) 4 | local scales = args.scales 5 | local step_width = args.step_width 6 | local step_height = args.step_height 7 | local dim_width_orig = args.dim_width 8 | local dim_height_orig = args.dim_height 9 | 10 | local dim_width = math.floor(dim_width_orig*scales[1]) 11 | local dim_height = math.floor(dim_height_orig*scales[1]) 12 | -- we define the coordinates table, which we will fill-in 13 | -- once per each different input or different scales 14 | -- and we will use it to pack and unpack different sclales into/out of 15 | -- one big pack. 16 | -- The rows of the table are different scales, 17 | -- the columns of the table are: 18 | -- 1 2 3 4 5 6 19 | -- x1 y1 x2 y2 width height 20 | -- 21 | -- (x1, y1) - top left corner, (x2, y2) - bottom right corner, 22 | -- (width, height) - sizes of the current scale 23 | 24 | local coordinates = torch.Tensor(#scales, 6) 25 | coordinates[1][1] = 1 26 | coordinates[1][2] = 1 27 | coordinates[1][3] = dim_width 28 | coordinates[1][4] = dim_height 29 | coordinates[1][5] = dim_width 30 | coordinates[1][6] = dim_height 31 | local max_width = dim_width 32 | local max_height = dim_height 33 | 34 | -- fill the coordinates table and get the size for the big pack 35 | for i=2,#scales,1 do 36 | 37 | dim_width = math.floor(dim_width_orig*scales[i]) 38 | dim_height = math.floor(dim_height_orig*scales[i]) 39 | 40 | -- an even case - putting down 41 | if (i%2 == 0) then 42 | coordinates[i][1] = coordinates[i-1][1] 43 | coordinates[i][2] = (math.floor((coordinates[i-1][4]-1)/step_height) + 1)*step_height+1 44 | else -- an odd case - putting beside 45 | coordinates[i][1] = (math.floor((coordinates[i-1][3]-1)/step_width) + 1)*step_width+1 46 | coordinates[i][2] = coordinates[i-1][2] 47 | end 48 | 49 | coordinates[i][3] = dim_width + coordinates[i][1] - 1 50 | coordinates[i][4] = dim_height + coordinates[i][2] - 1 51 | coordinates[i][5] = dim_width 52 | coordinates[i][6] = dim_height 53 | 54 | max_width = math.max(max_width, coordinates[i][3]) 55 | max_height = math.max(max_height, coordinates[i][4]) 56 | end 57 | 58 | return coordinates, max_width, max_height 59 | end 60 | 61 | local function getSizesTbl(net) 62 | local sizes_tbl = {} 63 | for i=1,#net.modules do 64 | dw = net.modules[i].dW 65 | dh = net.modules[i].dH 66 | kw = net.modules[i].kW 67 | kh = net.modules[i].kH 68 | if((dw ~= nil)and(dh ~= nil)and(kw ~= nil) and(kh ~= nil)) then 69 | table.insert(sizes_tbl, {kw=kw,kh=kh,dw=dw,dh=dh}) 70 | end 71 | end 72 | 73 | return sizes_tbl 74 | end 75 | 76 | local function getRange(args) 77 | local sizes_tbl = args.sizes_tbl 78 | local idx_output = args.idx_output 79 | 80 | local x = torch.Tensor(#sizes_tbl+1) 81 | local y = torch.Tensor(#sizes_tbl+1) 82 | x[#sizes_tbl+1] = idx_output 83 | y[#sizes_tbl+1] = idx_output 84 | 85 | for k = #sizes_tbl,1,-1 do 86 | -- rightmost point of the image that affects x(k+1) 87 | x[k] = sizes_tbl[k].kw+ (x[k+1]-1) * sizes_tbl[k].dw 88 | -- leftmost point of the image that affects y(k+1) 89 | y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dw 90 | end 91 | local left_width = y[1] 92 | local right_width = x[1] 93 | 94 | for k = #sizes_tbl,1,-1 do 95 | -- rightmost point of the image that affects x(k+1) 96 | x[k] = sizes_tbl[k].kh+ (x[k+1]-1) * sizes_tbl[k].dh 97 | -- leftmost point of the image that affects y(k+1) 98 | y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dh 99 | end 100 | 101 | local left_height = y[1] 102 | local right_height = x[1] 103 | 104 | 105 | return left_width, right_width, left_height, right_height 106 | end 107 | 108 | local function getGlobalSizes(args) 109 | local sizes_tbl = args.sizes_tbl 110 | 111 | -- to find gobal kernel size we use recursive formula: 112 | -- glob_ker(n + 1) = 1 113 | -- glob_ker(n) = ker(n) + (glob_ker(n+1)-1)*step(n) 114 | -- 115 | -- where: ker(n) - kernel size on layer n, step(n) - step size on layer n 116 | -- and n is number of layers that change the size of the input (convolution and subsample) 117 | local left_width1, right_width1, left_height1, right_height1 = getRange({sizes_tbl=sizes_tbl, idx_output=1}) 118 | local ker_width = right_width1 - left_width1 +1 119 | local ker_height = right_height1 - left_height1 +1 120 | 121 | local step_width = 1 122 | local step_height = 1 123 | 124 | -- global step = MUL(step_1, step_2, ... , step_n) 125 | for i = 1, #sizes_tbl do 126 | step_width = step_width * sizes_tbl[i].dw 127 | step_height = step_height * sizes_tbl[i].dh 128 | end 129 | 130 | return step_width, step_height, ker_width, ker_height 131 | end 132 | 133 | function PyramidPacker:__init(network, scales) 134 | parent.__init(self) 135 | -- vars 136 | self.scales = scales or {1} 137 | self.dim_width = 1 138 | self.dim_height = 1 139 | self.dimz = 1 140 | if network then 141 | -- infer params from given net 142 | self.step_width, self.step_height = getGlobalSizes({sizes_tbl=getSizesTbl(network)}) 143 | else 144 | self.step_width = 1 145 | self.step_height = 1 146 | end 147 | 148 | self.output = torch.Tensor(1,1,1) 149 | self.output:fill(0) 150 | end 151 | 152 | function PyramidPacker:forward(input) 153 | 154 | if ((input:size(3) ~= self.dim_width) or (input:size(2) ~= self.dim_height)) then 155 | self.dim_height = input:size(2) 156 | self.dim_width = input:size(3) 157 | self.coordinates, self.max_width, self.max_height = 158 | getCoordinates({dim_width = self.dim_width, dim_height = self.dim_height, 159 | scales = self.scales, 160 | step_width = self.step_width, step_height = self.step_height}) 161 | end 162 | 163 | if(input:size(1) ~= dim_z) then self.dimz = input:size(1) end 164 | self.output:resize(self.dimz, self.max_height, self.max_width):zero() 165 | 166 | -- using the coordinates table fill the pack with different scales 167 | -- if the pack and coordinates already exist for the same input size we go directly to here 168 | for i = 1,#self.scales do 169 | local temp = self.output:narrow(3,self.coordinates[i][1],self.coordinates[i][5]) 170 | temp = temp:narrow(2,self.coordinates[i][2],self.coordinates[i][6]) 171 | image.scale(input, temp, 'bilinear') 172 | end 173 | 174 | return self.output, self.coordinates 175 | end 176 | 177 | function PyramidPacker:backward(input, gradOutput) 178 | xlua.error('backward non implemented', 'PyramidPacker') 179 | end 180 | 181 | function PyramidPacker:write(file) 182 | parent.write(self,file) 183 | file:writeDouble(#self.scales) 184 | for i = 1,#self.scales do 185 | file:writeDouble(self.scales[i]) 186 | end 187 | end 188 | 189 | function PyramidPacker:read(file) 190 | parent.read(self,file) 191 | local nbScales = file:readDouble() 192 | for i = 1,nbScales do 193 | self.scales[i] = file:readDouble() 194 | end 195 | end 196 | -------------------------------------------------------------------------------- /demos/face-detector/PyramidUnPacker.lua: -------------------------------------------------------------------------------- 1 | 2 | local PyramidUnPacker, parent = torch.class('nn.PyramidUnPacker', 'nn.Module') 3 | 4 | local function getSizesTbl(net) 5 | local sizes_tbl = {} 6 | for i=1,#net.modules do 7 | dw = net.modules[i].dW 8 | dh = net.modules[i].dH 9 | kw = net.modules[i].kW 10 | kh = net.modules[i].kH 11 | if((dw ~= nil)and(dh ~= nil)and(kw ~= nil) and(kh ~= nil)) then 12 | table.insert(sizes_tbl, {kw=kw,kh=kh,dw=dw,dh=dh}) 13 | end 14 | end 15 | 16 | return sizes_tbl 17 | end 18 | 19 | local function getRange(args) 20 | local sizes_tbl = args.sizes_tbl 21 | local idx_output = args.idx_output 22 | 23 | local x = torch.Tensor(#sizes_tbl+1) 24 | local y = torch.Tensor(#sizes_tbl+1) 25 | x[#sizes_tbl+1] = idx_output 26 | y[#sizes_tbl+1] = idx_output 27 | 28 | for k = #sizes_tbl,1,-1 do 29 | -- rightmost point of the image that affects x(k+1) 30 | x[k] = sizes_tbl[k].kw+ (x[k+1]-1) * sizes_tbl[k].dw 31 | -- leftmost point of the image that affects y(k+1) 32 | y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dw 33 | end 34 | local left_width = y[1] 35 | local right_width = x[1] 36 | 37 | for k = #sizes_tbl,1,-1 do 38 | -- rightmost point of the image that affects x(k+1) 39 | x[k] = sizes_tbl[k].kh+ (x[k+1]-1) * sizes_tbl[k].dh 40 | -- leftmost point of the image that affects y(k+1) 41 | y[k] = 1 + (y[k+1]-1) * sizes_tbl[k].dh 42 | end 43 | 44 | local left_height = y[1] 45 | local right_height = x[1] 46 | 47 | 48 | return left_width, right_width, left_height, right_height 49 | end 50 | 51 | local function getGlobalSizes(args) 52 | local sizes_tbl = args.sizes_tbl 53 | 54 | -- to find gobal kernel size we use recursive formula: 55 | -- glob_ker(n + 1) = 1 56 | -- glob_ker(n) = ker(n) + (glob_ker(n+1)-1)*step(n) 57 | -- 58 | -- where: ker(n) - kernel size on layer n, step(n) - step size on layer n 59 | -- and n is number of layers that change the size of the input (convolution and subsample) 60 | local left_width1, right_width1, left_height1, right_height1 = getRange({sizes_tbl=sizes_tbl, idx_output=1}) 61 | local ker_width = right_width1 - left_width1 +1 62 | local ker_height = right_height1 - left_height1 +1 63 | 64 | local step_width = 1 65 | local step_height = 1 66 | 67 | -- global step = MUL(step_1, step_2, ... , step_n) 68 | for i = 1, #sizes_tbl do 69 | step_width = step_width * sizes_tbl[i].dw 70 | step_height = step_height * sizes_tbl[i].dh 71 | end 72 | 73 | return step_width, step_height, ker_width, ker_height 74 | end 75 | 76 | function PyramidUnPacker:__init(network) 77 | parent.__init(self) 78 | 79 | -- infer params from given net 80 | self.step_width, self.step_height, self.ker_width, self.ker_height 81 | = getGlobalSizes({sizes_tbl=getSizesTbl(network)}) 82 | end 83 | 84 | function PyramidUnPacker:forward(input, coordinates) 85 | self.out_tbl = {} 86 | self.coordinates = coordinates 87 | 88 | for i = 1, self.coordinates:size(1) do 89 | local start_x = math.floor((self.coordinates[i][1] - 1)/self.step_width) + 1 90 | local start_y = math.floor((self.coordinates[i][2] - 1)/self.step_height) + 1 91 | local width = math.floor((self.coordinates[i][5] - self.ker_width)/self.step_width) + 1 92 | local height = math.floor((self.coordinates[i][6] - self.ker_height)/self.step_height) + 1 93 | 94 | local temp = input:narrow(3, start_x, width) 95 | temp = temp:narrow(2, start_y, height) 96 | table.insert(self.out_tbl, temp) 97 | end 98 | return self.out_tbl 99 | end 100 | 101 | function PyramidUnPacker:backward(input, gradOutput) 102 | error('backward non implemented', 'PyramidUnPacker') 103 | end 104 | 105 | function PyramidUnPacker:write(file) 106 | parent.write(self,file) 107 | file:writeDouble(#self.scales) 108 | for i = 1,#self.scales do 109 | file:writeDouble(self.scales[i]) 110 | end 111 | end 112 | 113 | function PyramidUnPacker:read(file) 114 | parent.read(self,file) 115 | local nbScales = file:readDouble() 116 | for i = 1,nbScales do 117 | self.scales[i] = file:readDouble() 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /demos/face-detector/blobParser.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'inline' 3 | 4 | local parse = inline.load [[ 5 | // get args 6 | const void* id = luaT_checktypename2id(L, "torch.DoubleTensor"); 7 | THDoubleTensor *tensor = luaT_checkudata(L, 1, id); 8 | double threshold = lua_tonumber(L, 2); 9 | int table_blobs = 3; 10 | int idx = lua_objlen(L, 3) + 1; 11 | double scale = lua_tonumber(L, 4); 12 | 13 | // loop over pixels 14 | int x,y; 15 | for (y=0; ysize[0]; y++) { 16 | for (x=0; xsize[1]; x++) { 17 | double val = THDoubleTensor_get2d(tensor, y, x); 18 | if (val > threshold) { 19 | // entry = {} 20 | lua_newtable(L); 21 | int entry = lua_gettop(L); 22 | 23 | // entry[1] = x 24 | lua_pushnumber(L, x); 25 | lua_rawseti(L, entry, 1); 26 | 27 | // entry[2] = y 28 | lua_pushnumber(L, y); 29 | lua_rawseti(L, entry, 2); 30 | 31 | // entry[3] = scale 32 | lua_pushnumber(L, scale); 33 | lua_rawseti(L, entry, 3); 34 | 35 | // blobs[idx] = entry; idx = idx + 1 36 | lua_rawseti(L, table_blobs, idx++); 37 | } 38 | } 39 | } 40 | return 0; 41 | ]] 42 | 43 | return parse 44 | -------------------------------------------------------------------------------- /demos/face-detector/face.net: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clementfarabet/neuflow/cf3364568c6345767085eb8c36d90e8acc0ebffa/demos/face-detector/face.net -------------------------------------------------------------------------------- /demos/filter-bank.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env torch 2 | ---------------------------------------------------------------------- 3 | -- This program demonstrates the computation of a bank of filters 4 | -- over a grayscale image. The image is grabbed from a webcam, 5 | -- if available (and if the package 'camera' is installed as well), 6 | -- otherwise, a fixed image (lena) is used as an input. 7 | -- 8 | -- This script demonstrates how to describe a simple algorithm 9 | -- using Torch7's 'nn' package, and how to compile it for neuFlow. 10 | -- 11 | 12 | require 'image' 13 | require 'neuflow' 14 | require 'qt' 15 | require 'qtwidget' 16 | 17 | ---------------------------------------------------------------------- 18 | -- INIT: initialize the neuFlow context 19 | -- a mem manager, the dataflow core, and the compiler 20 | -- 21 | -- platform='xilinx_ml605' or platform='pico_m503' 22 | 23 | local platform = arg[1] or 'xilinx_ml605' 24 | nf = neuflow.init { 25 | prog_name = 'filter-bank', 26 | platform = platform 27 | } 28 | 29 | ---------------------------------------------------------------------- 30 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 31 | -- how it should interact with the host (data exchange) 32 | -- note: any copy**Host() inserted here needs to be matched by 33 | -- a copy**Dev() in the EXEC section. 34 | -- 35 | 36 | -- input data 37 | inputsize = 400 38 | input = image.scale(image.lena()[1], inputsize,inputsize) 39 | 40 | -- compute 16 9x9 random filters on the input, 41 | -- followed by a non-linear activation unit 42 | network = nn.Sequential() 43 | network:add(nn.SpatialConvolution(1,16,9,9)) 44 | network:add(nn.Tanh()) 45 | 46 | -- loop over the main code 47 | nf:beginLoop('main') do 48 | 49 | -- send data to device 50 | input_dev = nf:copyFromHost(input) 51 | 52 | -- compile network 53 | output_dev = nf:compile(network, input_dev) 54 | 55 | -- send result back to host 56 | outputs = nf:copyToHost(output_dev) 57 | 58 | end nf:endLoop('main') 59 | 60 | ---------------------------------------------------------------------- 61 | -- LOAD: load the bytecode on the device, and execute it 62 | -- 63 | nf:sendReset() 64 | nf:loadBytecode() 65 | 66 | ---------------------------------------------------------------------- 67 | -- EXEC: this part executes the host code, and interacts with the dev 68 | -- 69 | 70 | -- profiler 71 | p = nf.profiler 72 | 73 | -- zoom 74 | zoom = 0.5 75 | 76 | -- try to initialize camera, or default to Lena 77 | if xlua.require 'camera' then 78 | camera = image.Camera{} 79 | end 80 | 81 | -- process loop 82 | function process() 83 | p:start('whole-loop','fps') 84 | 85 | if camera then 86 | p:start('get-camera-frame') 87 | local frame = camera:forward() 88 | image.scale(input,frame:narrow(1,2,1)) 89 | p:lap('get-camera-frame') 90 | end 91 | 92 | nf:copyToDev(input) 93 | nf:copyFromDev(outputs) 94 | 95 | win:gbegin() 96 | win:showpage() 97 | 98 | p:start('display') 99 | image.display{image=outputs, win=win, min=-1, max=1, zoom=zoom} 100 | p:lap('display') 101 | 102 | p:lap('whole-loop') 103 | p:displayAll{painter=win, x=outputs:size(3)*4*zoom+10, y=outputs:size(2)*2*zoom+40, font=12} 104 | win:gend() 105 | end 106 | 107 | ---------------------------------------------------------------------- 108 | -- GUI: setup user interface / display 109 | -- 110 | 111 | if not win then 112 | win = qtwidget.newwindow(outputs:size(3)*6*zoom, outputs:size(2)*3*zoom, 'Filter Bank') 113 | end 114 | 115 | timer = qt.QTimer() 116 | timer.interval = 10 117 | timer.singleShot = true 118 | qt.connect(timer, 119 | 'timeout()', 120 | function() 121 | process() 122 | collectgarbage() 123 | timer:start() 124 | end) 125 | timer:start() 126 | -------------------------------------------------------------------------------- /demos/loopback.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env torch 2 | ---------------------------------------------------------------------- 3 | -- A simple loopback program for neuFlow: send images and receive 4 | -- them back from neuFlow, in a loop. 5 | -- 6 | -- If this script works, it validates: 7 | -- (1) the ethernet interface 8 | -- (2) the embedded openFlow CPU 9 | -- (3) the streamer 10 | -- (4) the DDR2/3 interface 11 | -- 12 | 13 | require 'image' 14 | require 'neuflow' 15 | require 'qt' 16 | require 'qtwidget' 17 | 18 | ---------------------------------------------------------------------- 19 | -- INIT: initialize the neuFlow context 20 | -- a mem manager, the dataflow core, and the compiler 21 | -- 22 | -- platform='xilinx_ml605' or platform='pico_m503' 23 | 24 | local platform = arg[1] or 'xilinx_ml605' 25 | local network_if_name = arg[2] 26 | 27 | nf = neuflow.init { 28 | prog_name = 'loopback', 29 | platform = platform, 30 | network_if_name = network_if_name 31 | } 32 | 33 | ---------------------------------------------------------------------- 34 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 35 | -- how it should interact with the host (data exchange) 36 | -- note: any copy**Host() inserted here needs to be matched by 37 | -- a copy**Dev() in the EXEC section. 38 | -- 39 | 40 | -- input data 41 | inputsize = 400 42 | 43 | -- rescale 44 | input = image.scale(image.lena(), inputsize,inputsize) 45 | 46 | -- loop over the main code 47 | nf:beginLoop('main') do 48 | 49 | -- send data to device 50 | input_dev = nf:copyFromHost(input) 51 | 52 | -- get it back 53 | outputs = nf:copyToHost(input_dev) 54 | 55 | end nf:endLoop('main') 56 | 57 | ---------------------------------------------------------------------- 58 | -- LOAD: load the bytecode on the device, and execute it 59 | -- 60 | nf:sendReset() 61 | nf:loadBytecode() 62 | 63 | ---------------------------------------------------------------------- 64 | -- EXEC: this part executes the host code, and interacts with the dev 65 | -- 66 | 67 | -- profiler 68 | p = nf.profiler 69 | 70 | -- process loop 71 | function process() 72 | p:start('whole-loop','fps') 73 | 74 | nf:copyToDev(input) 75 | nf:copyFromDev(outputs) 76 | 77 | p:start('compute-error') 78 | error = outputs:clone():add(-1,input):abs() 79 | p:lap('compute-error') 80 | 81 | win:gbegin() 82 | win:showpage() 83 | 84 | p:start('display') 85 | image.display{image=input, win=win, x=0, min=0, max=1} 86 | image.display{image=outputs, win=win, x=input:size(3), min=0, max=1} 87 | image.display{image=error, win=win, x=input:size(3)*2, min=0, max=1} 88 | p:lap('display') 89 | 90 | p:lap('whole-loop') 91 | p:displayAll{painter=win, x=10, y=input:size(2)+20, font=12} 92 | win:gend() 93 | end 94 | 95 | ---------------------------------------------------------------------- 96 | -- GUI: setup user interface / display 97 | -- 98 | 99 | if not win then 100 | win = qtwidget.newwindow(1200,540,'Loopback Test') 101 | end 102 | 103 | timer = qt.QTimer() 104 | timer.interval = 10 105 | timer.singleShot = true 106 | qt.connect(timer, 107 | 'timeout()', 108 | function() 109 | process() 110 | collectgarbage() 111 | timer:start() 112 | end) 113 | timer:start() 114 | -------------------------------------------------------------------------------- /demos/loopback_camera.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env torch 2 | ---------------------------------------------------------------------- 3 | -- A simple program for neuFlow: receive images from embedded camera 4 | -- of m503 board 5 | -- them back from neuFlow, in a loop. 6 | -- 7 | -- If this script works, it validates: 8 | -- (1) the ethernet interface 9 | -- (2) the embedded openFlow CPU 10 | -- (3) the streamer 11 | -- (4) the DDR2/3 interface 12 | -- (5) the cameras capture and configuration 13 | 14 | require 'image' 15 | require 'neuflow' 16 | require 'qt' 17 | require 'qtwidget' 18 | 19 | ---------------------------------------------------------------------- 20 | -- INIT: initialize the neuFlow context 21 | -- a mem manager, the dataflow core, and the compiler 22 | -- 23 | nf = neuflow.init { 24 | prog_name = 'loopback', 25 | platform ='pico_m503', 26 | --global_msg_level = 'detailled', 27 | --interface_msg_level = 'detailled', 28 | } 29 | 30 | ---------------------------------------------------------------------- 31 | -- ELABORATION: describe the algorithm to be run on neuFlow, and 32 | -- how it should interact with the host (data exchange) 33 | -- note: any copy**Host() inserted here needs to be matched by 34 | -- a copy**Dev() in the EXEC section. 35 | -- 36 | activeCamera = {'B','A'} 37 | toto = nf.camera:config(activeCamera, 'iic', 'ON') 38 | --toto = nf.camera:config(activeCamera, 'domain', 'RGB') 39 | --toto = nf.camera:config(activeCamera, 'definition', 'QVGA') 40 | toto = nf.camera:config(activeCamera, 'scan', 'PROGRESSIVE') 41 | toto = nf.camera:config(activeCamera, 'color', 'B&W') 42 | --toto = nf.camera:config(activeCamera, 'domain', 'RGB') 43 | --toto = nf.camera:cPROGRESSIVEonfig(activeCamera, 'grab', 'ONESHOT') 44 | --print(' : reg ctrl ' .. toto) 45 | 46 | --nf.camera:stopRBCameras() -- Being sure that the Camera is stopped 47 | nf.camera.core:sleep(1) 48 | --nf.camera:startRBCameras() -- Start camera and send images to Running Buffer 49 | nf.camera:enableCameras(activeCamera) 50 | 51 | -- loop over the main code 52 | nf:beginLoop('main') do 53 | 54 | 55 | -- send image from camera to memory 56 | nf.camera:captureOneFrame(activeCamera) 57 | input_dev = nf.camera:getLastFrame(activeCamera) 58 | 59 | -- pass image to host 60 | outputs = nf:copyToHost(input_dev) 61 | --outputs = nf.camera:copyToHostLatestFrame() -- Get the latest complete frame from both camers 62 | --nf.camera.core:sleep(0.15) 63 | 64 | end nf:endLoop('main') 65 | 66 | ---------------------------------------------------------------------- 67 | -- LOAD: load the bytecode on the device, and execute it 68 | -- 69 | nf:sendReset() 70 | nf:loadBytecode() 71 | 72 | ---------------------------------------------------------------------- 73 | -- EXEC: this part executes the host code, and interacts with the dev 74 | -- 75 | 76 | -- profiler 77 | p = nf.profiler 78 | 79 | local framecnt = 0 80 | -- process loop 81 | function process() 82 | p:start('whole-loop','fps') 83 | --end 84 | 85 | nf:copyFromDev(outputs) 86 | 87 | p:start('display') 88 | win:gbegin() 89 | win:showpage() 90 | image.display{image=outputs, win=win, x=0, min=0, max=1} 91 | p:lap('display') 92 | p:lap('whole-loop') 93 | p:displayAll{painter=win, x=10, y=500, font=12} 94 | win:gend() 95 | --end 96 | framecnt = framecnt + 1 97 | end 98 | 99 | ---------------------------------------------------------------------- 100 | -- GUI: setup user interface / display 101 | -- 102 | 103 | torch.setdefaulttensortype('torch.FloatTensor') 104 | 105 | if not win then 106 | win = qtwidget.newwindow(2000,800,'Loopback Camera Test') 107 | end 108 | 109 | timer = qt.QTimer() 110 | timer.interval = 10 111 | timer.singleShot = true 112 | qt.connect(timer, 113 | 'timeout()', 114 | function() 115 | process() 116 | collectgarbage() 117 | timer:start() 118 | end) 119 | timer:start() 120 | -------------------------------------------------------------------------------- /etherflow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | IF (APPLE) 3 | SET (CMAKE_C_FLAGS "-D_APPLE_=1") 4 | ELSE (APPLE) 5 | SET (CMAKE_C_FLAGS "-D_LINUX_=1") 6 | ENDIF (APPLE) 7 | 8 | INCLUDE_DIRECTORIES (${PROJECT_SOURCE_DIR}/etherflow) 9 | SET(src init.c) 10 | SET(luasrc init.lua) 11 | ADD_TORCH_PACKAGE(etherflow "${src}" "${luasrc}" "neuFlow") 12 | TARGET_LINK_LIBRARIES(etherflow luaT TH) 13 | -------------------------------------------------------------------------------- /etherflow/etherflow.c: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained API to interface neuFlow 3 | **********************************************************/ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // self-contained (no lua) 12 | #define _NO_LUA_ 13 | 14 | // define template macros 15 | #define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z) 16 | #define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z 17 | #define etherflow_(NAME) TH_CONCAT_3(etherflow_, Real, NAME) 18 | #define etherflow_send_(NAME) TH_CONCAT_3(etherflow_send_, Real, NAME) 19 | #define etherflow_receive_(NAME) TH_CONCAT_3(etherflow_receive_, Real, NAME) 20 | 21 | // load templated code 22 | #undef TH_GENERIC_FILE 23 | #include "generic/etherflow.c" 24 | 25 | // generate Float version 26 | #define real float 27 | #define accreal double 28 | #define Real Float 29 | #define TH_REAL_IS_FLOAT 30 | #line 1 TH_GENERIC_FILE 31 | #include TH_GENERIC_FILE 32 | #undef accreal 33 | #undef real 34 | #undef Real 35 | #undef TH_REAL_IS_FLOAT 36 | 37 | // generate Double version 38 | #define real double 39 | #define accreal double 40 | #define Real Double 41 | #define TH_REAL_IS_DOUBLE 42 | #line 1 TH_GENERIC_FILE 43 | #include TH_GENERIC_FILE 44 | #undef accreal 45 | #undef real 46 | #undef Real 47 | #undef TH_REAL_IS_DOUBLE 48 | -------------------------------------------------------------------------------- /etherflow/etherflow.h: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained API to interface neuFlow 3 | **********************************************************/ 4 | 5 | /*********************************************************** 6 | * open_socket() 7 | * what: opens an ethernet socket 8 | * params: 9 | * none 10 | * returns: 11 | * socket - a socket descriptor 12 | **********************************************************/ 13 | int etherflow_open_socket_C(const char *dev, unsigned char *destmac, unsigned char *srcmac); 14 | 15 | /*********************************************************** 16 | * close_socket() 17 | * what: closes an ethernet socket 18 | * params: 19 | * socket 20 | * returns: 21 | * none 22 | **********************************************************/ 23 | int etherflow_close_socket_C(); 24 | 25 | /*********************************************************** 26 | * etherflow_send_reset_C() 27 | * what: send a reset Ethernet frame 28 | * params: 29 | * none 30 | * returns: 31 | * return sendto error code 32 | **********************************************************/ 33 | int etherflow_send_reset_C() 34 | 35 | /*********************************************************** 36 | * receive_frame_C() 37 | * what: receives an ethernet frame 38 | * params: 39 | * socket - socket descriptor. 40 | * buffer - to receive the data 41 | * returns: 42 | * length - nb of bytes read/received 43 | **********************************************************/ 44 | unsigned char * etherflow_receive_frame_C(int *lengthp); 45 | 46 | /*********************************************************** 47 | * send_frame_C() 48 | * what: sends an ethernet frame 49 | * params: 50 | * socket - socket descriptor. 51 | * length - length of data to send 52 | * data_p - data pointer 53 | * returns: 54 | * error code 55 | **********************************************************/ 56 | int etherflow_send_frame_C(short int length, const unsigned char * data_p); 57 | 58 | /*********************************************************** 59 | * send_tensor_byte() 60 | * what: sends a torch byte tensor by breaking it down into 61 | * ethernet packets of maximum size 62 | * params: 63 | * socket - socket descriptor. 64 | * tensor - tensor to send 65 | * returns: 66 | * void 67 | **********************************************************/ 68 | int etherflow_send_ByteTensor_C(unsigned char * data, int size); 69 | 70 | /*********************************************************** 71 | * send_tensor() 72 | * what: sends a torch tensor by breaking it down into 73 | * ethernet packets of maximum size 74 | * a tensor of reals is converted to Q8.8 75 | * params: 76 | * socket - socket descriptor. 77 | * tensor - tensor to send 78 | * returns: 79 | * void 80 | **********************************************************/ 81 | int etherflow_send_FloatTensor_C(float * data, int size); 82 | int etherflow_send_DoubleTensor_C(double * data, int size); 83 | 84 | /*********************************************************** 85 | * receive_tensor_TYPE() 86 | * what: receives a torch tensor by concatenating eth packs 87 | * a tensor of TYPE is created from Q8.8 88 | * params: 89 | * socket - socket descriptor. 90 | * tensor - tensor to fill 91 | * returns: 92 | * void 93 | **********************************************************/ 94 | int etherflow_receive_FloatTensor_C(float *data, int size, int height); 95 | int etherflow_receive_DoubleTensor_C(double *data, int size, int height); 96 | -------------------------------------------------------------------------------- /etherflow/example.c: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained example 3 | * Compile: 4 | * gcc -fpic -shared etherflow.c -o libeth.so 5 | * gcc example.c libeth.so -o example 6 | **********************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "etherflow.h" 14 | 15 | #define BINARY_SIZE 32*1024*1024 16 | 17 | #ifdef _LINUX_ 18 | #define ETH_DEV "eth0" 19 | #else // _APPLE_ 20 | #define ETH_DEV "en0" 21 | #endif 22 | 23 | #define abs(a) (a)>0 ? (a) : -(a) 24 | 25 | int main() { 26 | // init device 27 | etherflow_open_socket_C(ETH_DEV, NULL, NULL); 28 | 29 | // load code (binary) from file 30 | unsigned char *neuflow_bin = (unsigned char *)malloc(BINARY_SIZE); 31 | memset(neuflow_bin, BINARY_SIZE, 0); 32 | FILE *f = fopen("neuflow.bin", "rb"); 33 | int nread; 34 | if (f) nread = fread(neuflow_bin, 1, BINARY_SIZE, f); 35 | else { 36 | printf("error: could not find neuflow code (neuflow.bin)\n"); 37 | return 1; 38 | } 39 | printf("loaded bytecode [size = %d]\n", nread); 40 | 41 | // load (and exec) code on neuFlow 42 | printf("transmitting bytecode\n"); 43 | etherflow_send_ByteTensor_C(neuflow_bin, BINARY_SIZE); 44 | sleep(1); 45 | printf("transmitted.\n"); 46 | 47 | // data structures 48 | double *input_data = malloc(sizeof(double) * 3 * 400 * 400); 49 | double *output_data = malloc(sizeof(double) * 3 * 400 * 400); 50 | 51 | // initialize data 52 | int i,k; 53 | for (k = 0; k < 3; k++) { 54 | for (i = 0; i < 400*400; i++) { 55 | input_data[k*400*400+i] = k; 56 | output_data[k*400*400+i] = 0; 57 | } 58 | } 59 | 60 | // code is now executing, send data and receive answer in a loop 61 | while (1) { 62 | // send input data (a 3x400x400 image) 63 | double *input_p = input_data; 64 | for (i = 0; i < 3; i++) { 65 | etherflow_send_DoubleTensor_C(input_p, 400*400); 66 | input_p += 400*400; 67 | } 68 | etherflow_receive_frame_C(NULL); 69 | 70 | // receive data, processed by neuFlow (a 3x400x400 image, loopbacked) 71 | etherflow_receive_frame_C(NULL); 72 | double *output_p = output_data; 73 | for (i = 0; i < 3; i++) { 74 | etherflow_receive_DoubleTensor_C(output_p, 400*400, 400); 75 | output_p += 400*400; 76 | } 77 | 78 | // measure loopback error 79 | double error = 0; 80 | double maxerr = 0; 81 | for (i = 0; i < 3*400*400; i++) { 82 | double err = abs(input_data[i] - output_data[i]); 83 | if (err > maxerr) maxerr = err; 84 | error += err; 85 | } 86 | error /= 3*400*400; 87 | printf("average error = %f, max error = %f\n", error, maxerr); 88 | } 89 | 90 | // cleanup 91 | free(input_data); 92 | free(output_data); 93 | free(neuflow_bin); 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /etherflow/init.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 12 | #define torch_string_(NAME) TH_CONCAT_STRING_3(torch., Real, NAME) 13 | #define etherflow_(NAME) TH_CONCAT_3(etherflow_, Real, NAME) 14 | #define etherflow_send_(NAME) TH_CONCAT_3(etherflow_send_, Real, NAME) 15 | #define etherflow_receive_(NAME) TH_CONCAT_3(etherflow_receive_, Real, NAME) 16 | 17 | static const void* torch_FloatTensor_id = NULL; 18 | static const void* torch_DoubleTensor_id = NULL; 19 | 20 | #undef TH_GENERIC_FILE 21 | #include "generic/etherflow.c" 22 | #include "THGenerateFloatTypes.h" 23 | 24 | DLL_EXPORT int luaopen_libetherflow(lua_State *L) 25 | { 26 | torch_FloatTensor_id = luaT_checktypename2id(L, "torch.FloatTensor"); 27 | torch_DoubleTensor_id = luaT_checktypename2id(L, "torch.DoubleTensor"); 28 | 29 | etherflow_FloatApi_init(L); 30 | etherflow_DoubleApi_init(L); 31 | 32 | luaL_register(L, "etherflow.double", etherflow_DoubleApi__); 33 | luaL_register(L, "etherflow.float", etherflow_FloatApi__); 34 | 35 | return 1; 36 | } 37 | -------------------------------------------------------------------------------- /etherflow/init.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod 4 | -- 5 | -- Permission is hereby granted, free of charge, to any person obtaining 6 | -- a copy of this software and associated documentation files (the 7 | -- "Software"), to deal in the Software without restriction, including 8 | -- without limitation the rights to use, copy, modify, merge, publish, 9 | -- distribute, sublicense, and/or sell copies of the Software, and to 10 | -- permit persons to whom the Software is furnished to do so, subject to 11 | -- the following conditions: 12 | -- 13 | -- The above copyright notice and this permission notice shall be 14 | -- included in all copies or substantial portions of the Software. 15 | -- 16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -- 24 | ---------------------------------------------------------------------- 25 | -- description: 26 | -- etherflow - a raw serial interface over gigabit ethernet, 27 | -- for communication between neuFlow <-> UNIX host. 28 | -- 29 | -- history: 30 | -- July 16, 2011, 1:46PM - import from Torch5 - Clement Farabet 31 | ---------------------------------------------------------------------- 32 | 33 | require 'torch' 34 | require 'libetherflow' 35 | 36 | function etherflow.open(dev, destmac, srcmac) 37 | return etherflow.double.open_socket(dev, destmac, srcmac) 38 | end 39 | 40 | function etherflow.close(dev) 41 | etherflow.double.close_socket() 42 | end 43 | 44 | function etherflow.sendreset() 45 | return etherflow.double.send_reset() 46 | end 47 | 48 | function etherflow.handshake(bool) 49 | etherflow.double.handshake(bool) 50 | end 51 | 52 | function etherflow.sendstring(str) 53 | etherflow.double.send_frame(str) 54 | end 55 | 56 | function etherflow.receivestring() 57 | return etherflow.double.receive_string() 58 | end 59 | 60 | function etherflow.receiveframe() 61 | return etherflow.double.receive_frame() 62 | end 63 | 64 | function etherflow.sendtensor(tensor) 65 | tensor.etherflow.send_tensor(tensor) 66 | end 67 | 68 | function etherflow.receivetensor(tensor) 69 | tensor.etherflow.receive_tensor(tensor) 70 | end 71 | 72 | function etherflow.loadbytecode(bytetensor) 73 | etherflow.double.send_bytetensor(bytetensor) 74 | end 75 | 76 | function etherflow.setfirstcall(val) 77 | etherflow.double.set_first_call(val) 78 | end 79 | -------------------------------------------------------------------------------- /etherflow/test/receive.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'etherflow' 3 | require 'image' 4 | 5 | etherflow.open() 6 | 7 | t = torch.Tensor(512,512) 8 | 9 | for i = 1,1000 do 10 | print 'waiting for tensor' 11 | sys.tic() 12 | etherflow.setfirstcall(1) 13 | etherflow.receivetensor(t) 14 | print 'got tensor !' 15 | sys.toc(true) 16 | w = image.display{image=t, win=w, gui=false} 17 | end 18 | -------------------------------------------------------------------------------- /etherflow/test/send.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'etherflow' 3 | require 'image' 4 | 5 | etherflow.open(nil, {0xff,0xff,0xff,0xff,0xff,0xff}, {0x01,0x02,0x03,0x04,0x05,0x06}) 6 | 7 | l = image.lena()[1] 8 | 9 | for i = 1,1000 do 10 | sys.tic() 11 | etherflow.sendtensor(l) 12 | sys.toc(true) 13 | end 14 | -------------------------------------------------------------------------------- /ethertbsp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | IF (APPLE) 3 | SET (CMAKE_C_FLAGS "-D_APPLE_=1") 4 | ELSE (APPLE) 5 | SET (CMAKE_C_FLAGS "-D_LINUX_=1") 6 | ENDIF (APPLE) 7 | 8 | INCLUDE_DIRECTORIES (${PROJECT_SOURCE_DIR}/ethertbsp) 9 | SET(src init.c) 10 | SET(luasrc init.lua) 11 | ADD_TORCH_PACKAGE(ethertbsp "${src}" "${luasrc}" "neuFlow") 12 | TARGET_LINK_LIBRARIES(ethertbsp luaT TH) 13 | -------------------------------------------------------------------------------- /ethertbsp/ethertbsp.c: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained API to interface neuFlow 3 | **********************************************************/ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // self-contained (no lua) 12 | #define _NO_LUA_ 13 | 14 | // define template macros 15 | #define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z) 16 | #define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z 17 | #define ethertbsp_(NAME) TH_CONCAT_3(ethertbsp_, Real, NAME) 18 | #define ethertbsp_send_(NAME) TH_CONCAT_3(ethertbsp_send_, Real, NAME) 19 | #define ethertbsp_receive_(NAME) TH_CONCAT_3(ethertbsp_receive_, Real, NAME) 20 | 21 | // load templated code 22 | #undef TH_GENERIC_FILE 23 | #include "generic/ethertbsp.c" 24 | 25 | // generate Float version 26 | #define real float 27 | #define accreal double 28 | #define Real Float 29 | #define TH_REAL_IS_FLOAT 30 | #line 1 TH_GENERIC_FILE 31 | #include TH_GENERIC_FILE 32 | #undef accreal 33 | #undef real 34 | #undef Real 35 | #undef TH_REAL_IS_FLOAT 36 | 37 | // generate Double version 38 | #define real double 39 | #define accreal double 40 | #define Real Double 41 | #define TH_REAL_IS_DOUBLE 42 | #line 1 TH_GENERIC_FILE 43 | #include TH_GENERIC_FILE 44 | #undef accreal 45 | #undef real 46 | #undef Real 47 | #undef TH_REAL_IS_DOUBLE 48 | -------------------------------------------------------------------------------- /ethertbsp/ethertbsp.h: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained API to interface Ethernet to neuFlow 3 | **********************************************************/ 4 | 5 | /*********************************************************** 6 | * open_socket() 7 | * what: opens an ethernet socket 8 | * params: 9 | * dev - network device name 10 | * remote_mac - MAC addr of remote dev 11 | * local_mac - MAC addr of host computer 12 | * 13 | * returns: 14 | * error - 0 for succsess, -1 for error 15 | **********************************************************/ 16 | int ethertbsp_open_socket_C(const char *dev, unsigned char *remote_mac, unsigned char *local_mac); 17 | 18 | /*********************************************************** 19 | * close_socket() 20 | * what: closes the ethernet socket 21 | * params: 22 | * none 23 | * returns: 24 | * none 25 | **********************************************************/ 26 | int ethertbsp_close_socket_C(); 27 | 28 | /*********************************************************** 29 | * send_tensor_byte() 30 | * what: sends a torch byte tensor by breaking it down into 31 | * ethernet packets of maximum size 32 | * params: 33 | * data - send tensor as array 34 | * size - length of data array 35 | * returns: 36 | * zero 37 | **********************************************************/ 38 | int ethertbsp_send_ByteTensor_C(unsigned char * data, int size); 39 | 40 | /*********************************************************** 41 | * send_tensor() 42 | * what: sends a torch tensor by breaking it down into 43 | * ethernet packets of maximum size 44 | * a tensor of reals is converted to Q8.8 45 | * params: 46 | * data - send tensor as array 47 | * size - length of data array 48 | * returns: 49 | * zero 50 | **********************************************************/ 51 | int ethertbsp_send_FloatTensor_C(float * data, int size); 52 | int ethertbsp_send_DoubleTensor_C(double * data, int size); 53 | 54 | /*********************************************************** 55 | * receive_tensor_TYPE() 56 | * what: receives a torch tensor by concatenating eth packs 57 | * a tensor of TYPE is created from Q8.8 58 | * params: 59 | * data - tensor as array to be filled 60 | * size - length of data array 61 | * returns: 62 | * zero 63 | **********************************************************/ 64 | int ethertbsp_receive_FloatTensor_C(float *data, int size, int height); 65 | int ethertbsp_receive_DoubleTensor_C(double *data, int size, int height); 66 | -------------------------------------------------------------------------------- /ethertbsp/example.c: -------------------------------------------------------------------------------- 1 | /*********************************************************** 2 | * A self-contained example 3 | * Compile: 4 | * gcc -fpic -shared ethertbsp.c -o libeth.so 5 | * gcc example.c libeth.so -o example 6 | **********************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "ethertbsp.h" 14 | 15 | #define BINARY_SIZE 32*1024*1024 16 | 17 | #ifdef _LINUX_ 18 | #define ETH_DEV "eth0" 19 | #else // _APPLE_ 20 | #define ETH_DEV "en0" 21 | #endif 22 | 23 | #define abs(a) (a)>0 ? (a) : -(a) 24 | 25 | int main() { 26 | // init device 27 | ethertbsp_open_socket_C(ETH_DEV, NULL, NULL); 28 | 29 | // load code (binary) from file 30 | unsigned char *neuflow_bin = (unsigned char *)malloc(BINARY_SIZE); 31 | memset(neuflow_bin, BINARY_SIZE, 0); 32 | FILE *f = fopen("neuflow.bin", "rb"); 33 | int nread; 34 | if (f) nread = fread(neuflow_bin, 1, BINARY_SIZE, f); 35 | else { 36 | printf("error: could not find neuflow code (neuflow.bin)\n"); 37 | return 1; 38 | } 39 | printf("loaded bytecode [size = %d]\n", nread); 40 | 41 | // load (and exec) code on neuFlow 42 | printf("transmitting bytecode\n"); 43 | ethertbsp_send_ByteTensor_C(neuflow_bin, BINARY_SIZE); 44 | sleep(1); 45 | printf("transmitted.\n"); 46 | 47 | // data structures 48 | double *input_data = malloc(sizeof(double) * 3 * 400 * 400); 49 | double *output_data = malloc(sizeof(double) * 3 * 400 * 400); 50 | 51 | // initialize data 52 | int i,k; 53 | for (k = 0; k < 3; k++) { 54 | for (i = 0; i < 400*400; i++) { 55 | input_data[k*400*400+i] = k; 56 | output_data[k*400*400+i] = 0; 57 | } 58 | } 59 | 60 | // code is now executing, send data and receive answer in a loop 61 | while (1) { 62 | // send input data (a 3x400x400 image) 63 | double *input_p = input_data; 64 | for (i = 0; i < 3; i++) { 65 | ethertbsp_send_DoubleTensor_C(input_p, 400*400); 66 | input_p += 400*400; 67 | } 68 | 69 | // receive data, processed by neuFlow (a 3x400x400 image, loopbacked) 70 | double *output_p = output_data; 71 | for (i = 0; i < 3; i++) { 72 | ethertbsp_receive_DoubleTensor_C(output_p, 400*400, 400); 73 | output_p += 400*400; 74 | } 75 | 76 | // measure loopback error 77 | double error = 0; 78 | double maxerr = 0; 79 | for (i = 0; i < 3*400*400; i++) { 80 | double err = abs(input_data[i] - output_data[i]); 81 | if (err > maxerr) maxerr = err; 82 | error += err; 83 | } 84 | error /= 3*400*400; 85 | printf("average error = %f, max error = %f\n", error, maxerr); 86 | } 87 | 88 | // cleanup 89 | free(input_data); 90 | free(output_data); 91 | free(neuflow_bin); 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /ethertbsp/init.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 12 | #define torch_string_(NAME) TH_CONCAT_STRING_3(torch., Real, NAME) 13 | #define ethertbsp_(NAME) TH_CONCAT_3(ethertbsp_, Real, NAME) 14 | #define ethertbsp_send_(NAME) TH_CONCAT_3(ethertbsp_send_, Real, NAME) 15 | #define ethertbsp_receive_(NAME) TH_CONCAT_3(ethertbsp_receive_, Real, NAME) 16 | 17 | static const void* torch_FloatTensor_id = NULL; 18 | static const void* torch_DoubleTensor_id = NULL; 19 | 20 | #undef TH_GENERIC_FILE 21 | #include "generic/ethertbsp.c" 22 | #include "THGenerateFloatTypes.h" 23 | 24 | DLL_EXPORT int luaopen_libethertbsp(lua_State *L) 25 | { 26 | torch_FloatTensor_id = luaT_checktypename2id(L, "torch.FloatTensor"); 27 | torch_DoubleTensor_id = luaT_checktypename2id(L, "torch.DoubleTensor"); 28 | 29 | ethertbsp_FloatApi_init(L); 30 | ethertbsp_DoubleApi_init(L); 31 | 32 | luaL_register(L, "ethertbsp.double", ethertbsp_DoubleApi__); 33 | luaL_register(L, "ethertbsp.float", ethertbsp_FloatApi__); 34 | 35 | return 1; 36 | } 37 | -------------------------------------------------------------------------------- /ethertbsp/init.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod 4 | -- 5 | -- Permission is hereby granted, free of charge, to any person obtaining 6 | -- a copy of this software and associated documentation files (the 7 | -- "Software"), to deal in the Software without restriction, including 8 | -- without limitation the rights to use, copy, modify, merge, publish, 9 | -- distribute, sublicense, and/or sell copies of the Software, and to 10 | -- permit persons to whom the Software is furnished to do so, subject to 11 | -- the following conditions: 12 | -- 13 | -- The above copyright notice and this permission notice shall be 14 | -- included in all copies or substantial portions of the Software. 15 | -- 16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -- 24 | ---------------------------------------------------------------------- 25 | -- description: 26 | -- ethertbsp - a raw Ethernet packet interface over gigabit ethernet, 27 | -- for communication between neuFlow <-> UNIX host. 28 | -- 29 | -- history: 30 | -- July 16, 2011, 1:46PM - import from Torch5 - Clement Farabet 31 | -- Wed 25 Apr 2012 22:53:06 EDT - Berin Martini 32 | ---------------------------------------------------------------------- 33 | 34 | require 'torch' 35 | require 'libethertbsp' 36 | 37 | function ethertbsp.open(dev, destmac, srcmac) 38 | return ethertbsp.double.open_socket(dev, destmac, srcmac) 39 | end 40 | 41 | function ethertbsp.close(dev) 42 | ethertbsp.double.close_socket() 43 | end 44 | 45 | function ethertbsp.sendreset() 46 | return ethertbsp.double.send_reset() 47 | end 48 | 49 | function ethertbsp.sendtensor(tensor) 50 | tensor.ethertbsp.send_tensor(tensor) 51 | end 52 | 53 | function ethertbsp.receivetensor(tensor) 54 | tensor.ethertbsp.receive_tensor(tensor) 55 | end 56 | 57 | function ethertbsp.loadbytecode(bytetensor) 58 | ethertbsp.double.send_bytetensor(bytetensor) 59 | end 60 | -------------------------------------------------------------------------------- /ethertbsp/test/receive.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'ethertbsp' 3 | require 'image' 4 | 5 | ethertbsp.open() 6 | 7 | t = torch.Tensor(512,512) 8 | 9 | for i = 1,1000 do 10 | print 'waiting for tensor' 11 | sys.tic() 12 | ethertbsp.receivetensor(t) 13 | print 'got tensor !' 14 | sys.toc(true) 15 | w = image.display{image=t, win=w, gui=false} 16 | end 17 | -------------------------------------------------------------------------------- /ethertbsp/test/send.lua: -------------------------------------------------------------------------------- 1 | 2 | require 'ethertbsp' 3 | require 'image' 4 | 5 | ethertbsp.open(nil, {0xff,0xff,0xff,0xff,0xff,0xff}, {0x01,0x02,0x03,0x04,0x05,0x06}) 6 | 7 | l = image.lena()[1] 8 | 9 | for i = 1,1000 do 10 | sys.tic() 11 | ethertbsp.sendtensor(l) 12 | sys.toc(true) 13 | end 14 | -------------------------------------------------------------------------------- /neuflow-1.scm-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "neuflow" 2 | version = "1.scm-0" 3 | 4 | source = { 5 | url = "git://github.com/clementfarabet/neuflow", 6 | } 7 | 8 | description = { 9 | summary = "A compiler toolkit for the neuFlow v1 arch", 10 | detailed = [[ 11 | A package to generate the bytecode for and to setup a communication channel with the neuFlow v1 processor. 12 | ]], 13 | homepage = "https://github.com/clementfarabet/neuflow", 14 | license = "MIT/X11" 15 | } 16 | 17 | dependencies = { 18 | "torch >= 7.0", 19 | "xlua >= 1.0", 20 | "nnx >= 0.1", 21 | "luabitop >= 1.0.1", 22 | } 23 | 24 | build = { 25 | type = "command", 26 | build_command = [[ 27 | cmake -E make_directory build; 28 | cd build; 29 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)"; 30 | $(MAKE) 31 | ]], 32 | install_command = "cd build && $(MAKE) install" 33 | } 34 | -------------------------------------------------------------------------------- /scripts/get-latest-neuflow-image: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | wget http://data.neuflow.org/share/neuFlow-ml605.bit 4 | -------------------------------------------------------------------------------- /scripts/load-bitfile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # need an arg 4 | if [ $# == 0 ] 5 | then 6 | echo "syntax:" 7 | echo " load-bitfile bitfile [loads a bitfile]" 8 | echo " load-bitfile bitfile platform [specifies the platform: ml605 | m503 (default=ml605)]" 9 | echo " load-bitfile unlock [unlocks the cable, if not responsive]" 10 | exit 11 | fi 12 | 13 | # require impact in path 14 | if [ ! `which impact` ] 15 | then 16 | echo "impact could not be found..." 17 | echo " > impact is part of Xilinx's ISE toolchain" 18 | echo " > it is used to load a bitfile into any Xilinx's FPGA, via JTAG" 19 | echo " > it comes for free with ISE webpack edition" 20 | echo " > if you already installed it, then simply add the tools to your path:" 21 | echo " $ source /opt/Xilinx/VERSION/.../settings**.sh" 22 | echo " and then re-run this script !" 23 | exit 24 | fi 25 | 26 | # platform 27 | if [ $# == 2 ] 28 | then 29 | if [ $2 == "ml605" ] 30 | then 31 | fpga=2 32 | fi 33 | if [ $2 == "m503" ] 34 | then 35 | fpga=1 36 | fi 37 | echo "--> programming device ${fpga}" 38 | else 39 | fpga=2 40 | fi 41 | 42 | # parse arg 43 | if [ $1 == "unlock" ] 44 | then 45 | echo "--> unlocking cable" 46 | tmp=/tmp/impact_batch_`date` 47 | echo $tmp 48 | echo "cleancablelock" > "$tmp" 49 | echo "quit" >> "$tmp" 50 | else 51 | echo "--> loading bitfile" 52 | tmp=/tmp/impact_batch_`date` 53 | echo $tmp 54 | echo "setmode -bs" > "$tmp" 55 | echo "setcable -p auto" >> "$tmp" 56 | echo "identify" >> "$tmp" 57 | echo "assignFile -p ${fpga} -file" $1 >> "$tmp" 58 | echo "program -p ${fpga}" >> "$tmp" 59 | echo "quit" >> "$tmp" 60 | fi 61 | 62 | # run commands in batch mode 63 | impact -batch "$tmp" 64 | -------------------------------------------------------------------------------- /segments/coef_Abs: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 8 4 | 256 5 | 0 6 | 32767 7 | 256 8 | 0 9 | 32767 10 | 256 11 | 0 12 | 32767 13 | 256 14 | 0 15 | 32767 16 | 256 17 | 0 18 | 32767 19 | 256 20 | 0 21 | 32767 22 | 256 23 | 0 24 | 32767 25 | 256 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_Sqrt: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 8 4 | 15 5 | 941 6 | 7598 7 | 33 8 | 430 9 | 1510 10 | 73 11 | 194 12 | 311 13 | 159 14 | 89 15 | 67 16 | 340 17 | 42 18 | 15 19 | 697 20 | 21 21 | 4 22 | 1365 23 | 11 24 | 1 25 | 4096 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_Sqrt_th: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 8 4 | 15 5 | 941 6 | 7598 7 | 33 8 | 430 9 | 1510 10 | 73 11 | 194 12 | 311 13 | 159 14 | 89 15 | 67 16 | 340 17 | 42 18 | 15 19 | 697 20 | 21 21 | 4 22 | 1365 23 | 11 24 | 1 25 | 3840 26 | 1 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_Sqrt_th_div_3: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 8 4 | 8 5 | 595 6 | 9985 7 | 17 8 | 279 9 | 1883 10 | 39 11 | 118 12 | 325 13 | 92 14 | 50 15 | 61 16 | 205 17 | 23 18 | 14 19 | 432 20 | 11 21 | 3 22 | 3840 23 | -29 24 | 2 25 | 0 26 | 1 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_Sqrt_th_div_32: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 8 4 | 256 5 | 0 6 | 32767 7 | 3 8 | 176 9 | 8915 10 | 6 11 | 76 12 | 1386 13 | 14 14 | 32 15 | 269 16 | 30 17 | 15 18 | 66 19 | 53 20 | 9 21 | 32 22 | 3840 23 | -464 24 | 31 25 | 0 26 | 1 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_StdSigm: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 8 4 | 5 5 | 410 6 | 810 7 | 77 8 | 184 9 | 267 10 | 253 11 | 1 12 | 4 13 | 293 14 | 0 15 | 3 16 | 293 17 | 0 18 | 2 19 | 293 20 | 0 21 | 1 22 | 293 23 | 0 24 | 0 25 | -2147483648 26 | -2147483648 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_StdSigmAbs: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 8 4 | 16 5 | 349 6 | 509 7 | 158 8 | 68 9 | 142 10 | 280 11 | 0 12 | 7 13 | 293 14 | 0 15 | 6 16 | 293 17 | 0 18 | 5 19 | 293 20 | 0 21 | 4 22 | 293 23 | 0 24 | 3 25 | 293 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_StdSigm_abs_err: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 8 4 | 2 5 | 430 6 | 1172 7 | 8 8 | 403 9 | 815 10 | 27 11 | 342 12 | 634 13 | 57 14 | 268 15 | 496 16 | 101 17 | 183 18 | 374 19 | 157 20 | 101 21 | 267 22 | 219 23 | 36 24 | 157 25 | 278 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_StdSigm_abs_err_all_range: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 8 4 | 0 5 | 437 6 | 1172 7 | 8 8 | 403 9 | 815 10 | 27 11 | 342 12 | 634 13 | 57 14 | 268 15 | 496 16 | 101 17 | 183 18 | 374 19 | 157 20 | 101 21 | 267 22 | 219 23 | 36 24 | 157 25 | 278 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_Tanh: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 8 4 | 2 5 | 245 6 | 562 7 | 39 8 | 165 9 | 295 10 | 130 11 | 60 12 | 156 13 | 215 14 | 8 15 | 58 16 | 250 17 | 0 18 | 21 19 | 254 20 | 0 21 | 20 22 | 255 23 | 0 24 | 19 25 | 256 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /segments/coef_TanhAbs: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 8 4 | 0 5 | 250 6 | 562 7 | 39 8 | 165 9 | 295 10 | 130 11 | 60 12 | 156 13 | 215 14 | 8 15 | 58 16 | 250 17 | 0 18 | 21 19 | 254 20 | 0 21 | 20 22 | 255 23 | 0 24 | 19 25 | 256 26 | 0 27 | 0 28 | -------------------------------------------------------------------------------- /src/Camera.lua: -------------------------------------------------------------------------------- 1 | 2 | ---------------------------------------------------------------------- 3 | --- Class: Camera 4 | -- 5 | -- This class provides a set of methods to exchange data/info with the Camera. 6 | -- 7 | local Camera = torch.class('neuflow.Camera') 8 | 9 | function Camera:__init(args) 10 | -- args: 11 | self.nf = args.nf 12 | self.core = args.nf.core 13 | self.msg_level = args.msg_level or 'none' -- 'detailled' or 'none' or 'concise' 14 | self.frames = {} 15 | 16 | self.nb_frames = 4 -- number of frames in running buffer 17 | -- self.Aw_ = 640 18 | -- self.Ah_ = 480 19 | -- self.Bw_ = 640 20 | -- self.Bh_ = 480 21 | self.size = { 22 | ['B'] = {['width'] = 640, ['height'] = 480, ['component'] = 3}, 23 | ['A'] = {['width'] = 640, ['height'] = 480, ['component'] = 3} 24 | } 25 | 26 | self.mask = { 27 | ['counter'] = {['A'] = 0x0000000c, ['B'] = 0x000c0000}, 28 | ['status'] = {['A'] = 0x00000001, ['B'] = 0x00010000}, 29 | } 30 | 31 | self.conf = { 32 | ['acquisition'] = { 33 | ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0}, 34 | ['mask'] = 0x1, 35 | ['index'] = 10}, 36 | ['definition'] = { 37 | ['value'] = {['QVGA'] = 0x1, ['VGA'] = 0x0}, 38 | ['mask'] = 0x1, 39 | ['index'] = 0}, 40 | ['framerate'] = { 41 | ['value'] = {['60FPS'] = 0x1, ['30FPS'] = 0x0}, 42 | ['mask'] = 0x1, 43 | ['index'] = 1}, 44 | ['color'] = { 45 | ['value'] = {['COLOR'] = 0x0, ['B&W'] = 0x1}, 46 | ['mask'] = 0x1, 47 | ['index'] = 2}, 48 | ['domain'] = { 49 | ['value'] = {['RGB'] = 0x1, ['YUV'] = 0x0}, 50 | ['mask'] = 0x1, 51 | ['index'] = 3}, 52 | ['scan'] = { 53 | ['value'] = {['INTERLACED'] = 0x0, ['PROGRESSIVE'] = 0x1}, 54 | ['mask'] = 0x1, 55 | ['index'] = 4}, 56 | ['grab'] = { 57 | ['value'] = {['ONESHOT'] = 0x1, ['CONTINUOUS'] = 0x0}, 58 | ['mask'] = 0x1, 59 | ['index'] = 8}, 60 | ['power'] = { 61 | ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0}, 62 | ['mask'] = 0x1, 63 | ['index'] = 11}, 64 | ['iic'] = { 65 | ['value'] = {['ON'] = 0x1, ['OFF'] = 0x0}, 66 | ['mask'] = 0x1, 67 | ['index'] = 12} 68 | } 69 | 70 | -- Memorize here the camera register value 71 | self.reg_ctrl = 0x00000000 72 | self.reg_status = 0x00000000 73 | 74 | self.cam_param = { 75 | ['A'] = {['port_addrs'] = dma.camera_A_port_id, ['offset'] = 0}, 76 | ['B'] = {['port_addrs'] = dma.camera_B_port_id, ['offset'] = 16} 77 | } 78 | 79 | -- compulsory 80 | if (self.core == nil) then 81 | error(' ERROR: requires a Dataflow Core') 82 | end 83 | end 84 | 85 | function Camera:config(cameraID, param, value) 86 | local temp_mask 87 | local temp_offset 88 | local lcameraID 89 | if #cameraID == 1 then 90 | lcameraID = {cameraID} 91 | else 92 | lcameraID = cameraID 93 | end 94 | 95 | for i = 1,#lcameraID do 96 | temp_offset = self.conf[param].index + self.cam_param[lcameraID[i]].offset 97 | -- Unset all dedicated bits of the config paramater 98 | temp_mask = bit.bnot(bit.lshift(self.conf[param].mask,temp_offset)) 99 | self.reg_ctrl = bit.band(self.reg_ctrl, temp_mask) 100 | -- Set the new value in reg_ctrl 101 | temp_mask = bit.lshift(self.conf[param].value[value],temp_offset) 102 | self.reg_ctrl = bit.bor(self.reg_ctrl, temp_mask) 103 | 104 | -- Adjust camera memory size to definition 105 | if param == 'definition' then 106 | if value == 'QVGA' then 107 | self.size[lcameraID[i]].width = 320 108 | self.size[lcameraID[i]].height = 240 109 | else 110 | self.size[lcameraID[i]].width = 640 111 | self.size[lcameraID[i]].height = 480 112 | end 113 | end 114 | if param == 'color' then 115 | if value == 'B&W' then 116 | self.size[lcameraID[i]].component = 1 117 | else 118 | self.size[lcameraID[i]].component = 3 119 | end 120 | end 121 | end 122 | 123 | return self.reg_ctrl 124 | end 125 | 126 | 127 | function Camera:initCamera(cameraID, alloc_frames) 128 | 129 | self.frames[cameraID] = alloc_frames 130 | print(' : init Camera ' .. cameraID) 131 | 132 | -- puts the cameras in standby 133 | local reg_ctrl = self.core:allocRegister() 134 | self:config(cameraID,'power','ON') 135 | self.core:setreg(reg_ctrl, self.reg_ctrl) 136 | self.core:iowrite(oFlower.io_gpios, reg_ctrl) 137 | --self.core:sleep(1) 138 | self.core:message('Camera: Init done') 139 | end 140 | 141 | -- Not stable for now because of the camera settings. Use getLastFrame instead 142 | function Camera:getLastFrameSafe(cameraID) 143 | local outputs = {} 144 | local lcameraID 145 | 146 | if #cameraID == 1 then 147 | lcameraID = {cameraID} 148 | else 149 | lcameraID = cameraID 150 | end 151 | 152 | for i = 1,#lcameraID do 153 | table.insert(outputs, self.frames[lcameraID[i]]) 154 | 155 | self.core:closePortSafe(self.cam_param[lcameraID[i]].port_addrs) 156 | end 157 | return outputs 158 | end 159 | 160 | function Camera:getLastFrame(cameraID) 161 | local outputs = {} 162 | 163 | local reg_acqst = self.core:allocRegister() 164 | local reg_tmp = self.core:allocRegister() 165 | local lcameraID 166 | 167 | local mask_status = 0x00000000 168 | if #cameraID == 1 then 169 | lcameraID = {cameraID} 170 | else 171 | lcameraID = cameraID 172 | end 173 | for i = 1,#lcameraID do 174 | mask_status = bit.bor(mask_status, self.mask.status[lcameraID[i]]) 175 | end 176 | self.core:loopUntilStart() 177 | self.core:ioread(oFlower.io_gpios, reg_acqst) 178 | self.core:bitandi(reg_acqst, mask_status, reg_tmp) 179 | self.core:compi(reg_tmp, 0x00000000, reg_tmp) 180 | self.core:loopUntilEndIfNonZero(reg_tmp) 181 | 182 | for i = 1,#lcameraID do 183 | table.insert(outputs, self.frames[lcameraID[i]]) 184 | self.core:closePort(self.cam_param[lcameraID[i]].port_addrs) 185 | end 186 | 187 | return outputs 188 | end 189 | 190 | function Camera:captureOneFrame(cameraID) 191 | local lcameraID 192 | 193 | local reg_ctrl = self.core:allocRegister() 194 | local reg_acqst = self.core:allocRegister() 195 | local reg_tmp = self.core:allocRegister() 196 | 197 | local mask_ctrl = 0x00000000 198 | local mask_status = 0x00000000 199 | 200 | -- Enable camera acquisition 201 | if #cameraID == 1 then 202 | lcameraID = {cameraID} 203 | else 204 | lcameraID = cameraID 205 | end 206 | 207 | for i = 1,#lcameraID do 208 | self.core:openPortWr(self.cam_param[lcameraID[i]].port_addrs, self.frames[lcameraID[i]]) 209 | mask_status = bit.bor(mask_status, self.mask.status[lcameraID[i]]) 210 | end 211 | 212 | -- trigger acquisition 213 | mask_ctrl = self:config(cameraID, 'acquisition', 'ON') 214 | self.core:setreg(reg_ctrl, mask_ctrl) 215 | self.core:iowrite(oFlower.io_gpios, reg_ctrl) 216 | 217 | -- loop until acquisition has started 218 | self.core:loopUntilStart() 219 | self.core:ioread(oFlower.io_gpios, reg_acqst) 220 | self.core:bitandi(reg_acqst, mask_status, reg_tmp) 221 | self.core:compi(reg_tmp, mask_status, reg_tmp) 222 | self.core:loopUntilEndIfNonZero(reg_tmp) 223 | 224 | -- Once the acquisition start. Disable the acquisition for the next frame 225 | mask_ctrl = self:config(cameraID, 'acquisition', 'OFF') 226 | self.core:setreg(reg_ctrl, mask_ctrl) 227 | self.core:iowrite(oFlower.io_gpios, reg_ctrl) 228 | end 229 | 230 | function Camera:enableCameras(cameraID) 231 | local lcameraID 232 | if #cameraID == 1 then 233 | lcameraID = {cameraID} 234 | else 235 | lcameraID = cameraID 236 | end 237 | 238 | for i=1,#lcameraID do 239 | local image_tensor = torch.Tensor(self.size[lcameraID[i]].height, self.size[lcameraID[i]].width*self.size[lcameraID[i]].component) 240 | local image_segment = self.core.mem:allocPersistentData(image_tensor, '2D') 241 | 242 | self:initCamera(lcameraID[i], image_segment) 243 | end 244 | self.core:sleep(1) 245 | end 246 | 247 | function Camera:startRBCameras() -- Start camera and send images to Running Buffer 248 | 249 | print(' : enable Camera: ' .. self.size['A'].width * self.size['A'].component .. 'x' .. self.size['A'].height) 250 | 251 | local image_tensor_A = torch.Tensor(self.size['A'].height, self.size['A'].width*self.size['A'].component) 252 | local image_segment_A = self.core.mem:allocPersistentData(image_tensor_A, '2D') 253 | 254 | local image_tensor_B = torch.Tensor(self.size['B'].height, self.size['B'].width*self.size['B'].component) 255 | local image_segment_B = self.core.mem:allocPersistentData(image_tensor_B, '2D') 256 | 257 | -- The two cameras have to be initialized in the same time if an IIC configuration occured. 258 | self:initCamera('B', image_segment_B) 259 | self:initCamera('A', image_segment_A) 260 | 261 | self.core:sleep(2) 262 | 263 | -- Global setup for DMA port (camera A and B) to make continuous 264 | local stride_bit_shift = math.log(1024) / math.log(2) 265 | 266 | self.core:send_selectModule(blast_bus.area_streamer, blast_bus.addr_mem_streamer_0+dma.camera_A_port_id, 1) 267 | self.core:send_setup(0, 16*1024*1024, stride_bit_shift, 1) 268 | 269 | self.core:send_selectModule(blast_bus.area_streamer, blast_bus.addr_mem_streamer_0+dma.camera_B_port_id, 1) 270 | self.core:send_setup(0, 16*1024*1024, stride_bit_shift, 1) 271 | 272 | -- Open the streamer ports for writing 273 | self.core:openPortWr(dma.camera_B_port_id, self.frames['B']) 274 | self.core:openPortWr(dma.camera_A_port_id, self.frames['A']) 275 | 276 | --self.core:sleep(0.1) 277 | -- Start cameras sending images 278 | local reg_ctrl = self.core:allocRegister() 279 | local mask_ctrl = self:config({'B','A'}, 'acquisition', 'ON') 280 | 281 | -- trigger acquisition 282 | self.core:setreg(reg_ctrl, mask_ctrl) 283 | self.core:iowrite(oFlower.io_gpios, reg_ctrl) 284 | end 285 | 286 | function Camera:stopRBCameras() -- Stop camera sending to Running Buffer 287 | 288 | local reg_acqst = self.core:allocRegister() 289 | local mask_status = bit.bor(self.mask.status['A'], self.mask.status['B']) 290 | local mask_ctrl = self:config({'A','B'}, 'acquisition', 'OFF') 291 | 292 | -- Once the acquisition stop. Disable the acquisition for the next frame 293 | self.core:setreg(reg_acqst, mask_ctrl) 294 | self.core:iowrite(oFlower.io_gpios, reg_acqst) 295 | self.core:nop(100) -- small delay 296 | 297 | -- wait for the frame to finish being sent 298 | -- self.core:loopUntilStart() 299 | -- self.core:ioread(oFlower.io_gpios, reg_acqst) 300 | -- self.core:bitandi(reg_acqst, mask_status, reg_acqst) 301 | -- self.core:compi(reg_acqst, 0x00000000, reg_acqst) 302 | -- self.core:loopUntilEndIfNonZero(reg_acqst) 303 | 304 | -- reset ports setup 305 | self.core:configureStreamer(0, 16*1024*1024, 1024, {dma.camera_A_port_id, dma.camera_B_port_id}) 306 | end 307 | 308 | function Camera:copyToHostLatestFrame() -- Get the latest complete frame 309 | 310 | local reg_acqst = self.core:allocRegister() 311 | self.core:ioread(oFlower.io_gpios, reg_acqst) 312 | 313 | self:streamLatestFrameFromPort('B', reg_acqst, dma.ethernet_read_port_id, 'full') 314 | self.nf.ethernet:streamFromHost(self.nf.ethernet.ack_stream[1], 'ack_stream') 315 | self:streamLatestFrameFromPort('A', reg_acqst, dma.ethernet_read_port_id, 'full') 316 | 317 | return torch.Tensor(2, self.size['A'].height, self.size['A'].width) 318 | end 319 | 320 | function Camera:streamLatestFrameFromPort(cameraID, reg_acqst, port_addr, port_addr_range) 321 | 322 | function coordinateOffset(coordinate, offset) 323 | return { 324 | coordinate = coordinate, 325 | calc = function(self) 326 | return self.coordinate:calc() + offset 327 | end 328 | } 329 | end 330 | 331 | 332 | local goto_ends = {} 333 | local reg_count = self.core:allocRegister() 334 | 335 | for ii = (self.nb_frames-1), 1, -1 do 336 | -- copy camera status into reg but masked for frame count 337 | self.core:bitandi(reg_acqst, self.mask.counter[cameraID], reg_count) 338 | self.core:compi(reg_count, ii, reg_count) -- test if current frame is 'ii' 339 | 340 | -- if current frame not eq to 'ii' (reg_count == 0) goto next possible option 341 | self.core:gotoTagIfZero(nil, reg_count) -- goto next pos 342 | local goto_next = self.core.linker:getLastReference() 343 | 344 | -- read the last frame in running buffer 345 | self.core:configPort{ 346 | index = port_addr, 347 | action = 'fetch+read+sync+close', 348 | data = { 349 | x = self.frames[cameraID].x, 350 | y = coordinateOffset(self.frames[cameraID].y, ((ii-1)*self.size[cameraID].height)), 351 | w = self.size[cameraID].width * self.size[cameraID].component, 352 | h = self.size[cameraID].height 353 | }, 354 | 355 | range = port_addr_range 356 | } 357 | 358 | self.core:gotoTag(nil) -- finish so goto end 359 | goto_ends[ii] = self.core.linker:getLastReference() 360 | 361 | -- next pos 362 | goto_next.goto_tag = self.core:makeGotoTag() 363 | self.core:nop() 364 | end 365 | 366 | -- if got here only option left is to read the following frame 367 | self.core:configPort { 368 | index = port_addr, 369 | action = 'fetch+read+sync+close', 370 | data = { 371 | x = self.frames[cameraID].x, 372 | y = coordinateOffset(self.frames[cameraID].y, ((self.nb_frames-1)*self.size[cameraID].height)), 373 | w = self.size[cameraID].width * self.size[cameraID].component, 374 | h = self.size[cameraID].height 375 | }, 376 | 377 | range = port_addr_range 378 | } 379 | 380 | -- end point 381 | local goto_end_tag = self.core:makeGotoTag() 382 | self.core:nop() 383 | 384 | for i, goto_end in pairs(goto_ends) do 385 | goto_end.goto_tag = goto_end_tag 386 | end 387 | end 388 | -------------------------------------------------------------------------------- /src/DmaInterface.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | --- Class: DmaEthernet 3 | -- 4 | -- This class provides a set of methods to exchange data/info with the host. 5 | -- 6 | local DmaEthernet = torch.class('neuflow.DmaEthernet') 7 | 8 | xrequire 'ethertbsp' 9 | 10 | function DmaEthernet:__init(args) 11 | -- args: 12 | self.nf = args.nf 13 | self.core = args.core 14 | self.profiler = self.nf.profiler 15 | 16 | self.msg_level = args.msg_level or 'none' -- 'detailled' or 'none' or 'concise' 17 | self.max_packet_size = 1500 or args.max_packet_size 18 | 19 | -- compulsory 20 | if (self.core == nil) then 21 | error(' ERROR: requires a Dataflow Core') 22 | end 23 | 24 | -- data ack 25 | self.ack_tensor = torch.Tensor(1,1,32) 26 | self.ack_stream = self.nf:allocHeap(self.ack_tensor) 27 | end 28 | 29 | function DmaEthernet:open(network_if_name) 30 | if(network_if_name) then 31 | ethertbsp.open(network_if_name) 32 | else 33 | ethertbsp.open() 34 | end 35 | end 36 | 37 | function DmaEthernet:close() 38 | ethertbsp.close() 39 | end 40 | 41 | function DmaEthernet:sendReset() 42 | if (-1 == ethertbsp.sendreset()) then 43 | print(' fail') 44 | end 45 | end 46 | 47 | function DmaEthernet:dev_copyToHost(tensor) 48 | -- profiler ack 49 | self.nf.core:executionTimeSensitive(function() 50 | self:streamToHost(self.ack_stream[1], 'ack_stream') 51 | end) 52 | 53 | for i = 1, (#tensor-1) do 54 | self.nf.core:executionTimeSensitive(function() 55 | self:streamToHost(tensor[i], 'default') 56 | --self:streamFromHost(self.ack_stream[1], 'ack_stream') 57 | end) 58 | end 59 | 60 | self.nf.core:executionTimeSensitive(function() 61 | self:streamToHost(tensor[#tensor], 'default') 62 | end) 63 | end 64 | 65 | function DmaEthernet:dev_copyFromHost(tensor) 66 | for i = 1,#tensor do 67 | self.nf.core:executionTimeSensitive(function() 68 | self:streamFromHost(tensor[i], 'default') 69 | end) 70 | end 71 | end 72 | 73 | function DmaEthernet:dev_receiveBytecode() 74 | self:loadByteCode() 75 | end 76 | 77 | function DmaEthernet:host_copyToDev(tensor) 78 | self.profiler:start('copy-to-dev') 79 | for i = 1,tensor:size(1) do 80 | ethertbsp.sendtensor(tensor[i]) 81 | end 82 | self.profiler:lap('copy-to-dev') 83 | end 84 | 85 | function DmaEthernet:host_copyFromDev(tensor) 86 | -- profiler ack 87 | self.profiler:start('on-board-processing') 88 | self.profiler:setColor('on-board-processing', 'blue') 89 | ethertbsp.receivetensor(self.ack_tensor) 90 | self.profiler:lap('on-board-processing') 91 | 92 | 93 | self.profiler:start('copy-from-dev') 94 | ethertbsp.receivetensor(tensor[1]) 95 | for i = 2,tensor:size(1) do 96 | --ethertbsp.sendtensor(self.ack_tensor) 97 | ethertbsp.receivetensor(tensor[i]) 98 | end 99 | self.profiler:lap('copy-from-dev') 100 | end 101 | 102 | function DmaEthernet:host_sendBytecode(bytecode) 103 | self.profiler:start('load-bytecode') 104 | ethertbsp.loadbytecode(bytecode) 105 | self.profiler:lap('load-bytecode') 106 | end 107 | 108 | function DmaEthernet:printToEthernet(str) 109 | print("DEPRECATED") 110 | 111 | -- Printing to ethernet involves initializing a transfer with the driver, 112 | -- then writing the data (frame), then triggering the transfer. 113 | 114 | if (self.msg_level == 'detailled') then 115 | self.core:print(string.format('[ETHERNET TX : %s]',str)) 116 | end 117 | 118 | -- verif data size >= 64 119 | str = str .. '\n' 120 | local data_size = string.len(str) 121 | if (data_size < 64) then 122 | data_size = 64 123 | end 124 | 125 | -- allocate string in memory (TODO: this call is wrong, it allocates the right size, 126 | -- but the data will be corrupted, need to implement a allocString function) 127 | local fake_string = {x = 0, y = 0, w = math.ceil(data_size/2), h = 1} 128 | 129 | -- stream data to DMA ethernet interface 130 | self.core:configPort{index = dma.ethernet_read_port_id, 131 | action = 'fetch+read+sync+close', 132 | data = fake_string, 133 | range = 'full' 134 | } 135 | end 136 | 137 | function DmaEthernet:streamToHost(stream, tag, mode) 138 | local data_size = stream.w * stream.h * 2 139 | 140 | -- estimate number of eth packets 141 | local nb_packets = math.ceil(data_size / self.max_packet_size) 142 | 143 | -- debug 144 | if (self.msg_level ~= 'none') then 145 | self.core:message(string.format('eth: sending %0d packets [tag = %s]', nb_packets, tag)) 146 | end 147 | 148 | -- stream data (tensor) out with a write ack 149 | -- self.core:configPort{index = -1, action = 'write', data = {x=0, y=0, w=32, h=1}} 150 | self.core:configPort{index = dma.ethernet_read_port_id, 151 | action = 'fetch+read+sync+close', 152 | data = stream, 153 | range = 'full'} 154 | -- self.core:configPort{index = -1, action = 'sync+close'} 155 | 156 | end 157 | 158 | function DmaEthernet:streamFromHost(stream, tag) 159 | -- verif data size >= 64 160 | local data_size = stream.w * stream.h * 2 161 | if (data_size < 64) then 162 | error(' ERROR: cant stream data packets smaller than 64 bytes') 163 | end 164 | 165 | -- estimate number of eth packets 166 | local nb_packets = math.ceil(data_size / self.max_packet_size) 167 | 168 | -- debug 169 | if (self.msg_level ~= 'none') then 170 | self.core:message(string.format('eth: requesting %0d packets [tag = %s]', nb_packets, tag)) 171 | end 172 | 173 | -- stream data in 174 | self.core:configPort{index = dma.ethernet_write_port_id, 175 | action = 'write', 176 | data = stream, 177 | range = 'full'} 178 | self.core:configPort{index = dma.ethernet_write_port_id, 179 | action = 'sync+close', 180 | range = 'full'} 181 | end 182 | 183 | function DmaEthernet:loadByteCode() 184 | -- Creating a stream 185 | local bytecode_stream = {x = 0, y = 0, w = 1024, h = 16*1024} 186 | 187 | -- Regular streamFromHost 188 | self:streamFromHost(bytecode_stream, 'bytecode') 189 | 190 | -- ACK to indicate that bytecode has been received 191 | --self.core:configPort{index = 0, action = 'fetch+read+sync+close', data = {x = 0, y = 0, w = 64, h = 1}} 192 | 193 | -- Jump to address 0 and execute 194 | self.core:gotoGlobal(bootloader.entry_point) 195 | end 196 | -------------------------------------------------------------------------------- /src/Linker.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | --- Class: Linker 3 | -- 4 | -- This class is used to manage and link the bytecode. 5 | -- The bytecode contains processes and data: 6 | -- (1) a process is an action recognized by the virtual machine 7 | -- running on the dataflow computer 8 | -- (2) data is used by processes 9 | -- 10 | local Linker = torch.class('neuflow.Linker') 11 | 12 | function Linker:__init(args) 13 | -- args 14 | self.disassemble = args.disassemble 15 | 16 | -- the bytecode array 17 | local sentinel_node = {} 18 | self.instruction_list = { 19 | start_node = sentinel_node, 20 | end_node = sentinel_node, 21 | start_sentinel = sentinel_node, 22 | end_sentinel = sentinel_node 23 | } 24 | 25 | local init_offset = (args.init_offset or 0) + 1 26 | 27 | -- only if we start NOT from page zero 28 | if (init_offset ~= 1) then 29 | 30 | -- init padding 31 | for aa = 0, ((init_offset/8)-1) do 32 | self:appendInstruction{bytes = {0,0,0,0,0,0,0,0}} 33 | end 34 | 35 | -- Sentinel to seperate init padding from next process 36 | self:appendSentinel() 37 | end 38 | 39 | self.counter_bytes = 0 40 | end 41 | 42 | function Linker:getLastReference() 43 | return self.instruction_list.end_node 44 | end 45 | 46 | function Linker:getReference() 47 | error('# ERROR : Deprecated') 48 | end 49 | 50 | function Linker:linkGotos() 51 | 52 | local goto_table = {} 53 | local node = self.instruction_list.start_node 54 | while node do 55 | if node.goto_tag then 56 | goto_table[node] = node.goto_tag 57 | end 58 | 59 | node = node.next 60 | end 61 | 62 | for node in pairs(goto_table) do 63 | local ref_node = goto_table[node].ref 64 | local offset = goto_table[node].offset 65 | 66 | if offset <= 0 then 67 | local ii = 0 68 | while ii > offset do 69 | 70 | ref_node = ref_node.prev 71 | ii = ii - 1 72 | end 73 | else 74 | local ii = 0 75 | while ii < offset do 76 | 77 | ref_node = ref_node.next 78 | ii = ii + 1 79 | end 80 | end 81 | 82 | -- if destination node is a sentinel, try linking to a node in the next 83 | -- direction, if cannot then in the prev direction. Throw an error if 84 | -- cannot find a non sentinel node. 85 | function checkNode(ref_node, reverse) 86 | 87 | if nil ~= ref_node.bytes then 88 | return ref_node 89 | else 90 | if ref_node.next and not reverse then 91 | return checkNode(ref_node.next) 92 | elseif ref_node.prev then 93 | return checkNode(ref_node.prev, true) 94 | else 95 | error('# ERROR : could not link goto') 96 | end 97 | end 98 | end 99 | 100 | ref_node = checkNode(ref_node) 101 | 102 | -- remove just processed goto tab from table 103 | goto_table[node] = nil 104 | 105 | -- ref_node is destination instr 106 | node.goto_instr = ref_node 107 | end 108 | end 109 | 110 | function Linker:resolveGotos() 111 | local addr_index = {} 112 | local ii = 0 113 | 114 | local node = self.instruction_list.start_node 115 | while node do 116 | if node.bytes ~= nil then 117 | addr_index[node] = ii 118 | ii = ii + 1 119 | end 120 | 121 | node = node.next 122 | end 123 | 124 | local node = self.instruction_list.start_node 125 | while node do 126 | if node.goto_instr ~= nil then 127 | self:rewriteARG32(node.bytes, addr_index[node.goto_instr]) 128 | end 129 | 130 | node = node.next 131 | end 132 | 133 | return ii 134 | end 135 | 136 | function Linker:resolveMemSegments() 137 | local node = self.instruction_list.start_node 138 | 139 | while node do 140 | if node.mem_offset ~= nil then 141 | self:rewriteARG32(node.bytes, node.mem_offset:calc()) 142 | end 143 | 144 | node = node.next 145 | end 146 | end 147 | 148 | function Linker:genBytecode() 149 | local node = self.instruction_list.start_node 150 | local instruction_output = {} 151 | local ii = 0 152 | 153 | while node do 154 | if node.bytes ~= nil then 155 | instruction_output[ii+1] = node.bytes[1] 156 | instruction_output[ii+2] = node.bytes[2] 157 | instruction_output[ii+3] = node.bytes[3] 158 | instruction_output[ii+4] = node.bytes[4] 159 | instruction_output[ii+5] = node.bytes[5] 160 | instruction_output[ii+6] = node.bytes[6] 161 | instruction_output[ii+7] = node.bytes[7] 162 | instruction_output[ii+8] = node.bytes[8] 163 | 164 | ii = ii + 8 165 | end 166 | 167 | node = node.next 168 | end 169 | 170 | return instruction_output 171 | end 172 | 173 | function Linker:appendSentinel(mode) 174 | assert('start' == mode or 'end' == mode or nil == mode) 175 | 176 | local new_sentinel = {mode = mode} 177 | local last_sentinel = self.instruction_list.end_sentinel 178 | local last_node = self.instruction_list.end_node 179 | 180 | last_sentinel.next_sentinel = new_sentinel 181 | new_sentinel.prev_sentinel = last_sentinel 182 | self.instruction_list.end_sentinel = new_sentinel 183 | 184 | last_node.next = new_sentinel 185 | new_sentinel.prev = last_node 186 | self.instruction_list.end_node = new_sentinel 187 | end 188 | 189 | function Linker:appendInstruction(instruction) 190 | 191 | if not instruction.bytes then 192 | instruction.bytes = self:newInstructionBytes(instruction) 193 | end 194 | 195 | local node = self.instruction_list.end_node 196 | 197 | node.next = instruction 198 | instruction.prev = node 199 | self.instruction_list.end_node = instruction 200 | end 201 | 202 | function Linker:newInstructionBytes(args) 203 | 204 | -- parse args 205 | local opcode = args.opcode or oFlower.op_nop 206 | local arg8_1 = args.arg8_1 or 0 207 | local arg8_2 = args.arg8_2 or 0 208 | local arg8_3 = args.arg8_3 or 0 209 | local arg32_1 = args.arg32_1 or 0 210 | local bytes = {} 211 | 212 | -- serialize opcode + args 213 | bytes[1] = math.floor(arg32_1/256^0) % 256 214 | bytes[2] = math.floor(arg32_1/256^1) % 256 215 | bytes[3] = math.floor(arg32_1/256^2) % 256 216 | bytes[4] = math.floor(arg32_1/256^3) % 256 217 | bytes[5] = arg8_3 218 | bytes[6] = arg8_2 219 | bytes[7] = arg8_1 220 | bytes[8] = opcode 221 | 222 | return bytes 223 | end 224 | 225 | function Linker:rewriteARG32(instr_bytes, uint32) 226 | instr_bytes[1] = math.floor(uint32/256^0) % 256 227 | instr_bytes[2] = math.floor(uint32/256^1) % 256 228 | instr_bytes[3] = math.floor(uint32/256^2) % 256 229 | instr_bytes[4] = math.floor(uint32/256^3) % 256 230 | end 231 | 232 | function Linker:insertInstruction(node, instruction) 233 | 234 | instruction.next = node.next 235 | instruction.prev = node 236 | 237 | instruction.next.prev = instruction 238 | instruction.prev.next = instruction 239 | end 240 | 241 | function Linker:insertSegment(earlier_node, seg_start, seg_end) 242 | local later_node = earlier_node.next 243 | 244 | earlier_node.next = seg_start 245 | seg_start.prev = earlier_node 246 | 247 | later_node.prev = seg_end 248 | seg_end.next = later_node 249 | end 250 | 251 | function Linker:removeSegment(seg_start, seg_end) 252 | local earlier_node = seg_start.prev 253 | local later_node = seg_end.next 254 | 255 | earlier_node.next = later_node 256 | later_node.prev = earlier_node 257 | end 258 | 259 | function Linker:alignSensitiveCode(walker) 260 | walker = walker or { 261 | current_node = self.instruction_list.start_node, 262 | sentinel_start = nil, 263 | sentinel_nesting = 0, 264 | sentinel_size = 0, 265 | bytecode_size = 0, 266 | } 267 | 268 | if nil == walker.current_node.bytes then 269 | -- sentinel 270 | 271 | if 'start' == walker.current_node.mode then 272 | if 0 == walker.sentinel_nesting then 273 | walker.sentinel_start = walker.current_node 274 | walker.sentinel_size = 0 275 | end 276 | walker.sentinel_nesting = walker.sentinel_nesting + 1 277 | end 278 | 279 | if 'end' == walker.current_node.mode then 280 | walker.sentinel_nesting = walker.sentinel_nesting - 1 281 | assert(0 <= walker.sentinel_nesting) 282 | end 283 | else 284 | -- instr 285 | walker.bytecode_size = walker.bytecode_size + 1 286 | 287 | if 0 < walker.sentinel_nesting then 288 | if (1 == (walker.bytecode_size % (oFlower.page_size_b/8))) then 289 | -- current node is first of new page 290 | 291 | if walker.sentinel_start ~= walker.current_node.prev then 292 | -- shift sensitive section into new page 293 | assert((oFlower.page_size_b/8) > walker.sentinel_size) 294 | 295 | local before_sensitive = walker.sentinel_start.next 296 | for i = 1, walker.sentinel_size do 297 | self:insertInstruction(before_sensitive, {bytes = {0,0,0,0,0,0,0,0}}) 298 | before_sensitive = before_sensitive.next 299 | walker.bytecode_size = walker.bytecode_size + 1 300 | end 301 | end 302 | end 303 | 304 | walker.sentinel_size = walker.sentinel_size + 1 305 | end 306 | end 307 | 308 | if walker.current_node.next then 309 | walker.current_node = walker.current_node.next 310 | return self:alignSensitiveCode(walker) 311 | end 312 | end 313 | 314 | function Linker:dump(info, mem) 315 | 316 | self:linkGotos() 317 | self:alignSensitiveCode() 318 | local instr_nb = self:resolveGotos() 319 | 320 | mem:adjustBytecodeSize(instr_nb*8) 321 | 322 | self:resolveMemSegments() 323 | local instr = self:genBytecode() 324 | 325 | -- optional disassemble 326 | if self.disassemble then 327 | neuflow.tools.disassemble(instr, {length = #instr}) 328 | end 329 | 330 | -- parse argument 331 | assert(info.tensor) 332 | info.bigendian = info.bigendian or 0 333 | 334 | -- print all the instructions 335 | self:dump_instructions(instr, info.tensor) 336 | 337 | -- and embedded data 338 | self:dump_embedded_data(info, info.tensor, mem) 339 | 340 | -- print memory area statistics 341 | mem:printAreaStatistics() 342 | 343 | return self.counter_bytes 344 | end 345 | 346 | function Linker:dump_instructions(instr, tensor) 347 | -- copy instructions into tensor 348 | for i=1, #instr do 349 | tensor[self.counter_bytes+1] = instr[i] 350 | self.counter_bytes = self.counter_bytes + 1 351 | end 352 | end 353 | 354 | function Linker:dump_embedded_data(info, tensor, mem) 355 | -- pad initial offset for raw data 356 | self.counter_bytes = mem.embedded.start.y * streamer.stride_b 357 | + mem.embedded.start.x * streamer.word_b 358 | 359 | for i=1, #mem.embedded do 360 | mem_entry = mem.embedded[i] 361 | 362 | -- set offset in file 363 | if ('number' == type(mem_entry.y)) then 364 | self.counter_bytes = mem_entry.y * streamer.stride_b + mem_entry.x * streamer.word_b 365 | else 366 | self.counter_bytes = mem_entry.y:calc() * streamer.stride_b + mem_entry.x:calc() * streamer.word_b 367 | end 368 | 369 | if (mem_entry.bias ~= nil) then 370 | for b = 1,mem_entry.bias:size(1) do 371 | dataTwos = math.floor(mem_entry.bias[b] * num.one + 0.5) 372 | dataTwos = bit.band(dataTwos, num.mask) 373 | for j=0,(num.size_b - 1) do 374 | -- get char from short 375 | if (info.bigendian == 1) then 376 | tempchar = math.floor(dataTwos / (256^((num.size_b - 1)-j))) % 256 377 | else 378 | tempchar = math.floor(dataTwos / (256^j)) % 256 379 | end 380 | tensor[self.counter_bytes+1] = tempchar 381 | self.counter_bytes = self.counter_bytes + 1 382 | end 383 | end 384 | end 385 | 386 | for r=1,mem_entry.data:size(1) do 387 | for c=1,mem_entry.data:size(2) do 388 | dataTwos = math.floor(mem_entry.data[r][c] * num.one + 0.5) 389 | dataTwos = bit.band(dataTwos, num.mask) 390 | for j=0,(num.size_b - 1) do 391 | -- get char from short 392 | if (info.bigendian == 1) then 393 | tempchar = math.floor(dataTwos / (256^((num.size_b - 1)-j))) % 256 394 | else 395 | tempchar = math.floor(dataTwos / (256^j)) % 256 396 | end 397 | tensor[self.counter_bytes+1] = tempchar 398 | self.counter_bytes = self.counter_bytes + 1 399 | end 400 | end 401 | end 402 | end 403 | end 404 | -------------------------------------------------------------------------------- /src/LinkerExtensions.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | --- Class: Linker 3 | -- 4 | -- This file contains extensions to the Linker class. 5 | -- 6 | 7 | function neuflow.Linker:cacheConfigOptimization() 8 | -- Filter for the instruction linked list, would be used after 'linkGotos()' and before 9 | -- 'alignProcessWithPages()' 10 | 11 | -- Beginning from the start of list, move along list until dead time is found. 12 | -- From that point, descend list looking for configs that can be moved. 13 | -- If config that can be moved is found, remove segment and then insert the segment in dead time. 14 | -- Repeat until the end of list is reached 15 | 16 | 17 | local function bytesDecode(bytes) 18 | -- instr bit packing is hard code, any change in the blast_bus.vh will make errors here 19 | 20 | local instr = {} 21 | instr.config8_1 = bytes[1] 22 | instr.config8_2 = bytes[2] 23 | instr.config8_3 = bytes[3] 24 | instr.config8_4 = bytes[4] 25 | instr.config16_1 = (256^1)*bytes[4]+(256^0)*bytes[3] 26 | instr.config32_1 = (256^3)*bytes[4]+(256^2)*bytes[3]+(256^1)*bytes[2]+(256*0)*bytes[1] 27 | 28 | instr.arg8_3 = bytes[5] 29 | instr.arg8_2 = bytes[6] 30 | instr.arg8_1 = bytes[7] -- config_content 31 | instr.of_opcode = bytes[8] -- openflower opcode 32 | 33 | return instr 34 | end 35 | 36 | -- makes a table that holds the current state of all the ports, argument 'state' is an old port 37 | -- state table to be cloned 38 | local function makePorts(state) 39 | if not state then state = {} end 40 | local ports = {} 41 | ports.addr = state.addr or nil -- if nil no port is being addressed 42 | ports.submod = state.addr or nil -- if nil no port sub module is being addressed 43 | 44 | for aa = 1, (streamer.nb_ports-1) do 45 | if not state[aa] then state[aa] = {} end 46 | ports[aa] = {} 47 | ports[aa].valid = state[aa].valid or 1 -- if 0, no longer in considerion for reordering 48 | ports[aa].idle = state[aa].idle or 1 -- if 1, is idle & does not need to be cached set 49 | ports[aa].active = state[aa].active or 0 50 | ports[aa].cached = state[aa].cached or 0 51 | ports[aa].reset = state[aa].reset or 0 52 | ports[aa].prefetch = state[aa].prefetch or 0 53 | end 54 | 55 | function ports:reset_valid() 56 | for aa = 1, (streamer.nb_ports-1) do 57 | ports[aa].valid = 1 58 | end 59 | end 60 | 61 | return ports 62 | end 63 | 64 | -- determines how the current instruction affects which end point the config bus 65 | -- is interacting with 66 | local function addressState(of_opcode, config_content, config_addr, config_submod, ports) 67 | 68 | if of_opcode == oFlower.op_writeConfig then 69 | if config_content == blast_bus.content_command then 70 | -- last 4 bits of config_addr is the area address 71 | local area = (config_addr - (config_addr%(2^12)))/(2^12) 72 | 73 | -- group addr and broadcast addr means more then one port can be active, these will 74 | -- be ignored as this version of config optimizer only can deal with a single port 75 | -- being addressed 76 | if area == blast_bus.area_streamer then 77 | -- first 12 bits of config_addr is the port address 78 | ports.addr = config_addr%(2^12) 79 | ports.submod = config_submod 80 | 81 | if ((ports.addr < 1) or (ports.addr > (streamer.nb_ports-1))) then 82 | -- addr zero is broadcast to all ports while any address above the 83 | -- number of ports is a group addr, both are ignored 84 | ports.addr = nil 85 | ports.submod = nil 86 | end 87 | else 88 | ports.addr = nil 89 | ports.submod = nil 90 | end 91 | end 92 | end 93 | end 94 | 95 | -- determines if the current instruction has a command that will affect the addressed port 96 | -- should be called after addressState in case the addring command also had an config_instr 97 | local function portCommand(of_opcode, config_content, config_instr, ports) 98 | for aa = 1, (streamer.nb_ports-1) do 99 | ports[aa].reset = 0 100 | ports[aa].prefetch = 0 101 | end 102 | 103 | local command = false 104 | 105 | if of_opcode == oFlower.op_writeConfig then 106 | if ports.addr and (config_content == blast_bus.content_command or 107 | config_content == blast_bus.content_instruc) then 108 | 109 | command = true 110 | 111 | if config_instr == blast_bus.instruc_config then 112 | -- place holder for addressing without an opcode 113 | elseif config_instr == blast_bus.instruc_setAdd then 114 | -- set group address, defualt is broadcast address 115 | -- addr could be set to a different area code which 116 | -- would mean the addressState would need to be changed 117 | elseif config_instr == blast_bus.instruc_reset then 118 | ports[ports.addr].valid = 0 119 | ports[ports.addr].reset = 1 120 | elseif config_instr == blast_bus.instruc_cacheStart then 121 | ports[ports.addr].valid = 0 122 | ports[ports.addr].cached = 1 123 | elseif config_instr == blast_bus.instruc_cacheFinish then 124 | ports[ports.addr].valid = 0 125 | ports[ports.addr].cached = 0 126 | elseif config_instr == blast_bus.instruc_activate then 127 | ports[ports.addr].valid = 0 128 | ports[ports.addr].idle = 0 129 | ports[ports.addr].active = 1 130 | elseif config_instr == blast_bus.instruc_deActivate then 131 | ports[ports.addr].active = 0 132 | elseif config_instr == blast_bus.instruc_control_1 then 133 | -- prefetch 134 | ports[ports.addr].valid = 0 135 | ports[ports.addr].idle = 0 136 | ports[ports.addr].prefetch = 1 137 | else 138 | print("WARNING: Unknown comand sent to streamer") 139 | command = false 140 | end 141 | end 142 | end 143 | return command 144 | end 145 | 146 | local function portConfig(of_opcode, config_content, ports) 147 | local config = false 148 | 149 | if of_opcode == oFlower.op_writeConfig then 150 | if ports.addr and config_content == blast_bus.content_config then 151 | -- sending config words to sub module, currently only can move sub mod 2 152 | -- global and timeout config is ignored 153 | 154 | config = true 155 | end 156 | end 157 | 158 | return config 159 | end 160 | 161 | local function portWaitStatus(of_opcode, config_content, ports) 162 | local wait = false 163 | 164 | -- TODO: have estimate of time spent in wait and if there is enough time for 165 | -- a config reorder set wait to true 166 | 167 | if ports.addr and (of_opcode == oFlower.op_getStatus) then 168 | if config_content == blast_bus.status_primed then 169 | wait = true 170 | elseif config_content == blast_bus.status_done then 171 | wait = true 172 | end 173 | end 174 | 175 | return wait 176 | end 177 | 178 | local function makeCacheSetInstr() 179 | local instr_bytes = self:newInstructionBytes { 180 | opcode = oFlower.op_writeConfig, 181 | arg8_1 = blast_bus.content_instruc, 182 | arg32_1 = blast_bus.instruc_cacheStart 183 | } 184 | 185 | return {bytes = instr_bytes} 186 | end 187 | 188 | local function makeCacheUnsetInstr() 189 | local instr_bytes = self:newInstructionBytes { 190 | opcode = oFlower.op_writeConfig, 191 | arg8_1 = blast_bus.content_instruc, 192 | arg32_1 = blast_bus.instruc_cacheFinish 193 | } 194 | 195 | return {bytes = instr_bytes} 196 | end 197 | 198 | local function makeAddrInstr(addr, submod) 199 | submod = submod or 0 200 | local configWord = blast_bus.area_streamer*(2^28) + addr*(2^16) + submod*(2^8) 201 | 202 | local instr_bytes = self:newInstructionBytes { 203 | opcode = oFlower.op_writeConfig, 204 | arg8_1 = blast_bus.content_command, 205 | arg32_1 = configWord 206 | } 207 | 208 | return {bytes = instr_bytes} 209 | end 210 | 211 | local function findConfigSegment(node, ports) 212 | -- start_node is an instruction that addressess a port and the 2nd sub mod 213 | -- end_node is the last config instruction 214 | local start_node = nil 215 | local end_node = nil 216 | local search = true 217 | 218 | while (search and node) do 219 | if node.bytes ~= nil then 220 | local instr = bytesDecode(node.bytes) 221 | 222 | addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports) 223 | portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports) 224 | 225 | if (ports.submod == 2 and ports[ports.addr].valid == 1) then 226 | start_node = node 227 | 228 | node = node.next 229 | local nb_config = 0 230 | while (search and node) do 231 | 232 | if node.bytes == nil then 233 | search = false 234 | break 235 | end 236 | 237 | local instr = bytesDecode(node.bytes) 238 | 239 | if portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports) then 240 | search = false 241 | break 242 | end 243 | 244 | if portConfig(instr.of_opcode, instr.arg8_1, ports) then 245 | nb_config = nb_config + 1 246 | end 247 | 248 | if nb_config == 5 then 249 | search = false 250 | end_node = node 251 | end 252 | 253 | node = node.next 254 | end 255 | end 256 | end 257 | if node then node = node.next end 258 | end 259 | 260 | return start_node, end_node 261 | end 262 | 263 | local function findWaitAddrNode(node, target_addr) 264 | -- NOTE: if there is any other instr b/w addr instr and wait instr, make a addr 265 | -- instr node and insert it before wait instr 266 | local ports = makePorts() 267 | 268 | while not ports.addr do 269 | node = node.prev 270 | 271 | local instr = bytesDecode(node.bytes) 272 | addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports) 273 | 274 | if ports.addr == target_addr then 275 | break 276 | else 277 | local tmp_ports = makePorts(ports) 278 | tmp_ports.addr = target_addr 279 | 280 | local command = portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, tmp_ports) 281 | local config = portConfig(instr.of_opcode, instr.arg8_1, tmp_ports) 282 | 283 | if config or command then 284 | local addr_node = makeAddrInstr(target_addr) 285 | self:insertInstruction(node, addr_node) 286 | node = addr_node 287 | 288 | break 289 | end 290 | end 291 | end 292 | 293 | return node 294 | end 295 | 296 | local node = self.instruction_list.start_sentinel 297 | local ports = makePorts() 298 | 299 | while node do 300 | if node.bytes ~= nil then 301 | local instr = bytesDecode(node.bytes) 302 | 303 | addressState(instr.of_opcode, instr.arg8_1, instr.config16_1, instr.config8_2, ports) 304 | portCommand(instr.of_opcode, instr.arg8_1, instr.config8_1, ports) 305 | 306 | --while portWaitStatus(instr.of_opcode, instr.arg8_1, ports) do -- only with time estimate 307 | if portWaitStatus(instr.of_opcode, instr.arg8_1, ports) then 308 | ports:reset_valid() 309 | local descent_ports = makePorts(ports) 310 | local descent_node = node.next 311 | local start_node = nil 312 | local end_node = nil 313 | 314 | start_node, end_node = findConfigSegment(descent_node, descent_ports) 315 | 316 | if start_node and end_node then 317 | -- find the addr instr node used to addr port for the wait for status instr node 318 | local wait_addr_node = findWaitAddrNode(node, ports.addr) 319 | 320 | -- insert instruction to re-addr port after first making it 321 | local new_addr = makeAddrInstr(descent_ports.addr, 2) 322 | self:insertInstruction(end_node, new_addr) 323 | 324 | -- NOTE: idle/not idle might not be correct, probable sould not use until sure 325 | -- and just use the cache every time 326 | --if ports[descent_ports.addr].idle then 327 | local cache_set = makeCacheSetInstr() 328 | local cache_unset = makeCacheUnsetInstr() 329 | 330 | -- insert cache set instr in segment if using caching 331 | self:insertInstruction(start_node, cache_set) 332 | 333 | -- insert instr to unset cache if using caching 334 | -- (if port is not idle at dead time) 335 | self:insertInstruction(new_addr, cache_unset) 336 | --end 337 | 338 | -- remove (cut) the config segment 339 | self:removeSegment(start_node, end_node) 340 | 341 | -- re-insert segment in its new place 342 | self:insertSegment(wait_addr_node.prev, start_node, end_node) 343 | end 344 | end 345 | end 346 | 347 | node = node.next 348 | end 349 | end 350 | -------------------------------------------------------------------------------- /src/Log.lua: -------------------------------------------------------------------------------- 1 | 2 | ---------------------------------------------------------------------- 3 | --- Class: Log 4 | -- 5 | -- logs info during compilation. 6 | -- 7 | local Log = torch.class('neuflow.Log') 8 | 9 | function Log:__init(file) 10 | self.logFile = assert(io.open(file, "w")) 11 | end 12 | 13 | function Log:write(msg) 14 | self.logFile:write(msg) 15 | end 16 | 17 | function Log:close() 18 | self.logFile:close() 19 | end 20 | 21 | -------------------------------------------------------------------------------- /src/Memory.lua: -------------------------------------------------------------------------------- 1 | --[[ Class: Memory 2 | 3 | This class is used to allocate areas of memory in a controlled manner. It 4 | generate offsets and areas. If data needs to be written to the bytecode start 5 | up stream, that is done in the Linker class. 6 | 7 | The offsets and memory areas are represented in pixels. Conceptually the memory 8 | is considered to be a large rectangular matrix. 9 | 10 | The requirements for the memory that gets allocation vary but for our purposes 11 | they can be grouped into three broad types. As such when requesting a memory 12 | allocation the way that memory will be used needs to be considered and the 13 | correct alloc function selected. The definition of these 3 type are as follows: 14 | 15 | 1) Embedded data (e.g., kernels) whose value is know at compile time and thus 16 | would benefit by being written to memory when the bytecode is sent at start 17 | up. 18 | 19 | 2) Persistent data (e.g., circular image buffers). The contents of this memory 20 | may be updated or change in time but the area addressing it does not. This 21 | allocation is used when data needs to be preserved between multiples layers 22 | in a conv net or between multiple runs of the program. 23 | 24 | 3) Managed data (e.g., intermediate results). The contents of which only need 25 | to exist to pass data between operations or layers in a program. Area 26 | allocations of this type can be freed and reused in a managed fashion as the 27 | need arises. 28 | 29 | --]] 30 | 31 | local Memory = torch.class('neuflow.Memory') 32 | 33 | function Memory:__init(args) 34 | 35 | self.prog_name = args.prog_name 36 | self.init_offset = (args.init_offset or 0) + 1 37 | self.bytecode_size_b = 0 38 | 39 | -- table of embedded data segments 40 | self.embedded = { 41 | ['start'] = { 42 | ['x'] = 0, 43 | ['y'] = 0, 44 | }, 45 | ['current'] = { 46 | ['x'] = 0, 47 | ['y'] = 0, 48 | }, 49 | ['layer'] = { 50 | ['h'] = 0, 51 | ['packing'] = 'kernel' 52 | } 53 | } 54 | 55 | -- table of persistent data segments 56 | self.persistent = { 57 | ['start'] = { 58 | ['x'] = 0, 59 | ['y'] = 0, 60 | }, 61 | ['current'] = { 62 | ['x'] = 0, 63 | ['y'] = 0, 64 | }, 65 | ['layer'] = { 66 | ['h'] = 0, 67 | ['packing'] = '1D' 68 | } 69 | } 70 | 71 | -- table of managed data segments 72 | self.managed = { 73 | ['start'] = { 74 | ['x'] = 0, 75 | ['y'] = 0, 76 | }, 77 | ['current'] = { 78 | ['x'] = 0, 79 | ['y'] = 0, 80 | }, 81 | ['layer'] = { 82 | ['h'] = 0, 83 | ['packing'] = '1D' 84 | }, 85 | } 86 | end 87 | 88 | function Memory:adjustBytecodeSize(size_in_bytes) 89 | 90 | self.bytecode_size_b = size_in_bytes 91 | 92 | self.embedded.start.x = 0 93 | self.embedded.start.y = math.ceil((size_in_bytes + 1) / streamer.stride_b) 94 | 95 | self.persistent.start.x = 0 96 | self.persistent.start.y = self.embedded.start.y + self.embedded.current.y + 1 97 | 98 | self.managed.start.x = 0 99 | self.managed.start.y = self.persistent.start.y + self.persistent.current.y + 1 100 | end 101 | 102 | function Memory:constructCoordinate(area, coor) 103 | return { 104 | coor = coor, 105 | start = self[area].start, 106 | offset = self[area].current[coor], 107 | calc = function(self) 108 | return self.start[self.coor] + self.offset 109 | end 110 | } 111 | end 112 | 113 | --[[ Allocate Embedded Data 114 | 115 | By default the data is reformatted & treated as a kernel. If non kernel data 116 | needs to be embedded an explicit 1D or 2D packing argument needs to be 117 | passed in. If 2D is selected but the width of the data is larger then the 118 | streamer (memory) stride, packing is reverted to 1D. 119 | --]] 120 | function Memory:allocEmbeddedData(data_, bias_, packing) 121 | packing = packing or 'kernel' 122 | assert(packing == 'kernel' or packing == '1D' or packing == '2D') 123 | assert(packing == 'kernel' or (not bias_)) 124 | 125 | local orig_w_ = data_:size(2) 126 | local orig_h_ = data_:size(1) 127 | local w_ 128 | local h_ 129 | local offset_width 130 | local offset_height 131 | 132 | if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then 133 | print(" WARNING: Current Embedded Data tensor cannot be written with 2D packing, switching to 1D.") 134 | packing = '1D' 135 | end 136 | 137 | if 'kernel' == packing then 138 | local dh = grid.kernel_height - orig_h_ 139 | local kernel = torch.zeros(grid.kernel_height, grid.kernel_width) 140 | 141 | -- copy incoming data to the bottom left corner of kernel 142 | for r = 1, orig_h_ do 143 | for c = 1, orig_w_ do 144 | kernel[r+dh][c] = data_[r][c] 145 | end 146 | end 147 | 148 | -- overwrite with new transformed values 149 | data_ = kernel 150 | h_ = 1 151 | 152 | if bias_ then 153 | w_ = data_:size(1) * data_:size(2) + bias_:size(1) 154 | else 155 | w_ = data_:size(1) * data_:size(2) 156 | end 157 | elseif '1D' == packing then 158 | w_ = orig_w_ * orig_h_ 159 | h_ = 1 160 | else 161 | w_ = orig_w_ 162 | h_ = orig_h_ 163 | end 164 | 165 | if '2D' ~= packing then 166 | offset_width = w_ % streamer.stride_w 167 | offset_height = math.floor(w_ / streamer.stride_w) 168 | 169 | if '2D' == self.embedded.layer.packing then 170 | self.embedded.current.x = 0 171 | self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h 172 | self.embedded.layer.h = 0 173 | end 174 | else 175 | offset_width = w_ 176 | offset_height = h_ 177 | 178 | -- check if current data fits in the line 179 | if (self.embedded.current.x + w_) > streamer.stride_w then 180 | self.embedded.current.x = 0 181 | self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h 182 | self.embedded.layer.h = 0 183 | end 184 | end 185 | 186 | -- the layer height is the height of the maximum data area in the layer 187 | if self.embedded.layer.h < h_ then 188 | self.embedded.layer.h = h_ 189 | end 190 | 191 | self.embedded[ #self.embedded+1 ] = { 192 | x = self:constructCoordinate('embedded', 'x'), 193 | y = self:constructCoordinate('embedded', 'y'), 194 | w = w_, 195 | h = h_, 196 | orig_w = orig_w_, 197 | orig_h = orig_h_, 198 | data = data_, 199 | bias = bias_ 200 | } 201 | 202 | self.embedded.current.x = self.embedded.current.x + offset_width 203 | 204 | if '2D' ~= packing then 205 | self.embedded.current.y = self.embedded.current.y + offset_height 206 | 207 | -- check if we did not step out of the line 208 | if (self.embedded.current.x > streamer.stride_w) then 209 | self.embedded.current.y = self.embedded.current.y + 1 210 | self.embedded.current.x = self.embedded.current.x - streamer.stride_w 211 | end 212 | end 213 | 214 | -- alignment of addresses to physical memory pages 215 | if (self.embedded.current.x % streamer.align_w) ~= 0 then 216 | self.embedded.current.x = (math.floor(self.embedded.current.x/streamer.align_w) + 1) * streamer.align_w 217 | -- and check if we did not step out of the line again 218 | if (self.embedded.current.x > streamer.stride_w) then 219 | self.embedded.current.x = 0 220 | self.embedded.current.y = self.embedded.current.y + self.embedded.layer.h 221 | self.embedded.layer.h = 0 222 | end 223 | end 224 | 225 | self.embedded.layer.packing = packing 226 | 227 | return self.embedded[ #self.embedded ] 228 | end 229 | 230 | --[[ Allocate Persistent Data 231 | 232 | Data can be transformed to use 1D or 2D packing depending on packing 233 | argument. If 2D is selected but the width of the data is larger then the 234 | streamer (memory) stride, packing is reverted to 1D. 235 | --]] 236 | function Memory:allocPersistentData(data_, packing) 237 | packing = packing or '1D' 238 | assert(packing == '1D' or packing == '2D') 239 | 240 | local orig_w_ = data_:size(2) 241 | local orig_h_ = data_:size(1) 242 | local w_ 243 | local h_ 244 | local offset_width 245 | local offset_height 246 | 247 | if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then 248 | print(" WARNING: Current Persistent Data tensor cannot be written with 2D packing, switching to 1D.") 249 | packing = '1D' 250 | end 251 | 252 | if '1D' == packing then 253 | w_ = orig_w_ * orig_h_ 254 | h_ = 1 255 | 256 | offset_width = w_ % streamer.stride_w 257 | offset_height = math.floor(w_ / streamer.stride_w) 258 | 259 | if '1D' ~= self.persistent.layer.packing then 260 | self.persistent.current.x = 0 261 | self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h 262 | self.persistent.layer.h = 0 263 | end 264 | else 265 | w_ = orig_w_ 266 | h_ = orig_h_ 267 | 268 | offset_width = w_ 269 | offset_height = h_ 270 | 271 | -- check if current data fits in the line 272 | if (self.persistent.current.x + w_) > streamer.stride_w then 273 | self.persistent.current.x = 0 274 | self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h 275 | self.persistent.layer.h = 0 276 | end 277 | end 278 | 279 | -- the layer height is the height of the maximum data area in the layer 280 | if self.persistent.layer.h < h_ then 281 | self.persistent.layer.h = h_ 282 | end 283 | 284 | self.persistent[ #self.persistent+1 ] = { 285 | x = self:constructCoordinate('persistent', 'x'), 286 | y = self:constructCoordinate('persistent', 'y'), 287 | w = w_, 288 | h = h_, 289 | orig_w = orig_w_, 290 | orig_h = orig_h_, 291 | data = data_ 292 | } 293 | 294 | self.persistent.current.x = self.persistent.current.x + offset_width 295 | 296 | if '1D' == packing then 297 | self.persistent.current.y = self.persistent.current.y + offset_height 298 | 299 | -- check if we did not step out of the line 300 | if (self.persistent.current.x > streamer.stride_w) then 301 | self.persistent.current.y = self.persistent.current.y + 1 302 | self.persistent.current.x = self.persistent.current.x - streamer.stride_w 303 | end 304 | end 305 | 306 | -- alignment of addresses to physical memory pages 307 | if (self.persistent.current.x % streamer.align_w) ~= 0 then 308 | self.persistent.current.x = (math.floor(self.persistent.current.x/streamer.align_w) + 1)*streamer.align_w 309 | -- and check if we did not step out of the line again 310 | if (self.persistent.current.x > streamer.stride_w) then 311 | self.persistent.current.x = 0 312 | self.persistent.current.y = self.persistent.current.y + self.persistent.layer.h 313 | self.persistent.layer.h = 0 314 | end 315 | end 316 | 317 | self.persistent.layer.packing = packing 318 | 319 | return self.persistent[ #self.persistent ] 320 | end 321 | 322 | --[[ Allocate Managed Data 323 | 324 | Data can be transformed to use 1D or 2D packing depending on packing 325 | argument. If 2D is selected but the width of the data is larger then the 326 | streamer (memory) stride, packing is reverted to 1D. 327 | 328 | If the end of physical memory is reached, function will start overwriting 329 | from the start of the Managed memory space. 330 | --]] 331 | function Memory:allocManagedData(data_, packing) 332 | packing = packing or '1D' 333 | assert(packing == '1D' or packing == '2D') 334 | 335 | local orig_w_ = data_:size(2) 336 | local orig_h_ = data_:size(1) 337 | local w_ 338 | local h_ 339 | local offset_width 340 | local offset_height 341 | 342 | if (('2D' == packing) and (orig_w_ > streamer.stride_w)) then 343 | print(" WARNING: Current Managed Data tensor cannot be written with 2D packing, switching to 1D.") 344 | packing = '1D' 345 | end 346 | 347 | if '1D' == packing then 348 | w_ = orig_w_ * orig_h_ 349 | h_ = 1 350 | 351 | offset_width = w_ % streamer.stride_w 352 | offset_height = math.floor(w_ / streamer.stride_w) 353 | 354 | if '1D' ~= self.managed.layer.packing then 355 | self.managed.current.x = 0 356 | self.managed.current.y = self.managed.current.y + self.managed.layer.h 357 | self.managed.layer.h = 0 358 | end 359 | else 360 | w_ = orig_w_ 361 | h_ = orig_h_ 362 | 363 | offset_width = w_ 364 | offset_height = h_ 365 | 366 | -- check if current data fits in the line 367 | if (self.managed.current.x + w_) > streamer.stride_w then 368 | self.managed.current.x = 0 369 | self.managed.current.y = self.managed.current.y + self.managed.layer.h 370 | self.managed.layer.h = 0 371 | end 372 | end 373 | 374 | -- check if there is space in the mem if not start overwriting first layers 375 | if (self.managed.current.y + offset_height) > memory.size_r then 376 | print(" WARNING: Overwriting the first layers of heap!") 377 | self.managed.current.x = 0 378 | self.managed.current.y = 0 379 | self.managed.layer.h = 0 380 | end 381 | 382 | -- the layer height is the height of the maximum data area in the layer 383 | if self.managed.layer.h < h_ then 384 | self.managed.layer.h = h_ 385 | end 386 | 387 | self.managed[ #self.managed+1 ] = { 388 | x = self:constructCoordinate('managed', 'x'), 389 | y = self:constructCoordinate('managed', 'y'), 390 | w = w_, 391 | h = h_, 392 | orig_w = orig_w_, 393 | orig_h = orig_h_, 394 | data = data_ 395 | } 396 | 397 | self.managed.current.x = self.managed.current.x + offset_width 398 | 399 | if '1D' == packing then 400 | self.managed.current.y = self.managed.current.y + offset_height 401 | 402 | -- check if we did not step out of the line 403 | if (self.managed.current.x > streamer.stride_w) then 404 | self.managed.current.y = self.managed.current.y + 1 405 | self.managed.current.x = self.managed.current.x - streamer.stride_w 406 | end 407 | end 408 | 409 | -- alignment of addresses to physical memory pages 410 | if (self.managed.current.x % streamer.align_w) ~= 0 then 411 | self.managed.current.x = (math.floor(self.managed.current.x/streamer.align_w) + 1)*streamer.align_w 412 | -- and check if we did not step out of the line again 413 | if (self.managed.current.x > streamer.stride_w) then 414 | self.managed.current.x = 0 415 | self.managed.current.y = self.managed.current.y + self.managed.layer.h 416 | self.managed.layer.h = 0 417 | end 418 | end 419 | 420 | self.managed.layer.packing = packing 421 | 422 | return self.managed[#self.managed] 423 | end 424 | 425 | function Memory:printAreaStatistics() 426 | 427 | embedded_start_b = self.embedded.start.y * streamer.stride_b 428 | + self.embedded.start.x * streamer.word_b 429 | 430 | embedded_size_b = self.embedded.current.y * streamer.stride_b 431 | + self.embedded.current.x * streamer.word_b 432 | 433 | persistent_start_b = self.persistent.start.y * streamer.stride_b 434 | + self.persistent.start.x * streamer.word_b 435 | 436 | persistent_size_b = self.persistent.current.y * streamer.stride_b 437 | if (self.persistent.current.x ~= 0) then 438 | -- if we did not just step a new line 439 | -- take into account all the lines we wrote (the last entry's height is enough) 440 | -- if not all the lines are filled till the end we are counting more than we should here, 441 | -- but for checking collision it's OK 442 | persistent_size_b = persistent_size_b + self.persistent[#self.persistent].h * streamer.stride_b 443 | end 444 | 445 | managed_start_b = self.managed.start.y * streamer.stride_b 446 | + self.managed.start.x * streamer.word_b 447 | 448 | managed_size_b = self.managed.current.y * streamer.stride_b 449 | if (self.managed.current.x ~= 0) then 450 | -- if we did not just step a new line 451 | -- take into account all the lines we wrote (the last entry's height is enough) 452 | -- if not all the lines are filled till the end we are counting more than we should here, 453 | -- but for checking collision it's OK 454 | managed_size_b = managed_size_b + (self.managed[#self.managed].h * streamer.stride_b) 455 | end 456 | 457 | local binary_size = embedded_start_b+embedded_size_b 458 | 459 | print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") 460 | print(c.Cyan .. '-openFlow-' .. c.Magenta .. ' ConvNet Name ' .. 461 | c.none ..'[ ' .. self.prog_name .. ' ]\n') 462 | print( 463 | string.format(" bytecode segment: start = %10d, size = %10d, end = %10d", 464 | self.init_offset, 465 | self.bytecode_size_b-self.init_offset, 466 | self.bytecode_size_b) 467 | ) 468 | print( 469 | string.format(" embedded data segment: start = %10d, size = %10d, end = %10d", 470 | embedded_start_b, 471 | embedded_size_b, 472 | embedded_start_b+embedded_size_b) 473 | ) 474 | print( 475 | string.format("persistent data segment: start = %10d, size = %10d, end = %10d", 476 | persistent_start_b, 477 | persistent_size_b, 478 | persistent_start_b+persistent_size_b) 479 | ) 480 | print( 481 | string.format(" managed data segment: start = %10d, size = %10d, end = %10d", 482 | managed_start_b, 483 | managed_size_b, 484 | memory.size_b) 485 | ) 486 | print( 487 | string.format("\n the binary file size should be = %10d, total memory used = %10d", 488 | binary_size, 489 | managed_start_b+managed_size_b) 490 | ) 491 | print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") 492 | 493 | end 494 | -------------------------------------------------------------------------------- /src/NeuFlow.lua: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- 3 | -- NeuFlow 4 | -- a class to abstract the neuFlow processor 5 | -- 6 | 7 | ---------------------------------------------------------------------- 8 | -- register class + constructor 9 | -- 10 | local NeuFlow = torch.class('neuflow.NeuFlow') 11 | 12 | function NeuFlow:__init(args) 13 | -- parse args 14 | args = args or {} 15 | self.prog_name = args.prog_name or 'temp' 16 | self.use_ethernet = args.use_ethernet or false 17 | self.serial_device = args.serial_device or false 18 | self.global_msg_level = args.global_msg_level or 'none' 19 | self.mode = args.mode or 'runtime' -- or 'simulation' or 'rom' 20 | self.use_ethernet = (self.mode == 'runtime') 21 | if(args.network_if_name) then 22 | self.network_if_name = args.network_if_name 23 | end 24 | 25 | -- default offsets, for conveniency 26 | args.offset_code = args.offset_code or bootloader.entry_point_b 27 | -- in simul, bypass header 28 | if self.mode == 'simulation' then 29 | args.offset_code = 0 30 | end 31 | 32 | -- instantiate core, with all args 33 | args.msg_level = args.core_msg_level or self.global_msg_level 34 | self.core = neuflow.Core(args) 35 | 36 | -- instantiate the compiler, relies on the core 37 | self.compiler = neuflow.Compiler { 38 | optimize_across_layers = true, 39 | core = self.core, 40 | msg_level = args.compiler_msg_level or self.global_msg_level 41 | } 42 | 43 | -- use a profiler 44 | self.profiler = neuflow.Profiler() 45 | 46 | -- instantiate the interface 47 | if (self.core.platform == 'pico_m503') or (self.core.platform == 'xilinx_ml605_tbsp') then 48 | self.handshake = false 49 | self.ethernet = neuflow.DmaEthernet { 50 | msg_level = args.ethernet_msg_level or self.global_msg_level, 51 | core = self.core, 52 | nf = self 53 | } 54 | else 55 | self.handshake = true 56 | self.ethernet = neuflow.Ethernet { 57 | msg_level = args.ethernet_msg_level or self.global_msg_level, 58 | core = self.core, 59 | nf = self 60 | } 61 | end 62 | 63 | if self.core.platform == 'pico_m503' then 64 | self.camera = neuflow.Camera { 65 | msg_level = args.camera_msg_level or self.global_msg_level, 66 | nf = self 67 | } 68 | end 69 | 70 | -- for loops: this retains a list of jump locations 71 | self.loopTags = {} 72 | 73 | -- ethernet socket (auto found for now) 74 | if self.use_ethernet then 75 | print ' loading ethernet driver' 76 | if self.ethernet:open(self.network_if_name) ~= 0 then 77 | self.use_ethernet = false 78 | end 79 | end 80 | 81 | -- serial dev 82 | if self.serial_device then 83 | self.tty = neuflow.Serial(self.serial_device, '57600') 84 | end 85 | 86 | -- bytecode has a constant size (oFlower bios) 87 | self.bytecodesize = bootloader.load_size 88 | 89 | -- and finally initialize hardware 90 | self:initialize() 91 | end 92 | 93 | ---------------------------------------------------------------------- 94 | -- ending functions: this is not clean for now, but insures that 95 | -- the hardware stays in sync. 96 | -- 97 | function NeuFlow:cleanup() 98 | if self.use_ethernet then 99 | self.ethernet:close() 100 | end 101 | if self.tty then 102 | self.tty:cleanup() 103 | end 104 | end 105 | 106 | ---------------------------------------------------------------------- 107 | -- print messages / send message 108 | -- 109 | function NeuFlow:printMessage() 110 | if self.tty then 111 | print(self.tty:read()) 112 | end 113 | end 114 | 115 | function NeuFlow:sendMessage(message) 116 | if self.tty then 117 | self.tty:write(message) 118 | end 119 | end 120 | 121 | ---------------------------------------------------------------------- 122 | -- initialize system 123 | -- 124 | function NeuFlow:initialize(args) 125 | -- args 126 | if args and args.selftest then 127 | self.core:bootSequence{selftest=true} 128 | else 129 | self.core:bootSequence{selftest=false} 130 | end 131 | end 132 | 133 | ---------------------------------------------------------------------- 134 | -- high-level memory functions 135 | -- 136 | function NeuFlow:allocHeap(tensor) 137 | local alloc_list = {} 138 | if type(tensor) == 'table' then 139 | local first = true 140 | for i = 1,#tensor do 141 | if tensor[i]:nDimension() ~= 2 then 142 | xlua.error('only supports list of 2D tensors','NeuFlow.allocHeap') 143 | end 144 | local segment = self.core.mem:allocManagedData(tensor[i]) 145 | table.insert(alloc_list, segment) 146 | first = false 147 | end 148 | else 149 | local dims = tensor:nDimension() 150 | if dims == 2 then 151 | local segment = self.core.mem:allocManagedData(tensor) 152 | table.insert(alloc_list, segment) 153 | elseif dims == 3 then 154 | local first = true 155 | for i = 1,tensor:size(1) do 156 | local segment = self.core.mem:allocManagedData(tensor[i]) 157 | table.insert(alloc_list, segment) 158 | first = false 159 | end 160 | else 161 | error('tensors must have 2 or 3 dimensions') 162 | end 163 | end 164 | return alloc_list 165 | end 166 | 167 | function NeuFlow:allocDataPacked(tensor,bias) 168 | local alloc_list = {} 169 | if type(tensor) == 'table' then 170 | for i = 1,#tensor do 171 | if tensor[i]:nDimension() ~= 2 then 172 | xlua.error('only supports list of 2D tensors','NeuFlow.allocHeap') 173 | end 174 | local segment 175 | if bias then 176 | segment = self.core.mem:allocEmbeddedData(tensor[i], bias[i]) 177 | else 178 | segment = self.core.mem:allocEmbeddedData(tensor[i]) 179 | end 180 | table.insert(alloc_list, segment) 181 | end 182 | else 183 | local dims = tensor:nDimension() 184 | if dims == 2 then 185 | local segment 186 | if bias then 187 | segment = self.core.mem:allocEmbeddedData(tensor, bias) 188 | else 189 | segment = self.core.mem:allocEmbeddedData(tensor) 190 | end 191 | table.insert(alloc_list, segment) 192 | elseif dims == 3 then 193 | for i = 1,tensor:size(1) do 194 | local segment 195 | if bias then 196 | segment = self.core.mem:allocEmbeddedData(tensor[i], bias:narrow(1,i,1)) 197 | else 198 | segment = self.core.mem:allocEmbeddedData(tensor[i]) 199 | end 200 | table.insert(alloc_list, segment) 201 | end 202 | else 203 | error('tensors must have 2 or 3 dimensions') 204 | end 205 | end 206 | return alloc_list 207 | end 208 | 209 | function NeuFlow:allocData(tensor) 210 | local alloc_list = {} 211 | if type(tensor) == 'table' then 212 | for i = 1,#tensor do 213 | if tensor[i]:nDimension() ~= 2 then 214 | xlua.error('only supports list of 2D tensors','NeuFlow.allocPersistentData') 215 | end 216 | if self.mode == 'simulation' then 217 | local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D') 218 | table.insert(alloc_list, segment) 219 | else 220 | local segment = self.core.mem:allocPersistentData(tensor[i]) 221 | table.insert(alloc_list, segment) 222 | end 223 | end 224 | else 225 | local dims = tensor:nDimension() 226 | if dims == 2 then 227 | if self.mode == 'simulation' then 228 | local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D') 229 | table.insert(alloc_list, segment) 230 | else 231 | local segment = self.core.mem:allocPersistentData(tensor) 232 | table.insert(alloc_list, segment) 233 | end 234 | elseif dims == 3 then 235 | for i = 1,tensor:size(1) do 236 | if self.mode == 'simulation' then 237 | local segment = self.core.mem:allocEmbeddedData(tensor[i], nil, '1D') 238 | table.insert(alloc_list, segment) 239 | else 240 | local segment = self.core.mem:allocPersistentData(tensor[i]) 241 | table.insert(alloc_list, segment) 242 | end 243 | end 244 | else 245 | error('tensors must have 2 or 3 dimensions') 246 | end 247 | end 248 | return alloc_list 249 | end 250 | 251 | function NeuFlow:copy(source, dest) 252 | -- check if source/dest are lists of streams, or streams 253 | if #source == 0 then 254 | source = {source} 255 | if dest then 256 | dest = {dest} 257 | end 258 | end 259 | 260 | -- if no dest, create it 261 | if not dest then 262 | dest = self:allocHeap(source) 263 | end 264 | 265 | -- process a list of streams 266 | for i = 1,#source do 267 | self.core:copy(source[i],dest[i]) 268 | end 269 | 270 | -- return result 271 | return dest 272 | end 273 | 274 | function NeuFlow:copyFromHost(source, dest) 275 | -- if no dest, create it 276 | if not dest then 277 | dest = self:allocHeap(source) 278 | end 279 | -- check if dest is a list of streams, or a stream 280 | local ldest 281 | if #dest == 0 then 282 | ldest = {dest} 283 | else 284 | ldest = dest 285 | end 286 | -- if simulation, we replace this transfer by a plain copy 287 | if self.mode == 'simulation' then 288 | -- alloc in constant data: 289 | source = self:allocData(source) 290 | print(' copy host->dev [simul]: ' .. #ldest .. 'x' .. ldest[1].orig_h .. 'x' .. ldest[1].orig_w) 291 | self:copy(source,ldest) 292 | else 293 | -- process list of streams 294 | print(' copy host->dev: ' .. #ldest .. 'x' .. ldest[1].orig_h .. 'x' .. ldest[1].orig_w) 295 | 296 | self.ethernet:dev_copyFromHost(ldest) 297 | end 298 | 299 | return dest 300 | end 301 | 302 | function NeuFlow:copyToHost(source, dest) 303 | -- no ack in simulation 304 | local ack 305 | if self.mode == 'simulation' or (not self.handshake) then 306 | ack = 'no-ack' 307 | end 308 | 309 | -- check if source is a list of streams, or a stream 310 | local lsource 311 | if #source == 0 then 312 | lsource = {source} 313 | else 314 | lsource = source 315 | end 316 | 317 | -- record original sizes 318 | local orig_h = lsource[1].orig_h 319 | local orig_w = lsource[1].orig_w 320 | 321 | -- process list of streams 322 | print(' copy dev->host: ' .. #lsource .. 'x' .. lsource[1].orig_h .. 'x' .. lsource[1].orig_w) 323 | 324 | self.ethernet:dev_copyToHost(lsource, ack) 325 | 326 | -- create/resize dest 327 | if not dest then 328 | dest = torch.Tensor() 329 | end 330 | dest:resize(#lsource, orig_h, orig_w) 331 | return dest 332 | end 333 | 334 | ---------------------------------------------------------------------- 335 | -- wrappers for compilers 336 | -- 337 | function NeuFlow:compile(network, input) 338 | -- retrieve IDs 339 | local inputs 340 | if #input == 0 then 341 | inputs = { input } 342 | else 343 | inputs = input 344 | end 345 | 346 | local outputs 347 | outputs, self.gops = self.compiler:processNetwork(network, inputs) 348 | 349 | return outputs 350 | end 351 | 352 | ---------------------------------------------------------------------- 353 | -- high-level GOTO functions 354 | -- 355 | function NeuFlow:beginLoop(tag) 356 | self.loopTags.tag = self.core:makeGotoTag() 357 | self.loopTags.tag.offset = 1 358 | end 359 | 360 | function NeuFlow:endLoop(tag) 361 | self.core:defaults() 362 | self.core:gotoTag(self.loopTags.tag) 363 | end 364 | 365 | function NeuFlow:term() 366 | self.core:terminate() 367 | end 368 | 369 | ---------------------------------------------------------------------- 370 | -- write bytecode in binary/hex mode 371 | -- 372 | function NeuFlow:writeBytecode(args) 373 | local tensor = torch.ByteTensor(self.bytecodesize):zero() 374 | 375 | -- generate binary once 376 | local tensor_size = self.core.linker:dump( 377 | { 378 | tensor = tensor, 379 | }, 380 | self.core.mem 381 | ) 382 | 383 | local filepath 384 | if next(args) ~= nil then -- called with arguments pasted in 385 | filepath = '/tmp/' .. self.prog_name .. '-' .. os.date("%Y_%m_%d_%H_%M_%S") .. '.bin' 386 | local file = assert(torch.DiskFile(filepath,'w'):binary()) 387 | file:writeString(tensor:storage():string():sub(1, tensor_size)) 388 | assert(file:close()) 389 | end 390 | 391 | -- generate all outputs 392 | for _,args in ipairs(args) do 393 | -- args 394 | local format = args.format or 'bin' -- or 'hex' 395 | local width = args.width or 8 396 | local length = args.length 397 | 398 | if format == 'bin' then 399 | -- simple copy 400 | os.execute('cp -v' .. filepath .. ' ' .. self.prog_name .. '.bin') 401 | elseif format == 'hex' then 402 | local filehex = self.prog_name ..'.hex'..tostring(width) 403 | neuflow.tools.readBinWriteHex(filepath, filehex, width, length) 404 | elseif format == 'rom' then 405 | local filev = self.prog_name ..'.v' 406 | neuflow.tools.readBinWriteRom(filepath, filev, width, 'flow_rom') 407 | else 408 | error('format should be one of: bin | hex') 409 | end 410 | end 411 | 412 | return tensor 413 | end 414 | 415 | ---------------------------------------------------------------------- 416 | -- execute simulation (testbench) 417 | -- 418 | function NeuFlow:execSimulation(args) 419 | local testbench = args.testbench or error('please provide a testbench script') 420 | local cache_hex = args.cache_hex or error('please provide path for cache hex mask') 421 | local mem_hex = args.mem_hex or error('please provide path for mem hex mask') 422 | 423 | print(' exporting compiled code [hex]') 424 | self:writeBytecode{{format='hex', width=oFlower.bus_, length=oFlower.cache_size_b}, 425 | {format='hex', width=streamer.mem_bus_}} 426 | 427 | -- platform-dependent memories: 428 | if self.core.platform == 'ibm_asic' then 429 | os.execute('mv '..self.prog_name..'.hex64 '..cache_hex) 430 | for subidx = 0,7 do 431 | os.execute('cut -c'..(subidx*8+1)..'-'..(subidx*8+8)..' ' 432 | ..self.prog_name..'.hex256 > '..mem_hex..'.'..(subidx+1)) 433 | end 434 | os.execute('rm '..self.prog_name..'.hex256 ') 435 | else 436 | os.execute('mv '..self.prog_name..'.hex64 '..cache_hex) 437 | os.execute('mv '..self.prog_name..'.hex256 '..mem_hex) 438 | end 439 | 440 | local c = sys.COLORS 441 | print(c._cyan) 442 | print(' running compiled bytecode in simulation') 443 | local path = paths.dirname(testbench) 444 | local script = paths.basename(testbench) 445 | os.execute('cd ' .. path .. '; ./' .. script .. ' ' .. options.tb_args) 446 | print(c.none) 447 | end 448 | 449 | ---------------------------------------------------------------------- 450 | -- transmit reset 451 | -- 452 | function NeuFlow:sendReset() 453 | self.ethernet:sendReset() 454 | end 455 | 456 | ---------------------------------------------------------------------- 457 | -- tell device to wait for the bytecode to be sent from the host 458 | -- 459 | function NeuFlow:receiveBytecode() 460 | self.ethernet:dev_receiveBytecode() 461 | end 462 | 463 | ---------------------------------------------------------------------- 464 | -- send bytecode to device 465 | -- 466 | function NeuFlow:sendBytecode(bytecode) 467 | self:loadBytecode(bytecode) 468 | end 469 | 470 | ---------------------------------------------------------------------- 471 | -- transmit bytecode 472 | -- 473 | function NeuFlow:loadBytecode(bytecode) 474 | if bytecode then 475 | -- then transmit bytecode 476 | print(' transmitting bytecode') 477 | self.ethernet:host_sendBytecode(bytecode) 478 | else 479 | -- if no bytecode given, first dump it to file, then load it from there 480 | self:loadBytecode(self:writeBytecode{}) 481 | end 482 | end 483 | 484 | ---------------------------------------------------------------------- 485 | -- transmit bytecode (from file) 486 | -- 487 | function NeuFlow:loadBytecodeFromFile(filename) 488 | local file = assert(io.open(filename, "r")) 489 | local tensor = self:convertBytecodeString(file:read("*all")) 490 | file:close() 491 | 492 | self:loadBytecode(tensor) 493 | end 494 | 495 | function NeuFlow:convertBytecodeString(bytes) 496 | local tensor = torch.ByteTensor(self.bytecodesize) 497 | local i = 1 498 | for b in string.gfind(bytes, ".") do 499 | tensor[i] = string.byte(b) 500 | i = i+1 501 | end 502 | 503 | return tensor 504 | end 505 | 506 | ---------------------------------------------------------------------- 507 | -- transmit tensor 508 | -- 509 | function NeuFlow:copyToDev(tensor) 510 | self.ethernet:host_copyToDev(tensor) 511 | end 512 | 513 | ---------------------------------------------------------------------- 514 | -- receive tensor 515 | -- 516 | function NeuFlow:copyFromDev(tensor) 517 | self.ethernet:host_copyFromDev(tensor, self.handshake) 518 | end 519 | -------------------------------------------------------------------------------- /src/Profiler.lua: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- Profiler: a simple class to help profiling code 3 | -------------------------------------------------------------------------------- 4 | local Profiler = torch.class('neuflow.Profiler') 5 | 6 | function Profiler:__init(mode,verbose) 7 | self.events = {} 8 | self.list = {} 9 | self.off = (mode == 'off') or false 10 | self.verbose = verbose or false 11 | end 12 | 13 | function Profiler:start(name, fps) 14 | if self.events[name] then 15 | -- update 16 | self.events[name].cpu = os.clock() 17 | self.events[name].real = sys.clock() 18 | else 19 | -- create 20 | self.events[name] = {cpu=os.clock(), real=sys.clock(), name=name} 21 | self.list[#self.list+1] = self.events[name] 22 | end 23 | if fps and fps == 'fps' then 24 | self.events[name].fps = true 25 | end 26 | if self.verbose then io.write('<' .. name .. '>') io.flush() end 27 | end 28 | 29 | function Profiler:setColor(name, color) 30 | if self.events[name] then 31 | -- update 32 | self.events[name].color = color 33 | else 34 | error('# ERROR: There is no such profiler - '.. name..', create it first') 35 | end 36 | end 37 | 38 | 39 | function Profiler:cpu(name,divider) 40 | local delta = os.clock() - self.events[name].cpu 41 | if divider then delta = delta / divider end 42 | self.events[name].cpud = delta 43 | return delta 44 | end 45 | 46 | function Profiler:real(name,divider) 47 | local delta = sys.clock() - self.events[name].real 48 | if divider then delta = delta / divider end 49 | self.events[name].reald = delta 50 | return delta 51 | end 52 | 53 | function Profiler:lap(name,divider) 54 | local r = self:real(name,divider) 55 | local c = self:cpu(name,divider) 56 | if self.verbose then io.write('\r') self:print(name) end 57 | return r,c 58 | end 59 | 60 | function Profiler:format(name) 61 | return string.format('$ real | cpu: %f | %f <%s>', 62 | self.events[name].reald or -1, self.events[name].cpud or -1, name) 63 | end 64 | 65 | function Profiler:print(name) 66 | if not self.off then 67 | print(self:format(name)) 68 | end 69 | end 70 | 71 | function Profiler:formatAll() 72 | local str = '$ profiler report:' 73 | for i = 1,#self.list do 74 | if self.list[i].fps then 75 | str = str .. '\n' .. string.format('$ real %f | cpu %f <%s> = %f fps', 76 | self.list[i].reald or -1, 77 | self.list[i].cpud or -1, 78 | self.list[i].name, 79 | 1/self.list[i].reald) 80 | else 81 | str = str .. '\n' .. string.format('$ real %f | cpu %f <%s>', 82 | self.list[i].reald or -1, 83 | self.list[i].cpud or -1, 84 | self.list[i].name) 85 | end 86 | end 87 | return str 88 | end 89 | 90 | function Profiler:printAll() 91 | if not self.off then 92 | print(self:formatAll()) 93 | end 94 | end 95 | 96 | function Profiler:displayAll(args) 97 | -- args 98 | local x = args.x or 0 99 | local y = args.y or 0 100 | local zoom = args.zoom or 1 101 | local painter = args.painter or args.win 102 | local font = args.font or 24*zoom 103 | if not painter then error('# ERROR: Profiler.displayAll() needs a painter') end 104 | 105 | painter:setfont(qt.QFont{serif=false,italic=false,size=font}) 106 | if not self.off then 107 | for i = 1,#self.list do 108 | painter:setcolor(self.list[i].color or "black") 109 | local str 110 | if self.list[i].fps then 111 | str = string.format('$ real %f | cpu %f <%s> = %f fps', 112 | self.list[i].reald or -1, 113 | self.list[i].cpud or -1, 114 | self.list[i].name, 115 | 1/self.list[i].reald) 116 | else 117 | str = string.format('$ real %f | cpu %f <%s>', 118 | self.list[i].reald or -1, 119 | self.list[i].cpud or -1, 120 | self.list[i].name) 121 | end 122 | -- disp line: 123 | painter:moveto(x,y); y = y + font*1.5 124 | painter:show(str) 125 | end 126 | end 127 | end 128 | -------------------------------------------------------------------------------- /src/Serial.lua: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- Serial 3 | -- a class to read/write through serial port 4 | -------------------------------------------------------------------------------- 5 | 6 | ---------------------------------------------------------------------- 7 | -- register class + constructor 8 | -- 9 | local Serial = torch.class('neuflow.Serial') 10 | 11 | function Serial:__init(dev,baud) 12 | -- error messages 13 | self.WARNING_NOTFOUND = '# serial: warning, device ' .. dev .. ' not found' 14 | 15 | -- device + speed 16 | self.dev = dev or '/dev/tty' 17 | self.baud = baud or 57600 18 | 19 | -- dev exists ? 20 | if not paths.filep(self.dev) then 21 | print(self.WARNING_NOTFOUND) 22 | return 23 | end 24 | 25 | -- this is linux dependent ? 26 | local ret = sys.execute('stty -F ' .. self.dev .. ' ' .. self.baud .. ' min 0 time 1') 27 | 28 | -- dev exists ? 29 | if ret ~= '' then 30 | print(self.WARNING_NOTFOUND) 31 | return 32 | end 33 | 34 | -- file descriptors 35 | self.devr = io.open(dev, 'r') 36 | self.devw = io.open(dev, 'w') 37 | 38 | -- background reader 39 | require 'thread' 40 | local function dumpTTY () 41 | local c = sys.COLORS 42 | local highlight = c._cyan 43 | local none = c.none 44 | while true do 45 | local fromTTY = self:read() 46 | if fromTTY then print(fromTTY) end 47 | end 48 | end 49 | thread.newthread(dumpTTY, {}) 50 | end 51 | 52 | function Serial:cleanup() 53 | self.dev:close() 54 | end 55 | 56 | function Serial:read() 57 | return self.devr:read('*l') 58 | end 59 | 60 | function Serial:write(line) 61 | return self.devw:write(line) 62 | end 63 | -------------------------------------------------------------------------------- /src/defines.lua: -------------------------------------------------------------------------------- 1 | -- -*- lua -*- 2 | 3 | ---------------------------------------------------------------------- 4 | --- Useful abbrevs 5 | -- 6 | kB = 1024 7 | MB = 1024*1024 8 | GB = 1024*1024*1024 9 | kHz = 1000 10 | MHz = 1000*1000 11 | GHz = 1000*1000*1000 12 | 13 | ---------------------------------------------------------------------- 14 | --- Blast Bus parameters 15 | -- 16 | blast_bus = { 17 | -- Addressing : 18 | area_streamer = 1, 19 | area_tile = 2, 20 | area_memctrl = 3, 21 | area_dma = 4, 22 | -- 23 | addr_broadcast = 0, 24 | addr_conv_0 = 1, 25 | addr_conv_1 = 2, 26 | addr_comb_0 = 16, 27 | addr_mapp_0 = 24, 28 | addr_div_0 = 28, 29 | addr_grid_0 = 256, 30 | addr_mem_streamer_0 = 1, 31 | addr_mem_streamer_1 = 2, 32 | addr_mem_streamer_2 = 3, 33 | addr_mem_streamer_3 = 4, 34 | addr_mem_streamer_4 = 5, 35 | addr_mem_streamer_5 = 6, 36 | addr_mem_streamer_6 = 7, 37 | addr_mem_streamer_7 = 8, 38 | addr_dma = 0, 39 | addr_memctrl = 0, 40 | -- 41 | subAddr_router = 0, 42 | subAddr_operator = 1, 43 | subAddr_cacher = 2, 44 | subAddr_IO = 3, 45 | subAddr_none = 0, 46 | subAddr_memTimeouts = 0, 47 | subAddr_memGlobals = 1, 48 | subAddr_memLocals = 2, 49 | 50 | -- Content: 51 | content_nothing = 0, 52 | content_command = 1, 53 | content_instruc = 2, 54 | content_config = 3, 55 | content_valid = 1, 56 | 57 | -- Instructions 58 | instruc_config = 0, 59 | instruc_setAdd = 1, 60 | instruc_activate = 2, 61 | instruc_deActivate = 3, 62 | instruc_reset = 4, 63 | instruc_RESERVED_1 = 5, 64 | instruc_control_0 = 6, 65 | instruc_control_1 = 7, 66 | instruc_control_2 = 8, 67 | instruc_control_3 = 9, 68 | instruc_control_4 = 10, 69 | instruc_control_5 = 11, 70 | instruc_control_6 = 12, 71 | instruc_control_7 = 13, 72 | instruc_cacheStart = 14, 73 | instruc_cacheFinish = 15, 74 | 75 | -- Status 76 | status_notAddressed = 0, 77 | status_idle = 1, 78 | status_busy = 2, 79 | status_done = 3, 80 | status_primed = 4, 81 | status_unconfigured = 5, 82 | status_misconfigured = 6 83 | } 84 | 85 | 86 | ---------------------------------------------------------------------- 87 | --- OpenFlower Instruction Set. 88 | -- 89 | oFlower = { 90 | -- Opcodes 91 | op_writeConfig = 0, 92 | op_getStatus = 1, 93 | op_writeStream = 2, 94 | op_routeStream = 3, 95 | op_writeWord = 4, 96 | op_readWord = 5, 97 | op_setReg = 6, 98 | op_goto = 7, 99 | op_add = 8, 100 | op_control = 9, 101 | op_and = 10, 102 | op_or = 11, 103 | op_comp = 12, 104 | op_shr = 13, 105 | op_nop = 14, 106 | op_term = 15, 107 | 108 | -- Register map 109 | reg_operation = 0, 110 | reg_size = 1, 111 | reg_type = 2, 112 | reg_state = 3, 113 | reg_counter = 4, 114 | reg_loops = 5, 115 | reg_status = 6, 116 | reg_sys_A = 7, 117 | reg_sys_B = 8, 118 | reg_sys_C = 9, 119 | reg_A = 10, 120 | reg_B = 11, 121 | reg_C = 12, 122 | reg_D = 13, 123 | reg_E = 14, 124 | reg_F = 15, 125 | 126 | -- ctrl map 127 | ctrl_lock_config_bus = 0, 128 | 129 | -- I/O Map 130 | io_uart = 0, 131 | io_uart_status = 1, 132 | io_dma = 2, 133 | io_dma_status = 3, 134 | io_ethernet = 4, 135 | io_ethernet_status = 5, 136 | io_iic = 6, 137 | io_iic_status = 7, 138 | io_spi = 8, 139 | io_spi_status = 8, 140 | io_gpios = 10, 141 | io_timer = 11, 142 | io_timer_ctrl = 12, 143 | 144 | -- CPU types 145 | type_uint8 = 8, 146 | type_uint16 = 4, 147 | type_uint32 = 2, 148 | type_uint64 = 1, 149 | 150 | -- clock 151 | clock_freq = 100*MHz, 152 | uart_freq = 57600, 153 | 154 | -- nb of dmas (this includes instruction path) 155 | nb_dmas = 2 156 | } 157 | do 158 | -- Cache 159 | oFlower.cache_size_b = 64*kB 160 | oFlower.page_size_b = oFlower.cache_size_b/2 161 | oFlower.bus_ = 64 162 | oFlower.bus_b = oFlower.bus_/8 163 | end 164 | 165 | 166 | ---------------------------------------------------------------------- 167 | --- Grid parameters 168 | -- 169 | grid = {} 170 | do 171 | -- nb of grids 172 | grid.nb_grids = 1 173 | -- global IOs 174 | grid.nb_ios = 6 175 | -- conv 176 | grid.nb_convs = 4 177 | grid.kernel_width = 10 178 | grid.kernel_height = 10 179 | -- mapper 180 | grid.nb_mappers = 4 181 | grid.mapper_segs = 8 182 | -- generic ALUs 183 | grid.nb_alus = 4 184 | -- clock: 185 | grid.clock_freq = 200*MHz 186 | end 187 | 188 | 189 | ---------------------------------------------------------------------- 190 | --- General DMAs 191 | -- 192 | dma = {} 193 | do 194 | -- global DMA IOs 195 | dma.nb_ios = 2 196 | dma.ethernet_write_port_id = 2 197 | dma.ethernet_read_port_id = 3 198 | end 199 | 200 | 201 | ---------------------------------------------------------------------- 202 | --- Streamer parameters 203 | -- 204 | -- Units: 205 | -- _: bits 206 | -- _b: bytes 207 | -- _w: words (1 word = word_b bytes) 208 | -- _r: memory rows (1 row = size_b bytes) 209 | -- _i: integers (1 int = 4 bytes) 210 | -- 211 | streamer = {} 212 | do 213 | -- physical params 214 | streamer.nb_ports = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios 215 | -- geometry 216 | streamer.mem_bus_ = 256 217 | streamer.mem_bus_b = 256 / 8 218 | streamer.stride_b = 2048 219 | streamer.word_b = 2 220 | streamer.align_b = streamer.mem_bus_ / 8 221 | streamer.stride_w = streamer.stride_b / streamer.word_b 222 | streamer.align_w = streamer.align_b / streamer.word_b 223 | -- clock 224 | streamer.clock_freq = 200*MHz 225 | end 226 | 227 | 228 | ---------------------------------------------------------------------- 229 | --- Memory parameters 230 | -- 231 | -- the parameters are expressed in different units: 232 | -- _: bits 233 | -- _b: bytes 234 | -- _w: words (1 word = word_b bytes) 235 | -- _r: memory rows (1 row = size_b bytes) 236 | -- _i: integers (1 int = 4 bytes) 237 | -- 238 | memory = {} 239 | do 240 | -- size: 241 | memory.size_b = 512*MB 242 | memory.size_w = memory.size_b / streamer.word_b 243 | memory.size_r = memory.size_b / streamer.stride_b 244 | -- clock: 245 | memory.clock_freq = 400*MHz 246 | -- bandwidth 247 | memory.bus_ = 32 248 | memory.is_ddr = true 249 | memory.bandwidth_ = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1) 250 | memory.bandwidth_b = memory.bandwidth_ / 8 251 | memory.bandwidth_w = memory.bandwidth_b / streamer.word_b 252 | 253 | memory.offset_text = 0 254 | end 255 | 256 | 257 | ---------------------------------------------------------------------- 258 | --- Extra Streamer parameters 259 | -- 260 | do 261 | -- parallel streams: this is application dependent 262 | streamer.max_parallel_rd_streams = grid.nb_convs + 1 263 | streamer.max_parallel_wr_streams = 1 264 | streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams 265 | -- bandwidth per stream: 266 | streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b 267 | streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams 268 | streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor 269 | -- bandwidth first check 270 | if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then 271 | print('ERROR internal bandwidth too high: ' 272 | .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s' 273 | .. ' > external bandwidth available: ' 274 | .. streamer.mem_bandwidth_b/1e9 ..'GB/s') 275 | os.exit() 276 | end 277 | -- continous streaming per rd port: 278 | -- this is based on the observation that: 279 | -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b 280 | local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams 281 | local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams 282 | streamer.min_timeout_rd = math.ceil(dead_cycles_rd / 283 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 284 | - streamer.max_parallel_streams)) 285 | streamer.min_timeout_wr = math.ceil(dead_cycles_wr / 286 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 287 | - streamer.max_parallel_streams)) 288 | --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr 289 | -- .. ' and rd=' .. streamer.min_timeout_rd) 290 | -- for these timeouts, we compute necessary buffers to insure no one is starving 291 | streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd 292 | + streamer.min_timeout_rd 293 | *(streamer.max_parallel_streams-1)) 294 | / streamer.mem_bus_b)) 295 | streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr 296 | + streamer.min_timeout_wr 297 | *(streamer.max_parallel_streams-1)) 298 | / streamer.mem_bus_b)) 299 | --print('# streamer min cache sizes: wr='..streamer.min_cache_wr 300 | -- ..' and rd='..streamer.min_cache_rd) 301 | end 302 | 303 | 304 | ---------------------------------------------------------------------- 305 | --- Num parameters 306 | -- 307 | num = {} 308 | do 309 | num.size_b = 2 310 | num.size_ = 16 311 | num.frac_ = 8 312 | num.int_ = num.size_-num.frac_ 313 | num.max = (2^(num.size_-1)-1) / 2^num.frac_ 314 | num.min = -(2^(num.size_-1)) / 2^num.frac_ 315 | num.one = 2^num.frac_ 316 | num.res = 1 / 2^num.frac_ 317 | num.precision = num.res 318 | num.mask = 0xFFFF 319 | end 320 | 321 | 322 | ---------------------------------------------------------------------- 323 | --- System Banner 324 | -- 325 | banner = 326 | '------------------------------------------------------------\r\n' .. 327 | '-- _ _ __ neuFlow [v.1.0] --\r\n' .. 328 | '-- ( | )/_/ --\r\n' .. 329 | '-- __( >O< ) This code runs on --\r\n' .. 330 | '-- \\_\\(_|_) the custom openFlow CPU. --\r\n' .. 331 | '-- --\r\n' .. 332 | '-- Copyright (C) 2009/10 | Farabet/Akselrod/Martini --\r\n' .. 333 | '------------------------------------------------------------' 334 | 335 | 336 | ---------------------------------------------------------------------- 337 | --- BootLoader parameters 338 | -- 339 | bootloader = {} 340 | do 341 | bootloader.entry_point_b = oFlower.cache_size_b 342 | bootloader.entry_point = bootloader.entry_point_b / oFlower.bus_b 343 | bootloader.load_size = 32*MB 344 | end 345 | -------------------------------------------------------------------------------- /src/defines_ibm_asic.lua: -------------------------------------------------------------------------------- 1 | -- -*- lua -*- 2 | 3 | ---------------------------------------------------------------------- 4 | --- Useful abbrevs 5 | -- 6 | kB = 1024 7 | MB = 1024*1024 8 | GB = 1024*1024*1024 9 | kHz = 1000 10 | MHz = 1000*1000 11 | GHz = 1000*1000*1000 12 | 13 | ---------------------------------------------------------------------- 14 | --- Blast Bus parameters 15 | -- 16 | blast_bus = { 17 | -- Addressing : 18 | area_streamer = 1, 19 | area_tile = 2, 20 | area_memctrl = 3, 21 | area_dma = 4, 22 | -- 23 | addr_broadcast = 0, 24 | addr_conv_0 = 1, 25 | addr_conv_1 = 2, 26 | addr_comb_0 = 16, 27 | addr_mapp_0 = 24, 28 | addr_div_0 = 28, 29 | addr_grid_0 = 256, 30 | addr_mem_streamer_0 = 1, 31 | addr_mem_streamer_1 = 2, 32 | addr_mem_streamer_2 = 3, 33 | addr_mem_streamer_3 = 4, 34 | addr_mem_streamer_4 = 5, 35 | addr_mem_streamer_5 = 6, 36 | addr_mem_streamer_6 = 7, 37 | addr_mem_streamer_7 = 8, 38 | addr_dma = 0, 39 | addr_memctrl = 0, 40 | -- 41 | subAddr_router = 0, 42 | subAddr_operator = 1, 43 | subAddr_cacher = 2, 44 | subAddr_IO = 3, 45 | subAddr_none = 0, 46 | subAddr_memTimeouts = 0, 47 | subAddr_memGlobals = 1, 48 | subAddr_memLocals = 2, 49 | 50 | -- Content: 51 | content_nothing = 0, 52 | content_command = 1, 53 | content_instruc = 2, 54 | content_config = 3, 55 | content_valid = 1, 56 | 57 | -- Instructions 58 | instruc_config = 0, 59 | instruc_setAdd = 1, 60 | instruc_activate = 2, 61 | instruc_deActivate = 3, 62 | instruc_reset = 4, 63 | instruc_RESERVED_1 = 5, 64 | instruc_control_0 = 6, 65 | instruc_control_1 = 7, 66 | instruc_control_2 = 8, 67 | instruc_control_3 = 9, 68 | instruc_control_4 = 10, 69 | instruc_control_5 = 11, 70 | instruc_control_6 = 12, 71 | instruc_control_7 = 13, 72 | instruc_cacheStart = 14, 73 | instruc_cacheFinish = 15, 74 | 75 | -- Status 76 | status_notAddressed = 0, 77 | status_idle = 1, 78 | status_busy = 2, 79 | status_done = 3, 80 | status_primed = 4, 81 | status_unconfigured = 5, 82 | status_misconfigured = 6 83 | } 84 | 85 | 86 | ---------------------------------------------------------------------- 87 | --- OpenFlower Instruction Set. 88 | -- 89 | oFlower = { 90 | -- Opcodes 91 | op_writeConfig = 0, 92 | op_getStatus = 1, 93 | op_writeStream = 2, 94 | op_routeStream = 3, 95 | op_writeWord = 4, 96 | op_readWord = 5, 97 | op_setReg = 6, 98 | op_goto = 7, 99 | op_add = 8, 100 | op_control = 9, 101 | op_and = 10, 102 | op_or = 11, 103 | op_comp = 12, 104 | op_shr = 13, 105 | op_nop = 14, 106 | op_term = 15, 107 | 108 | -- Register map 109 | reg_operation = 0, 110 | reg_size = 1, 111 | reg_type = 2, 112 | reg_state = 3, 113 | reg_counter = 4, 114 | reg_loops = 5, 115 | reg_status = 6, 116 | reg_sys_A = 7, 117 | reg_sys_B = 8, 118 | reg_sys_C = 9, 119 | reg_A = 10, 120 | reg_B = 11, 121 | reg_C = 12, 122 | reg_D = 13, 123 | reg_E = 14, 124 | reg_F = 15, 125 | 126 | -- ctrl map 127 | ctrl_lock_config_bus = 0, 128 | 129 | -- I/O Map 130 | io_uart = 0, 131 | io_uart_status = 1, 132 | io_dma = 2, 133 | io_dma_status = 3, 134 | io_ethernet = 4, 135 | io_ethernet_status = 5, 136 | io_iic = 6, 137 | io_iic_status = 7, 138 | io_spi = 8, 139 | io_spi_status = 8, 140 | io_gpios = 10, 141 | io_timer = 11, 142 | io_timer_ctrl = 12, 143 | 144 | -- CPU types 145 | type_uint8 = 8, 146 | type_uint16 = 4, 147 | type_uint32 = 2, 148 | type_uint64 = 1, 149 | 150 | -- clock 151 | clock_freq = 200*MHz, 152 | uart_freq = 57600, 153 | 154 | -- nb of dmas (this includes instruction path) 155 | nb_dmas = 2 156 | } 157 | do 158 | -- Cache 159 | oFlower.cache_size_b = 64*kB 160 | oFlower.page_size_b = oFlower.cache_size_b/2 161 | oFlower.bus_ = 64 162 | oFlower.bus_b = oFlower.bus_/8 163 | end 164 | 165 | 166 | ---------------------------------------------------------------------- 167 | --- Grid parameters 168 | -- 169 | grid = {} 170 | do 171 | -- nb of grids 172 | grid.nb_grids = 1 173 | -- global IOs 174 | grid.nb_ios = 7 175 | -- conv 176 | grid.nb_convs = 4 177 | grid.kernel_width = 10 178 | grid.kernel_height = 10 179 | -- mapper 180 | grid.nb_mappers = 4 181 | grid.mapper_segs = 8 182 | -- generic ALUs 183 | grid.nb_alus = 4 184 | -- clock: 185 | grid.clock_freq = 400*MHz 186 | end 187 | 188 | 189 | ---------------------------------------------------------------------- 190 | --- General DMAs 191 | -- 192 | dma = {} 193 | do 194 | -- global DMA IOs 195 | dma.nb_ios = 2 196 | end 197 | 198 | 199 | ---------------------------------------------------------------------- 200 | --- Streamer parameters 201 | -- 202 | -- Units: 203 | -- _: bits 204 | -- _b: bytes 205 | -- _w: words (1 word = word_b bytes) 206 | -- _r: memory rows (1 row = size_b bytes) 207 | -- _i: integers (1 int = 4 bytes) 208 | -- 209 | streamer = {} 210 | do 211 | -- physical params 212 | streamer.nb_ports = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios * grid.nb_grids 213 | -- geometry 214 | streamer.mem_bus_ = 256 215 | streamer.mem_bus_b = 256 / 8 216 | streamer.stride_b = 2048 217 | streamer.word_b = 2 218 | streamer.align_b = streamer.mem_bus_ / 8 219 | streamer.stride_w = streamer.stride_b / streamer.word_b 220 | streamer.align_w = streamer.align_b / streamer.word_b 221 | -- clock 222 | streamer.clock_freq = 400*MHz 223 | end 224 | 225 | 226 | ---------------------------------------------------------------------- 227 | --- Memory parameters 228 | -- 229 | -- the parameters are expressed in different units: 230 | -- _: bits 231 | -- _b: bytes 232 | -- _w: words (1 word = word_b bytes) 233 | -- _r: memory rows (1 row = size_b bytes) 234 | -- _i: integers (1 int = 4 bytes) 235 | -- 236 | memory = {} 237 | do 238 | -- size: 239 | memory.size_b = 16*MB 240 | memory.size_w = memory.size_b / streamer.word_b 241 | memory.size_r = memory.size_b / streamer.stride_b 242 | -- clock: 243 | memory.clock_freq = 400*MHz 244 | -- bandwidth 245 | memory.bus_ = 64 246 | memory.is_ddr = true 247 | memory.is_dual = true 248 | memory.bandwidth_ = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1) 249 | memory.bandwidth_b = memory.bandwidth_ / 8 250 | memory.bandwidth_w = memory.bandwidth_b / streamer.word_b 251 | 252 | memory.offset_text = 0 253 | end 254 | 255 | 256 | ---------------------------------------------------------------------- 257 | --- Extra Streamer parameters 258 | -- 259 | do 260 | -- parallel streams: this is application dependent 261 | streamer.max_parallel_rd_streams = grid.nb_convs + 1 262 | streamer.max_parallel_wr_streams = 1 263 | streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams 264 | -- bandwidth per stream: 265 | streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b 266 | streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams 267 | streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor 268 | -- bandwidth first check 269 | if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then 270 | print('ERROR internal bandwidth too high: ' 271 | .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s' 272 | .. ' > external bandwidth available: ' 273 | .. streamer.mem_bandwidth_b/1e9 ..'GB/s') 274 | os.exit() 275 | end 276 | -- continous streaming per rd port: 277 | -- this is based on the observation that: 278 | -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b 279 | local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams 280 | local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams 281 | streamer.min_timeout_rd = math.ceil(dead_cycles_rd / 282 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 283 | - streamer.max_parallel_streams)) 284 | streamer.min_timeout_wr = math.ceil(dead_cycles_wr / 285 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 286 | - streamer.max_parallel_streams)) 287 | --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr 288 | -- .. ' and rd=' .. streamer.min_timeout_rd) 289 | -- for these timeouts, we compute necessary buffers to insure no one is starving 290 | streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd 291 | + streamer.min_timeout_rd 292 | *(streamer.max_parallel_streams-1)) 293 | / streamer.mem_bus_b)) 294 | streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr 295 | + streamer.min_timeout_wr 296 | *(streamer.max_parallel_streams-1)) 297 | / streamer.mem_bus_b)) 298 | --print('# streamer min cache sizes: wr='..streamer.min_cache_wr 299 | -- ..' and rd='..streamer.min_cache_rd) 300 | end 301 | 302 | 303 | ---------------------------------------------------------------------- 304 | --- Num parameters 305 | -- 306 | num = {} 307 | do 308 | num.size_b = 2 309 | num.size_ = 16 310 | num.frac_ = 8 311 | num.int_ = num.size_-num.frac_ 312 | num.max = (2^(num.size_-1)-1) / 2^num.frac_ 313 | num.min = -(2^(num.size_-1)) / 2^num.frac_ 314 | num.one = 2^num.frac_ 315 | num.res = 1 / 2^num.frac_ 316 | num.precision = num.res 317 | num.mask = 0xFFFF 318 | end 319 | 320 | 321 | ---------------------------------------------------------------------- 322 | --- System Banner 323 | -- 324 | banner = 325 | '------------------------------------------------------------\r\n' .. 326 | '-- _ _ __ neuFlow [v.1.0] --\r\n' .. 327 | '-- ( | )/_/ --\r\n' .. 328 | '-- __( >O< ) This code runs on --\r\n' .. 329 | '-- \\_\\(_|_) the custom openFlow CPU. --\r\n' .. 330 | '-- --\r\n' .. 331 | '-- Copyright (C) 2009/10 | Farabet/Akselrod/Martini --\r\n' .. 332 | '------------------------------------------------------------' 333 | 334 | 335 | ---------------------------------------------------------------------- 336 | --- BootLoader parameters 337 | -- 338 | bootloader = {} 339 | do 340 | bootloader.entry_point_b = oFlower.cache_size_b 341 | bootloader.entry_point = bootloader.entry_point_b / oFlower.bus_b 342 | bootloader.load_size = 32*MB 343 | end 344 | -------------------------------------------------------------------------------- /src/defines_pico_m503.lua: -------------------------------------------------------------------------------- 1 | -- -*- lua -*- 2 | 3 | ---------------------------------------------------------------------- 4 | --- Useful abbrevs 5 | -- 6 | kB = 1024 7 | MB = 1024*1024 8 | GB = 1024*1024*1024 9 | kHz = 1000 10 | MHz = 1000*1000 11 | GHz = 1000*1000*1000 12 | 13 | ---------------------------------------------------------------------- 14 | --- Blast Bus parameters 15 | -- 16 | blast_bus = { 17 | -- Addressing : 18 | area_streamer = 1, 19 | area_tile = 2, 20 | area_memctrl = 3, 21 | area_dma = 4, 22 | -- 23 | addr_broadcast = 0, 24 | addr_conv_0 = 1, 25 | addr_conv_1 = 2, 26 | addr_comb_0 = 16, 27 | addr_mapp_0 = 24, 28 | addr_div_0 = 28, 29 | addr_grid_0 = 256, 30 | addr_mem_streamer_0 = 1, 31 | addr_mem_streamer_1 = 2, 32 | addr_mem_streamer_2 = 3, 33 | addr_mem_streamer_3 = 4, 34 | addr_mem_streamer_4 = 5, 35 | addr_mem_streamer_5 = 6, 36 | addr_mem_streamer_6 = 7, 37 | addr_mem_streamer_7 = 8, 38 | addr_dma = 0, 39 | addr_memctrl = 0, 40 | -- 41 | subAddr_router = 0, 42 | subAddr_operator = 1, 43 | subAddr_cacher = 2, 44 | subAddr_IO = 3, 45 | subAddr_none = 0, 46 | subAddr_memTimeouts = 0, 47 | subAddr_memGlobals = 1, 48 | subAddr_memLocals = 2, 49 | 50 | -- Content: 51 | content_nothing = 0, 52 | content_command = 1, 53 | content_instruc = 2, 54 | content_config = 3, 55 | content_valid = 1, 56 | 57 | -- Instructions 58 | instruc_config = 0, 59 | instruc_setAdd = 1, 60 | instruc_activate = 2, 61 | instruc_deActivate = 3, 62 | instruc_reset = 4, 63 | instruc_RESERVED_1 = 5, 64 | instruc_control_0 = 6, 65 | instruc_control_1 = 7, 66 | instruc_control_2 = 8, 67 | instruc_control_3 = 9, 68 | instruc_control_4 = 10, 69 | instruc_control_5 = 11, 70 | instruc_control_6 = 12, 71 | instruc_control_7 = 13, 72 | instruc_cacheStart = 14, 73 | instruc_cacheFinish = 15, 74 | 75 | -- Status 76 | status_notAddressed = 0, 77 | status_idle = 1, 78 | status_busy = 2, 79 | status_done = 3, 80 | status_primed = 4, 81 | status_unconfigured = 5, 82 | status_misconfigured = 6 83 | } 84 | 85 | 86 | ---------------------------------------------------------------------- 87 | --- OpenFlower Instruction Set. 88 | -- 89 | oFlower = { 90 | -- Opcodes 91 | op_writeConfig = 0, 92 | op_getStatus = 1, 93 | op_writeStream = 2, 94 | op_routeStream = 3, 95 | op_writeWord = 4, 96 | op_readWord = 5, 97 | op_setReg = 6, 98 | op_goto = 7, 99 | op_add = 8, 100 | op_control = 9, 101 | op_and = 10, 102 | op_or = 11, 103 | op_comp = 12, 104 | op_shr = 13, 105 | op_nop = 14, 106 | op_term = 15, 107 | 108 | -- Register map 109 | reg_operation = 0, 110 | reg_size = 1, 111 | reg_type = 2, 112 | reg_state = 3, 113 | reg_counter = 4, 114 | reg_loops = 5, 115 | reg_status = 6, 116 | reg_sys_A = 7, 117 | reg_sys_B = 8, 118 | reg_sys_C = 9, 119 | reg_A = 10, 120 | reg_B = 11, 121 | reg_C = 12, 122 | reg_D = 13, 123 | reg_E = 14, 124 | reg_F = 15, 125 | 126 | -- ctrl map 127 | ctrl_lock_config_bus = 0, 128 | 129 | -- I/O Map 130 | io_uart = 0, 131 | io_uart_status = 1, 132 | io_dma = 2, 133 | io_dma_status = 3, 134 | io_ethernet = 4, 135 | io_ethernet_status = 5, 136 | io_iic = 6, 137 | io_iic_status = 7, 138 | io_spi = 8, 139 | io_spi_status = 8, 140 | io_gpios = 10, 141 | io_timer = 11, 142 | io_timer_ctrl = 12, 143 | 144 | -- CPU types 145 | type_uint8 = 8, 146 | type_uint16 = 4, 147 | type_uint32 = 2, 148 | type_uint64 = 1, 149 | 150 | -- clock 151 | clock_freq = 100*MHz, 152 | uart_freq = 57600, 153 | 154 | -- nb of dmas (this includes instruction path) 155 | nb_dmas = 2 156 | } 157 | do 158 | -- Cache 159 | oFlower.cache_size_b = 64*kB 160 | oFlower.page_size_b = oFlower.cache_size_b/2 161 | oFlower.bus_ = 64 162 | oFlower.bus_b = oFlower.bus_/8 163 | end 164 | 165 | ---------------------------------------------------------------------- 166 | --- General DMAs 167 | -- 168 | dma = {} 169 | do 170 | -- global DMA IOs 171 | dma.nb_ios = 4 172 | dma.ethernet_write_port_id = 2 173 | dma.ethernet_read_port_id = 3 174 | dma.camera_A_port_id = 4 175 | dma.camera_B_port_id = 5 176 | end 177 | 178 | ---------------------------------------------------------------------- 179 | --- Grid parameters 180 | -- 181 | grid = {} 182 | do 183 | -- nb of grids 184 | grid.nb_grids = 1 185 | -- global IOs 186 | grid.nb_ios = 6 187 | -- conv 188 | grid.nb_convs = 4 189 | grid.kernel_width = 10 190 | grid.kernel_height = 10 191 | -- mapper 192 | grid.nb_mappers = 4 193 | grid.mapper_segs = 8 194 | -- generic ALUs 195 | grid.nb_alus = 4 196 | -- clock: 197 | grid.clock_freq = 200*MHz 198 | end 199 | 200 | 201 | ---------------------------------------------------------------------- 202 | --- Streamer parameters 203 | -- 204 | -- Units: 205 | -- _: bits 206 | -- _b: bytes 207 | -- _w: words (1 word = word_b bytes) 208 | -- _r: memory rows (1 row = size_b bytes) 209 | -- _i: integers (1 int = 4 bytes) 210 | -- 211 | streamer = {} 212 | do 213 | -- physical params 214 | streamer.nb_ports = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios 215 | -- geometry 216 | streamer.mem_bus_ = 256 217 | streamer.mem_bus_b = 256 / 8 218 | streamer.stride_b = 2048 219 | streamer.word_b = 2 220 | streamer.align_b = streamer.mem_bus_ / 8 221 | streamer.stride_w = streamer.stride_b / streamer.word_b 222 | streamer.align_w = streamer.align_b / streamer.word_b 223 | -- clock 224 | streamer.clock_freq = 200*MHz 225 | end 226 | 227 | 228 | ---------------------------------------------------------------------- 229 | --- Memory parameters 230 | -- 231 | -- the parameters are expressed in different units: 232 | -- _: bits 233 | -- _b: bytes 234 | -- _w: words (1 word = word_b bytes) 235 | -- _r: memory rows (1 row = size_b bytes) 236 | -- _i: integers (1 int = 4 bytes) 237 | -- 238 | memory = {} 239 | do 240 | -- size: 241 | memory.size_b = 512*MB 242 | memory.size_w = memory.size_b / streamer.word_b 243 | memory.size_r = memory.size_b / streamer.stride_b 244 | -- clock: 245 | memory.clock_freq = 400*MHz 246 | -- bandwidth 247 | memory.bus_ = 32 248 | memory.is_ddr = true 249 | memory.bandwidth_ = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1) 250 | memory.bandwidth_b = memory.bandwidth_ / 8 251 | memory.bandwidth_w = memory.bandwidth_b / streamer.word_b 252 | 253 | memory.offset_text = 0 254 | end 255 | 256 | 257 | ---------------------------------------------------------------------- 258 | --- Extra Streamer parameters 259 | -- 260 | do 261 | -- parallel streams: this is application dependent 262 | streamer.max_parallel_rd_streams = grid.nb_convs + 1 263 | streamer.max_parallel_wr_streams = 1 264 | streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams 265 | -- bandwidth per stream: 266 | streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b 267 | streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams 268 | streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor 269 | -- bandwidth first check 270 | if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then 271 | print('ERROR internal bandwidth too high: ' 272 | .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s' 273 | .. ' > external bandwidth available: ' 274 | .. streamer.mem_bandwidth_b/1e9 ..'GB/s') 275 | os.exit() 276 | end 277 | -- continous streaming per rd port: 278 | -- this is based on the observation that: 279 | -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b 280 | local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams 281 | local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams 282 | streamer.min_timeout_rd = math.ceil(dead_cycles_rd / 283 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 284 | - streamer.max_parallel_streams)) 285 | streamer.min_timeout_wr = math.ceil(dead_cycles_wr / 286 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 287 | - streamer.max_parallel_streams)) 288 | --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr 289 | -- .. ' and rd=' .. streamer.min_timeout_rd) 290 | -- for these timeouts, we compute necessary buffers to insure no one is starving 291 | streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd 292 | + streamer.min_timeout_rd 293 | *(streamer.max_parallel_streams-1)) 294 | / streamer.mem_bus_b)) 295 | streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr 296 | + streamer.min_timeout_wr 297 | *(streamer.max_parallel_streams-1)) 298 | / streamer.mem_bus_b)) 299 | --print('# streamer min cache sizes: wr='..streamer.min_cache_wr 300 | -- ..' and rd='..streamer.min_cache_rd) 301 | end 302 | 303 | 304 | ---------------------------------------------------------------------- 305 | --- Num parameters 306 | -- 307 | num = {} 308 | do 309 | num.size_b = 2 310 | num.size_ = 16 311 | num.frac_ = 8 312 | num.int_ = num.size_-num.frac_ 313 | num.max = (2^(num.size_-1)-1) / 2^num.frac_ 314 | num.min = -(2^(num.size_-1)) / 2^num.frac_ 315 | num.one = 2^num.frac_ 316 | num.res = 1 / 2^num.frac_ 317 | num.precision = num.res 318 | num.mask = 0xFFFF 319 | end 320 | 321 | 322 | ---------------------------------------------------------------------- 323 | --- System Banner 324 | -- 325 | banner = 326 | '------------------------------------------------------------\r\n' .. 327 | '-- _ _ __ neuFlow [v.1.0] --\r\n' .. 328 | '-- ( | )/_/ --\r\n' .. 329 | '-- __( >O< ) This code runs on --\r\n' .. 330 | '-- \\_\\(_|_) the custom openFlow CPU. --\r\n' .. 331 | '-- --\r\n' .. 332 | '-- Copyright (C) 2009/10 | Farabet/Akselrod/Martini --\r\n' .. 333 | '------------------------------------------------------------' 334 | 335 | 336 | ---------------------------------------------------------------------- 337 | --- BootLoader parameters 338 | -- 339 | bootloader = {} 340 | do 341 | bootloader.entry_point_b = oFlower.cache_size_b 342 | bootloader.entry_point = bootloader.entry_point_b / oFlower.bus_b 343 | bootloader.load_size = 32*MB 344 | end 345 | -------------------------------------------------------------------------------- /src/defines_xilinx_ml605.lua: -------------------------------------------------------------------------------- 1 | -- -*- lua -*- 2 | 3 | ---------------------------------------------------------------------- 4 | --- Useful abbrevs 5 | -- 6 | kB = 1024 7 | MB = 1024*1024 8 | GB = 1024*1024*1024 9 | kHz = 1000 10 | MHz = 1000*1000 11 | GHz = 1000*1000*1000 12 | 13 | ---------------------------------------------------------------------- 14 | --- Blast Bus parameters 15 | -- 16 | blast_bus = { 17 | -- Addressing : 18 | area_streamer = 1, 19 | area_tile = 2, 20 | area_memctrl = 3, 21 | area_dma = 4, 22 | -- 23 | addr_broadcast = 0, 24 | addr_conv_0 = 1, 25 | addr_conv_1 = 2, 26 | addr_comb_0 = 16, 27 | addr_mapp_0 = 24, 28 | addr_div_0 = 28, 29 | addr_grid_0 = 256, 30 | addr_mem_streamer_0 = 1, 31 | addr_mem_streamer_1 = 2, 32 | addr_mem_streamer_2 = 3, 33 | addr_mem_streamer_3 = 4, 34 | addr_mem_streamer_4 = 5, 35 | addr_mem_streamer_5 = 6, 36 | addr_mem_streamer_6 = 7, 37 | addr_mem_streamer_7 = 8, 38 | addr_dma = 0, 39 | addr_memctrl = 0, 40 | -- 41 | subAddr_router = 0, 42 | subAddr_operator = 1, 43 | subAddr_cacher = 2, 44 | subAddr_IO = 3, 45 | subAddr_none = 0, 46 | subAddr_memTimeouts = 0, 47 | subAddr_memGlobals = 1, 48 | subAddr_memLocals = 2, 49 | 50 | -- Content: 51 | content_nothing = 0, 52 | content_command = 1, 53 | content_instruc = 2, 54 | content_config = 3, 55 | content_valid = 1, 56 | 57 | -- Instructions 58 | instruc_config = 0, 59 | instruc_setAdd = 1, 60 | instruc_activate = 2, 61 | instruc_deActivate = 3, 62 | instruc_reset = 4, 63 | instruc_RESERVED_1 = 5, 64 | instruc_control_0 = 6, 65 | instruc_control_1 = 7, 66 | instruc_control_2 = 8, 67 | instruc_control_3 = 9, 68 | instruc_control_4 = 10, 69 | instruc_control_5 = 11, 70 | instruc_control_6 = 12, 71 | instruc_control_7 = 13, 72 | instruc_cacheStart = 14, 73 | instruc_cacheFinish = 15, 74 | 75 | -- Status 76 | status_notAddressed = 0, 77 | status_idle = 1, 78 | status_busy = 2, 79 | status_done = 3, 80 | status_primed = 4, 81 | status_unconfigured = 5, 82 | status_misconfigured = 6 83 | } 84 | 85 | 86 | ---------------------------------------------------------------------- 87 | --- OpenFlower Instruction Set. 88 | -- 89 | oFlower = { 90 | -- Opcodes 91 | op_writeConfig = 0, 92 | op_getStatus = 1, 93 | op_writeStream = 2, 94 | op_routeStream = 3, 95 | op_writeWord = 4, 96 | op_readWord = 5, 97 | op_setReg = 6, 98 | op_goto = 7, 99 | op_add = 8, 100 | op_control = 9, 101 | op_and = 10, 102 | op_or = 11, 103 | op_comp = 12, 104 | op_shr = 13, 105 | op_nop = 14, 106 | op_term = 15, 107 | 108 | -- Register map 109 | reg_operation = 0, 110 | reg_size = 1, 111 | reg_type = 2, 112 | reg_state = 3, 113 | reg_counter = 4, 114 | reg_loops = 5, 115 | reg_status = 6, 116 | reg_sys_A = 7, 117 | reg_sys_B = 8, 118 | reg_sys_C = 9, 119 | reg_A = 10, 120 | reg_B = 11, 121 | reg_C = 12, 122 | reg_D = 13, 123 | reg_E = 14, 124 | reg_F = 15, 125 | 126 | -- ctrl map 127 | ctrl_lock_config_bus = 0, 128 | 129 | -- I/O Map 130 | io_uart = 0, 131 | io_uart_status = 1, 132 | io_dma = 2, 133 | io_dma_status = 3, 134 | io_ethernet = 4, 135 | io_ethernet_status = 5, 136 | io_iic = 6, 137 | io_iic_status = 7, 138 | io_spi = 8, 139 | io_spi_status = 8, 140 | io_gpios = 10, 141 | io_timer = 11, 142 | io_timer_ctrl = 12, 143 | 144 | -- CPU types 145 | type_uint8 = 8, 146 | type_uint16 = 4, 147 | type_uint32 = 2, 148 | type_uint64 = 1, 149 | 150 | -- clock 151 | clock_freq = 100*MHz, 152 | uart_freq = 57600, 153 | 154 | -- nb of dmas (this includes instruction path) 155 | nb_dmas = 2 156 | } 157 | do 158 | -- Cache 159 | oFlower.cache_size_b = 64*kB 160 | oFlower.page_size_b = oFlower.cache_size_b/2 161 | oFlower.bus_ = 64 162 | oFlower.bus_b = oFlower.bus_/8 163 | end 164 | 165 | 166 | ---------------------------------------------------------------------- 167 | --- Grid parameters 168 | -- 169 | grid = {} 170 | do 171 | -- nb of grids 172 | grid.nb_grids = 1 173 | -- global IOs 174 | grid.nb_ios = 6 175 | -- conv 176 | grid.nb_convs = 4 177 | grid.kernel_width = 10 178 | grid.kernel_height = 10 179 | -- mapper 180 | grid.nb_mappers = 4 181 | grid.mapper_segs = 8 182 | -- generic ALUs 183 | grid.nb_alus = 4 184 | -- clock: 185 | grid.clock_freq = 200*MHz 186 | end 187 | 188 | 189 | ---------------------------------------------------------------------- 190 | --- General DMAs 191 | -- 192 | dma = {} 193 | do 194 | -- global DMA IOs 195 | dma.nb_ios = 0 196 | end 197 | 198 | 199 | ---------------------------------------------------------------------- 200 | --- Streamer parameters 201 | -- 202 | -- Units: 203 | -- _: bits 204 | -- _b: bytes 205 | -- _w: words (1 word = word_b bytes) 206 | -- _r: memory rows (1 row = size_b bytes) 207 | -- _i: integers (1 int = 4 bytes) 208 | -- 209 | streamer = {} 210 | do 211 | -- physical params 212 | streamer.nb_ports = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios 213 | -- geometry 214 | streamer.mem_bus_ = 256 215 | streamer.mem_bus_b = 256 / 8 216 | streamer.stride_b = 2048 217 | streamer.word_b = 2 218 | streamer.align_b = streamer.mem_bus_ / 8 219 | streamer.stride_w = streamer.stride_b / streamer.word_b 220 | streamer.align_w = streamer.align_b / streamer.word_b 221 | -- clock 222 | streamer.clock_freq = 200*MHz 223 | end 224 | 225 | 226 | ---------------------------------------------------------------------- 227 | --- Memory parameters 228 | -- 229 | -- the parameters are expressed in different units: 230 | -- _: bits 231 | -- _b: bytes 232 | -- _w: words (1 word = word_b bytes) 233 | -- _r: memory rows (1 row = size_b bytes) 234 | -- _i: integers (1 int = 4 bytes) 235 | -- 236 | memory = {} 237 | do 238 | -- size: 239 | memory.size_b = 512*MB 240 | memory.size_w = memory.size_b / streamer.word_b 241 | memory.size_r = memory.size_b / streamer.stride_b 242 | -- clock: 243 | memory.clock_freq = 400*MHz 244 | -- bandwidth 245 | memory.bus_ = 32 246 | memory.is_ddr = true 247 | memory.bandwidth_ = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1) 248 | memory.bandwidth_b = memory.bandwidth_ / 8 249 | memory.bandwidth_w = memory.bandwidth_b / streamer.word_b 250 | 251 | memory.offset_text = 0 252 | end 253 | 254 | 255 | ---------------------------------------------------------------------- 256 | --- Extra Streamer parameters 257 | -- 258 | do 259 | -- parallel streams: this is application dependent 260 | streamer.max_parallel_rd_streams = grid.nb_convs + 1 261 | streamer.max_parallel_wr_streams = 1 262 | streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams 263 | -- bandwidth per stream: 264 | streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b 265 | streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams 266 | streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor 267 | -- bandwidth first check 268 | if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then 269 | print('ERROR internal bandwidth too high: ' 270 | .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s' 271 | .. ' > external bandwidth available: ' 272 | .. streamer.mem_bandwidth_b/1e9 ..'GB/s') 273 | os.exit() 274 | end 275 | -- continous streaming per rd port: 276 | -- this is based on the observation that: 277 | -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b 278 | local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams 279 | local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams 280 | streamer.min_timeout_rd = math.ceil(dead_cycles_rd / 281 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 282 | - streamer.max_parallel_streams)) 283 | streamer.min_timeout_wr = math.ceil(dead_cycles_wr / 284 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 285 | - streamer.max_parallel_streams)) 286 | --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr 287 | -- .. ' and rd=' .. streamer.min_timeout_rd) 288 | -- for these timeouts, we compute necessary buffers to insure no one is starving 289 | streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd 290 | + streamer.min_timeout_rd 291 | *(streamer.max_parallel_streams-1)) 292 | / streamer.mem_bus_b)) 293 | streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr 294 | + streamer.min_timeout_wr 295 | *(streamer.max_parallel_streams-1)) 296 | / streamer.mem_bus_b)) 297 | --print('# streamer min cache sizes: wr='..streamer.min_cache_wr 298 | -- ..' and rd='..streamer.min_cache_rd) 299 | end 300 | 301 | 302 | ---------------------------------------------------------------------- 303 | --- Num parameters 304 | -- 305 | num = {} 306 | do 307 | num.size_b = 2 308 | num.size_ = 16 309 | num.frac_ = 8 310 | num.int_ = num.size_-num.frac_ 311 | num.max = (2^(num.size_-1)-1) / 2^num.frac_ 312 | num.min = -(2^(num.size_-1)) / 2^num.frac_ 313 | num.one = 2^num.frac_ 314 | num.res = 1 / 2^num.frac_ 315 | num.precision = num.res 316 | num.mask = 0xFFFF 317 | end 318 | 319 | 320 | ---------------------------------------------------------------------- 321 | --- System Banner 322 | -- 323 | banner = 324 | '------------------------------------------------------------\r\n' .. 325 | '-- _ _ __ neuFlow [v.1.0] --\r\n' .. 326 | '-- ( | )/_/ --\r\n' .. 327 | '-- __( >O< ) This code runs on --\r\n' .. 328 | '-- \\_\\(_|_) the custom openFlow CPU. --\r\n' .. 329 | '-- --\r\n' .. 330 | '-- Copyright (C) 2009/10 | Farabet/Akselrod/Martini --\r\n' .. 331 | '------------------------------------------------------------' 332 | 333 | 334 | ---------------------------------------------------------------------- 335 | --- BootLoader parameters 336 | -- 337 | bootloader = {} 338 | do 339 | bootloader.entry_point_b = oFlower.cache_size_b 340 | bootloader.entry_point = bootloader.entry_point_b / oFlower.bus_b 341 | bootloader.load_size = 32*MB 342 | end 343 | -------------------------------------------------------------------------------- /src/defines_xilinx_ml605_tbsp.lua: -------------------------------------------------------------------------------- 1 | -- -*- lua -*- 2 | 3 | ---------------------------------------------------------------------- 4 | --- Useful abbrevs 5 | -- 6 | kB = 1024 7 | MB = 1024*1024 8 | GB = 1024*1024*1024 9 | kHz = 1000 10 | MHz = 1000*1000 11 | GHz = 1000*1000*1000 12 | 13 | ---------------------------------------------------------------------- 14 | --- Blast Bus parameters 15 | -- 16 | blast_bus = { 17 | -- Addressing : 18 | area_streamer = 1, 19 | area_tile = 2, 20 | area_memctrl = 3, 21 | area_dma = 4, 22 | -- 23 | addr_broadcast = 0, 24 | addr_conv_0 = 1, 25 | addr_conv_1 = 2, 26 | addr_comb_0 = 16, 27 | addr_mapp_0 = 24, 28 | addr_div_0 = 28, 29 | addr_grid_0 = 256, 30 | addr_mem_streamer_0 = 1, 31 | addr_mem_streamer_1 = 2, 32 | addr_mem_streamer_2 = 3, 33 | addr_mem_streamer_3 = 4, 34 | addr_mem_streamer_4 = 5, 35 | addr_mem_streamer_5 = 6, 36 | addr_mem_streamer_6 = 7, 37 | addr_mem_streamer_7 = 8, 38 | addr_dma = 0, 39 | addr_memctrl = 0, 40 | -- 41 | subAddr_router = 0, 42 | subAddr_operator = 1, 43 | subAddr_cacher = 2, 44 | subAddr_IO = 3, 45 | subAddr_none = 0, 46 | subAddr_memTimeouts = 0, 47 | subAddr_memGlobals = 1, 48 | subAddr_memLocals = 2, 49 | 50 | -- Content: 51 | content_nothing = 0, 52 | content_command = 1, 53 | content_instruc = 2, 54 | content_config = 3, 55 | content_valid = 1, 56 | 57 | -- Instructions 58 | instruc_config = 0, 59 | instruc_setAdd = 1, 60 | instruc_activate = 2, 61 | instruc_deActivate = 3, 62 | instruc_reset = 4, 63 | instruc_RESERVED_1 = 5, 64 | instruc_control_0 = 6, 65 | instruc_control_1 = 7, 66 | instruc_control_2 = 8, 67 | instruc_control_3 = 9, 68 | instruc_control_4 = 10, 69 | instruc_control_5 = 11, 70 | instruc_control_6 = 12, 71 | instruc_control_7 = 13, 72 | instruc_cacheStart = 14, 73 | instruc_cacheFinish = 15, 74 | 75 | -- Status 76 | status_notAddressed = 0, 77 | status_idle = 1, 78 | status_busy = 2, 79 | status_done = 3, 80 | status_primed = 4, 81 | status_unconfigured = 5, 82 | status_misconfigured = 6 83 | } 84 | 85 | 86 | ---------------------------------------------------------------------- 87 | --- OpenFlower Instruction Set. 88 | -- 89 | oFlower = { 90 | -- Opcodes 91 | op_writeConfig = 0, 92 | op_getStatus = 1, 93 | op_writeStream = 2, 94 | op_routeStream = 3, 95 | op_writeWord = 4, 96 | op_readWord = 5, 97 | op_setReg = 6, 98 | op_goto = 7, 99 | op_add = 8, 100 | op_control = 9, 101 | op_and = 10, 102 | op_or = 11, 103 | op_comp = 12, 104 | op_shr = 13, 105 | op_nop = 14, 106 | op_term = 15, 107 | 108 | -- Register map 109 | reg_operation = 0, 110 | reg_size = 1, 111 | reg_type = 2, 112 | reg_state = 3, 113 | reg_counter = 4, 114 | reg_loops = 5, 115 | reg_status = 6, 116 | reg_sys_A = 7, 117 | reg_sys_B = 8, 118 | reg_sys_C = 9, 119 | reg_A = 10, 120 | reg_B = 11, 121 | reg_C = 12, 122 | reg_D = 13, 123 | reg_E = 14, 124 | reg_F = 15, 125 | 126 | -- ctrl map 127 | ctrl_lock_config_bus = 0, 128 | 129 | -- I/O Map 130 | io_uart = 0, 131 | io_uart_status = 1, 132 | io_dma = 2, 133 | io_dma_status = 3, 134 | io_ethernet = 4, 135 | io_ethernet_status = 5, 136 | io_iic = 6, 137 | io_iic_status = 7, 138 | io_spi = 8, 139 | io_spi_status = 8, 140 | io_gpios = 10, 141 | io_timer = 11, 142 | io_timer_ctrl = 12, 143 | 144 | -- CPU types 145 | type_uint8 = 8, 146 | type_uint16 = 4, 147 | type_uint32 = 2, 148 | type_uint64 = 1, 149 | 150 | -- clock 151 | clock_freq = 100*MHz, 152 | uart_freq = 57600, 153 | 154 | -- nb of dmas (this includes instruction path) 155 | nb_dmas = 2 156 | } 157 | do 158 | -- Cache 159 | oFlower.cache_size_b = 64*kB 160 | oFlower.page_size_b = oFlower.cache_size_b/2 161 | oFlower.bus_ = 64 162 | oFlower.bus_b = oFlower.bus_/8 163 | end 164 | 165 | 166 | ---------------------------------------------------------------------- 167 | --- Grid parameters 168 | -- 169 | grid = {} 170 | do 171 | -- nb of grids 172 | grid.nb_grids = 1 173 | -- global IOs 174 | grid.nb_ios = 6 175 | -- conv 176 | grid.nb_convs = 4 177 | grid.kernel_width = 10 178 | grid.kernel_height = 10 179 | -- mapper 180 | grid.nb_mappers = 4 181 | grid.mapper_segs = 8 182 | -- generic ALUs 183 | grid.nb_alus = 4 184 | -- clock: 185 | grid.clock_freq = 200*MHz 186 | end 187 | 188 | 189 | ---------------------------------------------------------------------- 190 | --- General DMAs 191 | -- 192 | dma = {} 193 | do 194 | -- global DMA IOs 195 | dma.nb_ios = 2 196 | dma.ethernet_write_port_id = 2 197 | dma.ethernet_read_port_id = 3 198 | end 199 | 200 | 201 | ---------------------------------------------------------------------- 202 | --- Streamer parameters 203 | -- 204 | -- Units: 205 | -- _: bits 206 | -- _b: bytes 207 | -- _w: words (1 word = word_b bytes) 208 | -- _r: memory rows (1 row = size_b bytes) 209 | -- _i: integers (1 int = 4 bytes) 210 | -- 211 | streamer = {} 212 | do 213 | -- physical params 214 | streamer.nb_ports = oFlower.nb_dmas + dma.nb_ios + grid.nb_ios 215 | -- geometry 216 | streamer.mem_bus_ = 256 217 | streamer.mem_bus_b = 256 / 8 218 | streamer.stride_b = 2048 219 | streamer.word_b = 2 220 | streamer.align_b = streamer.mem_bus_ / 8 221 | streamer.stride_w = streamer.stride_b / streamer.word_b 222 | streamer.align_w = streamer.align_b / streamer.word_b 223 | -- clock 224 | streamer.clock_freq = 200*MHz 225 | end 226 | 227 | 228 | ---------------------------------------------------------------------- 229 | --- Memory parameters 230 | -- 231 | -- the parameters are expressed in different units: 232 | -- _: bits 233 | -- _b: bytes 234 | -- _w: words (1 word = word_b bytes) 235 | -- _r: memory rows (1 row = size_b bytes) 236 | -- _i: integers (1 int = 4 bytes) 237 | -- 238 | memory = {} 239 | do 240 | -- size: 241 | memory.size_b = 512*MB 242 | memory.size_w = memory.size_b / streamer.word_b 243 | memory.size_r = memory.size_b / streamer.stride_b 244 | -- clock: 245 | memory.clock_freq = 400*MHz 246 | -- bandwidth 247 | memory.bus_ = 32 248 | memory.is_ddr = true 249 | memory.bandwidth_ = memory.bus_*memory.clock_freq*((memory.is_ddr and 2) or 1) 250 | memory.bandwidth_b = memory.bandwidth_ / 8 251 | memory.bandwidth_w = memory.bandwidth_b / streamer.word_b 252 | 253 | memory.offset_text = 0 254 | end 255 | 256 | 257 | ---------------------------------------------------------------------- 258 | --- Extra Streamer parameters 259 | -- 260 | do 261 | -- parallel streams: this is application dependent 262 | streamer.max_parallel_rd_streams = grid.nb_convs + 1 263 | streamer.max_parallel_wr_streams = 1 264 | streamer.max_parallel_streams = streamer.max_parallel_wr_streams+streamer.max_parallel_rd_streams 265 | -- bandwidth per stream: 266 | streamer.stream_bandwidth_b = grid.clock_freq * streamer.word_b 267 | streamer.grid_max_bandwidth_b = streamer.stream_bandwidth_b * streamer.max_parallel_streams 268 | streamer.mem_bandwidth_b = memory.bandwidth_b * 0.85 -- 0.85 is an empirical throughput factor 269 | -- bandwidth first check 270 | if streamer.mem_bandwidth_b < streamer.grid_max_bandwidth_b then 271 | print('ERROR internal bandwidth too high: ' 272 | .. streamer.grid_max_bandwidth_b/1e9 ..'GB/s' 273 | .. ' > external bandwidth available: ' 274 | .. streamer.mem_bandwidth_b/1e9 ..'GB/s') 275 | os.exit() 276 | end 277 | -- continous streaming per rd port: 278 | -- this is based on the observation that: 279 | -- (timeout/(dead_cycles + timeout*max_parallel_ports))*mem_bandwidth_b > stream_bandwidth_b 280 | local dead_cycles_rd = streamer.nb_ports - streamer.max_parallel_rd_streams 281 | local dead_cycles_wr = streamer.nb_ports - streamer.max_parallel_wr_streams 282 | streamer.min_timeout_rd = math.ceil(dead_cycles_rd / 283 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 284 | - streamer.max_parallel_streams)) 285 | streamer.min_timeout_wr = math.ceil(dead_cycles_wr / 286 | ((streamer.mem_bandwidth_b/streamer.stream_bandwidth_b) 287 | - streamer.max_parallel_streams)) 288 | --print('# streamer min timeouts: wr='.. streamer.min_timeout_wr 289 | -- .. ' and rd=' .. streamer.min_timeout_rd) 290 | -- for these timeouts, we compute necessary buffers to insure no one is starving 291 | streamer.min_cache_rd = (math.ceil(streamer.word_b * (dead_cycles_rd 292 | + streamer.min_timeout_rd 293 | *(streamer.max_parallel_streams-1)) 294 | / streamer.mem_bus_b)) 295 | streamer.min_cache_wr = (math.ceil(streamer.word_b * (dead_cycles_wr 296 | + streamer.min_timeout_wr 297 | *(streamer.max_parallel_streams-1)) 298 | / streamer.mem_bus_b)) 299 | --print('# streamer min cache sizes: wr='..streamer.min_cache_wr 300 | -- ..' and rd='..streamer.min_cache_rd) 301 | end 302 | 303 | 304 | ---------------------------------------------------------------------- 305 | --- Num parameters 306 | -- 307 | num = {} 308 | do 309 | num.size_b = 2 310 | num.size_ = 16 311 | num.frac_ = 8 312 | num.int_ = num.size_-num.frac_ 313 | num.max = (2^(num.size_-1)-1) / 2^num.frac_ 314 | num.min = -(2^(num.size_-1)) / 2^num.frac_ 315 | num.one = 2^num.frac_ 316 | num.res = 1 / 2^num.frac_ 317 | num.precision = num.res 318 | num.mask = 0xFFFF 319 | end 320 | 321 | 322 | ---------------------------------------------------------------------- 323 | --- System Banner 324 | -- 325 | banner = 326 | '------------------------------------------------------------\r\n' .. 327 | '-- _ _ __ neuFlow [v.1.0] --\r\n' .. 328 | '-- ( | )/_/ --\r\n' .. 329 | '-- __( >O< ) This code runs on --\r\n' .. 330 | '-- \\_\\(_|_) the custom openFlow CPU. --\r\n' .. 331 | '-- --\r\n' .. 332 | '-- Copyright (C) 2009/10 | Farabet/Akselrod/Martini --\r\n' .. 333 | '------------------------------------------------------------' 334 | 335 | 336 | ---------------------------------------------------------------------- 337 | --- BootLoader parameters 338 | -- 339 | bootloader = {} 340 | do 341 | bootloader.entry_point_b = oFlower.cache_size_b 342 | bootloader.entry_point = bootloader.entry_point_b / oFlower.bus_b 343 | bootloader.load_size = 32*MB 344 | end 345 | -------------------------------------------------------------------------------- /src/init.lua: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------------- 2 | -- 3 | -- Copyright (c) 2010,2011 Clement Farabet, Polina Akselrod, Berin Martini 4 | -- 5 | -- Permission is hereby granted, free of charge, to any person obtaining 6 | -- a copy of this software and associated documentation files (the 7 | -- "Software"), to deal in the Software without restriction, including 8 | -- without limitation the rights to use, copy, modify, merge, publish, 9 | -- distribute, sublicense, and/or sell copies of the Software, and to 10 | -- permit persons to whom the Software is furnished to do so, subject to 11 | -- the following conditions: 12 | -- 13 | -- The above copyright notice and this permission notice shall be 14 | -- included in all copies or substantial portions of the Software. 15 | -- 16 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | -- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | -- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | -- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -- 24 | ---------------------------------------------------------------------- 25 | -- description: 26 | -- neuflow - a compiler toolkit + communication for neuFlow. 27 | -- 28 | -- history: 29 | -- July 16, 2011, 1:51PM - import from Torch5 - Clement Farabet 30 | ---------------------------------------------------------------------- 31 | 32 | -- dependencies 33 | require 'xlua' 34 | require 'os' 35 | require 'torch' 36 | require 'nnx' 37 | require 'bit' 38 | 39 | -- main table 40 | neuflow = {} 41 | 42 | -- load all submodules 43 | torch.include('neuflow', 'defines.lua') 44 | torch.include('neuflow', 'tools.lua') 45 | torch.include('neuflow', 'rom.lua') 46 | torch.include('neuflow', 'Profiler.lua') 47 | torch.include('neuflow', 'Log.lua') 48 | torch.include('neuflow', 'Memory.lua') 49 | torch.include('neuflow', 'Compiler.lua') 50 | torch.include('neuflow', 'Interface.lua') 51 | torch.include('neuflow', 'DmaInterface.lua') 52 | torch.include('neuflow', 'Camera.lua') 53 | torch.include('neuflow', 'Core.lua') 54 | torch.include('neuflow', 'CoreUser.lua') 55 | torch.include('neuflow', 'Linker.lua') 56 | torch.include('neuflow', 'LinkerExtensions.lua') 57 | torch.include('neuflow', 'Serial.lua') 58 | torch.include('neuflow', 'NeuFlow.lua') 59 | 60 | -- shortcut for user interface: 61 | neuflow.init = neuflow.NeuFlow 62 | 63 | -- create a path in home dir to store things 64 | -- like coefficients for example 65 | neuflow.coefpath = os.getenv('HOME')..'/.neuflow/coefs' 66 | os.execute('mkdir -p ' .. neuflow.coefpath) 67 | os.execute('chmod a+rw ' .. neuflow.coefpath) 68 | 69 | -- migrate all the coefficients 70 | os.execute('cp ' .. sys.concat(sys.fpath(), 'coef_*') .. ' ' .. neuflow.coefpath) 71 | os.execute('chmod a+rw ' .. neuflow.coefpath .. '/*') 72 | 73 | -- return table 74 | return neuflow 75 | -------------------------------------------------------------------------------- /src/rom.lua: -------------------------------------------------------------------------------- 1 | 2 | neuflow.tools.romTemplate = [[ 3 | /*************************************************************************************************** 4 | * Module: #ROM_NAME 5 | * 6 | * Description: Sync ROM, with registered output. 7 | * This is a template: macros of that kind #*** need to be replaced... 8 | * 9 | * TODO: rst is commented out for now, because not tolerated by XST... 10 | * 11 | * Created: December 13, 2009, 12:11PM 12 | * 13 | * Author: Clement Farabet 14 | **************************************************************************************************/ 15 | `ifndef _#ROM_NAME_ `define _#ROM_NAME_ 16 | 17 | module #ROM_NAME 18 | #(parameter 19 | CPU_ADDR_WIDTH = 32, 20 | ADDR_WIDTH = #ADDR_WIDTH, 21 | DATA_WIDTH = #DATA_WIDTH) 22 | (input wire clk, 23 | input wire rst, 24 | input wire [CPU_ADDR_WIDTH-1:0] address, 25 | output reg [DATA_WIDTH-1:0] data, 26 | input wire en ); 27 | 28 | 29 | /************************************************************************************** 30 | * Internal address 31 | **************************************************************************************/ 32 | wire [ADDR_WIDTH-1:0] addr; 33 | assign addr = address[ADDR_WIDTH-1:0]; 34 | 35 | 36 | /************************************************************************************** 37 | * ROM Storage... a simple case statement. 38 | **************************************************************************************/ 39 | always @ (posedge clk) begin : ROM_STORAGE_ 40 | if (en) begin 41 | case (addr) 42 | #STORAGE 43 | default: data <= #OUTPUT_ON_RESET; 44 | endcase 45 | end 46 | end 47 | 48 | endmodule 49 | 50 | `endif // `ifndef _#ROM_NAME_ 51 | ]] 52 | --------------------------------------------------------------------------------