├── src ├── main │ ├── resources │ │ └── verilog │ │ │ ├── altera_mf.v │ │ │ ├── DualPortBRAM.v │ │ │ ├── macc.v │ │ │ ├── VTASimDPI.v │ │ │ ├── UltraRAM.v │ │ │ └── VTAHostDPI.v │ └── scala │ │ ├── utility │ │ ├── Printf.scala │ │ └── Constants.scala │ │ ├── tensorKernels │ │ ├── macc.scala │ │ ├── UltraRAM.scala │ │ ├── MinArbiter.scala │ │ ├── MergeAdd.scala │ │ ├── WeightedArbiter.scala │ │ ├── Adder.scala │ │ ├── MergeSort.scala │ │ └── VirtualChannel.scala │ │ ├── config │ │ ├── cde │ │ │ ├── Dump.scala │ │ │ └── Config.scala │ │ ├── util.scala │ │ ├── TestConfigs.scala │ │ ├── FPConfigs.scala │ │ └── configurations.scala │ │ ├── verilogmain │ │ └── TypeMemDataflow.scala │ │ ├── dataflow │ │ ├── fuse │ │ │ ├── computeFuse01.scala │ │ │ └── computeFuse04.scala │ │ ├── SharedFPDiv.scala │ │ ├── filter │ │ │ ├── CacheLoader.scala │ │ │ ├── CacheVecLoader.scala │ │ │ ├── BasicLoader.scala │ │ │ ├── VecFilter.scala │ │ │ └── BasicFilter.scala │ │ ├── AllocaTest.scala │ │ ├── tests │ │ │ └── Add01.scala │ │ ├── SharedFPDiv │ │ ├── DataFlow.scala │ │ └── AllocaDF.scala │ │ ├── accel │ │ ├── Config.scala │ │ ├── Accelerator.scala │ │ └── coredf │ │ │ ├── FilterDFCore.scala │ │ │ ├── VecFilterDFCore.scala │ │ │ └── TestCacheDF.scala │ │ ├── memory │ │ ├── MemDRAM.scala │ │ └── MemArbiter.scala │ │ ├── stack │ │ ├── StackMem.scala │ │ └── StackAlloca.scala │ │ ├── muxes │ │ └── TestMux.scala │ │ ├── vta │ │ ├── dpi │ │ │ └── VTASimDPI.scala │ │ └── util │ │ │ └── GenericParameterizedBundle.scala │ │ ├── dnn │ │ ├── modules │ │ │ └── NCycle_CooSCAL.scala │ │ ├── MacNode.scala │ │ ├── memory │ │ │ ├── WriteTensorController.scala │ │ │ ├── ReadTensorController.scala │ │ │ ├── inStreamDMA.scala │ │ │ ├── inDMA_wgt.scala │ │ │ └── inDMA_act_HWC.scala │ │ ├── types │ │ │ ├── FP_GEMV.scala │ │ │ └── GEMM.scala │ │ ├── DGEMVNode.scala │ │ └── CooSCALNode.scala │ │ ├── loop │ │ ├── LoopEnd.scala │ │ ├── LoopStart.scala │ │ ├── LoopElement.scala │ │ ├── Example1.scala │ │ └── LoopHeader.scala │ │ ├── regfile │ │ ├── RegFile.scala │ │ └── InputRegFile.scala │ │ ├── dnnnode │ │ ├── CooShapeTransformer.scala │ │ ├── DiffShapeTransformer.scala │ │ └── WeightShapeTransformer.scala │ │ ├── node │ │ ├── CallNode.scala │ │ └── Comparision.scala │ │ ├── shell │ │ ├── VTAShell.scala │ │ └── IntelShell.scala │ │ ├── arbiters │ │ └── ArbiterTree.scala │ │ ├── junctions │ │ └── CombineDecoupled.scala │ │ └── dnn_layers │ │ └── ConvLayer.scala └── test │ └── scala │ ├── dnn │ ├── FXwidth.scala │ ├── GEMM.scala │ ├── FX_Reduction.scala │ ├── FX_SCAL.scala │ ├── FX_Dot.scala │ ├── GEMV.scala │ ├── FX_GEMV.scala │ ├── FX_Systolic.scala │ ├── FP_Systolic.scala │ ├── TLoad.scala │ ├── TStore.scala │ ├── Reduce.scala │ ├── Dot.scala │ └── SCAL.scala │ ├── FPU │ ├── SharedFPDiv.scala │ ├── FPResizeNode.scala │ ├── FPDiv.scala │ ├── FPCompareNode.scala │ └── FPComputeNode.scala │ └── dnn_layers │ └── Mac.scala └── include ├── dmlc ├── build_config_default.h ├── omp.h ├── timer.h ├── endian.h ├── thread_local.h └── common.h ├── meta_data.h ├── runtime ├── util.h └── serializer.h ├── runtime_base.h └── vta ├── module.h └── tsim.h /src/main/resources/verilog/altera_mf.v: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-arch/SpGEMM/HEAD/src/main/resources/verilog/altera_mf.v -------------------------------------------------------------------------------- /src/main/scala/utility/Printf.scala: -------------------------------------------------------------------------------- 1 | package utility 2 | 3 | import chisel3._ 4 | 5 | trait UniformPrintfs { 6 | val printfSigil = "" 7 | 8 | def pp(prefix: String, message: String, args: Bits*): Unit = { 9 | printf(prefix + message, args:_*) } 10 | 11 | def printfInfo (m: String, a: Bits*) { pp("\n[INFO] ", printfSigil++m, a:_*) } 12 | def printfWarn (m: String, a: Bits*) { pp("\n[WARN] ", printfSigil++m, a:_*) } 13 | def printfError(m: String, a: Bits*) { pp("\n[ERROR] ", printfSigil++m, a:_*) } 14 | def printfDebug(m: String, a: Bits*) { pp("\n[DEBUG] ", printfSigil++m, a:_*) } 15 | def printfTodo (m: String, a: Bits*) { pp("\n[TODO] ", printfSigil++m, a:_*) } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/macc.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import Chisel._ 4 | import chisel3.util.HasBlackBoxResource 5 | import chisel3.{BlackBox, Reset} 6 | 7 | class macc(SIZEIN: Int = 16, SIZEOUT: Int = 40) extends BlackBox(Map("SIZEIN" -> SIZEIN, "SIZEOUT" -> SIZEOUT)) with HasBlackBoxResource { 8 | val io = IO(new Bundle { 9 | val clk = Input(Clock()) 10 | val ce = Input(Bool ()) 11 | val sload = Input(Bool ()) 12 | val a = Input(UInt(SIZEIN.W)) 13 | val b = Input(UInt(SIZEIN.W)) 14 | val accum_out = Input(UInt(SIZEOUT.W)) 15 | }) 16 | 17 | // setResource("/home/reza/git/tensorstrainers/src/main/resources/verilog/macc.v") 18 | setResource("/verilog/macc.v") 19 | } 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FXwidth.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | 4 | import chisel3._ 5 | import Chisel.iotesters.{ChiselFlatSpec, SteppedHWIOTester} 6 | import chisel3.iotesters._ 7 | import config._ 8 | import node._ 9 | import org.scalatest.{FlatSpec, Matchers} 10 | 11 | 12 | class WidthTests(df: changeWidth[FXmatNxN])(implicit p: config.Parameters) extends PeekPokeTester(df) { 13 | poke(df.io.in, 0x323232L) 14 | print(peek(df.io.out).toString(16)) 15 | } 16 | 17 | class Width_Tester extends FlatSpec with Matchers { 18 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 19 | it should "Typ Compute Tester" in { 20 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 21 | () => new changeWidth(new FXmatNxN(2, 4), "double")) { 22 | c => new WidthTests(c) 23 | } should be(true) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/config/cde/Dump.scala: -------------------------------------------------------------------------------- 1 | package config.cde 2 | 3 | import scala.collection.mutable 4 | 5 | object Dump { 6 | def apply[T](key:Any,value:T):T = ParameterDump.apply(key, value) 7 | def apply[T](knob:Knob[T]):Knob[T] = ParameterDump.apply(knob) 8 | def apply[T](key_base:String,values:Seq[T]):Seq[T] = { 9 | values.zipWithIndex.foreach{ case(value, i) => Dump(key_base + "__" + i, value) } 10 | Dump(key_base + "__COUNT", values.size) 11 | values 12 | } 13 | } 14 | 15 | object ParameterDump { 16 | val dump = mutable.Set[Tuple2[Any,Any]]() 17 | val knobList = mutable.ListBuffer[Any]() 18 | def apply[T](key:Any,value:T):T = {addToDump(key,value); value} 19 | def apply[T](knob:Knob[T]):Knob[T] = {knobList += knob.name; knob} 20 | def addToDump(key:Any,value:Any) = dump += ((key,value)) 21 | def getDump:String = if (!dump.isEmpty) dump.map(_.toString).reduce(_+"\n"+_) + "\n" else "" 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/UltraRAM.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import Chisel._ 4 | import chisel3.{BlackBox, Reset, Clock} 5 | import chisel3.util.HasBlackBoxResource 6 | 7 | class UltraRAM(DWIDTH: Int = 72, AWIDTH: Int = 12, NBPIPE: Int = 1) extends BlackBox(Map("DWIDTH" -> DWIDTH, "AWIDTH" -> AWIDTH, "NBPIPE" -> NBPIPE)) with HasBlackBoxResource { 8 | val io = IO(new Bundle { 9 | val clk = Input(Clock()) 10 | val rst = Input(Reset()) 11 | val we = Input(Bool ()) 12 | val regce = Input(Bool ()) 13 | val mem_en = Input(Bool ()) 14 | val din = Input(UInt(DWIDTH.W)) 15 | val raddr = Input(UInt(AWIDTH.W)) 16 | val waddr = Input(UInt(AWIDTH.W)) 17 | val dout = Output(UInt(DWIDTH.W)) 18 | }) 19 | setResource("/verilog/UltraRAM.v") 20 | require(DWIDTH <= 72, "In URAM, data width should be equal or less than 72bits") 21 | require(AWIDTH <= 12, "In URAM, address width should be equal or less than 12bits") 22 | 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/main/scala/verilogmain/TypeMemDataflow.scala: -------------------------------------------------------------------------------- 1 | package verilogmain 2 | 3 | // liveIn_R(i).predicate := io.latchEnable.bits.control 4 | //liveIn_R(i).predicate := io.latchEnable.bits.control 5 | import java.io.{File, FileWriter} 6 | 7 | import node._ 8 | import config._ 9 | import interfaces._ 10 | import arbiters._ 11 | import memory._ 12 | import dataflow._ 13 | import config._ 14 | import util._ 15 | import interfaces._ 16 | 17 | 18 | object Main extends App { 19 | val dir = new File(args(0)) ; dir.mkdirs 20 | implicit val p = config.Parameters.root((new MiniConfig).toInstance) 21 | val chirrtl = firrtl.Parser.parse(chisel3.Driver.emit(() => new TypeMemDataFlow())) 22 | 23 | val verilog = new FileWriter(new File(dir, s"${chirrtl.main}.v")) 24 | val compileResult = (new firrtl.VerilogCompiler).compileAndEmit(firrtl.CircuitState(chirrtl, firrtl.ChirrtlForm)) 25 | val compiledStuff = compileResult.getEmittedCircuit 26 | verilog.write(compiledStuff.value) 27 | verilog.close 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/fuse/computeFuse01.scala: -------------------------------------------------------------------------------- 1 | package dataflow.fuse 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class ComputeFuse01DF(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val io = IO(new Bundle { 15 | val data0 = Flipped(Decoupled(new DataBundle())) 16 | val data1 = Flipped(Decoupled(new DataBundle())) 17 | val data2 = Flipped(Decoupled(new DataBundle())) 18 | val data3 = Flipped(Decoupled(new DataBundle())) 19 | val enable = Flipped(Decoupled(new ControlBundle)) 20 | 21 | val dataOut = Decoupled(new DataBundle()) 22 | 23 | }) 24 | 25 | val m0 = Module(new Chain(NumOps = 3, ID = 0, OpCodes = Array("And","Xor","Add"))(sign = false)(p)) 26 | 27 | m0.io.In(0) <> io.data0 28 | m0.io.In(1) <> io.data1 29 | m0.io.In(2) <> io.data2 30 | m0.io.In(3) <> io.data3 31 | 32 | m0.io.enable <> io.enable 33 | 34 | io.dataOut <> m0.io.Out(2) 35 | 36 | for(i <- 0 until 4) 37 | m0.io.Out(i).ready := io.dataOut.ready 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/accel/Config.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package accel 4 | 5 | import chisel3.Module 6 | import config._ 7 | import junctions._ 8 | import regfile.RFile 9 | import shell._ 10 | 11 | 12 | //class AcceleratorConfig extends MiniConfig() 13 | 14 | class AccelConfig extends MiniConfig( ) { 15 | 16 | } 17 | 18 | class VecFilterDFConfig extends Config((site, here, up) => { 19 | // Core 20 | case XLEN => 32 21 | case TLEN => 32 22 | case GLEN => 16 23 | // # Max bits of cache request tag. 24 | case MSHRLEN => 8 25 | case TYPSZ => 96 26 | case VERBOSITY => "low" 27 | case COMPONENTS => "TYPLOAD;TYPOP" 28 | // Max size of type memory system may see 29 | case TRACE => true 30 | case BuildRFile => (p: Parameters) => Module(new RFile(32)(p)) 31 | 32 | //------------------------- 33 | // Cache 34 | case NWays => 1 // TODO: set-associative 35 | case NSets => 256 36 | case CacheBlockBytes => 4 * (here(XLEN) >> 3) // 4 x 32 bits = 16B 37 | // NastiIO 38 | case NastiKey => new NastiParameters( 39 | idBits = 12, 40 | dataBits = 32, 41 | addrBits = 32) 42 | } 43 | ) 44 | 45 | -------------------------------------------------------------------------------- /src/main/scala/memory/MemDRAM.scala: -------------------------------------------------------------------------------- 1 | package memory 2 | 3 | 4 | import chisel3._ 5 | import chisel3.Module 6 | import chisel3.util._ 7 | import chisel3.experimental._ 8 | 9 | 10 | // Config 11 | import config._ 12 | import utility._ 13 | import interfaces._ 14 | 15 | 16 | class SinglePortDRAM(DATA: Int = 32, ADDR: Int = 32) extends BlackBox with HasBlackBoxResource { 17 | val io = IO(new Bundle { 18 | val clk = Input(Clock()) 19 | val wr = Input(Bool()) 20 | val addr = Input(UInt(ADDR.W)) 21 | val din = Input(UInt(DATA.W)) 22 | val dout = Output(UInt(DATA.W)) 23 | }) 24 | 25 | setResource("/verilog/SinglePortDRAM.v") 26 | 27 | } 28 | 29 | class FastMem(DATA: Int = 32, ADDR: Int = 32) extends Module { 30 | val io = IO(new Bundle { 31 | val wr = Input(Bool()) 32 | val addr = Input(UInt(ADDR.W)) 33 | val din = Input(UInt(DATA.W)) 34 | val dout = Output(UInt(DATA.W)) 35 | }) 36 | 37 | val memory = Module(new SinglePortDRAM()) 38 | 39 | memory.io.clk := clock 40 | memory.io.wr <> io.wr 41 | memory.io.addr <> io.addr 42 | memory.io.din <> io.din 43 | io.dout <> memory.io.dout 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/resources/verilog/DualPortBRAM.v: -------------------------------------------------------------------------------- 1 | // the dual-port BRAM Verilog below is adapted from Dan Strother's example: 2 | // http://danstrother.com/2010/09/11/inferring-rams-in-fpgas/ 3 | 4 | module DualPortBRAM #( 5 | parameter DATA = 72, 6 | parameter ADDR = 10 7 | ) ( 8 | input wire clk, 9 | 10 | // Port A 11 | input wire a_wr, 12 | input wire [ADDR-1:0] a_addr, 13 | input wire [DATA-1:0] a_din, 14 | output reg [DATA-1:0] a_dout, 15 | 16 | // Port B 17 | input wire b_wr, 18 | input wire [ADDR-1:0] b_addr, 19 | input wire [DATA-1:0] b_din, 20 | output reg [DATA-1:0] b_dout 21 | ); 22 | 23 | // Shared memory 24 | reg [DATA-1:0] mem [(2**ADDR)-1:0]; 25 | 26 | // Port A 27 | always @(posedge clk) begin 28 | a_dout <= mem[a_addr]; 29 | if(a_wr) begin 30 | a_dout <= a_din; 31 | mem[a_addr] <= a_din; 32 | end 33 | end 34 | 35 | // Port B 36 | always @(posedge clk) begin 37 | b_dout <= mem[b_addr]; 38 | if(b_wr) begin 39 | b_dout <= b_din; 40 | mem[b_addr] <= b_din; 41 | end 42 | end 43 | 44 | endmodule 45 | -------------------------------------------------------------------------------- /src/test/scala/dnn/GEMM.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.PeekPokeTester 5 | import config._ 6 | import node._ 7 | import org.scalatest.{FlatSpec, Matchers} 8 | 9 | 10 | // Tester. 11 | class GEMMCompTests(df: GEMMNode[matNxN]) 12 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 13 | poke(df.io.enable.valid, true) 14 | poke(df.io.enable.bits.control, true) 15 | 16 | poke(df.io.LeftIO.bits.data, 0x01020304L) 17 | poke(df.io.LeftIO.valid, true) 18 | poke(df.io.LeftIO.bits.predicate, true) 19 | 20 | 21 | poke(df.io.RightIO.bits.data, 0x04030201L) 22 | poke(df.io.RightIO.valid, true) 23 | poke(df.io.RightIO.bits.predicate, true) 24 | 25 | poke(df.io.Out(0).ready, true.B) 26 | step(20) 27 | } 28 | 29 | 30 | class GEMMCompTester extends FlatSpec with Matchers { 31 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 32 | it should "Typ Compute Tester" in { 33 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 34 | () => new GEMMNode(NumOuts = 1, ID = 0)(new matNxN(2))) { 35 | c => new GEMMCompTests(c) 36 | } should be(true) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/stack/StackMem.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package accel 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | 8 | import junctions._ 9 | import config._ 10 | import interfaces._ 11 | import NastiConstants._ 12 | import accel._ 13 | 14 | class StackMemIO(implicit p: Parameters) extends CoreBundle()(p) with CoreParams { 15 | val req = Flipped(Decoupled(new MemReq)) 16 | val resp = Output(Valid(new MemResp)) 17 | } 18 | 19 | class StackMem(size : Int)(implicit val p: Parameters) extends Module with CoreParams { 20 | val io = IO(new StackMemIO) 21 | 22 | // val mem = Mem(size*(1< io.ReadIn(i) 32 | } 33 | 34 | RMux.io.sel <> io.SEL 35 | RMux.io.en := io.EN 36 | io.ReadOut <> RMux.io.output 37 | 38 | // val EN = RegInit(true.B) 39 | // val SEL = RegInit(1.U(2.W)) 40 | // 41 | // 42 | // val x = io.SEL 43 | // when(io.EN) { 44 | // io.ReadOut := io.ReadIn(x) 45 | // 46 | // }.otherwise { 47 | // io.ReadOut.valid := false.B 48 | // } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/config/util.scala: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | 4 | import Chisel._ 5 | import scala.math._ 6 | 7 | class ParameterizedBundle(implicit p: Parameters) extends Bundle { 8 | override def cloneType = { 9 | try { 10 | this.getClass.getConstructors.head.newInstance(p).asInstanceOf[this.type] 11 | } catch { 12 | case e: java.lang.IllegalArgumentException => 13 | throwException("Unable to use ParamaterizedBundle.cloneType on " + 14 | this.getClass + ", probably because " + this.getClass + 15 | "() takes more than one argument. Consider overriding " + 16 | "cloneType() on " + this.getClass, e) 17 | } 18 | } 19 | } 20 | 21 | abstract class GenericParameterizedBundle[+T <: Object](val params: T) extends Bundle 22 | { 23 | override def cloneType = { 24 | try { 25 | this.getClass.getConstructors.head.newInstance(params).asInstanceOf[this.type] 26 | } catch { 27 | case e: java.lang.IllegalArgumentException => 28 | throw new Exception("Unable to use GenericParameterizedBundle.cloneType on " + 29 | this.getClass + ", probably because " + this.getClass + 30 | "() takes more than one argument. Consider overriding " + 31 | "cloneType() on " + this.getClass, e) 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/scala/accel/Accelerator.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package accel 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | import accel.coredf._ 8 | import config._ 9 | import junctions._ 10 | 11 | abstract class AcceleratorIO(implicit val p: Parameters) extends Module with CoreParams { 12 | val io = IO( 13 | new Bundle { 14 | val h2f = Flipped(new NastiIO) 15 | val f2h = new NastiIO 16 | } 17 | ) 18 | } 19 | 20 | class Accelerator(cNum : Int, sNum : Int, coreDF: => CoreT) (implicit p: Parameters)extends AcceleratorIO()(p) { 21 | 22 | val regs = Module(new DataBundleReg(cNum, sNum)) 23 | val core = Module(coreDF) 24 | val cache = Module(new Cache) 25 | 26 | // Connect HPC AXI Master interface the control/status register block 27 | // AXI Slave interface 28 | regs.io.nasti <> io.h2f 29 | 30 | // Connect the first three control registers and one of the status 31 | // registers to the core logic block 32 | core.io.init <> regs.io.init 33 | core.io.start <> regs.io.start 34 | core.io.ctrl <> regs.io.ctrl 35 | regs.io.stat <> core.io.stat 36 | core.io.ready <> regs.io.ready 37 | core.io.done <> regs.io.done 38 | 39 | // Connect the cache CPU interface to the core logic block 40 | core.io.cache <> cache.io.cpu 41 | io.f2h <> cache.io.nasti 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/SharedFPDiv.scala: -------------------------------------------------------------------------------- 1 | package dataflow 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | import FPU._ 12 | import FType._ 13 | 14 | class FPDivDataFlow(implicit val p: Parameters) extends Module with CoreParams { 15 | 16 | val io = IO(new Bundle { 17 | val dummy = Input(UInt { 18 | 32.W 19 | }) 20 | }) 21 | 22 | val SharedDiv = Module(new SharedFPU(NumOps = 1, 32)(t = S)) 23 | 24 | val FPDiv = Module(new FPDivSqrtNode(NumOuts = 1, ID = 0, RouteID = 0, "SQRT")(t = S)) 25 | 26 | 27 | SharedDiv.io.InData(0) <> FPDiv.io.FUReq 28 | FPDiv.io.FUResp <> SharedDiv.io.OutData(0) 29 | 30 | FPDiv.io.a.bits.data := 0x40800000.U 31 | FPDiv.io.a.bits.predicate := true.B 32 | FPDiv.io.a.valid := true.B 33 | FPDiv.io.a.bits.taskID := 0.U 34 | 35 | FPDiv.io.b.bits.data := 0x40800000.U 36 | FPDiv.io.b.bits.predicate := true.B 37 | FPDiv.io.b.valid := true.B 38 | FPDiv.io.b.bits.taskID := 0.U 39 | 40 | FPDiv.io.enable.bits.control := true.B 41 | FPDiv.io.enable.valid := true.B 42 | FPDiv.io.enable.bits.taskID := 0.U 43 | FPDiv.io.Out(0).ready := true.B 44 | 45 | //printf("\n \"Outputs\": {\"Out\": %x, %x}", FPDiv.io.Out(0).bits.data, FPDiv.io.Out(0).fire( )) 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/resources/verilog/macc.v: -------------------------------------------------------------------------------- 1 | // Signed 40-bit streaming accumulator with 16-bit inputs 2 | // File: macc.v 3 | 4 | module macc # ( 5 | parameter SIZEIN = 16, SIZEOUT = 40 6 | ) 7 | ( 8 | input clk, ce, sload, 9 | input signed [SIZEIN-1:0] a, b, 10 | output signed [SIZEOUT-1:0] accum_out 11 | ); 12 | 13 | // Declare registers for intermediate values 14 | reg signed [SIZEIN-1:0] a_reg, b_reg; 15 | reg sload_reg; 16 | reg signed [2*SIZEIN:0] mult_reg; 17 | reg signed [SIZEOUT-1:0] adder_out, old_result; 18 | 19 | //always @(adder_out or sload_reg) 20 | //begin 21 | // if (sload_reg) 22 | // old_result <= 0; 23 | // else 24 | // // 'sload' is now active (=low) and opens the accumulation loop. 25 | // // The accumulator takes the next multiplier output in 26 | // // the same cycle. 27 | // old_result <= adder_out; 28 | //end 29 | 30 | always @(posedge clk) 31 | if (ce) 32 | begin 33 | a_reg <= a; 34 | b_reg <= b; 35 | mult_reg <= a_reg * b_reg; 36 | sload_reg <= sload; 37 | // Store accumulation result into a register 38 | adder_out <= old_result + mult_reg; 39 | end 40 | 41 | // Output accumulation result 42 | assign accum_out = adder_out; 43 | 44 | endmodule // macc -------------------------------------------------------------------------------- /include/dmlc/build_config_default.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2018 by Contributors 3 | * \file build_config_default.h 4 | * \brief Default detection logic for fopen64 and other symbols. 5 | * May be overriden by CMake 6 | * \author KOLANICH 7 | */ 8 | #ifndef DMLC_BUILD_CONFIG_DEFAULT_H_ 9 | #define DMLC_BUILD_CONFIG_DEFAULT_H_ 10 | 11 | /* default logic for fopen64 */ 12 | #if DMLC_USE_FOPEN64 && \ 13 | (!defined(__GNUC__) || (defined __ANDROID__) || (defined __FreeBSD__) \ 14 | || (defined __APPLE__) || ((defined __MINGW32__) && !(defined __MINGW64__)) \ 15 | || (defined __CYGWIN__) ) 16 | #define DMLC_EMIT_FOPEN64_REDEFINE_WARNING 17 | #define fopen64 std::fopen 18 | #endif 19 | 20 | /* default logic for stack trace */ 21 | #if (defined(__GNUC__) && !defined(__MINGW32__)\ 22 | && !defined(__sun) && !defined(__SVR4)\ 23 | && !(defined __MINGW64__) && !(defined __ANDROID__))\ 24 | && !defined(__CYGWIN__) && !defined(__EMSCRIPTEN__)\ 25 | && !defined(__RISCV__) && !defined(__hexagon__) 26 | #define DMLC_LOG_STACK_TRACE 1 27 | #define DMLC_LOG_STACK_TRACE_SIZE 10 28 | #define DMLC_EXECINFO_H 29 | #endif 30 | 31 | /* default logic for detecting existence of nanosleep() */ 32 | #if !(defined _WIN32) || (defined __CYGWIN__) 33 | #define DMLC_NANOSLEEP_PRESENT 34 | #endif 35 | 36 | #endif // DMLC_BUILD_CONFIG_DEFAULT_H_ 37 | -------------------------------------------------------------------------------- /include/dmlc/omp.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2015 by Contributors 3 | * \file omp.h 4 | * \brief header to handle OpenMP compatibility issues 5 | */ 6 | #ifndef DMLC_OMP_H_ 7 | #define DMLC_OMP_H_ 8 | 9 | 10 | #if defined(_OPENMP) 11 | #include 12 | #else 13 | 14 | #if defined(__ANDROID__) 15 | #undef __GOMP_NOTHROW 16 | #define __GOMP_NOTHROW 17 | #elif defined(__cplusplus) 18 | #undef __GOMP_NOTHROW 19 | #define __GOMP_NOTHROW throw() 20 | #else 21 | #undef __GOMP_NOTHROW 22 | #define __GOMP_NOTHROW __attribute__((__nothrow__)) 23 | #endif 24 | 25 | //! \cond Doxygen_Suppress 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | inline int omp_get_thread_num() __GOMP_NOTHROW { return 0; } 30 | inline int omp_get_num_threads() __GOMP_NOTHROW { return 1; } 31 | inline int omp_get_max_threads() __GOMP_NOTHROW { return 1; } 32 | inline int omp_get_num_procs() __GOMP_NOTHROW { return 1; } 33 | inline void omp_set_num_threads(int nthread) __GOMP_NOTHROW {} 34 | #ifdef __cplusplus 35 | } 36 | #endif // __cplusplus 37 | #endif // _OPENMP 38 | 39 | // loop variable used in openmp 40 | namespace dmlc { 41 | #ifdef _MSC_VER 42 | typedef int omp_uint; 43 | typedef long omp_ulong; // NOLINT(*) 44 | #else 45 | typedef unsigned omp_uint; 46 | typedef unsigned long omp_ulong; // NOLINT(*) 47 | #endif 48 | //! \endcond 49 | } // namespace dmlc 50 | #endif // DMLC_OMP_H_ 51 | -------------------------------------------------------------------------------- /src/main/scala/vta/dpi/VTASimDPI.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package vta.dpi 21 | 22 | import chisel3._ 23 | import chisel3.util._ 24 | import config._ 25 | import shell._ 26 | 27 | /** Sim DPI module. 28 | * 29 | * Wrapper for Sim Verilog DPI module. 30 | */ 31 | class VTASimDPI extends BlackBox with HasBlackBoxResource { 32 | val io = IO(new Bundle { 33 | val clock = Input(Clock()) 34 | val reset = Input(Bool()) 35 | val dpi_wait = Output(Bool()) 36 | }) 37 | setResource("/verilog/VTASimDPI.v") 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FX_Reduction.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.experimental.FixedPoint 5 | import chisel3.util._ 6 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 7 | import org.scalatest.{FlatSpec, Matchers} 8 | import dataflow._ 9 | import muxes._ 10 | import config._ 11 | import util._ 12 | 13 | 14 | class ReductionTests(df: NCycle_Reduction[FixedPoint])(implicit p: config.Parameters) extends PeekPokeTester(df) { 15 | poke(df.io.activate, false.B) 16 | // left * right 17 | df.io.input_vec.zipWithIndex.foreach { case (io, i) => poke(io, (0x20).U) } 18 | poke(df.io.activate, true.B) 19 | step(1) 20 | poke(df.io.activate, false.B) 21 | for (i <- 0 until df.latency( ) - 1) { 22 | print(peek(df.io.output)) 23 | print("," + peek(df.io.valid)) 24 | print("\n") 25 | step(1) 26 | } 27 | } 28 | 29 | 30 | class Reduction_Tester extends FlatSpec with Matchers { 31 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 32 | it should "Typ Compute Tester" in { 33 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 34 | () => new NCycle_Reduction(FixedPoint(p(XLEN).W, 4.BP), N = 4, pipelined = true, opcode = "Add")) { 35 | c => new ReductionTests(c) 36 | } should be(true) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FX_SCAL.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 7 | import org.scalatest.{Matchers, FlatSpec} 8 | 9 | import dataflow._ 10 | import muxes._ 11 | import config._ 12 | import util._ 13 | 14 | 15 | class SCALTests(df: NCycle_SCAL[UInt])(implicit p: config.Parameters) extends PeekPokeTester(df) { 16 | poke(df.io.activate, false.B) 17 | // left * right 18 | df.io.input_vec.zipWithIndex.foreach { case (io, i) => poke(io, (i).U) } 19 | poke(df.io.scalar, 1.U) 20 | poke(df.io.activate, true.B) 21 | step(1) 22 | poke(df.io.activate, false.B) 23 | for (i <- 0 until df.latency( ) - 1) { 24 | for (j <- 0 until df.lanes) { 25 | // require(i * df.lanes + j + 1 == peek(df.io.output(j))) 26 | print(peek(df.io.output(j)) + ",") 27 | } 28 | print("\n") 29 | step(1) 30 | } 31 | } 32 | 33 | class SCAL_Tester extends FlatSpec with Matchers { 34 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 35 | it should "Typ Compute Tester" in { 36 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 37 | () => new NCycle_SCAL(UInt(p(XLEN).W), N = 8, lanes = 1, "add")) { 38 | c => new SCALTests(c) 39 | } should be(true) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/scala/FPU/SharedFPDiv.scala: -------------------------------------------------------------------------------- 1 | package FPU 2 | 3 | /** 4 | * Created by vnaveen0 on 8/7/17. 5 | */ 6 | 7 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 8 | import org.scalatest.{Matchers, FlatSpec} 9 | 10 | import config._ 11 | import FType._ 12 | 13 | 14 | class SharedFPUTests(c: SharedFPU) 15 | (implicit p: config.Parameters) 16 | extends PeekPokeTester(c) { 17 | 18 | // var readidx = 0 19 | poke(c.io.InData(0).bits.RouteID, 0) 20 | poke(c.io.InData(1).bits.RouteID, 1) 21 | poke(c.io.InData(0).bits.data("field0").data, 0x6C00) 22 | poke(c.io.InData(0).bits.data("field1").data, 0x4C00) 23 | poke(c.io.InData(0).bits.data("field2").data, 0) 24 | poke(c.io.InData(0).valid,1) 25 | poke(c.io.InData(1).valid,1) 26 | poke(c.io.InData(1).bits.data("field0").data, 0x6C00) 27 | poke(c.io.InData(1).bits.data("field1").data, 0x4C00) 28 | poke(c.io.InData(1).bits.data("field2").data, 1) 29 | poke(c.io.InData(1).valid,0) 30 | for( i <- 0 to 68) { 31 | step(1) 32 | } 33 | } 34 | 35 | 36 | class SharedFPUTester extends FlatSpec with Matchers { 37 | implicit val p = config.Parameters.root((new HALFPrecisionFPConfig).toInstance) 38 | it should "Memory Controller tester" in { 39 | chisel3.iotesters.Driver(() => new SharedFPU(NumOps=2, PipeDepth=5)(t = p(FTYP))) { 40 | c => new SharedFPUTests(c) 41 | } should be(true) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/dnn/modules/NCycle_CooSCAL.scala: -------------------------------------------------------------------------------- 1 | package dnn.modules 2 | 3 | import chisel3.{Module, _} 4 | import chisel3.util._ 5 | import config._ 6 | import dnn.TwoOperand_PE 7 | import dnn.types.TwoOperand 8 | import utility.UniformPrintfs 9 | 10 | 11 | class NCycle_CooSCAL[T <: Data : TwoOperand.OperatorTwoOperand](val gen: T, val N: Int, val lanes: Int, val opcode: String)(implicit val p: Parameters) 12 | extends Module with config.CoreParams with UniformPrintfs { 13 | val io = IO(new Bundle { 14 | val input_vec = Input(Vec(N, UInt(xlen.W))) 15 | val scalar = Input(UInt(xlen.W)) 16 | val output = Output(Vec(lanes, UInt(xlen.W))) 17 | }) 18 | 19 | require(gen.getWidth == xlen, "Size of element does not match xlen OR Size of vector does not match shape") 20 | require(N % lanes == 0, "Size of vector should be multiple of lanes") 21 | 22 | def latency(): Int = { 23 | N / lanes + 1 24 | } 25 | 26 | val PEs = 27 | for (i <- 0 until lanes) yield { 28 | Module(new TwoOperand_PE(gen, opcode)) 29 | } 30 | 31 | for (i <- 0 until lanes) { 32 | PEs(i).io.left.bits := io.input_vec(i) 33 | PEs(i).io.right.bits := io.scalar 34 | PEs(i).io.left.valid := true.B //io.input_vec(i).valid 35 | PEs(i).io.right.valid := true.B //io.scalar.valid 36 | } 37 | 38 | for (i <- 0 until lanes) { 39 | io.output(i) <> PEs(i).io.out.bits 40 | PEs(i).reset := false.B 41 | } 42 | } -------------------------------------------------------------------------------- /src/test/scala/dnn_layers/Mac.scala: -------------------------------------------------------------------------------- 1 | package dnn_layers 2 | 3 | 4 | import chisel3._ 5 | import chisel3.iotesters.PeekPokeTester 6 | import config._ 7 | import node._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import FPU._ 10 | import dnn.MacNode 11 | //import dnn.DotNode 12 | 13 | // Tester. 14 | 15 | class FXMacCompTests(df: MacNode[FXmatNxN]) 16 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 17 | poke(df.io.enable.valid, true) 18 | poke(df.io.enable.bits.control, true) 19 | // 0x32 0011.0010 . Fixed point 3.125 in fixed point 4 BP. 20 | poke(df.io.LeftIO.bits.data, 0x49494949L) 21 | poke(df.io.LeftIO.valid, true) 22 | poke(df.io.LeftIO.bits.predicate, true) 23 | 24 | // 0x32 (3.125) * 0x20 (2.0) = 6.25 (0x64 or 100) 25 | poke(df.io.RightIO.bits.data, 0x40404040L) 26 | poke(df.io.RightIO.valid, true) 27 | poke(df.io.RightIO.bits.predicate, true) 28 | 29 | poke(df.io.Out(0).ready, true.B) 30 | step(20) 31 | } 32 | 33 | 34 | 35 | class MacCompTester extends FlatSpec with Matchers { 36 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 37 | it should "Typ Compute Tester" in { 38 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 39 | () => new MacNode(NumOuts = 1, ID = 0, 4)(new FXmatNxN(2,4))) { 40 | c => new FXMacCompTests(c) 41 | } should be(true) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/memory/MemArbiter.scala: -------------------------------------------------------------------------------- 1 | package memory 2 | 3 | 4 | import chisel3._ 5 | import chisel3.Module 6 | import chisel3.util._ 7 | 8 | 9 | // Config 10 | import config._ 11 | import utility._ 12 | import interfaces._ 13 | 14 | 15 | class MemArbiterIO(NumPorts:Int)(implicit val p: Parameters) 16 | extends Module with CoreParams with UniformPrintfs { 17 | val io = IO(new Bundle { 18 | val cpu = new Bundle { 19 | val MemReq = Vec(NumPorts, Flipped(Decoupled(new MemReq))) 20 | val MemResp = Vec(NumPorts, Output(Valid(new MemResp))) 21 | } 22 | val cache = new Bundle { 23 | val MemReq = Decoupled(new MemReq) 24 | val MemResp = Input(Valid(new MemResp)) 25 | val chosen = Output(UInt(log2Ceil(NumPorts).W)) 26 | } 27 | }) 28 | } 29 | 30 | class MemArbiter(NumPorts:Int)(implicit p: Parameters) extends MemArbiterIO(NumPorts)(p) { 31 | 32 | val reqArb = Module(new RRArbiter(new MemReq, NumPorts)) 33 | reqArb.io.in <> io.cpu.MemReq 34 | val chosen_reg = RegInit(0.U) 35 | when (reqArb.io.out.fire()) { 36 | chosen_reg := reqArb.io.chosen 37 | } 38 | io.cache.MemReq <> reqArb.io.out 39 | io.cache.chosen := reqArb.io.chosen 40 | 41 | // Response Demux 42 | for(i <- 0 until NumPorts) { 43 | io.cpu.MemResp(i).bits := io.cache.MemResp.bits 44 | io.cpu.MemResp(i).valid := false.B // default 45 | } 46 | io.cpu.MemResp(chosen_reg).valid := io.cache.MemResp.valid 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FX_Dot.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 7 | import org.scalatest.{Matchers, FlatSpec} 8 | 9 | import dataflow._ 10 | import muxes._ 11 | import config._ 12 | import util._ 13 | 14 | 15 | class DotTests(df: NCycle_Dot[UInt])(implicit p: config.Parameters) extends PeekPokeTester(df) { 16 | poke(df.io.activate, false.B) 17 | // left * right 18 | df.io.input_left_vec.zipWithIndex.foreach { case (io, i) => poke(io, (i).U) } 19 | df.io.input_right_vec.zipWithIndex.foreach { case (io, i) => poke(io, (i).U) } 20 | poke(df.io.activate, true.B) 21 | step(1) 22 | poke(df.io.activate, false.B) 23 | for (i <- 0 until df.latency( )) { 24 | for (j <- 0 until df.lanes) { 25 | // require(i * df.lanes + j + 1 == peek(df.io.output(j))) 26 | print(peek(df.io.output(j)) + ",") 27 | } 28 | print("Valid" + peek(df.io.valid) + "\n") 29 | step(1) 30 | } 31 | } 32 | 33 | class Dot_Tester extends FlatSpec with Matchers { 34 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 35 | it should "Typ Compute Tester" in { 36 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 37 | () => new NCycle_Dot(UInt(p(XLEN).W), N = 8, lanes = 2, "add")) { 38 | c => new DotTests(c) 39 | } should be(true) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /include/dmlc/timer.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2015 by Contributors 3 | * \file timer.h 4 | * \brief cross platform timer for timing 5 | * \author Tianqi Chen 6 | */ 7 | #ifndef DMLC_TIMER_H_ 8 | #define DMLC_TIMER_H_ 9 | 10 | #include "base.h" 11 | 12 | #if DMLC_USE_CXX11 13 | #include 14 | #endif 15 | 16 | #include 17 | #ifdef __MACH__ 18 | #include 19 | #include 20 | #endif 21 | #include "./logging.h" 22 | 23 | namespace dmlc { 24 | /*! 25 | * \brief return time in seconds 26 | */ 27 | inline double GetTime(void) { 28 | #if DMLC_USE_CXX11 29 | return std::chrono::duration( 30 | std::chrono::high_resolution_clock::now().time_since_epoch()).count(); 31 | #elif defined __MACH__ 32 | clock_serv_t cclock; 33 | mach_timespec_t mts; 34 | host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); 35 | CHECK(clock_get_time(cclock, &mts) == 0) << "failed to get time"; 36 | mach_port_deallocate(mach_task_self(), cclock); 37 | return static_cast(mts.tv_sec) + static_cast(mts.tv_nsec) * 1e-9; 38 | #else 39 | #if defined(__unix__) || defined(__linux__) 40 | timespec ts; 41 | CHECK(clock_gettime(CLOCK_REALTIME, &ts) == 0) << "failed to get time"; 42 | return static_cast(ts.tv_sec) + static_cast(ts.tv_nsec) * 1e-9; 43 | #else 44 | return static_cast(time(NULL)); 45 | #endif 46 | #endif 47 | } 48 | } // namespace dmlc 49 | #endif // DMLC_TIMER_H_ 50 | -------------------------------------------------------------------------------- /src/test/scala/dnn/GEMV.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 6 | import org.scalatest.{FlatSpec, Matchers} 7 | import dataflow._ 8 | import muxes._ 9 | import config._ 10 | import util._ 11 | import interfaces._ 12 | import node._ 13 | 14 | 15 | // Tester. 16 | class GEMVCompTests(df: GEMV_1Cycle[matNxN, vecN, matNxN]) 17 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 18 | poke(df.io.enable.valid, true) 19 | poke(df.io.enable.bits.control, true) 20 | 21 | poke(df.io.LeftIO.bits.data, 0x0013001300130013L) 22 | poke(df.io.LeftIO.valid, true) 23 | poke(df.io.LeftIO.bits.predicate, true) 24 | 25 | 26 | poke(df.io.RightIO.bits.data, 0x0013001300130013L) 27 | poke(df.io.RightIO.valid, true) 28 | poke(df.io.RightIO.bits.predicate, true) 29 | 30 | poke(df.io.Out(0).ready, true.B) 31 | step(10) 32 | } 33 | 34 | 35 | class GEMVCompTester extends FlatSpec with Matchers { 36 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 37 | it should "Typ Compute Tester" in { 38 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 39 | () => new GEMV_1Cycle(NumOuts = 1, ID = 0, opCode = "Add")(sign = false)(new matNxN(2), new vecN(2))(new matNxN(2))) { 40 | c => new GEMVCompTests(c) 41 | } should be(true) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FX_GEMV.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 6 | import org.scalatest.{FlatSpec, Matchers} 7 | import dataflow._ 8 | import muxes._ 9 | import config._ 10 | import util._ 11 | import interfaces._ 12 | import node._ 13 | 14 | 15 | // Tester. 16 | class FXGEMVCompTests(df: GEMV_1Cycle[FXmatNxN, FXvecN, FXmatNxN]) 17 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 18 | poke(df.io.enable.valid, true) 19 | poke(df.io.enable.bits.control, true) 20 | 21 | poke(df.io.LeftIO.bits.data, 0x0013001300130013L) 22 | poke(df.io.LeftIO.valid, true) 23 | poke(df.io.LeftIO.bits.predicate, true) 24 | 25 | 26 | poke(df.io.RightIO.bits.data, 0x0013001300130013L) 27 | poke(df.io.RightIO.valid, true) 28 | poke(df.io.RightIO.bits.predicate, true) 29 | 30 | poke(df.io.Out(0).ready, true.B) 31 | step(10) 32 | } 33 | 34 | 35 | class FXGEMVCompTester extends FlatSpec with Matchers { 36 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 37 | it should "Typ Compute Tester" in { 38 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 39 | () => new GEMV_1Cycle(NumOuts = 1, ID = 0, opCode = "Add")(sign = false)(new FXmatNxN(2, 4), new FXvecN(2, 4))(new FXmatNxN(2, 4))) { 40 | c => new FXGEMVCompTests(c) 41 | } should be(true) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/filter/CacheLoader.scala: -------------------------------------------------------------------------------- 1 | package dataflow.filter 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import accel._ 7 | import node._ 8 | import config._ 9 | import interfaces._ 10 | import arbiters._ 11 | import memory._ 12 | 13 | class CacheLoader(FilterSize : Int)(implicit val p: Parameters) extends Module with CoreParams { 14 | 15 | val io = IO(new Bundle { 16 | val enable = Flipped(Decoupled(new ControlBundle())) 17 | val ptr = Vec(FilterSize,Flipped(Decoupled(new DataBundle()))) 18 | val cache = Flipped(new CacheIO) 19 | val data = Vec(FilterSize,Decoupled(new DataBundle())) 20 | }) 21 | 22 | val CacheMem = Module(new UnifiedController(ID=0,Size=32,NReads=FilterSize,NWrites=FilterSize) 23 | (WControl=new WriteTypMemoryController(NumOps=FilterSize,BaseSize=2,NumEntries=2)) 24 | (RControl=new ReadTypMemoryController(NumOps=FilterSize,BaseSize=2,NumEntries=2)) 25 | (RWArbiter=new ReadWriteArbiter())) 26 | 27 | val Load = for (i <- 0 until FilterSize) yield { 28 | val ld = Module(new UnTypLoad(NumPredOps=0, NumSuccOps=0, NumOuts=1,ID=i,RouteID=i)) 29 | ld 30 | } 31 | 32 | for (i <- 0 until FilterSize) { 33 | Load(i).io.enable <> io.enable 34 | Load(i).io.GepAddr <> io.ptr(i) 35 | CacheMem.io.ReadIn(i) <> Load(i).io.memReq 36 | Load(i).io.memResp <> CacheMem.io.ReadOut(i) 37 | io.data(i) <> Load(i).io.Out(0) 38 | } 39 | 40 | io.cache.abort := false.B 41 | io.cache.req <> CacheMem.io.MemReq 42 | CacheMem.io.MemResp <> io.cache.resp 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/main/scala/utility/Constants.scala: -------------------------------------------------------------------------------- 1 | /*================================================== 2 | = Errata MT_D accesses not supported = 3 | ===================================================*/ 4 | 5 | package utility 6 | 7 | import chisel3._ 8 | /** 9 | * @todo MT_D double access not supported yet. 10 | * 11 | */ 12 | trait MemoryOpConstants 13 | { 14 | val MT_X = 0.U(3.W) 15 | val MT_B = 1.U(3.W) 16 | val MT_H = 2.U(3.W) 17 | val MT_W = 3.U(3.W) 18 | val MT_D = 4.U(3.W) 19 | val MT_BU = 5.U(3.W) 20 | val MT_HU = 6.U(3.W) 21 | val MT_WU = 7.U(3.W) 22 | // Maximum size of access type 23 | val MT_MAX_SIZE = 2 24 | 25 | object Margin extends Enumeration { 26 | type Margin = Value; 27 | val MT_X,MT_B,MT_H,MT_W,MT_D,MT_BU,MT_HU,MT_WU,MT_2x2=Value 28 | } 29 | import Margin._ 30 | var Type = scala.collection.mutable.Map 31 | (MT_X -> 1, 32 | MT_B -> 1, 33 | MT_H -> 1, 34 | MT_W -> 1, 35 | MT_D -> 2, 36 | MT_BU ->1, 37 | MT_HU ->1, 38 | MT_WU -> 1, 39 | MT_2x2 -> 4 40 | ) 41 | 42 | 43 | } 44 | 45 | object Constants extends MemoryOpConstants 46 | { 47 | 48 | 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/loop/LoopEnd.scala: -------------------------------------------------------------------------------- 1 | package loop 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.util._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import muxes._ 12 | import util._ 13 | import node._ 14 | import utility.UniformPrintfs 15 | 16 | 17 | /** 18 | * @note Loop header IO 19 | * @param NumInputs Number of inputs 20 | */ 21 | class LoopEndIO(val NumInputs: Int, val NumOuts: Int) 22 | (implicit p: Parameters) extends CoreBundle()(p) { 23 | 24 | val inputArg = Vec(NumInputs, Flipped(Decoupled(new DataBundle()))) 25 | val outputArg = Vec(NumOuts, Decoupled(new DataBundle())) 26 | 27 | val enableSignal = Vec(NumInputs, Flipped(Decoupled(new ControlBundle))) 28 | 29 | } 30 | 31 | 32 | 33 | class LoopEnd(val NumInputs: Int, val NumOuts: Int, val ID: Int) 34 | (implicit val p: Parameters) extends Module with CoreParams with UniformPrintfs { 35 | 36 | override lazy val io = IO(new LoopEndIO(NumInputs, NumOuts)) 37 | 38 | val Args = for (i <- 0 until NumInputs) yield { 39 | val arg = Module(new LiveOutNode(NumOuts = 1, ID = i)) 40 | arg 41 | } 42 | 43 | //Iterating over each loopelement and connect them to the IO 44 | for (i <- 0 until NumInputs) { 45 | Args(i).io.InData <> io.inputArg(i) 46 | Args(i).io.enable <> io.enableSignal(i) 47 | } 48 | 49 | for (i <- 0 until NumOuts) { 50 | io.outputArg(i) <> Args(i).io.Out(0) 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FX_Systolic.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.core.FixedPoint 5 | import chisel3.util._ 6 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 7 | import org.scalatest.{FlatSpec, Matchers} 8 | import dataflow._ 9 | import muxes._ 10 | import config._ 11 | import util._ 12 | 13 | class FX_SystolicTests(df: SystolicSquare[FixedPoint])(implicit p: config.Parameters) extends PeekPokeTester(df) { 14 | poke(df.io.activate, false.B) 15 | // left * right 16 | // df.io.left.zipWithIndex.foreach { case (io, i) => poke(io, (i + 1).U) } 17 | // df.io.right.zipWithIndex.foreach { case (io, i) => poke(io, (i + 1).U) } 18 | poke(df.io.left(0), 0x140) 19 | poke(df.io.left(1), 0x140) 20 | poke(df.io.left(2), 0x140) 21 | poke(df.io.left(3), 0x140) 22 | 23 | poke(df.io.right(0), 0x140) 24 | poke(df.io.right(1), 0x140) 25 | poke(df.io.right(2), 0x140) 26 | poke(df.io.right(3), 0x140) 27 | 28 | poke(df.io.activate, true.B) 29 | step(1) 30 | poke(df.io.activate, false.B) 31 | step(6) 32 | for (i <- 0 until df.N * df.N) { 33 | // print(peek(df.io.output(i)) + ",") 34 | } 35 | } 36 | 37 | class FX_Systolic_Tester extends FlatSpec with Matchers { 38 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 39 | it should "Typ Compute Tester" in { 40 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 41 | () => new SystolicSquare(FixedPoint(p(XLEN).W, 8.BP), 2)) { 42 | c => new FX_SystolicTests(c) 43 | } should be(true) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/scala/dnn/FP_Systolic.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import FPU.FloatingPoint 4 | import chisel3._ 5 | import chisel3.util._ 6 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 7 | import org.scalatest.{FlatSpec, Matchers} 8 | import dataflow._ 9 | import muxes._ 10 | import config._ 11 | import util._ 12 | 13 | class FP_SystolicTests(df: SystolicSquare[FloatingPoint])(implicit p: config.Parameters) extends PeekPokeTester(df) { 14 | poke(df.io.activate, false.B) 15 | // left * right 16 | // df.io.left.zipWithIndex.foreach { case (io, i) => poke(io, (i + 1).U) } 17 | // df.io.right.zipWithIndex.foreach { case (io, i) => poke(io, (i + 1).U) } 18 | poke(df.io.left(0), 0x4400) 19 | poke(df.io.left(1), 0x4400) 20 | poke(df.io.left(2), 0x4400) 21 | poke(df.io.left(3), 0x4400) 22 | 23 | poke(df.io.right(0), 0x4400) 24 | poke(df.io.right(1), 0x4400) 25 | poke(df.io.right(2), 0x4400) 26 | poke(df.io.right(3), 0x4400) 27 | 28 | poke(df.io.activate, true.B) 29 | step(1) 30 | poke(df.io.activate, false.B) 31 | step(6) 32 | for (i <- 0 until df.N * df.N) { 33 | // print(peek(df.io.output(i)) + ",") 34 | } 35 | } 36 | 37 | class FP_Systolic_Tester extends FlatSpec with Matchers { 38 | implicit val p = config.Parameters.root((new HALFPrecisionFPConfig).toInstance) 39 | it should "Typ Compute Tester" in { 40 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 41 | () => new SystolicSquare(new FloatingPoint(t = p(FTYP)), 2)) { 42 | c => new FP_SystolicTests(c) 43 | } should be(true) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/dnn/MacNode.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3.util.Decoupled 4 | import chisel3.{Flipped, Module, UInt, _} 5 | import config.{Parameters, XLEN} 6 | import dnn.types.{OperatorDot, OperatorReduction} 7 | import interfaces.CustomDataBundle 8 | import node.{HandShakingIONPS, HandShakingNPS, Shapes} 9 | 10 | class MacIO[gen <: Shapes](NumOuts: Int)(shape: => gen)(implicit p: Parameters) 11 | extends HandShakingIONPS(NumOuts)(new CustomDataBundle(UInt(p(XLEN).W))) { 12 | val LeftIO = Flipped(Decoupled(new CustomDataBundle(UInt(shape.getWidth.W)))) 13 | val RightIO = Flipped(Decoupled(new CustomDataBundle(UInt(shape.getWidth.W)))) 14 | override def cloneType = new MacIO(NumOuts)(shape).asInstanceOf[this.type] 15 | } 16 | 17 | class MacNode[L <: Shapes : OperatorDot : OperatorReduction](NumOuts: Int, ID: Int, lanes: Int)(shape: => L)(implicit p: Parameters) 18 | extends HandShakingNPS(NumOuts, ID)(new CustomDataBundle(UInt(p(XLEN).W)))(p) { 19 | override lazy val io = IO(new MacIO(NumOuts)(shape)) 20 | 21 | val dotNode = Module(new DotNode(NumOuts = 1, ID = ID, lanes, "Mul")(shape)) 22 | val reduceNode = Module(new ReduceNode(NumOuts = 1, ID = ID, false, "Add")(shape)) 23 | 24 | // Connect IO to dotNode 25 | dotNode.io.enable <> io.enable 26 | dotNode.io.LeftIO <> io.LeftIO 27 | dotNode.io.RightIO <> io.RightIO 28 | 29 | reduceNode.io.LeftIO <> dotNode.io.Out(0) 30 | reduceNode.io.enable <> io.enable 31 | 32 | // Wire up Outputs 33 | for (i <- 0 until NumOuts) { 34 | io.Out(i) <> reduceNode.io.Out(i) 35 | } 36 | // printf(p"\n Left ${io.LeftIO.bits.data} Right: ${io.RightIO.bits.data} Output: ${reduceNode.io.Out(0).bits.data}") 37 | } 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/AllocaTest.scala: -------------------------------------------------------------------------------- 1 | // package dataflow 2 | 3 | // import chisel3._ 4 | // import chisel3.util._ 5 | // import chisel3.Module 6 | // import chisel3.testers._ 7 | // import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 8 | // import org.scalatest.{Matchers, FlatSpec} 9 | 10 | // import muxes._ 11 | // import config._ 12 | // import util._ 13 | // import interfaces._ 14 | // import regfile._ 15 | // import node._ 16 | // import alloca._ 17 | 18 | 19 | // //TODO uncomment if you remove StackCentral.scala file 20 | // // 21 | // abstract class AllocaTestIO(implicit val p: Parameters) extends Module with CoreParams { 22 | // val io = IO(new Bundle { 23 | // val result = Output(xlen.U) 24 | // val resultReady = Input(Bool()) 25 | // val resultValid = Output(Bool()) 26 | // }) 27 | // } 28 | 29 | // class AllocaTest(implicit p: Parameters) extends AllocaTestIO()(p){ 30 | 31 | 32 | // // Containig number of bytes Alloca require 33 | // val reg1 = Module(new InputRegFile(Array(1.U, 4.U, 3.U, 4.U))(p)) 34 | 35 | // val m0 = Module(new AllocaNode(0)(p)) 36 | // val m1 = Module(new newCentralStack()(p)) 37 | 38 | // m0.io.sizeinput <> reg1.io.Data 39 | // printf(p"Reg data: ${reg1.io.Data}\n") 40 | // m0.io.allocareq <> m1.io.AllocaIn(0) 41 | // printf(p"Alloca req: ${m0.io.allocareq}\n") 42 | // m0.io.allocaresp.allocaaddr <> m1.io.AllocaOut(0) 43 | // printf(p"Alloca resp: ${m0.io.allocaresp}\n") 44 | // m0.io.allocaresp.valid := m1.io.Valids(0) 45 | 46 | 47 | // m0.io.addressout.ready := io.resultReady 48 | // io.result := m0.io.addressout.bits 49 | // io.resultValid := m0.io.addressout.valid 50 | 51 | // } 52 | -------------------------------------------------------------------------------- /src/main/scala/vta/util/GenericParameterizedBundle.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package vta.util.genericbundle 21 | 22 | // taken from https://github.com/vta.roject/rocket-chip 23 | 24 | import chisel3._ 25 | 26 | abstract class GenericParameterizedBundle[+T <: Object](val params: T) extends Bundle 27 | { 28 | override def cloneType = { 29 | try { 30 | this.getClass.getConstructors.head.newInstance(params).asInstanceOf[this.type] 31 | } catch { 32 | case e: java.lang.IllegalArgumentException => 33 | throw new Exception("Unable to use GenericParameterizedBundle.cloneType on " + 34 | this.getClass + ", probably because " + this.getClass + 35 | "() takes more than one argument. Consider overriding " + 36 | "cloneType() on " + this.getClass, e) 37 | } 38 | } 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/main/scala/config/cde/Config.scala: -------------------------------------------------------------------------------- 1 | package config.cde 2 | 3 | class Config( 4 | val topDefinitions: World.TopDefs = { (a,b,c) => throw new CDEMatchError(a) }, 5 | val topConstraints: List[ViewSym=>Ex[Boolean]] = List( ex => ExLit[Boolean](true) ), 6 | val knobValues: Any => Any = { case x => throw new CDEMatchError(x) } 7 | ) { 8 | import Implicits._ 9 | type Constraint = ViewSym=>Ex[Boolean] 10 | 11 | def this(that: Config) = this(that.topDefinitions, 12 | that.topConstraints, 13 | that.knobValues) 14 | 15 | def ++(that: Config) = { 16 | new Config(this.addDefinitions(that.topDefinitions), 17 | this.addConstraints(that.topConstraints), 18 | this.addKnobValues(that.knobValues)) 19 | } 20 | 21 | def addDefinitions(that: World.TopDefs): World.TopDefs = { 22 | (pname,site,here) => { 23 | try this.topDefinitions(pname, site, here) 24 | catch { 25 | case e: scala.MatchError => that(pname, site, here) 26 | case e: CDEMatchError => that(pname, site, here) 27 | } 28 | } 29 | } 30 | 31 | def addConstraints(that: List[Constraint]):List[Constraint] = { 32 | this.topConstraints ++ that 33 | } 34 | 35 | 36 | def addKnobValues(that: Any=>Any): Any=>Any = { case x => 37 | try this.knobValues(x) 38 | catch { 39 | case e: scala.MatchError => that(x) 40 | case e: CDEMatchError => that(x) 41 | } 42 | } 43 | 44 | def toCollector = new Collector(this.topDefinitions, this.knobValues) 45 | def toInstance = new Instance(this.topDefinitions, this.knobValues) 46 | override def toString = this.getClass.getSimpleName 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/tests/Add01.scala: -------------------------------------------------------------------------------- 1 | package dataflow.tests 2 | 3 | /** 4 | * Created by vnaveen0 on 26/6/17. 5 | */ 6 | 7 | import chisel3._ 8 | import chisel3.util._ 9 | import chisel3.Module 10 | import chisel3.testers._ 11 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 12 | import org.scalatest.{FlatSpec, Matchers} 13 | import muxes._ 14 | import config._ 15 | import control.{BasicBlockNoMaskNode, BasicBlockNode} 16 | import util._ 17 | import interfaces._ 18 | import regfile._ 19 | import node._ 20 | 21 | 22 | 23 | abstract class Add01DFIO(implicit val p: Parameters) extends Module with CoreParams { 24 | val io = IO(new Bundle { 25 | val Data0 = Flipped(Decoupled(new DataBundle)) 26 | val Data1 = Flipped(Decoupled(new DataBundle)) 27 | val pred = Decoupled(new ControlBundle) 28 | val start = Input(new Bool()) 29 | val result = Decoupled(new DataBundle) 30 | }) 31 | } 32 | 33 | 34 | class Add01DF(implicit p: Parameters) extends Add01DFIO() { 35 | 36 | val b0_entry = Module(new BasicBlockNoMaskNode(NumInputs = 1, NumOuts = 1, BID = 0)) 37 | val m0 = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "Add")(sign = false)) 38 | 39 | //Setting b0_entry predicates to be true 40 | // will start immediately 41 | b0_entry.io.predicateIn.bits.control := true.B 42 | b0_entry.io.predicateIn.bits.taskID := 0.U 43 | 44 | //ALU will start only if the basic block enables adds 45 | m0.io.enable <> b0_entry.io.Out(0) 46 | 47 | //IO connections 48 | 49 | m0.io.LeftIO <> io.Data0 50 | m0.io.RightIO <> io.Data1 51 | b0_entry.io.predicateIn.valid := io.start 52 | 53 | 54 | io.pred <> b0_entry.io.Out(0) 55 | io.result <> m0.io.Out(0) 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/SharedFPDiv: -------------------------------------------------------------------------------- 1 | package dataflow 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class TypeLoadDataFlow(implicit val p: Parameters) extends Module with CoreParams{ 13 | 14 | val io = IO(new Bundle{val dummy = Input(UInt{32.W})}) 15 | 16 | val StackFile = Module(new TypeStackFile(ID=0,Size=32,NReads=1,NWrites=1) 17 | (WControl=new WriteTypMemoryController(NumOps=1,BaseSize=2,NumEntries=1)) 18 | (RControl=new ReadTypMemoryController(NumOps=1,BaseSize=2,NumEntries=1))) 19 | val Store = Module(new TypStore(NumPredOps=0,NumSuccOps=1,NumOuts=1,ID=0,RouteID=0)) 20 | val Load = Module(new TypLoad(NumPredOps=1,NumSuccOps=0,NumOuts=1,ID=0,RouteID=0)) 21 | 22 | 23 | StackFile.io.ReadIn(0) <> Load.io.memReq 24 | Load.io.memResp <> StackFile.io.ReadOut(0) 25 | 26 | StackFile.io.WriteIn(0) <> Store.io.memReq 27 | Store.io.memResp <> StackFile.io.WriteOut(0) 28 | 29 | 30 | Store.io.GepAddr.bits.data := 8.U 31 | Store.io.GepAddr.bits.predicate := true.B 32 | Store.io.GepAddr.valid := true.B 33 | 34 | Store.io.inData.bits.data := 0x1eadbeefbeefbeefL.U 35 | Store.io.inData.bits.predicate := true.B 36 | Store.io.inData.valid := true.B 37 | 38 | Store.io.enable.bits.control := true.B 39 | Store.io.enable.valid := true.B 40 | Store.io.Out(0).ready := true.B 41 | 42 | 43 | Load.io.GepAddr.bits.data := 8.U 44 | Load.io.GepAddr.bits.predicate := true.B 45 | Load.io.GepAddr.valid := true.B 46 | 47 | Load.io.enable.bits.control := true.B 48 | Load.io.enable.valid := true.B 49 | Load.io.Out(0).ready := true.B 50 | 51 | Load.io.PredOp(0) <> Store.io.SuccOp(0) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/MinArbiter.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import chisel3.util.{Decoupled, log2Ceil} 4 | import chisel3.{Flipped, Module, UInt, _} 5 | import config.{Parameters, XLEN} 6 | import interfaces.CooDataBundle 7 | 8 | 9 | class MinArbiterIO(n: Int)(implicit val p: Parameters) extends Module { 10 | val io = IO(new Bundle { 11 | 12 | val in = Flipped(Vec(n, Decoupled(new CooDataBundle(UInt(p(XLEN).W))))) 13 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 14 | val chosen = Output(UInt(log2Ceil(n).W)) 15 | val active = Input(Bool ()) 16 | 17 | // val eopIn = Vec(n, Input(Bool( ))) 18 | // val lastIn = Vec(n, Input(Bool( ))) 19 | // 20 | // val eopOut = Output(Bool( )) 21 | // val lastOut = Output(Bool( )) 22 | }) 23 | } 24 | 25 | class MinArbiter(n: Int)(implicit p: Parameters) 26 | extends MinArbiterIO(n)(p) { 27 | 28 | var chosen = n-1 29 | io.chosen := (n-1).asUInt 30 | // io.eopOut := io.eopIn(n-1) 31 | // io.lastOut := io.lastIn(n-1) 32 | io.out.bits := io.in(n-1).bits 33 | 34 | 35 | 36 | 37 | val grant = Wire(Vec(n, Bool( ))) 38 | grant.foreach(a => a := false.B) 39 | grant(n-1) := true.B 40 | 41 | for (i <- n-1 to 0 by -1) { 42 | when (io.active && io.in(i).valid && 43 | ((io.in(chosen).valid && io.in(i).bits.row < io.in(chosen).bits.row) || (!io.in(chosen).valid)) ) { 44 | grant.foreach(a => a := false.B) 45 | grant(i) := true.B 46 | chosen = i 47 | io.chosen := i.asUInt 48 | io.out.bits := io.in(i).bits 49 | // io.eopOut := io.eopIn(i) 50 | // io.lastOut := io.lastIn(i) 51 | } 52 | } 53 | 54 | for ((in, g) <- io.in zip grant) 55 | in.ready := g && io.out.ready && io.active && io.in.map(_.valid).reduceLeft(_||_) 56 | 57 | io.out.valid := io.active && io.in.map(_.valid).reduceLeft(_||_) 58 | 59 | } -------------------------------------------------------------------------------- /src/test/scala/FPU/FPResizeNode.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package FPU 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | import FPU._ 8 | import FType._ 9 | 10 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 11 | import org.scalatest.{Matchers, FlatSpec} 12 | 13 | import node._ 14 | import dataflow._ 15 | import muxes._ 16 | import config._ 17 | import util._ 18 | import interfaces._ 19 | 20 | 21 | // Tester. 22 | class FPResizeTester(df: FNtoFNNode) 23 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 24 | 25 | poke(df.io.Input.bits.data, 0x43000000) 26 | poke(df.io.Input.valid, false.B) 27 | poke(df.io.Input.bits.predicate, false.B) 28 | 29 | poke(df.io.enable.bits.control, false.B) 30 | poke(df.io.enable.valid, false.B) 31 | poke(df.io.Out(0).ready, false.B) 32 | println(s"Output: ${peek(df.io.Out(0))}\n") 33 | 34 | 35 | step(1) 36 | 37 | poke(df.io.enable.bits.control, true.B) 38 | poke(df.io.enable.valid, true.B) 39 | poke(df.io.Out(0).ready, true.B) 40 | 41 | 42 | poke(df.io.Input.valid, true.B) 43 | poke(df.io.Input.bits.predicate, true.B) 44 | 45 | println(s"Output: ${peek(df.io.Out(0))}\n") 46 | 47 | println(s"t: -1\n -------------------------------------") 48 | step(1) 49 | 50 | 51 | for (i <- 0 until 10) { 52 | println(s"Output: ${peek(df.io.Out(0))}\n") 53 | 54 | println(s"t: ${i}\n -------------------------------------") 55 | step(1) 56 | } 57 | } 58 | 59 | class FPResizeTests extends FlatSpec with Matchers { 60 | implicit val p = config.Parameters.root((new SinglePrecisionFPConfig).toInstance) 61 | it should "Dataflow tester" in { 62 | chisel3.iotesters.Driver(() => new FNtoFNNode(S, H, NumOuts = 1, ID = 0)) { 63 | c => new FPResizeTester(c) 64 | } should be(true) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /include/meta_data.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /*! 21 | * Copyright (c) 2017 by Contributors 22 | * \file meta_data.h 23 | * \brief Meta data related utilities 24 | */ 25 | #ifndef TVM_RUNTIME_META_DATA_H_ 26 | #define TVM_RUNTIME_META_DATA_H_ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include "runtime_base.h" 34 | 35 | namespace tvm 36 | { 37 | namespace runtime 38 | { 39 | 40 | /*! \brief function information needed by device */ 41 | struct FunctionInfo 42 | { 43 | std::string name; 44 | std::vector arg_types; 45 | std::vector thread_axis_tags; 46 | 47 | void Save(dmlc::JSONWriter *writer) const; 48 | void Load(dmlc::JSONReader *reader); 49 | void Save(dmlc::Stream *writer) const; 50 | bool Load(dmlc::Stream *reader); 51 | }; 52 | } // namespace runtime 53 | } // namespace tvm 54 | 55 | namespace dmlc 56 | { 57 | DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::FunctionInfo, true); 58 | } // namespace dmlc 59 | #endif // TVM_RUNTIME_META_DATA_H_ 60 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/filter/CacheVecLoader.scala: -------------------------------------------------------------------------------- 1 | package dataflow.filter 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import accel._ 7 | import node._ 8 | import config._ 9 | import interfaces._ 10 | import arbiters._ 11 | import memory._ 12 | 13 | class CacheVecLoader(FilterSize : Int)(implicit val p: Parameters) extends Module with CoreParams { 14 | 15 | val io = IO(new Bundle { 16 | val enable = Flipped(Decoupled(new ControlBundle())) 17 | val ptr = Vec(FilterSize,Flipped(Decoupled(new TypBundle()))) 18 | val sum = Flipped(Decoupled(new TypBundle())) 19 | val cache = Flipped(new CacheIO) 20 | val data = Vec(FilterSize,Decoupled(new TypBundle())) 21 | }) 22 | 23 | val CacheMem = Module(new UnifiedTypController(ID=0,Size=32,NReads=FilterSize,NWrites=2) 24 | (WControl=new WriteTypMemoryController(NumOps=2,BaseSize=2,NumEntries=2)) 25 | (RControl=new ReadTypMemoryController(NumOps=FilterSize,BaseSize=2,NumEntries=2)) 26 | (RWArbiter=new ReadWriteArbiter())) 27 | 28 | val Load = for (i <- 0 until FilterSize) yield { 29 | val ld = Module(new TypLoad(NumPredOps=0, NumSuccOps=0, NumOuts=1,ID=i,RouteID=i)) 30 | ld 31 | } 32 | 33 | val Store = Module(new TypStore(NumPredOps=0, NumSuccOps=0, NumOuts=1,ID=0,RouteID=0)) 34 | Store.io.enable <> io.enable 35 | Store.io.GepAddr <> io.ptr(0) 36 | Store.io.inData <> io.sum 37 | CacheMem.io.WriteIn(0) <> Store.io.memReq 38 | Store.io.memResp <> CacheMem.io.ReadOut(0) 39 | Store.io.Out(0).ready := true.B 40 | 41 | for (i <- 0 until FilterSize) { 42 | Load(i).io.enable <> io.enable 43 | Load(i).io.GepAddr <> io.ptr(i) 44 | CacheMem.io.ReadIn(i) <> Load(i).io.memReq 45 | Load(i).io.memResp <> CacheMem.io.ReadOut(i) 46 | io.data(i) <> Load(i).io.Out(0) 47 | } 48 | 49 | io.cache.req <> CacheMem.io.MemReq 50 | CacheMem.io.MemResp <> io.cache.resp 51 | io.cache.abort := false.B 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/test/scala/FPU/FPDiv.scala: -------------------------------------------------------------------------------- 1 | package FPU 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import FPU._ 6 | import FType._ 7 | 8 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 9 | import org.scalatest.{Matchers, FlatSpec} 10 | 11 | import node._ 12 | import dataflow._ 13 | import muxes._ 14 | import config._ 15 | import util._ 16 | import interfaces._ 17 | 18 | class FPDivNodeTester(c: FPDivSqrtNode) extends PeekPokeTester(c) { 19 | poke(c.io.a.valid,false) 20 | poke(c.io.b.valid,false) 21 | poke(c.io.FUReq.ready,false) 22 | poke(c.io.FUResp.valid,false) 23 | poke(c.io.Out(0).ready,true) 24 | 25 | step(1) 26 | 27 | 28 | poke(c.io.a.valid, true) 29 | poke(c.io.a.bits.data,0x42800000) 30 | poke(c.io.a.bits.taskID, 22) 31 | poke(c.io.a.bits.predicate, true) 32 | poke(c.io.b.valid, true) 33 | poke(c.io.b.bits.data, 0x41800000) 34 | poke(c.io.b.bits.predicate,true) 35 | poke(c.io.b.bits.taskID, 22) 36 | poke(c.io.enable.bits.control,true) 37 | poke(c.io.enable.valid,true) 38 | step(1) 39 | step(1) 40 | 41 | poke(c.io.FUReq.ready,true) 42 | 43 | step(1) 44 | step(1) 45 | step(1) 46 | print(s"t: ${t} io.field0: ${peek(c.io.FUReq.bits.data("field0"))} io.field1: ${peek(c.io.FUReq.bits.data("field1"))} io.field2: ${peek(c.io.FUReq.bits.data("field2"))} \n") 47 | poke(c.io.FUResp.data,100) 48 | poke(c.io.FUResp.valid,true) 49 | 50 | 51 | step(1) 52 | step(1) 53 | step(1) 54 | // } 55 | 56 | 57 | } 58 | 59 | 60 | 61 | class FPDivNodeTests extends FlatSpec with Matchers { 62 | implicit val p = config.Parameters.root((new MiniConfig).toInstance) 63 | it should "FPDivSqrt Node tester" in { 64 | chisel3.iotesters.Driver(() => new FPDivSqrtNode(NumOuts=1,ID=1,RouteID=0,opCode = "SQRT")(t = S)) { c => 65 | new FPDivNodeTester(c) 66 | } should be(true) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/regfile/RegFile.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package regfile 4 | import scala.math._ 5 | import chisel3._ 6 | import chisel3.util._ 7 | import config._ 8 | 9 | /** 10 | * @brief IO interface to register file 11 | * @details 12 | * raddr1: Read address (word aligned) 13 | * rdata1: Read data (word granularity) 14 | * wen : Write enable 15 | * waddr : write address (word aligned) 16 | * wdata : write data (word granularity) 17 | * 18 | * @param size: Number of registers 19 | * 20 | */ 21 | class RegFileBundle(size: Int)(implicit p: Parameters) extends CoreBundle()(p) { 22 | val raddr1 = Input(UInt(max(1,log2Ceil(size)).W)) 23 | val rdata1 = Output(UInt(xlen.W)) 24 | val raddr2 = Input(UInt(max(1,log2Ceil(size)).W)) 25 | val rdata2 = Output(UInt(xlen.W)) 26 | val wen = Input(Bool()) 27 | val waddr = Input(UInt(max(1,log2Ceil(size)).W)) 28 | val wdata = Input(UInt(xlen.W)) 29 | val wmask = Input(UInt((xlen/8).W)) 30 | 31 | override def cloneType = new RegFileBundle(size).asInstanceOf[this.type] 32 | 33 | } 34 | 35 | 36 | abstract class AbstractRFile(size: Int)(implicit val p: Parameters) extends Module with CoreParams { 37 | val io = IO(new RegFileBundle(size)) 38 | } 39 | /** 40 | * @brief Scratchpad registerfile 41 | * @details [long description] 42 | * 43 | * @param size : Number of registers. 44 | * @return [description] 45 | */ 46 | class RFile(size: Int)(implicit p: Parameters) extends AbstractRFile(size)(p) { 47 | val regs = SyncReadMem(size,Vec(xlen/8, UInt(8.W))) 48 | // I am reading a vector of bytes and then converting to a UInt before returning it. 49 | io.rdata1 := regs.read(io.raddr1).asUInt() 50 | io.rdata2 := regs.read(io.raddr2).asUInt() 51 | when(io.wen) { 52 | // I am writing a vector of bytes. Need to also feed the bytemask. 53 | regs.write(io.waddr, VecInit.tabulate(xlen/8)(i => io.wdata(8*(i+1)-1,8*i)),io.wmask.toBools) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/config/TestConfigs.scala: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import config._ 6 | import util._ 7 | import regfile._ 8 | import junctions._ 9 | import accel._ 10 | 11 | class TypeStackFileVerilog16bConfig extends Config((site, here, up) => { 12 | // Core 13 | case XLEN => 16 14 | case TLEN => 32 15 | case GLEN => 16 16 | // # Max bits of cache request tag. 17 | case MSHRLEN => 8 18 | case TYPSZ => 32 19 | case VERBOSITY => "low" 20 | case COMPONENTS => "TYPLOAD;TYPOP" 21 | // Max size of type memory system may see 22 | case TRACE => true 23 | case BuildRFile => (p: Parameters) => Module(new RFile(32)(p)) 24 | 25 | //------------------------- 26 | // Cache 27 | case NWays => 1 // TODO: set-associative 28 | case NSets => 256 29 | case CacheBlockBytes => 4 * (here(XLEN) >> 3) // 4 x 32 bits = 16B 30 | // NastiIO 31 | case NastiKey => new NastiParameters( 32 | idBits = 12, 33 | dataBits = 32, 34 | addrBits = 32) 35 | } 36 | ) 37 | 38 | class MixedDataflowConfig extends Config((site, here, up) => { 39 | // Core 40 | case XLEN => 16 41 | case TLEN => 32 42 | case GLEN => 16 43 | // # Max bits of cache request tag. 44 | case MSHRLEN => 8 45 | case TYPSZ => 32 46 | case VERBOSITY => "low" 47 | case COMPONENTS => "OP" 48 | // Max size of type memory system may see 49 | case TRACE => true 50 | case BuildRFile => (p: Parameters) => Module(new RFile(32)(p)) 51 | 52 | //------------------------- 53 | // Cache 54 | case NWays => 1 // TODO: set-associative 55 | case NSets => 256 56 | case CacheBlockBytes => 4 * (here(XLEN) >> 3) // 4 x 32 bits = 16B 57 | // NastiIO 58 | case NastiKey => new NastiParameters( 59 | idBits = 12, 60 | dataBits = 32, 61 | addrBits = 32) 62 | } 63 | ) 64 | -------------------------------------------------------------------------------- /src/main/scala/dnnnode/CooShapeTransformer.scala: -------------------------------------------------------------------------------- 1 | 2 | package dnnnode 3 | 4 | import chisel3._ 5 | import chisel3.util._ 6 | import config._ 7 | import dnn.memory.{TensorMaster, TensorParams} 8 | import interfaces.{CooDataBundle} 9 | import node.Shapes 10 | import shell._ 11 | 12 | 13 | /** Coordinate Shape Transformer. 14 | * 15 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 16 | * scratchpads (SRAM). Also, there is support for zero padding, while 17 | * doing the load. Zero-padding works on the y and x axis, and it is 18 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 19 | * handling the way tensors are stored on the scratchpads. 20 | */ 21 | class CooShapeTransformerIO[gen <: Shapes](memTensorType: String = "none")(outShape: => gen)(implicit val p: Parameters) 22 | extends Module { 23 | val tp = new TensorParams(memTensorType) 24 | val mp = p(ShellKey).memParams 25 | val io = IO(new Bundle { 26 | val ind = Flipped(Decoupled(UInt(p(ROWLEN).W))) 27 | val value = Flipped(Decoupled(UInt(p(XLEN).W))) 28 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 29 | }) 30 | } 31 | 32 | class CooShapeTransformer[L <: Shapes](rowBased: Boolean, memTensorType: String = "none") 33 | (outShape: => L)(implicit p: Parameters) 34 | extends CooShapeTransformerIO(memTensorType)(outShape)(p) { 35 | 36 | if (rowBased) { 37 | io.out.bits.data := io.value.bits 38 | io.out.bits.row := io.ind.bits 39 | io.out.bits.col := 0.U 40 | io.out.bits.valid := true.B 41 | } else { 42 | io.out.bits.data := io.value.bits 43 | io.out.bits.col := io.ind.bits 44 | io.out.bits.row := 0.U 45 | io.out.bits.valid := true.B 46 | } 47 | 48 | io.out.valid := io.ind.valid && io.value.valid 49 | 50 | io.ind.ready := io.out.ready && io.value.valid && io.ind.valid 51 | io.value.ready := io.out.ready && io.value.valid && io.ind.valid 52 | 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/dnn/memory/WriteTensorController.scala: -------------------------------------------------------------------------------- 1 | package dnn.memory 2 | 3 | import chisel3.util._ 4 | import chisel3.{Module, _} 5 | import config._ 6 | import interfaces._ 7 | import muxes.Demux 8 | import node._ 9 | 10 | abstract class WTController[gen <: Shapes](NumOps: Int, tensorType: String = "none")(shape: => gen)(implicit val p: Parameters) 11 | extends Module { 12 | val io = IO(new Bundle { 13 | val WriteIn = Vec(NumOps, Flipped(Decoupled(new TensorWriteReq(shape.getWidth)))) 14 | val WriteOut = Vec(NumOps, Output(new TensorWriteResp())) 15 | val tensor = new TensorMaster(tensorType) 16 | }) 17 | } 18 | 19 | 20 | class WriteTensorController[L <: Shapes] (NumOps: Int, tensorType: String = "none")(shape: => L)(implicit p: Parameters) 21 | extends WTController(NumOps, tensorType)(shape)(p) { 22 | 23 | val arbiter = Module(new RRArbiter(new TensorWriteReq(shape.getWidth), NumOps)) 24 | val demux = Module(new Demux(new TensorWriteResp, NumOps)) 25 | 26 | // Wire up inputs with the arbiter and outputs with demux 27 | for (i <- 0 until NumOps) { 28 | arbiter.io.in(i) <> io.WriteIn(i) 29 | io.WriteOut(i) <> demux.io.outputs(i) 30 | } 31 | 32 | val arb_valid_r = RegInit(false.B) 33 | 34 | val chosen_reg = RegInit(0.U) 35 | when(arbiter.io.out.fire){ 36 | chosen_reg := arbiter.io.chosen 37 | arb_valid_r := true.B 38 | } 39 | 40 | io.tensor.wr.valid := arbiter.io.out.valid 41 | io.tensor.wr.bits.data := arbiter.io.out.bits.data.asTypeOf(io.tensor.wr.bits.data) 42 | io.tensor.wr.bits.idx := arbiter.io.out.bits.index 43 | io.tensor.rd <> DontCare 44 | 45 | arbiter.io.out.ready := true.B 46 | 47 | demux.io.sel := chosen_reg 48 | demux.io.en := arb_valid_r //arbiter.io.out.valid 49 | demux.io.input.valid := arb_valid_r //arbiter.io.out.valid 50 | demux.io.input.RouteID := io.WriteIn(arbiter.io.chosen).bits.RouteID 51 | demux.io.input.done := arb_valid_r //arbiter.io.out.valid 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/filter/BasicLoader.scala: -------------------------------------------------------------------------------- 1 | package dataflow.filter 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class BasicLoader(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val FilterSize = 3*3 15 | 16 | val io = IO(new Bundle { 17 | val enable = Flipped(Decoupled(Bool())) 18 | val ptr = Vec(FilterSize,Flipped(Decoupled(new DataBundle()))) 19 | val data = Vec(FilterSize,Decoupled(new DataBundle())) 20 | }) 21 | 22 | val StackFile = Module(new TypeStackFile(ID=0,Size=32,NReads=9,NWrites=9) 23 | (WControl=new WriteMemoryController(NumOps=9,BaseSize=2,NumEntries=2)) 24 | (RControl=new ReadMemoryController(NumOps=9,BaseSize=2,NumEntries=2))) 25 | 26 | val Load = for (i <- 0 until FilterSize) yield { 27 | val ld = Module(new UnTypLoad(NumPredOps=0, NumSuccOps=0, NumOuts=1,ID=i,RouteID=i)) 28 | ld 29 | } 30 | 31 | for (i <- 0 until FilterSize) { 32 | Load(i).io.enable <> io.enable 33 | Load(i).io.GepAddr <> io.ptr(i) 34 | StackFile.io.ReadIn(i) <> Load(i).io.memReq 35 | Load(i).io.memResp <> StackFile.io.ReadOut(i) 36 | io.data(i) <> Load(i).io.Out(0) 37 | } 38 | 39 | } 40 | 41 | /* 42 | val GEP = for (i <- 0 until FilterSize) yield { 43 | val gp = Module (new GepOneNode(NumOuts = 1, ID = i)(numByte1 = 0)(p)) 44 | gp 45 | } 46 | 47 | // Wiring GEP instruction to the function argument 48 | GEP(i).io.baseAddress <> io.ptr(i) 49 | GEP(i).io.idx1.valid := true.B 50 | GEP(i).io.idx1.bits.predicate := true.B 51 | GEP(i).io.idx1.bits.data := 0.U 52 | GEP(i).io.idx2.valid := true.B 53 | GEP(i).io.idx2.bits.predicate := true.B 54 | GEP(i).io.idx2.bits.data := 0.U 55 | 56 | Load(i).io.GepAddr.bits.data := (4 + (xlen / 8) * i).U 57 | Load(i).io.GepAddr.bits.predicate := true.B 58 | Load(i).io.GepAddr.valid := true.B 59 | */ 60 | -------------------------------------------------------------------------------- /src/main/scala/loop/LoopStart.scala: -------------------------------------------------------------------------------- 1 | package loop 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.util._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import muxes._ 12 | import util._ 13 | import node._ 14 | import utility.UniformPrintfs 15 | 16 | 17 | /** 18 | * @note Loop header IO 19 | * @param NumInputs Number of inputs 20 | */ 21 | class LoopStartIO(val NumInputs: Int, val NumOuts: Int) 22 | (implicit p: Parameters) extends CoreBundle()(p) { 23 | 24 | val inputArg = Vec(NumInputs, Flipped(Decoupled(new DataBundle()))) 25 | val outputArg = Vec(NumOuts, Decoupled(new DataBundle())) 26 | 27 | val enableSignal = Vec(NumInputs, Flipped(Decoupled(new ControlBundle))) 28 | 29 | /** 30 | * Finish signal comes from Ret instruction 31 | */ 32 | val Finish = Vec(NumInputs, Flipped(Decoupled(new ControlBundle()))) 33 | 34 | override def cloneType = new LoopStartIO(NumInputs, NumOuts).asInstanceOf[this.type] 35 | 36 | } 37 | 38 | 39 | 40 | class LoopStart(val NumInputs: Int, val NumOuts: Int, val ID: Int) 41 | (implicit val p: Parameters) extends Module with CoreParams with UniformPrintfs { 42 | 43 | override lazy val io = IO(new LoopStartIO(NumInputs, NumOuts)) 44 | 45 | val Args = for (i <- 0 until NumInputs) yield { 46 | val arg = Module(new LiveInNode(NumOuts = 1, ID = i)) 47 | arg 48 | } 49 | 50 | //Iterating over each loopelement and connect them to the IO 51 | for (i <- 0 until NumInputs) { 52 | Args(i).io.InData <> io.inputArg(i) 53 | //Args(i).io.Finish <> io.Finish(i) 54 | Args(i).io.enable <> io.enableSignal(i) 55 | io.Finish(i).ready := true.B // just to shut simulation up 56 | } 57 | 58 | for (i <- 0 until NumOuts) { 59 | io.outputArg(i) <> Args(i).io.Out(0) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/regfile/InputRegFile.scala: -------------------------------------------------------------------------------- 1 | package regfile 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 8 | import org.scalatest.{Matchers, FlatSpec} 9 | 10 | //import examples._ 11 | import muxes._ 12 | import config._ 13 | import util._ 14 | import interfaces._ 15 | 16 | 17 | abstract class InRegFile(implicit val p: Parameters) extends Module with CoreParams { 18 | val io = IO(new Bundle{ 19 | val Data = Decoupled(UInt(xlen.W)) 20 | }) 21 | } 22 | 23 | /** 24 | * Custom counter which counts indexes 25 | * @param inc if it's enable start counting 26 | * @param maxN Max number which counter sets to zero after that 27 | * @param indx Counter output 28 | */ 29 | class IndexCounter(implicit val p: Parameters) extends Module with CoreParams{ 30 | val io = IO(new Bundle{ 31 | val inc = Input(Bool()) 32 | val maxN = Input(UInt(xlen.W)) 33 | val indx = Output(UInt(xlen.W)) 34 | }) 35 | val in_reg = RegInit(0.U(10.W)) 36 | 37 | io.indx := in_reg 38 | 39 | when(io.inc){ 40 | in_reg := Mux( io.maxN > in_reg, in_reg + 1.U, 0.U ) 41 | } 42 | 43 | } 44 | 45 | /** 46 | * This class generates local input register file for each node 47 | * @param inData An array of input data 48 | */ 49 | 50 | class InputRegFile (val inData : Array[UInt])(implicit p: Parameters) extends InRegFile()(p){ 51 | 52 | val ROM = VecInit(inData) 53 | val Valids = RegInit(VecInit(Seq.fill(inData.size)(true.B))) 54 | 55 | val counter = Module(new IndexCounter()) 56 | 57 | counter.io.inc := io.Data.ready 58 | counter.io.maxN := inData.size.U - 1.U 59 | 60 | io.Data.bits := ROM(counter.io.indx) 61 | io.Data.valid := Valids(counter.io.indx) 62 | 63 | // Set valid signal to false if you read the data 64 | when(io.Data.ready){ 65 | Valids(counter.io.indx) := false.B 66 | } 67 | 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/scala/stack/StackAlloca.scala: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import arbiters._ 12 | import util._ 13 | import utility.UniformPrintfs 14 | import muxes._ 15 | import node._ 16 | 17 | class StackIO(NumOps: Int) 18 | (implicit p: Parameters) extends CoreBundle()(p) { 19 | val InData = Vec(NumOps, Flipped(Decoupled(new AllocaReq))) 20 | val OutData = Output(Vec(NumOps, (new AllocaResp))) 21 | 22 | override def cloneType = new StackIO(NumOps).asInstanceOf[this.type] 23 | } 24 | 25 | class Stack(NumOps: Int) 26 | (implicit val p: Parameters) extends Module with CoreParams with UniformPrintfs{ 27 | override lazy val io = IO(new StackIO(NumOps)) 28 | 29 | /** 30 | * Instantiating Arbiter module and connecting inputs to the output 31 | * @note we fix the base size to 8 32 | */ 33 | val in_arbiter = Module(new Arbiter(new AllocaReq, NumOps)) 34 | for( i <- 0 until NumOps){ 35 | in_arbiter.io.in(i) <> io.InData(i) 36 | } 37 | 38 | /** 39 | * Arbiter's output is always ready 40 | */ 41 | 42 | in_arbiter.io.out.ready := true.B 43 | 44 | /** 45 | * Stack pointer Update 46 | */ 47 | val SP = RegInit(0.U) 48 | //val old_SP = RegInit(0.U) 49 | 50 | when(in_arbiter.io.out.fire){ 51 | SP := SP + (in_arbiter.io.out.bits.numByte * in_arbiter.io.out.bits.size) 52 | } 53 | 54 | // Copy arbiter output and pointer to all outputs. 55 | // Assert valid to the output corresponding to the arbiter grant 56 | for (i <- 0 until NumOps) { 57 | io.OutData(i).ptr := SP 58 | io.OutData(i).RouteID := in_arbiter.io.out.bits.RouteID 59 | io.OutData(i).valid := false.B 60 | } 61 | io.OutData(in_arbiter.io.chosen).valid := in_arbiter.io.out.valid 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /src/main/scala/node/CallNode.scala: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import chisel3._ 4 | import chisel3.Module 5 | import junctions._ 6 | 7 | import config._ 8 | import interfaces._ 9 | import util._ 10 | import utility.UniformPrintfs 11 | 12 | class CallNodeIO(val argTypes: Seq[Int], val retTypes: Seq[Int])(implicit p: Parameters) 13 | extends Bundle 14 | { 15 | val In = Flipped(new CallDecoupled(argTypes)) // Requests from calling block(s) 16 | val callOut = Decoupled(new Call(argTypes)) // To task 17 | val retIn = Flipped(Decoupled(new Call(retTypes))) // From task 18 | val Out = new CallDecoupled(retTypes) // Returns to calling block(s) 19 | override def cloneType = new CallNodeIO(argTypes,retTypes).asInstanceOf[this.type] 20 | } 21 | 22 | class CallNode(ID: Int, argTypes: Seq[Int], retTypes: Seq[Int]) 23 | (implicit p: Parameters, 24 | name: sourcecode.Name, 25 | file: sourcecode.File) extends Module 26 | with UniformPrintfs { 27 | override lazy val io = IO(new CallNodeIO(argTypes, retTypes)(p)) 28 | 29 | val node_name = name.value 30 | val module_name = file.value.split("/").tail.last.split("\\.").head.capitalize 31 | val (cycleCount,_) = Counter(true.B,32*1024) 32 | override val printfSigil = module_name + ": " + node_name + ID + " " 33 | 34 | // Combine individually decoupled enable and data into single decoupled call 35 | val CombineIn = Module(new CombineCall(argTypes)) 36 | CombineIn.io.In <> io.In 37 | io.callOut <> CombineIn.io.Out 38 | 39 | // Split return enable and arguments into individually decoupled enable and data 40 | val SplitOut = Module(new SplitCall(retTypes)) 41 | SplitOut.io.In <> io.retIn 42 | io.Out <> SplitOut.io.Out 43 | 44 | when(CombineIn.io.Out.fire) { 45 | when (CombineIn.io.Out.bits.enable.control) 46 | { 47 | printf("[LOG] " + "[" + module_name + "] [TID->%d] " + node_name + ": Output fired @ %d\n", CombineIn.io.Out.bits.enable.taskID, cycleCount) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/loop/LoopElement.scala: -------------------------------------------------------------------------------- 1 | package loop 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.util._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import muxes._ 12 | import util._ 13 | import node._ 14 | import utility.UniformPrintfs 15 | 16 | 17 | 18 | /** 19 | * Contain each loop input argument works like register file 20 | */ 21 | class LoopElementIO()(implicit p: Parameters) extends CoreBundle() { 22 | 23 | /** 24 | * Module input 25 | */ 26 | val inData = Flipped(Decoupled(CustomDataBundle(UInt(16.W)))) 27 | val Finish = Input(Bool()) 28 | 29 | /** 30 | * Module output 31 | */ 32 | val outData = new Bundle{ 33 | val data = Output(CustomDataBundle(UInt(32.W))) // Defaults to UInt(32.W) 34 | val valid = Output(Bool()) 35 | } 36 | } 37 | 38 | 39 | class LoopElement(val ID: Int)(implicit val p: Parameters) 40 | extends Module with CoreParams with UniformPrintfs { 41 | 42 | override lazy val io = IO(new LoopElementIO()) 43 | 44 | // Printf debugging 45 | override val printfSigil = "Node ID: " + ID + " " 46 | 47 | /** 48 | * Always latch the input data 49 | */ 50 | val data_R = RegNext(io.inData.bits) 51 | 52 | io.outData.data <> data_R 53 | 54 | /** 55 | * Defining state machines 56 | */ 57 | val s_INIT :: s_LATCH :: Nil = Enum(2) 58 | val state = RegInit(s_INIT) 59 | 60 | /** 61 | * State transision 62 | */ 63 | 64 | when(state === s_INIT){ 65 | io.inData.ready := true.B 66 | io.outData.valid := false.B 67 | 68 | }.otherwise{ 69 | // }.elsewhen( state === s_LATCH){ 70 | io.inData.ready := false.B 71 | io.outData.valid := true.B 72 | } 73 | 74 | when(io.inData.fire()){ 75 | state := s_LATCH 76 | } 77 | 78 | when(io.Finish){ 79 | state := s_INIT 80 | } 81 | 82 | /** 83 | * Debuging info 84 | */ 85 | // printfInfo(" State: %x\n", state) 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/DataFlow.scala: -------------------------------------------------------------------------------- 1 | package dataflow 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class TypeLoadDataFlow(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val io = IO(new Bundle { 15 | val dummy = Input(UInt { 16 | 32.W 17 | }) 18 | }) 19 | 20 | val StackFile = Module(new TypeStackFile(ID = 0, Size = 32, NReads = 1, NWrites = 1) 21 | (WControl = new WriteTypMemoryController(NumOps = 1, BaseSize = 2, NumEntries = 1)) 22 | (RControl = new ReadTypMemoryController(NumOps = 1, BaseSize = 2, NumEntries = 1))) 23 | val Store = Module(new TypStore(NumPredOps = 0, NumSuccOps = 1, NumOuts = 1, ID = 0, RouteID = 0)) 24 | val Load = Module(new TypLoad(NumPredOps = 1, NumSuccOps = 0, NumOuts = 1, ID = 0, RouteID = 0)) 25 | 26 | 27 | StackFile.io.ReadIn(0) <> Load.io.memReq 28 | Load.io.memResp <> StackFile.io.ReadOut(0) 29 | 30 | StackFile.io.WriteIn(0) <> Store.io.memReq 31 | Store.io.memResp <> StackFile.io.WriteOut(0) 32 | 33 | 34 | Store.io.GepAddr.bits.data := 8.U 35 | Store.io.GepAddr.bits.predicate := true.B 36 | Store.io.GepAddr.valid := true.B 37 | 38 | Store.io.inData.bits.data := 0x1eadbeefbeefbeefL.U 39 | Store.io.inData.bits.predicate := true.B 40 | Store.io.inData.valid := true.B 41 | 42 | Store.io.enable.bits.control := true.B 43 | Store.io.enable.valid := true.B 44 | Store.io.Out(0).ready := true.B 45 | Store.io.GepAddr.bits.taskID := 0.U 46 | Store.io.inData.bits.taskID := 0.U 47 | Store.io.inData.bits.valid := true.B 48 | Store.io.enable.bits.taskID := 0.U 49 | 50 | 51 | Load.io.GepAddr.bits.data := 8.U 52 | Load.io.GepAddr.bits.predicate := true.B 53 | Load.io.GepAddr.valid := true.B 54 | 55 | Load.io.enable.bits.control := true.B 56 | Load.io.enable.valid := true.B 57 | Load.io.Out(0).ready := true.B 58 | Load.io.enable.bits.taskID := 0.U 59 | Load.io.GepAddr.bits.taskID := 0.U 60 | 61 | Load.io.PredOp(0) <> Store.io.SuccOp(0) 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /src/test/scala/FPU/FPCompareNode.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package FPU 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | import FPU._ 8 | import FType._ 9 | 10 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 11 | import org.scalatest.{Matchers, FlatSpec} 12 | 13 | import node._ 14 | import dataflow._ 15 | import muxes._ 16 | import config._ 17 | import util._ 18 | import interfaces._ 19 | 20 | 21 | // Tester. 22 | class FPCompareNodeTester(df: FPCompareNode) 23 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 24 | 25 | 26 | poke(df.io.LeftIO.bits.data, 0x40800000.U) 27 | poke(df.io.LeftIO.valid, false.B) 28 | poke(df.io.LeftIO.bits.predicate, false.B) 29 | 30 | poke(df.io.RightIO.bits.data, 0x40800000.U) 31 | poke(df.io.RightIO.valid, false.B) 32 | poke(df.io.RightIO.bits.predicate, false.B) 33 | 34 | poke(df.io.enable.bits.control, false.B) 35 | poke(df.io.enable.valid, false.B) 36 | poke(df.io.Out(0).ready, false.B) 37 | println(s"Output: ${peek(df.io.Out(0))}\n") 38 | 39 | 40 | step(1) 41 | 42 | poke(df.io.enable.bits.control, true.B) 43 | poke(df.io.enable.valid, true.B) 44 | poke(df.io.Out(0).ready, true.B) 45 | 46 | 47 | poke(df.io.LeftIO.valid, true.B) 48 | poke(df.io.RightIO.valid, true.B) 49 | poke(df.io.LeftIO.bits.predicate, true.B) 50 | poke(df.io.RightIO.bits.predicate, true.B) 51 | 52 | println(s"Output: ${peek(df.io.Out(0))}\n") 53 | 54 | println(s"t: -1\n -------------------------------------") 55 | step(1) 56 | 57 | 58 | for (i <- 0 until 10) { 59 | println(s"Output: ${peek(df.io.Out(0))}\n") 60 | 61 | println(s"t: ${i}\n -------------------------------------") 62 | step(1) 63 | } 64 | } 65 | 66 | class FPCompareTests extends FlatSpec with Matchers { 67 | implicit val p = config.Parameters.root((new SinglePrecisionFPConfig).toInstance) 68 | it should "FP MAC tester" in { 69 | chisel3.iotesters.Driver(() => new FPCompareNode(NumOuts = 1, ID = 0, opCode = "=EQ")(t = S)) { 70 | c => new FPCompareNodeTester(c) 71 | } should be(true) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/dnnnode/DiffShapeTransformer.scala: -------------------------------------------------------------------------------- 1 | 2 | package dnnnode 3 | 4 | import chisel3._ 5 | import chisel3.util._ 6 | import config._ 7 | import dnn.memory.{TensorMaster, TensorParams} 8 | import shell._ 9 | 10 | 11 | /** Diff queue. 12 | * 13 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 14 | * scratchpads (SRAM). Also, there is support for zero padding, while 15 | * doing the load. Zero-padding works on the y and x axis, and it is 16 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 17 | * handling the way tensors are stored on the scratchpads. 18 | */ 19 | class DiffShapeTransformerIO(memTensorType: String = "none")(implicit val p: Parameters) 20 | extends Module { 21 | val tp = new TensorParams(memTensorType) 22 | val mp = p(ShellKey).memParams 23 | val io = IO(new Bundle { 24 | val start = Input(Bool()) 25 | val len = Input(UInt(mp.addrBits.W)) 26 | val in = Flipped(Decoupled(UInt(p(XLEN).W))) 27 | val out = Decoupled(UInt(p(XLEN).W)) 28 | }) 29 | } 30 | 31 | class DiffShapeTransformer(bufSize: Int, memTensorType: String = "none")(implicit p: Parameters) 32 | extends DiffShapeTransformerIO(memTensorType)(p) { 33 | 34 | require(bufSize > 1, "DiffQueue must have at least two elements") 35 | 36 | val elemNum = io.len 37 | 38 | val popCnt = Counter(tp.memDepth) 39 | 40 | val sIdle :: sRead :: Nil = Enum(2) 41 | val state = RegInit(sIdle) 42 | 43 | val queue = Module(new DiffQueue(UInt(p(XLEN).W), bufSize, NumIns = 1)) 44 | 45 | queue.io.clear := false.B 46 | queue.io.enq.bits := io.in.bits.asTypeOf(queue.io.enq.bits) 47 | 48 | queue.io.enq.valid := io.in.valid && state === sRead 49 | io.in.ready := queue.io.enq.ready 50 | 51 | io.out <> queue.io.deq 52 | 53 | when(queue.io.deq.fire()) {popCnt.inc()} 54 | 55 | 56 | switch(state){ 57 | is(sIdle){ 58 | when(io.start){ 59 | state := sRead 60 | } 61 | } 62 | is(sRead){ 63 | when((popCnt.value === elemNum - 1.U) && queue.io.deq.fire()){ 64 | popCnt.value := 0.U 65 | queue.io.clear := true.B 66 | state := sIdle 67 | } 68 | } 69 | } 70 | 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/shell/VTAShell.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package shell 21 | 22 | import chisel3._ 23 | import config._ 24 | 25 | /** Shell parameters. */ 26 | case class ShellParams( 27 | hostParams: AXIParams, 28 | memParams: AXIParams, 29 | vcrParams: VCRSimParams, 30 | vmeParams: VMESimParams, 31 | ) 32 | 33 | case object ShellKey extends Field[ShellParams] 34 | 35 | /** VTAShell. 36 | * 37 | * The VTAShell is based on a VME, VCR and core. This creates a complete VTA 38 | * system that can be used for simulation or real hardware. 39 | */ 40 | class VTAShell(implicit p: Parameters) extends Module { 41 | val io = IO(new Bundle{ 42 | val host = new AXILiteClient(p(ShellKey).hostParams) 43 | val mem = new AXIMaster(p(ShellKey).memParams) 44 | }) 45 | 46 | val vcr = Module(new VCR) 47 | val vme = Module(new VME) 48 | 49 | vcr.io.vcr.ecnt(0) <> DontCare 50 | vcr.io.vcr.finish := false.B 51 | 52 | vme.io.vme.rd(0) <> DontCare 53 | vme.io.vme.wr(0) <> DontCare 54 | // vme.io.vme.wr(0) <> DontCare 55 | // 56 | /* Insert Core Here */ 57 | // val core = Module(new Core) 58 | // 59 | /* Connect Control Status Registers */ 60 | // core.io.vcr <> vcr.io.vcr 61 | // vme.io.vme <> core.io.vme 62 | // 63 | /* Connect AXI */ 64 | io.host <> vcr.io.host 65 | io.mem <> vme.io.mem 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/test/scala/FPU/FPComputeNode.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package FPU 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | import FPU._ 8 | import FType._ 9 | 10 | import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester, OrderedDecoupledHWIOTester} 11 | import org.scalatest.{Matchers, FlatSpec} 12 | 13 | import node._ 14 | import dataflow._ 15 | import muxes._ 16 | import config._ 17 | import util._ 18 | import interfaces._ 19 | 20 | 21 | // Tester. 22 | class FPComputeNodeTester(df: FPComputeNode) 23 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 24 | 25 | 26 | poke(df.io.LeftIO.bits.data, 0x40800000.U) 27 | poke(df.io.LeftIO.valid, false.B) 28 | poke(df.io.LeftIO.bits.predicate, false.B) 29 | 30 | poke(df.io.RightIO.bits.data, 0x40800000.U) 31 | poke(df.io.RightIO.valid, false.B) 32 | poke(df.io.RightIO.bits.predicate, false.B) 33 | 34 | poke(df.io.enable.bits.control, false.B) 35 | poke(df.io.enable.valid, false.B) 36 | poke(df.io.Out(0).ready, false.B) 37 | println(s"Output: ${peek(df.io.Out(0))}\n") 38 | 39 | 40 | step(1) 41 | 42 | poke(df.io.enable.bits.control, true.B) 43 | poke(df.io.enable.valid, true.B) 44 | poke(df.io.Out(0).ready, true.B) 45 | 46 | 47 | poke(df.io.LeftIO.valid, true.B) 48 | poke(df.io.RightIO.valid, true.B) 49 | poke(df.io.LeftIO.bits.predicate, true.B) 50 | poke(df.io.RightIO.bits.predicate, true.B) 51 | 52 | println(s"Output: ${peek(df.io.Out(0))}\n") 53 | 54 | println(s"t: -1\n -------------------------------------") 55 | step(1) 56 | 57 | 58 | for (i <- 0 until 10) { 59 | println(s"Output: ${peek(df.io.Out(0))}\n") 60 | 61 | println(s"t: ${i}\n -------------------------------------") 62 | step(1) 63 | } 64 | } 65 | 66 | class FPComputeTests extends FlatSpec with Matchers { 67 | implicit val p = config.Parameters.root((new SinglePrecisionFPConfig).toInstance) 68 | it should "FP Add tester" in { 69 | chisel3.iotesters.Driver( 70 | () => new FPComputeNode(NumOuts = 1, ID = 0, opCode = "Add")(t = S)) { 71 | c => new FPComputeNodeTester(c) 72 | } should be(true) 73 | } 74 | } 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/main/resources/verilog/VTASimDPI.v: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | module VTASimDPI 21 | ( 22 | input clock, 23 | input reset, 24 | output logic dpi_wait 25 | ); 26 | 27 | import "DPI-C" function void VTASimDPI 28 | ( 29 | output byte unsigned sim_wait, 30 | output byte unsigned sim_exit 31 | ); 32 | 33 | typedef logic dpi1_t; 34 | typedef logic [7:0] dpi8_t; 35 | 36 | dpi1_t __reset; 37 | dpi8_t __wait; 38 | dpi8_t __exit; 39 | 40 | // reset 41 | always_ff @(posedge clock) begin 42 | __reset <= reset; 43 | end 44 | 45 | // evaluate DPI function 46 | always_ff @(posedge clock) begin 47 | if (reset | __reset) begin 48 | __wait = 0; 49 | __exit = 0; 50 | end 51 | else begin 52 | VTASimDPI( 53 | __wait, 54 | __exit); 55 | end 56 | end 57 | 58 | logic wait_reg; 59 | 60 | always_ff @(posedge clock) begin 61 | if (reset | __reset) begin 62 | wait_reg <= 1'b0; 63 | end else if (__wait == 1) begin 64 | wait_reg <= 1'b1; 65 | end else begin 66 | wait_reg <= 1'b0; 67 | end 68 | end 69 | 70 | assign dpi_wait = wait_reg; 71 | 72 | always_ff @(posedge clock) begin 73 | if (__exit == 1) begin 74 | $finish; 75 | end 76 | end 77 | 78 | endmodule 79 | -------------------------------------------------------------------------------- /src/main/scala/loop/Example1.scala: -------------------------------------------------------------------------------- 1 | package loop 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.util._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import muxes._ 12 | import util._ 13 | import node._ 14 | import utility.UniformPrintfs 15 | 16 | 17 | class LoopExampleIO[T <: Data](val ID: Int)(gen: T)(implicit p: Parameters) extends CoreBundle(){ 18 | 19 | val Input1 = Flipped(Decoupled(gen)) 20 | val Input2 = Flipped(Decoupled(gen)) 21 | val Input3 = Flipped(Decoupled(gen)) 22 | val Input4 = Flipped(Decoupled(gen)) 23 | 24 | val Enable = Flipped(Decoupled(new ControlBundle)) 25 | 26 | val Finish = Input(Bool()) 27 | 28 | val Result = Decoupled(gen) 29 | 30 | override def cloneType = new LoopExampleIO(ID)(gen).asInstanceOf[this.type] 31 | } 32 | 33 | 34 | class LoopExample(val NumInputs: Int, val ID: Int) 35 | (implicit val p: Parameters) extends Module with CoreParams with UniformPrintfs{ 36 | 37 | lazy val io = IO(new LoopExampleIO(ID)(new DataBundle())) 38 | 39 | val head = Module(new LoopHeader(NumInputs = NumInputs, NumOuts = 4, ID = 0)) 40 | val comp1 = Module(new ComputeNode(NumOuts = 1, ID = 1, opCode = "Add")(sign = false)) 41 | val comp2 = Module(new ComputeNode(NumOuts = 1, ID = 2, opCode = "Add")(sign = false)) 42 | val comp3 = Module(new ComputeNode(NumOuts = 1, ID = 3, opCode = "Add")(sign = false)) 43 | 44 | comp1.io.enable <> io.Enable 45 | comp2.io.enable <> io.Enable 46 | comp3.io.enable <> io.Enable 47 | 48 | head.io.inputArg(0) <> io.Input1 49 | head.io.inputArg(1) <> io.Input2 50 | head.io.inputArg(2) <> io.Input3 51 | head.io.inputArg(3) <> io.Input4 52 | 53 | head.io.Finish <> io.Finish 54 | 55 | comp1.io.LeftIO <> head.io.outputArg(0) 56 | comp1.io.RightIO <> head.io.outputArg(1) 57 | 58 | comp2.io.LeftIO <> head.io.outputArg(2) 59 | comp2.io.RightIO <> head.io.outputArg(3) 60 | 61 | comp3.io.LeftIO <> comp1.io.Out(0) 62 | comp3.io.RightIO <> comp2.io.Out(0) 63 | 64 | io.Result <> comp3.io.Out(0) 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/dnn/TLoad.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3.iotesters.PeekPokeTester 4 | import config._ 5 | import dnnnode.TLoad 6 | import node.matNxN 7 | import org.scalatest.{FlatSpec, Matchers} 8 | import utility._ 9 | 10 | class TLoadNodeTests(df: TLoad[matNxN]) (implicit p: config.Parameters) extends PeekPokeTester(df) { 11 | def N = false 12 | def Y = true 13 | val Control = Map( 14 | "Default" -> List(N,N,N,N,N,N,N,N,N,N), 15 | "Active" -> List(N,N, N,N, Y, N,N, Y,Y), 16 | "Input" -> List(Y,Y, Y,Y, Y, N,N, Y,Y), 17 | "~Input" -> List(Y,Y, Y,N, Y, N,N, Y,Y), 18 | "~Control" -> List(Y,N, Y,N, Y, N,N, Y,Y) 19 | ).withDefaultValue(List(N,N,N,N,N,N,N,N,N,N)) 20 | 21 | val sigs = Seq(df.io.enable.valid, df.io.enable.bits.control, 22 | df.io.GepAddr.valid, df.io.GepAddr.bits.predicate, 23 | df.io.PredOp(0).valid, 24 | df.io.tensorReq.ready, 25 | df.io.tensorResp.valid, 26 | df.io.SuccOp(0).ready, 27 | df.io.Out(0).ready 28 | ) 29 | 30 | sigs zip Control("Default") map {case(s,d) => poke(s,d)} 31 | sigs zip Control("Active") map {case(s,d) => poke(s,d)} 32 | 33 | 34 | for (t <- 0 until 20) { 35 | step(1) 36 | 37 | if (peek(df.io.GepAddr.ready) == 1) { 38 | sigs zip Control("~Control") map {case(s,d) => poke(s,d)} 39 | poke(df.io.GepAddr.bits.data, 12) 40 | } 41 | 42 | if((peek(df.io.tensorReq.valid) == 1) && (t > 4)) 43 | { 44 | poke(df.io.tensorReq.ready,true) 45 | } 46 | 47 | if (t > 5 && peek(df.io.tensorReq.ready) == 1) 48 | { 49 | // poke(df.io.tensorResp.data,t) 50 | poke(df.io.tensorResp.data, 0xdeadbeef+t) 51 | poke(df.io.tensorResp.valid,true) 52 | } 53 | } 54 | 55 | } 56 | 57 | import utility.Constants._ 58 | 59 | class TLoadNodeTester extends FlatSpec with Matchers { 60 | implicit val p = config.Parameters.root((new MiniConfig).toInstance) 61 | it should "TLoad Node tester" in { 62 | chisel3.iotesters.Driver(() => new TLoad(NumPredOps=1,NumSuccOps=1,NumOuts=1,ID=1,RouteID=0)(new matNxN(2, false))) { c => 63 | new TLoadNodeTests(c) 64 | } should be(true) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/dnn/TStore.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | /** 4 | * Created by nvedula on 15/5/17. 5 | */ 6 | 7 | 8 | import chisel3.iotesters.PeekPokeTester 9 | import config._ 10 | import dnnnode.TStore 11 | import node.matNxN 12 | import org.scalatest.{FlatSpec, Matchers} 13 | import utility._ 14 | 15 | class TStoreNodeTests(df: TStore[matNxN]) (implicit p: config.Parameters) extends PeekPokeTester(df) { 16 | poke(df.io.GepAddr.valid,false) 17 | poke(df.io.enable.valid,false) 18 | poke(df.io.inData.valid,false) 19 | poke(df.io.PredOp(0).valid,true) 20 | poke(df.io.tensorReq.ready,false) 21 | poke(df.io.tensorResp.valid,false) 22 | 23 | 24 | poke(df.io.SuccOp(0).ready,true) 25 | poke(df.io.Out(0).ready,false) 26 | 27 | 28 | for (t <- 0 until 20) { 29 | 30 | step(1) 31 | 32 | //IF ready is set 33 | // send address 34 | if (peek(df.io.GepAddr.ready) == 1) { 35 | poke(df.io.GepAddr.valid, true) 36 | poke(df.io.GepAddr.bits.data, 12) 37 | poke(df.io.GepAddr.bits.predicate, true) 38 | poke(df.io.inData.valid, true) 39 | poke(df.io.inData.bits.data, t+1) 40 | poke(df.io.inData.bits.predicate,true) 41 | // // poke(c.io.inData.bits.valid,true) 42 | poke(df.io.enable.bits.control,true) 43 | poke(df.io.enable.valid,true) 44 | } 45 | 46 | if((peek(df.io.tensorReq.valid) == 1) && (t > 4)) 47 | { 48 | poke(df.io.tensorReq.ready,true) 49 | } 50 | 51 | if (t > 5 && peek(df.io.tensorReq.ready) == 1) 52 | { 53 | // poke(c.io.memReq.ready,false) 54 | // poke(c.io.memResp.data,t) 55 | poke(df.io.tensorResp.valid,true) 56 | } 57 | printf(s"t: ${t} io.Out: ${peek(df.io.Out(0))} \n") 58 | 59 | } 60 | 61 | 62 | } 63 | 64 | 65 | 66 | import utility.Constants._ 67 | 68 | class TStoreNodeTester extends FlatSpec with Matchers { 69 | implicit val p = config.Parameters.root((new MiniConfig).toInstance) 70 | it should "TStore Node tester" in { 71 | chisel3.iotesters.Driver(() => new TStore(NumPredOps=1,NumSuccOps=1,NumOuts=1,ID=1,RouteID=0)(new matNxN(2, false))) { c => 72 | new TStoreNodeTests(c) 73 | } should be(true) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/filter/VecFilter.scala: -------------------------------------------------------------------------------- 1 | package dataflow.filter 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class VecFilter(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val FilterSize = 3 15 | 16 | val io = IO(new Bundle { 17 | val enable = Flipped(Decoupled(new ControlBundle( ))) 18 | val data = Vec(FilterSize, Flipped(Decoupled(new TypBundle( )))) 19 | val kern = Vec(FilterSize, Flipped(Decoupled(new TypBundle( )))) 20 | val sum = Decoupled(new TypBundle( )) 21 | }) 22 | 23 | val Multiplier = for (i <- 0 until FilterSize) yield { 24 | val mul = Module(new TypCompute(NumOuts = 1, ID = 0, opCode = "Mul")(sign = false)(new vecN(3))) 25 | mul 26 | } 27 | 28 | for (i <- 0 until FilterSize) { 29 | Multiplier(i).io.LeftIO <> io.data(i) 30 | Multiplier(i).io.RightIO <> io.kern(i) 31 | Multiplier(i).io.enable <> io.enable 32 | } 33 | 34 | val Adder = for (i <- 0 until FilterSize - 1) yield { 35 | val add = Module(new TypCompute(NumOuts = 1, ID = 0, opCode = "Add")(sign = false)(new vecN(3))) 36 | add 37 | } 38 | 39 | // First row 40 | Adder(0).io.LeftIO <> Multiplier(0).io.Out(0) 41 | Adder(0).io.RightIO <> Multiplier(1).io.Out(0) 42 | Adder(0).io.enable <> io.enable 43 | // Second Row 44 | Adder(1).io.LeftIO <> Adder(0).io.Out(0) 45 | Adder(1).io.RightIO <> Multiplier(2).io.Out(0) 46 | Adder(1).io.enable <> io.enable 47 | 48 | io.sum <> Adder(1).io.Out(0) 49 | 50 | // Info 51 | val countOn = true.B // increment counter every clock cycle 52 | val (counterValue, counterWrap) = Counter(countOn, 64 * 1024) 53 | 54 | val active = RegInit(init = false.B) 55 | val active_r = RegInit(init = false.B) 56 | active := Multiplier(0).io.Out(0).valid || Multiplier(1).io.Out(0).valid || Multiplier(2).io.Out(0).valid || 57 | Adder(0).io.Out(0).valid || Adder(1).io.Out(0).valid 58 | active_r := active 59 | when(active && !active_r) { 60 | printf("\nCOMPUTE START: %d\n", counterValue) 61 | } 62 | when(!active && active_r) { 63 | printf("\nCOMPUTE END: %d\n", counterValue) 64 | } 65 | 66 | } 67 | 68 | -------------------------------------------------------------------------------- /include/dmlc/endian.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file endian.h 4 | * \brief Endian testing, need c++11 5 | */ 6 | #ifndef DMLC_ENDIAN_H_ 7 | #define DMLC_ENDIAN_H_ 8 | 9 | #include "./base.h" 10 | 11 | #ifdef DMLC_CMAKE_LITTLE_ENDIAN 12 | // If compiled with CMake, use CMake's endian detection logic 13 | #define DMLC_LITTLE_ENDIAN DMLC_CMAKE_LITTLE_ENDIAN 14 | #else 15 | #if defined(__APPLE__) || defined(_WIN32) 16 | #define DMLC_LITTLE_ENDIAN 1 17 | #elif defined(__GLIBC__) || defined(__GNU_LIBRARY__) \ 18 | || defined(__ANDROID__) || defined(__RISCV__) 19 | #include 20 | #define DMLC_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) 21 | #elif defined(__FreeBSD__) || defined(__OpenBSD__) 22 | #include 23 | #define DMLC_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) 24 | #elif defined(__EMSCRIPTEN__) || defined(__hexagon__) 25 | #define DMLC_LITTLE_ENDIAN 1 26 | #elif defined(__sun) || defined(sun) 27 | #include 28 | #if defined(_LITTLE_ENDIAN) 29 | #define DMLC_LITTLE_ENDIAN 1 30 | #else 31 | #define DMLC_LITTLE_ENDIAN 0 32 | #endif 33 | #else 34 | #error "Unable to determine endianness of your machine; use CMake to compile" 35 | #endif 36 | #endif 37 | 38 | /*! \brief whether serialize using little endian */ 39 | #define DMLC_IO_NO_ENDIAN_SWAP (DMLC_LITTLE_ENDIAN == DMLC_IO_USE_LITTLE_ENDIAN) 40 | 41 | namespace dmlc { 42 | 43 | /*! 44 | * \brief A generic inplace byte swapping function. 45 | * \param data The data pointer. 46 | * \param elem_bytes The number of bytes of the data elements 47 | * \param num_elems Number of elements in the data. 48 | * \note Always try pass in constant elem_bytes to enable 49 | * compiler optimization 50 | */ 51 | inline void ByteSwap(void* data, size_t elem_bytes, size_t num_elems) { 52 | for (size_t i = 0; i < num_elems; ++i) { 53 | uint8_t* bptr = reinterpret_cast(data) + elem_bytes * i; 54 | for (size_t j = 0; j < elem_bytes / 2; ++j) { 55 | uint8_t v = bptr[elem_bytes - 1 - j]; 56 | bptr[elem_bytes - 1 - j] = bptr[j]; 57 | bptr[j] = v; 58 | } 59 | } 60 | } 61 | 62 | } // namespace dmlc 63 | #endif // DMLC_ENDIAN_H_ 64 | -------------------------------------------------------------------------------- /include/dmlc/thread_local.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2015 by Contributors 3 | * \file thread_local.h 4 | * \brief Portable thread local storage. 5 | */ 6 | #ifndef DMLC_THREAD_LOCAL_H_ 7 | #define DMLC_THREAD_LOCAL_H_ 8 | 9 | #include 10 | #include 11 | #include 12 | #include "./base.h" 13 | 14 | namespace dmlc { 15 | 16 | // macro hanlding for threadlocal variables 17 | #ifdef __GNUC__ 18 | #define MX_THREAD_LOCAL __thread 19 | #elif __STDC_VERSION__ >= 201112L 20 | #define MX_THREAD_LOCAL _Thread_local 21 | #elif defined(_MSC_VER) 22 | #define MX_THREAD_LOCAL __declspec(thread) 23 | #endif 24 | 25 | #if DMLC_CXX11_THREAD_LOCAL == 0 26 | #pragma message("Warning: CXX11 thread_local is not formally supported") 27 | #endif 28 | 29 | /*! 30 | * \brief A threadlocal store to store threadlocal variables. 31 | * Will return a thread local singleton of type T 32 | * \tparam T the type we like to store 33 | */ 34 | template 35 | class ThreadLocalStore { 36 | public: 37 | /*! \return get a thread local singleton */ 38 | static T* Get() { 39 | #if DMLC_CXX11_THREAD_LOCAL 40 | static thread_local T inst; 41 | return &inst; 42 | #else 43 | static MX_THREAD_LOCAL T* ptr = nullptr; 44 | if (ptr == nullptr) { 45 | ptr = new T(); 46 | Singleton()->RegisterDelete(ptr); 47 | } 48 | return ptr; 49 | #endif 50 | } 51 | 52 | private: 53 | /*! \brief constructor */ 54 | ThreadLocalStore() {} 55 | /*! \brief destructor */ 56 | ~ThreadLocalStore() { 57 | for (size_t i = 0; i < data_.size(); ++i) { 58 | delete data_[i]; 59 | } 60 | } 61 | /*! \return singleton of the store */ 62 | static ThreadLocalStore *Singleton() { 63 | static ThreadLocalStore inst; 64 | return &inst; 65 | } 66 | /*! 67 | * \brief register str for internal deletion 68 | * \param str the string pointer 69 | */ 70 | void RegisterDelete(T *str) { 71 | std::unique_lock lock(mutex_); 72 | data_.push_back(str); 73 | lock.unlock(); 74 | } 75 | /*! \brief internal mutex */ 76 | std::mutex mutex_; 77 | /*!\brief internal data */ 78 | std::vector data_; 79 | }; 80 | 81 | } // namespace dmlc 82 | 83 | #endif // DMLC_THREAD_LOCAL_H_ 84 | -------------------------------------------------------------------------------- /src/main/scala/dnn/memory/ReadTensorController.scala: -------------------------------------------------------------------------------- 1 | package dnn.memory 2 | 3 | import chisel3.{Module, _} 4 | import chisel3.util._ 5 | import muxes._ 6 | import config._ 7 | import interfaces._ 8 | import node._ 9 | 10 | abstract class RTController[gen <: Shapes](NumOps: Int, tensorType: String = "none")(shape: => gen)(implicit val p: Parameters) 11 | extends Module { 12 | val io = IO(new Bundle { 13 | val ReadIn = Vec(NumOps, Flipped(Decoupled(new TensorReadReq()))) 14 | val ReadOut = Vec(NumOps, Output(new TensorReadResp(shape.getWidth))) 15 | val tensor = new TensorMaster(tensorType) 16 | }) 17 | } 18 | 19 | class ReadTensorController[L <: Shapes] (NumOps: Int, tensorType: String = "none")(shape: => L)(implicit p: Parameters) 20 | extends RTController(NumOps, tensorType)(shape)(p) { 21 | 22 | val arbiter = Module(new RRArbiter(new TensorReadReq, NumOps)) 23 | val demux = Module(new Demux(new TensorReadResp(shape.getWidth), NumOps)) 24 | 25 | // Wire up inputs with the arbiter and outputs with demux 26 | for (i <- 0 until NumOps) { 27 | arbiter.io.in(i) <> io.ReadIn(i) 28 | io.ReadOut(i) <> demux.io.outputs(i) 29 | } 30 | 31 | val chosen_reg = RegInit(0.U) 32 | when(arbiter.io.out.fire){ 33 | chosen_reg := arbiter.io.chosen 34 | } 35 | 36 | demux.io.sel := chosen_reg 37 | //demux.io.sel := arbiter.io.chosen 38 | demux.io.en := io.tensor.rd.data.valid 39 | demux.io.input.data := io.tensor.rd.data.bits.asUInt() 40 | demux.io.input.valid := io.tensor.rd.data.valid 41 | // We don't have RouteID in TensorMaster interface !!!! 42 | demux.io.input.RouteID := arbiter.io.chosen 43 | 44 | io.tensor.rd.idx.bits := arbiter.io.out.bits.index 45 | io.tensor.rd.idx.valid := arbiter.io.out.valid 46 | arbiter.io.out.ready := true.B 47 | 48 | val s_idle:: s_wait :: Nil = Enum(2) 49 | val state = RegInit(s_idle) 50 | 51 | switch(state){ 52 | is(s_idle){ 53 | arbiter.io.out.ready := true.B 54 | when(arbiter.io.out.fire){ 55 | state := s_wait 56 | } 57 | } 58 | is(s_wait){ 59 | arbiter.io.out.ready := false.B 60 | when(io.tensor.rd.data.valid){ 61 | state := s_idle 62 | } 63 | } 64 | } 65 | 66 | 67 | io.tensor.wr <> DontCare 68 | } 69 | 70 | -------------------------------------------------------------------------------- /include/runtime/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /*! 21 | * \file tvm/runtime/util.h 22 | * \brief Useful runtime util. 23 | */ 24 | #ifndef TVM_RUNTIME_UTIL_H_ 25 | #define TVM_RUNTIME_UTIL_H_ 26 | 27 | #include "c_runtime_api.h" 28 | 29 | namespace tvm { 30 | namespace runtime { 31 | 32 | /*! 33 | * \brief Check whether type matches the given spec. 34 | * \param t The type 35 | * \param code The type code. 36 | * \param bits The number of bits to be matched. 37 | * \param lanes The number of lanes in the type. 38 | */ 39 | inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) { 40 | return t.code == code && t.bits == bits && t.lanes == lanes; 41 | } 42 | } // namespace runtime 43 | } // namespace tvm 44 | // Forward declare the intrinsic id we need 45 | // in structure fetch to enable stackvm in runtime 46 | namespace tvm { 47 | namespace ir { 48 | namespace intrinsic { 49 | /*! \brief The kind of structure field info used in intrinsic */ 50 | enum TVMStructFieldKind : int { 51 | // array head address 52 | kArrAddr, 53 | kArrData, 54 | kArrShape, 55 | kArrStrides, 56 | kArrNDim, 57 | kArrTypeCode, 58 | kArrTypeBits, 59 | kArrTypeLanes, 60 | kArrByteOffset, 61 | kArrDeviceId, 62 | kArrDeviceType, 63 | kArrKindBound_, 64 | // TVMValue field 65 | kTVMValueContent, 66 | kTVMValueKindBound_ 67 | }; 68 | } // namespace intrinsic 69 | } // namespace ir 70 | } // namespace tvm 71 | #endif // TVM_RUNTIME_UTIL_H_ 72 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/MergeAdd.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import chisel3.util.Decoupled 4 | import chisel3.{Flipped, Module, UInt, _} 5 | import config.{Parameters, XLEN} 6 | import dnn.types.OperatorNRSCAL 7 | import interfaces.CooDataBundle 8 | import node.{Shapes, vecN} 9 | 10 | class MergeAddIO(implicit val p: Parameters) extends Module { 11 | val io = IO(new Bundle { 12 | val eopIn = Input(Bool( )) 13 | val in = Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 14 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 15 | val eopOut = Output(Bool( )) 16 | }) 17 | } 18 | 19 | class MergeAdd[L <: Shapes : OperatorNRSCAL](maxStreamLen: Int, ID: Int, rowBased: Boolean)(shape: => L)(implicit p: Parameters) 20 | extends MergeAddIO()(p) { 21 | 22 | /*===============================================* 23 | * Connections * 24 | *===============================================*/ 25 | 26 | val merger = Module(new MergeSort(maxStreamLen = maxStreamLen, ID = 1, rowBased = rowBased)) 27 | val adder = Module(new Adder(ID = 1)(shape)) 28 | 29 | val data = RegInit(CooDataBundle.default(0.U(p(XLEN).W))) 30 | val valid = RegInit(false.B) 31 | // val valid = RegNext(io.in.valid) 32 | val lastR = RegInit(false.B) 33 | 34 | when(io.eopIn) { 35 | lastR := true.B 36 | } 37 | when(merger.io.in.ready && lastR) { 38 | lastR := false.B 39 | } 40 | when(merger.io.in.ready && io.in.valid) { 41 | data <> io.in.bits 42 | // valid := io.in.valid 43 | } 44 | when(merger.io.in.ready) { 45 | valid := io.in.valid 46 | } 47 | 48 | 49 | merger.io.lastIn := merger.io.in.ready && lastR 50 | merger.io.eopIn := false.B 51 | when((io.in.bits.row =/= data.row && io.in.valid) || (merger.io.in.ready && lastR)) { 52 | merger.io.eopIn := true.B 53 | } 54 | 55 | io.in.ready := merger.io.in.ready 56 | merger.io.in.bits := data 57 | merger.io.in.valid := valid 58 | 59 | adder.io.eopIn := merger.io.lastOut 60 | // adder.io.eopIn := merger.io.eopOut 61 | adder.io.in <> merger.io.out 62 | 63 | /* ================================================================== * 64 | * Adder & out * 65 | * ================================================================== */ 66 | 67 | io.out <> adder.io.out 68 | io.eopOut := adder.io.eopOut 69 | 70 | } -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/WeightedArbiter.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import chisel3.util.{ArbiterCtrl, Decoupled, log2Ceil} 4 | import chisel3.{Flipped, Module, UInt, _} 5 | import config.{Parameters, XLEN} 6 | import interfaces.CooDataBundle 7 | 8 | //private object ArbiterController { 9 | // def apply(request: Seq[Bool]): Seq[Bool] = request.length match { 10 | // case 0 => Seq() 11 | // case 1 => Seq(true.B) 12 | // case _ => true.B +: request.tail.init.scanLeft(request.head)(_ || _).map(!_) 13 | // } 14 | //} 15 | 16 | class WeightedArbiterIO(n: Int)(implicit val p: Parameters) extends Module { 17 | val io = IO(new Bundle { 18 | 19 | val in = Flipped(Vec(n, Decoupled(new CooDataBundle(UInt(p(XLEN).W))))) 20 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 21 | val chosen = Output(UInt(log2Ceil(n).W)) 22 | 23 | val eopIn = Vec(n, Input(Bool( ))) 24 | val lastIn = Vec(n, Input(Bool( ))) 25 | 26 | val eopOut = Output(Bool( )) 27 | val lastOut = Output(Bool( )) 28 | }) 29 | } 30 | 31 | class WeightedArbiter(n: Int)(implicit p: Parameters) 32 | extends WeightedArbiterIO(n)(p) { 33 | 34 | var chosen = n-1 35 | io.chosen := (n-1).asUInt 36 | io.eopOut := io.eopIn(n-1) 37 | io.lastOut := io.lastIn(n-1) 38 | io.out.bits := io.in(n-1).bits 39 | 40 | val isFinished = for (i <- 0 until n) yield { 41 | val eop = RegInit(init = false.B) 42 | eop 43 | } 44 | 45 | for (i <- 0 until n) { 46 | when (io.eopIn(i)) {isFinished(i) := true.B} 47 | } 48 | 49 | when(isFinished.reduceLeft(_&&_)) {isFinished.foreach(a => a := false.B)} 50 | val valid = Wire(Vec(n, Bool( ))) 51 | for (i <- 0 until n) { 52 | valid(i) := isFinished(i) || io.in(i).valid 53 | } 54 | 55 | val grant = Wire(Vec(n, Bool( ))) 56 | grant.foreach(a => a := false.B) 57 | grant(n-1) := true.B 58 | 59 | for (i <- n-1 to 0 by -1) { 60 | when (valid.reduceLeft(_&&_) && io.in(i).bits.row < io.in(chosen).bits.row && !isFinished(i)) { 61 | grant.foreach(a => a := false.B) 62 | grant(i) := true.B 63 | chosen = i 64 | io.chosen := i.asUInt 65 | io.out.bits := io.in(i).bits 66 | io.eopOut := io.eopIn(i) 67 | io.lastOut := io.lastIn(i) 68 | } 69 | } 70 | 71 | for ((in, g) <- io.in zip grant) 72 | in.ready := g && io.out.ready && valid.reduceLeft(_&&_) 73 | 74 | io.out.valid := valid.reduceLeft(_&&_) 75 | 76 | } -------------------------------------------------------------------------------- /src/main/scala/config/FPConfigs.scala: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import config._ 6 | import util._ 7 | import regfile._ 8 | import junctions._ 9 | import accel._ 10 | import FPU._ 11 | import FType._ 12 | 13 | 14 | class SinglePrecisionFPConfig extends Config((site, here, up) => { 15 | // Core 16 | case XLEN => 32 17 | case TLEN => 32 18 | case GLEN => 16 19 | // # Max bits of cache request tag. 20 | case MSHRLEN => 8 21 | case TYPSZ => 32 22 | case VERBOSITY => "low" 23 | case COMPONENTS => "TYPLOAD;TYPOP" 24 | // Max size of type memory system may see 25 | case TRACE => true 26 | case CTRACE => false 27 | case BuildRFile => (p: Parameters) => Module(new RFile(32)(p)) 28 | 29 | case FTYP => site(XLEN) match { 30 | case 32 => S 31 | case 64 => D 32 | case 16 => H 33 | case _ => S // Default. Initialization 34 | } 35 | 36 | 37 | //------------------------- 38 | // Cache 39 | case NWays => 1 // TODO: set-associative 40 | case NSets => 256 41 | case CacheBlockBytes => 4 * (here(XLEN) >> 3) // 4 x 32 bits = 16B 42 | // NastiIO 43 | case NastiKey => new NastiParameters( 44 | idBits = 12, 45 | dataBits = 32, 46 | addrBits = 32) 47 | } 48 | ) 49 | 50 | class HALFPrecisionFPConfig extends Config((site, here, up) => { 51 | // Core 52 | case XLEN => 16 53 | case TLEN => 32 54 | case GLEN => 16 55 | // # Max bits of cache request tag. 56 | case MSHRLEN => 8 57 | case TYPSZ => 32 58 | case VERBOSITY => "low" 59 | case COMPONENTS => "TYPLOAD;TYPOP" 60 | // Max size of type memory system may see 61 | case TRACE => true 62 | case CTRACE => false 63 | case BuildRFile => (p: Parameters) => Module(new RFile(32)(p)) 64 | 65 | case FTYP => site(XLEN) match { 66 | case 32 => S 67 | case 64 => D 68 | case 16 => H 69 | case _ => S // Default. Initialization 70 | } 71 | 72 | 73 | //------------------------- 74 | // Cache 75 | case NWays => 1 // TODO: set-associative 76 | case NSets => 256 77 | case CacheBlockBytes => 4 * (here(XLEN) >> 3) // 4 x 32 bits = 16B 78 | // NastiIO 79 | case NastiKey => new NastiParameters( 80 | idBits = 12, 81 | dataBits = 32, 82 | addrBits = 32) 83 | } 84 | ) 85 | -------------------------------------------------------------------------------- /src/main/scala/accel/coredf/FilterDFCore.scala: -------------------------------------------------------------------------------- 1 | package accel.coredf 2 | 3 | /** 4 | * Created by nvedula on 28/6/17. 5 | */ 6 | 7 | import accel._ 8 | import chisel3._ 9 | import chisel3.util._ 10 | import config._ 11 | import dataflow._ 12 | import dataflow.filter._ 13 | import interfaces._ 14 | 15 | /** 16 | * The Core class creates contains the dataflow logic for the accelerator. 17 | * This particular core file implements a simple memory test routine to 18 | * validate the register interface and the Nasti bus operation on an SoC FPGA. 19 | * 20 | * @param p Project parameters. Only xlen is used to specify register and 21 | * data bus width. 22 | * 23 | * @note io.ctrl A control register (from SimpleReg block) to start test 24 | * @note io.addr A control register containing the physical address for 25 | * the test 26 | * @note io.len A control register containing the length of the memory 27 | * test (number of words) 28 | * @note io.stat A status register containing the current state of the test 29 | * @note io.cache A Read/Write request interface to a memory cache block 30 | */ 31 | 32 | 33 | class FilterDFCore(cNum : Int, sNum: Int)(implicit p: Parameters) extends CoreT(cNum,sNum)(p) { 34 | 35 | val FilterSize = 3*3 36 | 37 | val Loader = Module(new CacheLoader(18)(p)) 38 | val Filt = Module(new BasicFilter()(p)) 39 | val done = RegInit(init=false.B) 40 | 41 | Loader.io.enable.bits.control := true.B 42 | Loader.io.enable.bits.taskID := 0.U 43 | Loader.io.enable.valid := true.B 44 | Filt.io.enable.bits.control := true.B 45 | Filt.io.enable.bits.taskID := 0.U 46 | Filt.io.enable.valid := true.B 47 | 48 | Loader.io.ptr <> io.ctrl 49 | 50 | for (i <- 0 until 9) { 51 | Filt.io.data(i) <> Loader.io.data(i) 52 | Filt.io.kern(i) <> Loader.io.data(i+9) 53 | } 54 | 55 | io.stat(0).bits.data := 0x55AA0002.U 56 | // // io.stat(0).bits.valid := true.B 57 | io.stat(0).valid := true.B 58 | io.stat(0).bits.predicate := true.B 59 | io.stat(1) <> Filt.io.sum; 60 | io.stat(2).bits.data := 0.U 61 | // // io.stat(2).bits.valid := true.B 62 | io.stat(2).valid := true.B 63 | io.stat(2).bits.predicate := true.B 64 | 65 | Loader.io.cache <> io.cache 66 | when (io.init) { 67 | done := false.B 68 | } .otherwise { 69 | when (Filt.io.sum.valid) { 70 | done := Filt.io.sum.valid 71 | } 72 | } 73 | io.done := done 74 | io.ready := true.B 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/dnn/types/FP_GEMV.scala: -------------------------------------------------------------------------------- 1 | package dnn.types 2 | 3 | import FPU.{FPMAC, FType} 4 | import chisel3._ 5 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 6 | import chisel3.Module 7 | import chisel3.testers._ 8 | import chisel3.util._ 9 | import org.scalatest.{FlatSpec, Matchers} 10 | import config._ 11 | import interfaces._ 12 | import muxes._ 13 | import util._ 14 | import node._ 15 | import utility.UniformPrintfs 16 | import dnn._ 17 | 18 | object FPOperator_GEMV { 19 | 20 | implicit object FPmatNxN_FPvecN extends OperatorGEMV[FPmatNxN, FPvecN] { 21 | def addition(l: FPmatNxN, r: FPvecN)(implicit p: Parameters): FPmatNxN = { 22 | // require((l.N & (l.N - 1)) == 0, "left operand not a power of 2") 23 | val x = Wire(new FPmatNxN(l.N, l.t)) 24 | for (i <- 0 until l.N) { 25 | for (j <- 0 until l.N) { 26 | val FPadd = Module(new FPMAC(p(XLEN), "Add", l.t)) 27 | FPadd.io.in1 := l.data(i)(j) 28 | if (r.isCol == 0) { 29 | FPadd.io.in2 := r.data(j) 30 | } else { 31 | FPadd.io.in2 := r.data(i) 32 | } 33 | x.data(i)(j) := FPadd.io.out 34 | } 35 | } 36 | x 37 | } 38 | 39 | def subtraction(l: FPmatNxN, r: FPvecN)(implicit p: Parameters): FPmatNxN = { 40 | val x = Wire(new FPmatNxN(l.N, l.t)) 41 | for (i <- 0 until l.N) { 42 | for (j <- 0 until l.N) { 43 | val FPadd = Module(new FPMAC(p(XLEN), "Sub", l.t)) 44 | FPadd.io.in1 := l.data(i)(j) 45 | if (r.isCol == 0) { 46 | FPadd.io.in2 := r.data(j) 47 | } else { 48 | FPadd.io.in2 := r.data(i) 49 | } 50 | x.data(i)(j) := FPadd.io.out 51 | } 52 | } 53 | x 54 | } 55 | 56 | def multiplication(l: FPmatNxN, r: FPvecN)(implicit p: Parameters): FPvecN = { 57 | require(r.isCol == 1, "Right vector should be a column vector") 58 | val x = Wire(new FPvecN(r.N, r.t)) 59 | 60 | 61 | x 62 | } 63 | 64 | def getfns(l: Shapes, r: Shapes)(implicit p: Parameters): Array[(Int, Shapes)] = { 65 | Array( 66 | GEMV_OpCode.Add -> addition(l.asInstanceOf[FPmatNxN], r.asInstanceOf[FPvecN]), 67 | GEMV_OpCode.Sub -> subtraction(l.asInstanceOf[FPmatNxN], r.asInstanceOf[FPvecN]), 68 | GEMV_OpCode.Mul -> multiplication(l.asInstanceOf[FPmatNxN], r.asInstanceOf[FPvecN]) 69 | ) 70 | } 71 | 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /src/main/scala/accel/coredf/VecFilterDFCore.scala: -------------------------------------------------------------------------------- 1 | package accel.coredf 2 | 3 | /** 4 | * Created by nvedula on 28/6/17. 5 | */ 6 | 7 | import accel._ 8 | import chisel3._ 9 | import config._ 10 | import dataflow.filter._ 11 | 12 | /** 13 | * The Core class creates contains the dataflow logic for the accelerator. 14 | * This particular core file implements a simple memory test routine to 15 | * validate the register interface and the Nasti bus operation on an SoC FPGA. 16 | * 17 | * @param p Project parameters. Only xlen is used to specify register and 18 | * data bus width. 19 | * 20 | * @note io.ctrl A control register (from SimpleReg block) to start test 21 | * @note io.addr A control register containing the physical address for 22 | * the test 23 | * @note io.len A control register containing the length of the memory 24 | * test (number of words) 25 | * @note io.stat A status register containing the current state of the test 26 | * @note io.cache A Read/Write request interface to a memory cache block 27 | */ 28 | 29 | 30 | class VecFilterDFCore(cNum : Int, sNum: Int)(implicit p: Parameters) extends CoreT(cNum,sNum)(p) { 31 | 32 | val FilterSize = 3 33 | 34 | val Loader = Module(new CacheVecLoader(6)(p)) 35 | val Filt = Module(new VecFilter()(p)) 36 | val done = RegInit(init=false.B) 37 | 38 | Loader.io.enable.bits.control := true.B 39 | Loader.io.enable.bits.taskID := 0.U 40 | Loader.io.enable.valid := true.B 41 | Filt.io.enable.bits.control := true.B 42 | Filt.io.enable.bits.taskID := 0.U 43 | Filt.io.enable.valid := true.B 44 | 45 | Loader.io.ptr <> io.ctrl 46 | 47 | for (i <- 0 until 3) { 48 | Filt.io.data(i) <> Loader.io.data(i) 49 | Filt.io.kern(i) <> Loader.io.data(i+3) 50 | } 51 | 52 | Loader.io.sum <> Filt.io.sum 53 | 54 | io.stat(0).bits.data := 0x55AA0003.U 55 | io.stat(0).valid := true.B 56 | 57 | // io.stat(1).bits.data := Filt.io.sum.bits.data(31,0) 58 | // io.stat(1).valid := Filt.io.sum.valid 59 | // io.stat(2).bits.data <> Filt.io.sum.bits.data(63,32) 60 | // io.stat(2).valid := Filt.io.sum.valid 61 | // io.stat(3).bits.data <> Filt.io.sum.bits.data(95,64) 62 | // io.stat(3).valid := Filt.io.sum.valid 63 | 64 | Loader.io.cache <> io.cache 65 | when (io.init) { 66 | done := false.B 67 | } .otherwise { 68 | when (Filt.io.sum.valid) { 69 | done := Filt.io.sum.valid 70 | } 71 | } 72 | io.done := done 73 | io.ready := true.B 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/Adder.scala: -------------------------------------------------------------------------------- 1 | 2 | package tensorKernels 3 | 4 | import chisel3.util.{Decoupled, Valid} 5 | import chisel3.{Flipped, Module, UInt, _} 6 | import config.{Parameters, XLEN} 7 | import dnn.types.{OperatorNRSCAL} 8 | import interfaces.CooDataBundle 9 | import node.Shapes 10 | 11 | class NRSCALFU[L <: Shapes : OperatorNRSCAL](shape: => L, lanes: Int, opCode: String)(implicit val p: Parameters) extends Module { 12 | val io = IO(new Bundle { 13 | val a = Flipped(Valid(shape)) 14 | val b = Flipped(Valid(UInt(p(XLEN).W))) 15 | val o = Decoupled(shape) 16 | }) 17 | 18 | 19 | val start = io.o.ready 20 | val FU = OperatorNRSCAL.magic(io.a.bits, io.b.bits, start, lanes, opCode) 21 | io.o.bits := FU._1 22 | io.o.valid := FU._2 23 | } 24 | 25 | class AdderIO(implicit val p: Parameters) extends Module { 26 | val io = IO(new Bundle { 27 | val eopIn = Input(Bool( )) 28 | val in = Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 29 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 30 | val eopOut = Output(Bool( )) 31 | }) 32 | } 33 | 34 | class Adder[L <: Shapes : OperatorNRSCAL](ID: Int)(shape: => L)(implicit p: Parameters) 35 | extends AdderIO()(p) { 36 | 37 | /*===============================================* 38 | * Connections * 39 | *===============================================*/ 40 | io.eopOut := RegNext(io.eopIn) 41 | 42 | val data = RegInit(CooDataBundle.default(0.U(p(XLEN).W))) 43 | val dataValid = RegInit(false.B) 44 | 45 | val FU = Module(new NRSCALFU(shape, lanes = shape.getLength(), opCode = "Add")) 46 | 47 | FU.io.a.bits := VecInit(io.in.bits.data.asUInt()).asTypeOf(shape) 48 | FU.io.b.bits := data.data 49 | 50 | FU.io.a.valid := io.in.valid 51 | FU.io.b.valid := io.in.valid 52 | 53 | FU.io.o.ready := true.B 54 | 55 | dataValid := false.B 56 | when(io.in.valid){ 57 | dataValid := true.B 58 | when(data.row =/= io.in.bits.row || data.col =/= io.in.bits.col) { 59 | data <> io.in.bits 60 | }.elsewhen(data.row === io.in.bits.row && data.col === io.in.bits.col){ 61 | data.data := FU.io.o.bits.asUInt() //data.data + io.in.bits.data 62 | data.row := io.in.bits.row 63 | data.col := io.in.bits.col 64 | data.valid := io.in.bits.valid 65 | } 66 | } 67 | 68 | io.out.bits := data 69 | io.out.valid := (dataValid && !(data.row === io.in.bits.row && data.col === io.in.bits.col)) || io.eopIn 70 | io.in.ready := io.out.ready 71 | } -------------------------------------------------------------------------------- /src/main/scala/dataflow/AllocaDF.scala: -------------------------------------------------------------------------------- 1 | package dataflow 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import control.{BasicBlockNoMaskNode, BasicBlockNode} 11 | import util._ 12 | import interfaces._ 13 | import node._ 14 | import stack._ 15 | 16 | 17 | 18 | //TODO uncomment if you remove StackCentral.scala file 19 | // 20 | abstract class StackDFIO(implicit val p: Parameters) extends Module with CoreParams { 21 | val io = IO(new Bundle { 22 | val Data0 = Flipped(Decoupled(new AllocaIO)) 23 | val pred = Decoupled(new Bool()) 24 | val result = Decoupled(new DataBundle) 25 | }) 26 | } 27 | 28 | class StackDF(implicit p: Parameters) extends StackDFIO() { 29 | 30 | 31 | /** 32 | * @note Module's variables they should set during initialization 33 | */ 34 | //BasicBlock 35 | val b0_entry = Module(new BasicBlockNoMaskNode(NumInputs = 1, NumOuts = 1, BID = 0)) 36 | 37 | //Compute 38 | val m0 = Module(new AllocaNode(NumOuts = 1,ID = 0,RouteID=0)) 39 | // val m5 = Module(new ComputeNode(NumOuts = 1, ID = 5, opCode = "Add")(sign = false)) 40 | 41 | //Stack 42 | val stack = Module(new Stack(NumOps = 1)) 43 | 44 | /** 45 | * Wireing control signals from BasicBlock nodes 46 | * to their child 47 | */ 48 | 49 | //Grounding entry BasicBlock 50 | b0_entry.io.predicateIn.bits.control := true.B 51 | b0_entry.io.predicateIn.bits.taskID := 0.U 52 | b0_entry.io.predicateIn.valid := true.B 53 | 54 | /** 55 | * Wireing enable signals to the instructions 56 | */ 57 | //Wiring enable signals 58 | m0.io.enable <> b0_entry.io.Out(0) 59 | 60 | /** 61 | * Connecting Dataflow signals 62 | */ 63 | //dataflow signal 64 | stack.io.InData(0) <> m0.io.allocaReqIO 65 | m0.io.allocaRespIO <> stack.io.OutData(0) 66 | 67 | 68 | /** 69 | * Wireing constants 70 | */ 71 | m0.io.allocaInputIO <> io.Data0 72 | // m0.io.allocaInputIO.bits.size := 3.U 73 | // m0.io.allocaInputIO.bits.numByte := 4.U 74 | // // // m0.io.allocaInputIO.bits.valid := true.B 75 | // m0.io.allocaInputIO.bits.predicate := true.B 76 | // m0.io.allocaInputIO.valid := true.B 77 | 78 | //Output 79 | io.result <> m0.io.Out(0) 80 | io.pred.valid := true.B 81 | io.pred.bits := true.B 82 | 83 | //DEBUG 84 | // io.pred <> b1_then.io.Out(0) 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/dnn/memory/inStreamDMA.scala: -------------------------------------------------------------------------------- 1 | 2 | package dnn.memory 3 | 4 | import chisel3._ 5 | import chisel3.util._ 6 | import config._ 7 | import interfaces.CooDataBundle 8 | import shell._ 9 | //import vta.util.config._ 10 | import dnn.memory.ISA._ 11 | 12 | 13 | /** TensorLoad. 14 | * 15 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 16 | * scratchpads (SRAM). Also, there is support for zero padding, while 17 | * doing the load. Zero-padding works on the y and x axis, and it is 18 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 19 | * handling the way tensors are stored on the scratchpads. 20 | */ 21 | class inStreamDMAIO(memTensorType: String = "none")(implicit val p: Parameters) 22 | extends Module { 23 | val tp = new TensorParams(memTensorType) 24 | val mp = p(ShellKey).memParams 25 | val io = IO(new Bundle { 26 | val start = Input(Bool()) 27 | val done = Output(Bool()) 28 | val baddr = Input(UInt(mp.addrBits.W)) 29 | val len = Input(UInt(mp.addrBits.W)) 30 | val vme_rd = new VMEReadMaster 31 | val out = Decoupled(UInt(p(XLEN).W)) 32 | }) 33 | } 34 | 35 | class inStreamDMA(bufSize: Int, memTensorType: String = "none")(implicit p: Parameters) 36 | extends inStreamDMAIO(memTensorType)(p) { 37 | 38 | val strLoad = Module(new StreamLoad(bufSize, memTensorType)) 39 | 40 | 41 | io.done := strLoad.io.done 42 | 43 | // val popCnt = Counter(math.pow(2, p(XLEN)).toInt) 44 | // val width = p(ShellKey).memParams.dataBits / p(XLEN) 45 | 46 | val tl_Inst = Wire(new MemDecode) 47 | val memTensorRows = Mux(io.len % tp.tensorWidth.U === 0.U, 48 | io.len / tp.tensorWidth.U, 49 | (io.len / tp.tensorWidth.U) + 1.U) 50 | 51 | tl_Inst.xpad_0 := 0.U 52 | tl_Inst.xpad_1 := 0.U 53 | tl_Inst.ypad_0 := 0.U 54 | tl_Inst.ypad_1 := 0.U 55 | tl_Inst.xstride := memTensorRows 56 | tl_Inst.xsize := memTensorRows 57 | tl_Inst.ysize := 1.U 58 | tl_Inst.empty_0 := 0.U 59 | tl_Inst.dram_offset := 0.U 60 | tl_Inst.sram_offset := 0.U 61 | tl_Inst.id := 3.U 62 | tl_Inst.push_next := 0.U 63 | tl_Inst.push_prev := 0.U 64 | tl_Inst.pop_next := 0.U 65 | tl_Inst.pop_prev := 0.U 66 | tl_Inst.op := 0.U 67 | 68 | strLoad.io.start := io.start 69 | strLoad.io.inst := tl_Inst.asTypeOf(UInt(INST_BITS.W)) 70 | strLoad.io.baddr := io.baddr 71 | io.out <> strLoad.io.out 72 | io.vme_rd <> strLoad.io.vme_rd 73 | 74 | // when(io.out.fire()) {popCnt.inc()} 75 | 76 | // io.out.valid := strLoad.io.out.valid && (popCnt.value < io.len) 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/arbiters/ArbiterTree.scala: -------------------------------------------------------------------------------- 1 | package arbiters 2 | 3 | import interfaces._ 4 | import chisel3._ 5 | import chisel3.util._ 6 | import chisel3.Module 7 | import config._ 8 | import interfaces._ 9 | 10 | abstract class AbstractArbiterTree[T <: Data](Nops: Int, gen: T)(implicit p: Parameters) 11 | extends Module with CoreParams { 12 | val io = IO(new Bundle { 13 | val in = Vec(Nops, Flipped(Decoupled(gen))) 14 | val out = Decoupled(Output(gen)) 15 | }) 16 | } 17 | 18 | class ArbiterTree[T <: Data](BaseSize: Int, NumOps: Int, gen: T, Locks: Int)(implicit val p: Parameters) 19 | extends AbstractArbiterTree(NumOps, gen)(p) { 20 | require(NumOps > 0) 21 | require(isPow2(BaseSize)) 22 | val ArbiterReady = RegInit(true.B) 23 | var prev = Seq.fill(0) { 24 | Module(new LockingRRArbiter(gen, 4, count = Locks)).io 25 | } 26 | var toplevel = Seq.fill(0) { 27 | Module(new LockingRRArbiter(gen, 4, count = Locks)).io 28 | } 29 | var Arbiters_per_Level = (NumOps + BaseSize - 1) / BaseSize 30 | while (Arbiters_per_Level > 0) { 31 | val arbiters = Seq.fill(Arbiters_per_Level) { 32 | Module(new LockingRRArbiter(gen, BaseSize, count = Locks)).io 33 | } 34 | if (prev.length != 0) { 35 | for (i <- 0 until arbiters.length * BaseSize) { 36 | if (i < prev.length) { 37 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)) <> prev(i).out 38 | } else { 39 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)).valid := false.B 40 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)).bits <> DontCare 41 | } 42 | } 43 | } 44 | 45 | if (prev.length == 0) { 46 | toplevel = arbiters 47 | for (i <- 0 until arbiters.length * BaseSize) { 48 | if (i < NumOps) { 49 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)) <> io.in(i) 50 | // arbiters(i/BaseSize).in(indexcalc(i,BaseSize)).valid := true.B; 51 | } else { 52 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)).valid := false.B; 53 | arbiters(i / BaseSize).in(indexcalc(i, BaseSize)).bits <> DontCare 54 | } 55 | } 56 | } 57 | prev = arbiters 58 | if (Arbiters_per_Level == 1) { 59 | Arbiters_per_Level = 0 60 | } else { 61 | Arbiters_per_Level = (Arbiters_per_Level + BaseSize - 1) / BaseSize 62 | } 63 | } 64 | if (NumOps > 0) { 65 | io.out <> prev(0).out 66 | } 67 | 68 | object indexcalc { 69 | def apply(i: Int, BaseSize: Int): Int = { 70 | i - ((i / BaseSize) * BaseSize) 71 | } 72 | } 73 | 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/main/scala/dnn/DGEMVNode.scala: -------------------------------------------------------------------------------- 1 | package dnnnode 2 | 3 | import FPU.FType 4 | import chisel3._ 5 | import chisel3.util._ 6 | import config._ 7 | import dnn.{MacNode, ReduceNode} 8 | import dnn.memory.TensorParams 9 | import dnn.types.{OperatorDot, OperatorReduction} 10 | import interfaces.{ControlBundle, CustomDataBundle, TensorReadReq, TensorReadResp} 11 | import node.{FPvecN, HandShakingIONPS, HandShakingNPS, Shapes, vecN} 12 | import shell._ 13 | //import vta.util.config._ 14 | 15 | 16 | /** TensorLoad. 17 | * 18 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 19 | * scratchpads (SRAM). Also, there is support for zero padding, while 20 | * doing the load. Zero-padding works on the y and x axis, and it is 21 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 22 | * handling the way tensors are stored on the scratchpads. 23 | */ 24 | class DGEMVNodeIO[gen <: Shapes](NumRows: Int, NumOuts: Int) 25 | (vecShape: => gen)(implicit p: Parameters) 26 | extends HandShakingIONPS(NumOuts)(new CustomDataBundle(UInt((p(XLEN) * NumRows).W))) { 27 | val mat = Vec(NumRows ,Flipped(Decoupled(new CustomDataBundle(UInt(vecShape.getWidth.W))))) 28 | val vec = Flipped(Decoupled(new CustomDataBundle(UInt(vecShape.getWidth.W)))) 29 | 30 | override def cloneType = new DGEMVNodeIO(NumRows, NumOuts)(vecShape).asInstanceOf[this.type] 31 | } 32 | 33 | class DGEMVNode[L <: Shapes: OperatorDot: OperatorReduction](NumRows: Int, NumOuts: Int)(vecShape: => L)(implicit p: Parameters) 34 | extends HandShakingNPS(NumOuts, 0)(new CustomDataBundle(UInt((p(XLEN) * NumRows).W)))(p) { 35 | override lazy val io = IO(new DGEMVNodeIO(NumRows, NumOuts)(vecShape)) 36 | 37 | val mac = for (i <- 0 until NumRows) yield { 38 | val macNode = Module(new MacNode(NumOuts = 1, ID = 0, lanes = vecShape.getLength())(vecShape)) 39 | macNode 40 | } 41 | 42 | for (i <- 0 until NumRows) { 43 | 44 | mac(i).io.enable.bits <> ControlBundle.active() 45 | mac(i).io.enable.valid := true.B 46 | 47 | mac(i).io.LeftIO <> io.mat(i) 48 | mac(i).io.RightIO <> io.vec 49 | 50 | mac(i).io.Out(0).ready := io.Out.map(_.ready).reduceLeft(_ && _) 51 | } 52 | 53 | for(i <- 0 until NumOuts){ 54 | io.Out(i).bits.data := VecInit(mac.map(_.io.Out(0).bits.data)).asTypeOf(CustomDataBundle(UInt((NumRows * xlen).W))) 55 | io.Out(i).bits.valid := mac.map(_.io.Out(0).bits.valid).reduceLeft(_ && _) 56 | io.Out(i).bits.predicate := true.B 57 | io.Out(i).bits.taskID := 0.U 58 | io.Out(i).valid := mac.map(_.io.Out(0).valid).reduceLeft(_ && _) 59 | } 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /include/dmlc/common.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2015 by Contributors 3 | * \file common.h 4 | * \brief defines some common utility function. 5 | */ 6 | #ifndef DMLC_COMMON_H_ 7 | #define DMLC_COMMON_H_ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "./logging.h" 15 | 16 | namespace dmlc { 17 | /*! 18 | * \brief Split a string by delimiter 19 | * \param s String to be splitted. 20 | * \param delim The delimiter. 21 | * \return a splitted vector of strings. 22 | */ 23 | inline std::vector Split(const std::string& s, char delim) { 24 | std::string item; 25 | std::istringstream is(s); 26 | std::vector ret; 27 | while (std::getline(is, item, delim)) { 28 | ret.push_back(item); 29 | } 30 | return ret; 31 | } 32 | 33 | /*! 34 | * \brief hash an object and combines the key with previous keys 35 | */ 36 | template 37 | inline size_t HashCombine(size_t key, const T& value) { 38 | std::hash hash_func; 39 | return key ^ (hash_func(value) + 0x9e3779b9 + (key << 6) + (key >> 2)); 40 | } 41 | 42 | /*! 43 | * \brief specialize for size_t 44 | */ 45 | template<> 46 | inline size_t HashCombine(size_t key, const size_t& value) { 47 | return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2)); 48 | } 49 | 50 | /*! 51 | * \brief OMP Exception class catches, saves and rethrows exception from OMP blocks 52 | */ 53 | class OMPException { 54 | private: 55 | // exception_ptr member to store the exception 56 | std::exception_ptr omp_exception_; 57 | // mutex to be acquired during catch to set the exception_ptr 58 | std::mutex mutex_; 59 | 60 | public: 61 | /*! 62 | * \brief Parallel OMP blocks should be placed within Run to save exception 63 | */ 64 | template 65 | void Run(Function f, Parameters... params) { 66 | try { 67 | f(params...); 68 | } catch (dmlc::Error &ex) { 69 | std::lock_guard lock(mutex_); 70 | if (!omp_exception_) { 71 | omp_exception_ = std::current_exception(); 72 | } 73 | } catch (std::exception &ex) { 74 | std::lock_guard lock(mutex_); 75 | if (!omp_exception_) { 76 | omp_exception_ = std::current_exception(); 77 | } 78 | } 79 | } 80 | 81 | /*! 82 | * \brief should be called from the main thread to rethrow the exception 83 | */ 84 | void Rethrow() { 85 | if (this->omp_exception_) std::rethrow_exception(this->omp_exception_); 86 | } 87 | }; 88 | 89 | } // namespace dmlc 90 | 91 | #endif // DMLC_COMMON_H_ 92 | -------------------------------------------------------------------------------- /include/runtime_base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /*! 21 | * Copyright (c) 2016 by Contributors 22 | * \file runtime_base.h 23 | * \brief Base of all C APIs 24 | */ 25 | #ifndef TVM_RUNTIME_RUNTIME_BASE_H_ 26 | #define TVM_RUNTIME_RUNTIME_BASE_H_ 27 | 28 | #include 29 | #include 30 | 31 | /*! \brief macro to guard beginning and end section of all functions */ 32 | #define API_BEGIN() \ 33 | try \ 34 | { 35 | /*! \brief every function starts with API_BEGIN(); 36 | and finishes with API_END() or API_END_HANDLE_ERROR */ 37 | #define API_END() \ 38 | } \ 39 | catch (std::runtime_error & _except_) { return TVMAPIHandleException(_except_); } \ 40 | return 0; // NOLINT(*) 41 | /*! 42 | * \brief every function starts with API_BEGIN(); 43 | * and finishes with API_END() or API_END_HANDLE_ERROR 44 | * The finally clause contains procedure to cleanup states when an error happens. 45 | */ 46 | #define API_END_HANDLE_ERROR(Finalize) \ 47 | } \ 48 | catch (std::runtime_error & _except_) \ 49 | { \ 50 | Finalize; \ 51 | return TVMAPIHandleException(_except_); \ 52 | } \ 53 | return 0; // NOLINT(*) 54 | 55 | /*! 56 | * \brief handle exception throwed out 57 | * \param e the exception 58 | * \return the return value of API after exception is handled 59 | */ 60 | int TVMAPIHandleException(const std::runtime_error &e); 61 | 62 | #endif // TVM_RUNTIME_RUNTIME_BASE_H_ 63 | -------------------------------------------------------------------------------- /src/main/scala/accel/coredf/TestCacheDF.scala: -------------------------------------------------------------------------------- 1 | package accel.coredf 2 | 3 | /** 4 | * Created by nvedula on 28/6/17. 5 | */ 6 | 7 | import accel._ 8 | import chisel3._ 9 | import chisel3.util._ 10 | import config._ 11 | import dataflow._ 12 | 13 | /** 14 | * The Core class creates contains the dataflow logic for the accelerator. 15 | * This particular core file implements a simple memory test routine to 16 | * validate the register interface and the Nasti bus operation on an SoC FPGA. 17 | * 18 | * @param p Project parameters. Only xlen is used to specify register and 19 | * data bus width. 20 | * 21 | * @note io.ctrl A control register (from SimpleReg block) to start test 22 | * @note io.addr A control register containing the physical address for 23 | * the test 24 | * @note io.len A control register containing the length of the memory 25 | * test (number of words) 26 | * @note io.stat A status register containing the current state of the test 27 | * @note io.cache A Read/Write request interface to a memory cache block 28 | */ 29 | 30 | 31 | class TestCacheDF(cNum : Int, sNum: Int)(implicit p: Parameters) extends CoreT(cNum,sNum)(p) { 32 | 33 | 34 | val (s_idle :: s_busy :: s_done :: Nil) = Enum(3) 35 | val state = RegInit(init = s_idle) 36 | // val err_latch = Reg(Bool()) 37 | val add_result_reg = Reg(UInt(xlen.W)) 38 | val start_reg = RegInit(false.B) 39 | 40 | val MemDF = Module(new TestCacheDataFlow()) 41 | MemDF.io.start := io.start 42 | override val printfSigil = "TestMemDF: add_result_reg: " + add_result_reg.asUInt() + " state: " + state + " " 43 | 44 | //IO Connections 45 | //result is Decoupled 46 | io.stat(0) <> add_result_reg 47 | io.stat(0).valid := true.B 48 | 49 | io.cache.req <> MemDF.io.MemReq 50 | MemDF.io.MemResp <> io.cache.resp 51 | 52 | switch (state) { 53 | // Idle 54 | is(s_idle) { 55 | 56 | when(io.start) { 57 | start_reg := true.B 58 | state := s_busy 59 | } 60 | } 61 | 62 | is(s_busy) { 63 | 64 | when(MemDF.io.result.valid) { 65 | state := s_done 66 | add_result_reg := MemDF.io.result.bits.data(xlen-1,0) 67 | } 68 | 69 | } 70 | 71 | // Done 72 | is (s_done) { 73 | 74 | start_reg := false.B 75 | when(io.init) { 76 | add_result_reg := 0.U 77 | state := s_idle 78 | } 79 | } 80 | } 81 | 82 | // Reflect state machine status to processor 83 | io.done := (state === s_done) 84 | io.ready := (state === s_idle) 85 | // Intermediate 86 | MemDF.io.result.ready := (state === s_busy) 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/config/configurations.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.SiFive for license details. 2 | 3 | package config 4 | 5 | class Field[T] 6 | 7 | abstract class View { 8 | final def apply[T](pname: Field[T]): T = apply(pname, this) 9 | 10 | final def apply[T](pname: Field[T], site: View): T = find(pname, site).asInstanceOf[T] 11 | 12 | protected[config] def find(pname: Any, site: View): Any 13 | } 14 | 15 | abstract class Parameters extends View { 16 | final def ++(x: Parameters): Parameters = new ChainParameters(this, x) 17 | 18 | final def alter(f: (View, View, View) => PartialFunction[Any, Any]): Parameters = Parameters(f) ++ this 19 | 20 | final def alterPartial(f: PartialFunction[Any, Any]): Parameters = Parameters((_, _, _) => f) ++ this 21 | 22 | protected[config] def chain(site: View, tail: View, pname: Any): Any 23 | 24 | protected[config] def find(pname: Any, site: View) = chain(site, new TerminalView, pname) 25 | } 26 | 27 | object Parameters { 28 | def empty: Parameters = new EmptyParameters 29 | 30 | def apply(f: (View, View, View) => PartialFunction[Any, Any]): Parameters = new PartialParameters(f) 31 | 32 | def root(p: Parameters) = p 33 | } 34 | 35 | class Config(p: Parameters) extends Parameters { 36 | def this(f: (View, View, View) => PartialFunction[Any, Any]) = this(Parameters(f)) 37 | 38 | protected[config] def chain(site: View, tail: View, pname: Any) = p.chain(site, tail, pname) 39 | 40 | override def toString = this.getClass.getSimpleName 41 | 42 | def toInstance = this 43 | } 44 | 45 | // Internal implementation: 46 | 47 | private class TerminalView extends View { 48 | 49 | private class Unusable 50 | 51 | def find(pname: Any, site: View): Any = 52 | pname match { 53 | case x: Unusable => () 54 | case _ => println("x case not matching" + pname ) 55 | } 56 | } 57 | 58 | private class ChainView(head: Parameters, tail: View) extends View { 59 | def find(pname: Any, site: View) = head.chain(site, tail, pname) 60 | } 61 | 62 | private class ChainParameters(x: Parameters, y: Parameters) extends Parameters { 63 | def chain(site: View, tail: View, pname: Any) = x.chain(site, new ChainView(y, tail), pname) 64 | } 65 | 66 | private class EmptyParameters extends Parameters { 67 | def chain(site: View, tail: View, pname: Any) = tail.find(pname, site) 68 | } 69 | 70 | private class PartialParameters(f: (View, View, View) => PartialFunction[Any, Any]) extends Parameters { 71 | protected[config] def chain(site: View, tail: View, pname: Any) = { 72 | val g = f(site, this, tail) 73 | if (g.isDefinedAt(pname)) g.apply(pname) else tail.find(pname, site) 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /src/main/resources/verilog/UltraRAM.v: -------------------------------------------------------------------------------- 1 | // Xilinx UltraRAM Single Port No Change Mode. This code implements 2 | // a parameterizable UltraRAM block in No Change mode. The behavior of this RAM is 3 | // when data is written, the output of RAM is unchanged. Only when write is 4 | // inactive data corresponding to the address is presented on the output port. 5 | 6 | module UltraRAM #( 7 | parameter AWIDTH = 12, // Address Width 8 | parameter DWIDTH = 72, // Data Width 9 | parameter NBPIPE = 3 // Number of pipeline Registers 10 | ) ( 11 | input clk, // Clock 12 | input rst, // Reset 13 | input we, // Write Enable 14 | input regce, // Output Register Enable 15 | input mem_en, // Memory Enable 16 | input [DWIDTH-1:0] din, // Data Input 17 | input [AWIDTH-1:0] raddr, // Address Input 18 | input [AWIDTH-1:0] waddr, // Address Input 19 | output reg [DWIDTH-1:0] dout // Data Output 20 | ); 21 | 22 | (* ram_style = "ultra" *) 23 | reg [DWIDTH-1:0] mem[(1< new ReduceNode(NumOuts = 1, ID = 0, false, "Mul")(new matNxN(2, issign = true))) { 57 | // c => new ReduceCompTests(c) 58 | // } should be(true) 59 | 60 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 61 | () => new ReduceNode(NumOuts = 1, ID = 0, false, "Add")(new FXmatNxN(2,4))) { 62 | c => new FXReduceCompTests(c) 63 | } should be(true) 64 | // chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 65 | // () => new ReduceNode(NumOuts = 1, ID = 0, 4, "Mul")(new FPmatNxN(2, t = FType.M))) { 66 | // c => new FPReduceCompTests(c) 67 | // } should be(true) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /include/runtime/serializer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /*! 21 | * \file tvm/runtime/serializer.h 22 | * \brief Serializer extension to support TVM data types 23 | * Include this file to enable serialization of DLDataType, DLContext 24 | */ 25 | #ifndef TVM_RUNTIME_SERIALIZER_H_ 26 | #define TVM_RUNTIME_SERIALIZER_H_ 27 | 28 | #include 29 | #include 30 | #include "c_runtime_api.h" 31 | #include "ndarray.h" 32 | 33 | namespace dmlc { 34 | namespace serializer { 35 | 36 | template<> 37 | struct Handler { 38 | inline static void Write(Stream *strm, const DLDataType& dtype) { 39 | Handler::Write(strm, dtype.code); 40 | Handler::Write(strm, dtype.bits); 41 | Handler::Write(strm, dtype.lanes); 42 | } 43 | inline static bool Read(Stream *strm, DLDataType* dtype) { 44 | if (!Handler::Read(strm, &(dtype->code))) return false; 45 | if (!Handler::Read(strm, &(dtype->bits))) return false; 46 | if (!Handler::Read(strm, &(dtype->lanes))) return false; 47 | return true; 48 | } 49 | }; 50 | 51 | template<> 52 | struct Handler { 53 | inline static void Write(Stream *strm, const DLContext& ctx) { 54 | int32_t device_type = static_cast(ctx.device_type); 55 | Handler::Write(strm, device_type); 56 | Handler::Write(strm, ctx.device_id); 57 | } 58 | inline static bool Read(Stream *strm, DLContext* ctx) { 59 | int32_t device_type = 0; 60 | if (!Handler::Read(strm, &(device_type))) return false; 61 | ctx->device_type = static_cast(device_type); 62 | if (!Handler::Read(strm, &(ctx->device_id))) return false; 63 | return true; 64 | } 65 | }; 66 | 67 | } // namespace serializer 68 | } // namespace dmlc 69 | #endif // TVM_RUNTIME_SERIALIZER_H_ 70 | -------------------------------------------------------------------------------- /include/vta/module.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #ifndef VTA_DPI_MODULE_H_ 21 | #define VTA_DPI_MODULE_H_ 22 | 23 | #include "runtime/module.h" 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | namespace vta 30 | { 31 | namespace dpi 32 | { 33 | 34 | /*! 35 | * \brief DPI driver module for managing the accelerator 36 | */ 37 | class DPIModuleNode : public tvm::runtime::ModuleNode 38 | { 39 | public: 40 | /*! \brief Launch hardware simulation */ 41 | virtual void SimLaunch() = 0; 42 | 43 | /*! \brief Halt hardware simulation */ 44 | virtual void SimWait() = 0; 45 | 46 | /*! \brief Resume hardware simulation */ 47 | virtual void SimResume() = 0; 48 | 49 | /*! \brief Finish hardware simulation */ 50 | virtual void SimFinish() = 0; 51 | 52 | /*! 53 | * \brief Write an accelerator register 54 | * \param addr The register address 55 | * \param value The register value 56 | */ 57 | virtual void WriteReg(int addr, uint32_t value) = 0; 58 | 59 | /*! 60 | * \brief Read an accelerator register 61 | * \param addr The register address 62 | */ 63 | virtual uint32_t ReadReg(int addr) = 0; 64 | 65 | static tvm::runtime::Module Load(std::string dll_name); 66 | 67 | /* void SaveToFile(const std::string &file_name, const std::string &format) 68 | { 69 | LOG(FATAL) << "Module[" << type_key() << "] does not support SaveToFile"; 70 | return; 71 | }; 72 | void SaveToBinary(dmlc::Stream *stream) 73 | { 74 | LOG(FATAL) << "Module[" << type_key() << "] does not support SaveToBinary"; 75 | return; 76 | }; 77 | 78 | std::string GetSource(const std::string &format) 79 | { 80 | LOG(FATAL) << "Module[" << type_key() << "] does not support GetSource"; 81 | return ""; 82 | }; */ 83 | }; 84 | 85 | } // namespace dpi 86 | } // namespace vta 87 | #endif // VTA_DPI_MODULE_H_ 88 | -------------------------------------------------------------------------------- /src/main/scala/loop/LoopHeader.scala: -------------------------------------------------------------------------------- 1 | package loop 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 5 | import chisel3.Module 6 | import chisel3.testers._ 7 | import chisel3.util._ 8 | import org.scalatest.{FlatSpec, Matchers} 9 | import config._ 10 | import interfaces._ 11 | import muxes._ 12 | import util._ 13 | import node._ 14 | import utility.UniformPrintfs 15 | 16 | 17 | /** 18 | * Defining LoopOutputBundle 19 | * @param gen Datatype 20 | * @tparam T 21 | */ 22 | class LoopOutputBundleIO[+T <: Data](gen: T) extends Bundle(){ 23 | val bits = Output(gen.cloneType) 24 | val valid = Output(Bool()) 25 | override def cloneType: this.type = new LoopOutputBundleIO(gen).asInstanceOf[this.type] 26 | } 27 | 28 | object LoopOutputBundle{ 29 | def apply[T <: Data](gen: T): LoopOutputBundleIO[T] = new LoopOutputBundleIO(gen) 30 | } 31 | 32 | /** 33 | * @note Loop header IO 34 | * @param NumInputs Number of inputs 35 | */ 36 | class LoopHeaderIO[T <: Data](val NumInputs: Int, val NumOuts: Int) 37 | (gen: T)(implicit p: Parameters) extends CoreBundle()(p) { 38 | 39 | val inputArg = Vec(NumInputs, Flipped(Decoupled(gen))) 40 | val outputArg = Vec(NumOuts, Decoupled(gen)) 41 | 42 | /** 43 | * @note This is an example of how to build custom IO 44 | */ 45 | // val outputArg = Vec(NumOuts,LoopOutputBundle(gen)) 46 | 47 | /** 48 | * Finish signal comes from Ret instruction 49 | */ 50 | val Finish = Input(Bool()) 51 | 52 | /** 53 | * @todo connect the START to entry basic block 54 | */ 55 | val Start = Output(Bool()) 56 | 57 | override def cloneType = 58 | new LoopHeaderIO(NumInputs, NumOuts)(gen).asInstanceOf[this.type] 59 | 60 | 61 | } 62 | 63 | 64 | 65 | class LoopHeader(val NumInputs: Int, val NumOuts: Int, val ID: Int) 66 | (implicit val p: Parameters) extends Module with CoreParams with UniformPrintfs { 67 | 68 | lazy val io = IO(new LoopHeaderIO(NumInputs, NumOuts)(new DataBundle())) 69 | 70 | val valids = WireInit(VecInit(Seq.fill(NumInputs){false.B})) 71 | 72 | val Args = for (i <- 0 until NumInputs) yield { 73 | val arg = Module(new LoopElement(ID = i)) 74 | arg 75 | } 76 | 77 | //Iterating over each loopelement and connect them to the IO 78 | for (i <- 0 until NumInputs) { 79 | Args(i).io.inData <> io.inputArg(i) 80 | Args(i).io.Finish <> io.Finish 81 | } 82 | 83 | for (i <- 0 until NumOuts) { 84 | io.outputArg(i).bits <> Args(i).io.outData.data 85 | io.outputArg(i).valid <> Args(i).io.outData.valid 86 | } 87 | 88 | for (i <- 0 until NumInputs) { 89 | valids(i) <> io.inputArg(i).valid 90 | } 91 | 92 | io.Start := valids.asUInt().andR() 93 | 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/MergeSort.scala: -------------------------------------------------------------------------------- 1 | 2 | package tensorKernels 3 | 4 | import chisel3.util.{Arbiter, Counter, Decoupled, Queue, RRArbiter, isPow2, log2Ceil} 5 | import chisel3.{Flipped, Module, UInt, _} 6 | import config.{Parameters, XLEN} 7 | import interfaces.{BoolBundle, CooDataBundle} 8 | import muxes.{Demux, Mux} 9 | import shell.VMECmd 10 | 11 | class MergeSortIO(maxStreamLen: Int)(implicit val p: Parameters) extends Module { 12 | val io = IO(new Bundle { 13 | val lastIn = Input(Bool( )) 14 | val eopIn = Input(Bool( )) 15 | val in = Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 16 | val out = Decoupled(new CooDataBundle(UInt(p(XLEN).W))) 17 | val eopOut = Output(Bool( )) 18 | val lastOut = Output(Bool( )) 19 | }) 20 | } 21 | 22 | class MergeSort(maxStreamLen: Int, ID: Int, rowBased: Boolean)(implicit p: Parameters) 23 | extends MergeSortIO(maxStreamLen)(p) { 24 | require(maxStreamLen > 0, "Level must be greater than zero") 25 | 26 | val num_Merger = log2Ceil(maxStreamLen) 27 | 28 | val merger = for (i <-0 until num_Merger) yield { 29 | val Merger = Module(new MergeNode(level = math.pow(2,i).toInt, ID = 1, rowBased = rowBased, lastLevel = math.pow(2, num_Merger-1).toInt)) 30 | Merger 31 | } 32 | 33 | /*===============================================* 34 | * Connections * 35 | *===============================================*/ 36 | merger(0).io.eopIn := io.eopIn 37 | merger(0).io.lastIn := io.lastIn 38 | 39 | val sel = RegInit(false.B) 40 | // val sel = 41 | when(io.in.fire()) {sel := !sel} 42 | 43 | when(merger(0).io.in1.ready){ 44 | sel := false.B 45 | }.otherwise { 46 | sel := true.B 47 | } 48 | 49 | 50 | 51 | val demux = Module(new Demux(new CooDataBundle(UInt(p(XLEN).W)), Nops = 2)) 52 | 53 | demux.io.en := io.in.valid 54 | demux.io.input <> io.in.bits 55 | demux.io.sel := merger(0).io.in2.ready 56 | 57 | merger(0).io.in1.bits <> demux.io.outputs(0) 58 | merger(0).io.in2.bits <> demux.io.outputs(1) 59 | 60 | merger(0).io.in1.valid := demux.io.outputs(0).valid 61 | merger(0).io.in2.valid := demux.io.outputs(1).valid 62 | 63 | // io.in.ready := Mux(sel, merger(0).io.in2.ready, merger(0).io.in1.ready) 64 | io.in.ready := merger(0).io.in2.ready || merger(0).io.in1.ready 65 | 66 | for (i <-1 until num_Merger) { 67 | merger(i).io.in1 <> merger(i-1).io.out1 68 | merger(i).io.in2 <> merger(i-1).io.out2 69 | merger(i).io.eopIn := merger(i-1).io.eopOut 70 | merger(i).io.lastIn := merger(i-1).io.lastOut 71 | } 72 | 73 | io.out <> merger(num_Merger - 1).io.out1 74 | merger(num_Merger - 1).io.out2.ready := false.B 75 | 76 | io.eopOut := merger(num_Merger - 1).io.eopOut 77 | io.lastOut := merger(num_Merger - 1).io.lastOut 78 | } -------------------------------------------------------------------------------- /src/main/scala/tensorKernels/VirtualChannel.scala: -------------------------------------------------------------------------------- 1 | package tensorKernels 2 | 3 | import chisel3.util.{Counter, Decoupled, Queue, RRArbiter} 4 | import chisel3.{Flipped, Module, UInt, _} 5 | import config.{Parameters, ROWLEN, XLEN} 6 | import interfaces.{BoolBundle, CooDataBundle} 7 | import muxes.{Demux, Mux} 8 | import shell.VMECmd 9 | 10 | 11 | class VirtualChannelIO(N: Int)(implicit val p: Parameters) extends Module { 12 | val io = IO(new Bundle { 13 | 14 | val in = Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 15 | val out = Vec(N, Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 16 | 17 | val eopIn = Input(Bool( )) 18 | val eopOut = Vec(N, Output(Bool( ))) 19 | }) 20 | } 21 | 22 | class VirtualChannel(N: Int, VCDepth: Int)(implicit p: Parameters) 23 | extends VirtualChannelIO(N)(p) { 24 | require(N > 0, "Number of VCs should be at least 1") 25 | 26 | val queue = for (i <- 0 until N) yield { 27 | val queue1 = Module(new Queue(new CooDataBundle(UInt(p(XLEN).W)), entries = VCDepth, pipe = true)) 28 | queue1 29 | } 30 | 31 | val data = RegInit(CooDataBundle.default(0.U(p(XLEN).W))) 32 | val valid = RegInit(false.B) 33 | 34 | val demux = Module(new Demux(new CooDataBundle(UInt(p(XLEN).W)), Nops = N)) 35 | 36 | val readyMux = Module(new Mux(new BoolBundle(Bool()), Nops = N)) 37 | 38 | /* ================================================================== * 39 | * isFinished signals * 40 | * ================================================================== */ 41 | 42 | val isFinished = RegInit(init = false.B) 43 | 44 | val sel = Counter(N) 45 | 46 | 47 | when (data.row =/= io.in.bits.row && io.in.valid) { 48 | sel.inc() 49 | } 50 | 51 | 52 | demux.io.sel := sel.value 53 | readyMux.io.sel := sel.value 54 | 55 | demux.io.en := valid 56 | readyMux.io.en := valid 57 | 58 | when(valid && readyMux.io.output.data) { 59 | valid := false.B 60 | } 61 | 62 | when(io.in.fire()) { 63 | data <> io.in.bits 64 | valid := io.in.valid 65 | } 66 | 67 | dontTouch(data) 68 | 69 | demux.io.input := data 70 | 71 | for (i <- 0 until N) { 72 | readyMux.io.inputs(i).valid := true.B 73 | readyMux.io.inputs(i).data := queue(i).io.enq.ready 74 | 75 | queue(i).io.enq.bits := demux.io.outputs(i) 76 | queue(i).io.enq.valid := valid && demux.io.outputs(i).valid 77 | 78 | io.out(i) <> queue(i).io.deq 79 | } 80 | 81 | 82 | when (io.eopIn) {isFinished := true.B} 83 | io.eopOut.foreach(a => a := false.B) 84 | when(isFinished && !queue.map(_.io.deq.valid).reduceLeft(_||_)) { 85 | isFinished := false.B 86 | io.eopOut.foreach(a => a := true.B) 87 | } 88 | 89 | io.in.ready := !isFinished && readyMux.io.output.data 90 | 91 | 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/scala/shell/IntelShell.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package shell 21 | 22 | 23 | import chisel3._ 24 | import chisel3.RawModule 25 | import chisel3.withClockAndReset 26 | import config._ 27 | import shell.De10Config 28 | 29 | 30 | class IntelShell(implicit p: Parameters) extends Module { 31 | val io = IO(new Bundle { 32 | val host = new AXIClient(p(ShellKey).hostParams) 33 | val mem = new AXIMaster(p(ShellKey).memParams) 34 | }) 35 | 36 | val vcr = Module(new VCR) 37 | val vme = Module(new VME) 38 | // Connect the DNN core and its VME modules. 39 | // val core = Module(new DNNCore()) 40 | 41 | // core.io.vcr <> vcr.io.vcr 42 | // vme.io.vme <> core.io.vme 43 | 44 | 45 | 46 | 47 | // For whatever reason; this is hoisted here wheras the xilinx shell includes a VTA shell separately. For the timebeing we retain it here. 48 | io.host.aw.ready := vcr.io.host.aw.ready 49 | vcr.io.host.aw.valid := io.host.aw.valid 50 | vcr.io.host.aw.bits.addr := io.host.aw.bits.addr 51 | io.host.w.ready := vcr.io.host.w.ready 52 | vcr.io.host.w.valid := io.host.w.valid 53 | vcr.io.host.w.bits.data := io.host.w.bits.data 54 | vcr.io.host.w.bits.strb := io.host.w.bits.strb 55 | vcr.io.host.b.ready := io.host.b.ready 56 | io.host.b.valid := vcr.io.host.b.valid 57 | io.host.b.bits.resp := vcr.io.host.b.bits.resp 58 | io.host.b.bits.id := io.host.w.bits.id 59 | 60 | io.host.ar.ready := vcr.io.host.ar.ready 61 | vcr.io.host.ar.valid := io.host.ar.valid 62 | vcr.io.host.ar.bits.addr := io.host.ar.bits.addr 63 | vcr.io.host.r.ready := io.host.r.ready 64 | io.host.r.valid := vcr.io.host.r.valid 65 | io.host.r.bits.data := vcr.io.host.r.bits.data 66 | io.host.r.bits.resp := vcr.io.host.r.bits.resp 67 | io.host.r.bits.id := io.host.ar.bits.id 68 | 69 | io.host.b.bits.user <> DontCare 70 | io.host.r.bits.user <> DontCare 71 | io.host.r.bits.last := 1.U 72 | 73 | // io.mem <> vme.io.mem 74 | io.mem <> DontCare 75 | } 76 | 77 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/fuse/computeFuse04.scala: -------------------------------------------------------------------------------- 1 | package dataflow.fuse 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class ComputeFuse04SDF(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val io = IO(new Bundle { 15 | 16 | val data0 = Flipped(Decoupled(new DataBundle())) 17 | val data1 = Flipped(Decoupled(new DataBundle())) 18 | val data2 = Flipped(Decoupled(new DataBundle())) 19 | val data3 = Flipped(Decoupled(new DataBundle())) 20 | val data4 = Flipped(Decoupled(new DataBundle())) 21 | val data5 = Flipped(Decoupled(new DataBundle())) 22 | val enable = Flipped(Decoupled(Bool())) 23 | 24 | val dataOut = Decoupled(new DataBundle()) 25 | 26 | }) 27 | 28 | val m0 = Module(new Chain(NumOps = 3, ID = 0, OpCodes = Array("And","ShiftLeft","Xor"))(sign = false)) 29 | val m1 = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "And")(sign = false)) 30 | val m2 = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "Or")(sign = false)) 31 | 32 | m0.io.In(0) <> io.data0 33 | m0.io.In(1) <> io.data1 34 | m0.io.In(2) <> io.data2 35 | m0.io.In(3) <> io.data3 36 | 37 | m1.io.LeftIO <> io.data4 38 | m1.io.RightIO <> io.data5 39 | 40 | m2.io.LeftIO <> m0.io.Out(2) 41 | m2.io.RightIO <> m1.io.Out(0) 42 | 43 | m0.io.enable <> io.enable 44 | m1.io.enable <> io.enable 45 | m2.io.enable <> io.enable 46 | 47 | 48 | for(i <- 0 until 4) 49 | m0.io.Out(i).ready := m2.io.LeftIO.ready 50 | 51 | 52 | io.dataOut <> m1.io.Out(0) 53 | } 54 | 55 | 56 | 57 | class ComputeFuse04PDF(implicit val p: Parameters) extends Module with CoreParams { 58 | 59 | val io = IO(new Bundle { 60 | val data0 = Flipped(Decoupled(new DataBundle())) 61 | val data1 = Flipped(Decoupled(new DataBundle())) 62 | val data2 = Flipped(Decoupled(new DataBundle())) 63 | val data3 = Flipped(Decoupled(new DataBundle())) 64 | val data4 = Flipped(Decoupled(new DataBundle())) 65 | val data5 = Flipped(Decoupled(new DataBundle())) 66 | val enable = Flipped(Decoupled(new ControlBundle)) 67 | 68 | val dataOut = Decoupled(new DataBundle()) 69 | 70 | }) 71 | 72 | val m0 = Module(new Chain(NumOps = 4, ID = 0, OpCodes = Array("And","ShiftLeft","Xor","Or"))(sign = false)) 73 | val m1 = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "And")(sign = false)) 74 | 75 | m0.io.In(0) <> io.data0 76 | m0.io.In(1) <> io.data1 77 | m0.io.In(2) <> io.data2 78 | m0.io.In(3) <> io.data3 79 | m0.io.In(4) <> m1.io.Out(0) 80 | 81 | m1.io.LeftIO <> io.data4 82 | m1.io.RightIO <> io.data5 83 | 84 | m0.io.enable <> io.enable 85 | m1.io.enable <> io.enable 86 | 87 | for(i <- 0 until 5) 88 | m0.io.Out(i).ready := io.dataOut.ready 89 | 90 | io.dataOut <> m0.io.Out(3) 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/dnn/types/GEMM.scala: -------------------------------------------------------------------------------- 1 | package dnn.types 2 | 3 | import FPU.{FPMAC, FType, FloatingPoint} 4 | import chisel3._ 5 | import chisel3.iotesters.{ChiselFlatSpec, Driver, OrderedDecoupledHWIOTester, PeekPokeTester} 6 | import chisel3.Module 7 | import chisel3.experimental.FixedPoint 8 | import chisel3.testers._ 9 | import chisel3.util._ 10 | import org.scalatest.{FlatSpec, Matchers} 11 | import config._ 12 | import interfaces._ 13 | import muxes._ 14 | import util._ 15 | import node._ 16 | import dnn._ 17 | 18 | 19 | object GEMM { 20 | 21 | // Declare trait to encapsulate implicit functions 22 | trait OperatorGEMM[T] { 23 | def multiplication(l: T, r: T, start: Bool)(implicit p: Parameters): (T, Int) 24 | } 25 | 26 | // Implementation of actual functions 27 | object OperatorGEMM { 28 | 29 | // FX Operations 30 | implicit object FXmatNxN extends OperatorGEMM[FXmatNxN] { 31 | def multiplication(l: FXmatNxN, r: FXmatNxN, start: Bool)(implicit p: Parameters): (FXmatNxN, Int) = { 32 | val x = Wire(new FXmatNxN(l.N, l.fraction)) 33 | val GEMM = Module(new SystolicSquare(l.data(0)(0).cloneType, l.N)) 34 | GEMM.io.activate := start 35 | l.toVecUInt( ) zip GEMM.io.left foreach { case (a, b) => b := a } 36 | r.toVecUInt( ) zip GEMM.io.right foreach { case (a, b) => b := a } 37 | x.fromVecUInt(GEMM.io.output) 38 | (x, GEMM.latency( )) 39 | } 40 | } 41 | 42 | implicit object matNxN extends OperatorGEMM[matNxN] { 43 | def multiplication(l: matNxN, r: matNxN, start: Bool)(implicit p: Parameters): (matNxN, Int) = { 44 | val x = Wire(new matNxN(l.N)) 45 | val GEMM = Module(new SystolicSquare(l.data(0)(0).cloneType, l.N)) 46 | GEMM.io.activate := start 47 | GEMM.io.async_reset := false.B 48 | l.toVecUInt( ) zip GEMM.io.left foreach { case (a, b) => b := a } 49 | r.toVecUInt( ) zip GEMM.io.right foreach { case (a, b) => b := a } 50 | x.fromVecUInt(GEMM.io.output) 51 | (x, GEMM.latency) 52 | } 53 | } 54 | 55 | implicit object FPmatNxN extends OperatorGEMM[FPmatNxN] { 56 | def multiplication(l: FPmatNxN, r: FPmatNxN, start: Bool)(implicit p: Parameters): (FPmatNxN, Int) = { 57 | val x = Wire(new FPmatNxN(l.N, l.Ftyp)) 58 | val GEMM = Module(new SystolicSquare(new FloatingPoint(l.Ftyp), l.N)) 59 | GEMM.io.activate := start 60 | GEMM.io.async_reset := false.B 61 | l.toVecUInt( ) zip GEMM.io.left foreach { case (a, b) => b := a } 62 | r.toVecUInt( ) zip GEMM.io.right foreach { case (a, b) => b := a } 63 | x.fromVecUInt(GEMM.io.output) 64 | (x, GEMM.latency) 65 | } 66 | } 67 | 68 | 69 | } 70 | 71 | // Implicit functions to invoke. 72 | def GEMM[T](l: T, r: T, start: Bool)(implicit op: OperatorGEMM[T], p: Parameters): (T, Int) = op.multiplication(l, r, start) 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/dnn/memory/inDMA_wgt.scala: -------------------------------------------------------------------------------- 1 | 2 | package dnn.memory 3 | 4 | import chisel3._ 5 | import chisel3.util._ 6 | import config._ 7 | import dnnnode.WeightShapeTransformer 8 | import node.{Shapes, vecN} 9 | import shell._ 10 | //import vta.util.config._ 11 | import dnn.memory.ISA._ 12 | 13 | 14 | /** TensorLoad. 15 | * 16 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 17 | * scratchpads (SRAM). Also, there is support for zero padding, while 18 | * doing the load. Zero-padding works on the y and x axis, and it is 19 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 20 | * handling the way tensors are stored on the scratchpads. 21 | */ 22 | class inDMA_wgtIO[gen <: Shapes](wgtTensorType: String = "none")(wgtShape: => gen)(implicit val p: Parameters) 23 | extends Module { 24 | val tpWgt = new TensorParams(wgtTensorType) 25 | val mp = p(ShellKey).memParams 26 | val io = IO(new Bundle { 27 | val start = Input(Bool()) 28 | val done = Output(Bool()) 29 | val numWeight = Input(UInt(mp.addrBits.W)) 30 | val baddr = Input(UInt(mp.addrBits.W)) 31 | val vme_rd = new VMEReadMaster 32 | val tensor = new TensorClient(wgtTensorType) 33 | }) 34 | } 35 | 36 | 37 | class inDMA_wgt[L <: Shapes](wgtTFDepth: Int, bufSize: Int, intWgtTensorType: String = "none", extWgtTensorType: String = "none")(wgtShape: => L) 38 | (implicit p: Parameters) 39 | extends inDMA_wgtIO(intWgtTensorType)(wgtShape)(p) { 40 | 41 | val tpMem = new TensorParams(extWgtTensorType) 42 | val wgtTransformer = Module(new WeightShapeTransformer(wgtTFDepth, bufSize, intWgtTensorType, extWgtTensorType)(wgtShape)) 43 | val tensorLoad = Module(new TensorLoad(extWgtTensorType)) 44 | 45 | val tl_Inst = Wire(new MemDecode) 46 | val memTensorRows = Mux(io.numWeight * wgtShape.getLength().U % tpMem.tensorWidth.U === 0.U, 47 | io.numWeight * wgtShape.getLength().U / tpMem.tensorWidth.U, (io.numWeight * wgtShape.getLength().U / tpMem.tensorWidth.U) + 1.U) 48 | 49 | 50 | 51 | tensorLoad.io.start := io.start 52 | tensorLoad.io.inst := tl_Inst.asTypeOf(UInt(INST_BITS.W)) 53 | tensorLoad.io.baddr := io.baddr 54 | io.vme_rd <> tensorLoad.io.vme_rd 55 | 56 | wgtTransformer.io.start := tensorLoad.io.done 57 | wgtTransformer.io.numWeight := io.numWeight 58 | io.done := wgtTransformer.io.done 59 | 60 | tensorLoad.io.tensor <> wgtTransformer.io.tensorMaster 61 | wgtTransformer.io.tensor <> io.tensor 62 | 63 | tl_Inst.xpad_0 := 0.U 64 | tl_Inst.xpad_1 := 0.U 65 | tl_Inst.ypad_0 := 0.U 66 | tl_Inst.ypad_1 := 0.U 67 | tl_Inst.xstride := memTensorRows 68 | tl_Inst.xsize := memTensorRows 69 | tl_Inst.ysize := 1.U 70 | tl_Inst.empty_0 := 0.U 71 | tl_Inst.dram_offset := 0.U 72 | tl_Inst.sram_offset := 0.U 73 | tl_Inst.id := 3.U 74 | tl_Inst.push_next := 0.U 75 | tl_Inst.push_prev := 0.U 76 | tl_Inst.pop_next := 0.U 77 | tl_Inst.pop_prev := 0.U 78 | tl_Inst.op := 0.U 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/node/Comparision.scala: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | /** 7 | * List of comparision operations 8 | */ 9 | object CmpOpCode { 10 | val EQ = 1 11 | val NE = 2 12 | val UGT = 3 13 | val UGE = 4 14 | val ULT = 5 15 | val ULE = 6 16 | val SGT = 7 17 | val SGE = 8 18 | val SLT = 9 19 | val SLE = 10 20 | val length = 11 21 | 22 | val CompMap = Map( 23 | "EQ" -> EQ, 24 | "eq" -> EQ, 25 | "NE" -> NE, 26 | "ne" -> NE, 27 | "UGT" -> UGT, 28 | "ugt" -> UGT, 29 | "UGE" -> UGE, 30 | "uge" -> UGE, 31 | "ULT" -> ULT, 32 | "ult" -> ULT, 33 | "ULE" -> ULE, 34 | "ule" -> ULE, 35 | "SGT" -> SGT, 36 | "sgt" -> SGT, 37 | "SGE" -> SGE, 38 | "sge" -> SGE, 39 | "SLT" -> SLT, 40 | "slt" -> SLT, 41 | "SLE" -> SLE, 42 | "sle" -> SLE 43 | ) 44 | } 45 | 46 | 47 | /** 48 | * This way you can not pick our operation correctly!! 49 | * @param key a key to search for 50 | * @param default a default value if nothing is found 51 | * @param mapping a sequence to search of keys and values 52 | * @return the value found or the default if not 53 | * 54 | */ 55 | object CMPGenerator{ 56 | def apply[S <: Int, T <: Data] (key : S, mapping: Seq[(S, T)]): T = { 57 | 58 | //Default value for mapping 59 | var res = mapping(0)._2 60 | 61 | for((k,v) <- mapping){ 62 | if(k == key) 63 | res = v 64 | } 65 | 66 | res 67 | } 68 | } 69 | 70 | 71 | class UCMP(val xlen: Int, val opCode: String) extends Module { 72 | val io = IO(new Bundle { 73 | val in1 = Input(UInt(xlen.W)) 74 | val in2 = Input(UInt(xlen.W)) 75 | val out = Output(UInt(xlen.W)) 76 | }) 77 | 78 | //printf(p"OPCODE: ${opCode}\n") 79 | 80 | val cmpOp = Array( 81 | CmpOpCode.EQ -> (io.in1 === io.in2), 82 | CmpOpCode.NE -> (io.in1 =/= io.in2), 83 | CmpOpCode.UGT -> (io.in1 > io.in2), 84 | CmpOpCode.UGE -> (io.in1 >= io.in2), 85 | CmpOpCode.ULT -> (io.in1 < io.in2), 86 | CmpOpCode.ULE -> (io.in1 <= io.in2) 87 | ) 88 | 89 | assert(!CmpOpCode.CompMap.get(opCode).isEmpty, "Wrong CMP OP!") 90 | io.out := CMPGenerator(CmpOpCode.CompMap(opCode), cmpOp) 91 | 92 | } 93 | 94 | 95 | class SCMP(val xlen: Int, val opCode: String) extends Module { 96 | val io = IO(new Bundle { 97 | val in1 = Input(SInt(xlen.W)) 98 | val in2 = Input(SInt(xlen.W)) 99 | val out = Output(SInt(xlen.W)) 100 | }) 101 | 102 | 103 | val aluOp = Array( 104 | CmpOpCode.SGT -> (io.in1 > io.in2), 105 | CmpOpCode.SGE -> (io.in1 >= io.in2), 106 | CmpOpCode.SLT -> (io.in1 < io.in2), 107 | CmpOpCode.SLE -> (io.in1 <= io.in2) 108 | 109 | ) 110 | 111 | 112 | assert(!CmpOpCode.CompMap.get(opCode).isEmpty, "Wrong CMP OP!") 113 | 114 | io.out := CMPGenerator(CmpOpCode.CompMap(opCode), aluOp) 115 | 116 | } 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/test/scala/dnn/Dot.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.PeekPokeTester 5 | import config._ 6 | import org.scalatest.{FlatSpec, Matchers} 7 | import FPU._ 8 | import node._ 9 | 10 | // Tester. 11 | class DotCompTests(df: DotNode[matNxN]) 12 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 13 | poke(df.io.enable.valid, true) 14 | poke(df.io.enable.bits.control, true) 15 | 16 | poke(df.io.LeftIO.bits.data, 0x0401010101010102L) 17 | poke(df.io.LeftIO.valid, true) 18 | poke(df.io.LeftIO.bits.predicate, true) 19 | 20 | 21 | poke(df.io.RightIO.bits.data, 0x0501010101010103L) 22 | poke(df.io.RightIO.valid, true) 23 | poke(df.io.RightIO.bits.predicate, true) 24 | 25 | poke(df.io.Out(0).ready, true.B) 26 | step(20) 27 | } 28 | 29 | 30 | class FXDotCompTests(df: DotNode[FXmatNxN]) 31 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 32 | poke(df.io.enable.valid, true) 33 | poke(df.io.enable.bits.control, true) 34 | // 0x32 0011.0010 . Fixed point 3.125 in fixed point 4 BP. 35 | poke(df.io.LeftIO.bits.data, 0x49494949L) 36 | poke(df.io.LeftIO.valid, true) 37 | poke(df.io.LeftIO.bits.predicate, true) 38 | 39 | // 0x32 (3.125) * 0x20 (2.0) = 6.25 (0x64 or 100) 40 | poke(df.io.RightIO.bits.data, 0x40404040L) 41 | poke(df.io.RightIO.valid, true) 42 | poke(df.io.RightIO.bits.predicate, true) 43 | 44 | poke(df.io.Out(0).ready, true.B) 45 | step(20) 46 | } 47 | 48 | class FPDotCompTests(df: DotNode[FPmatNxN]) 49 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 50 | poke(df.io.enable.valid, true) 51 | poke(df.io.enable.bits.control, true) 52 | // 0x49 = 3.125 (Mini 8 bit format. 3 bit exp, 5 bit mantissa 53 | poke(df.io.LeftIO.bits.data, 0x49494949L) 54 | poke(df.io.LeftIO.valid, true) 55 | poke(df.io.LeftIO.bits.predicate, true) 56 | 57 | // 0x4e - 3.7 . Result : 103. 58 | poke(df.io.RightIO.bits.data, 0x40404040L) 59 | poke(df.io.RightIO.valid, true) 60 | poke(df.io.RightIO.bits.predicate, true) 61 | 62 | poke(df.io.Out(0).ready, true.B) 63 | step(20) 64 | } 65 | 66 | 67 | class DotCompTester extends FlatSpec with Matchers { 68 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 69 | it should "Typ Compute Tester" in { 70 | // chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 71 | // () => new DotNode(NumOuts = 1, ID = 0, 4, "Add")(new matNxN(2, true))) { 72 | // c => new DotCompTests(c) 73 | // } should be(true) 74 | 75 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 76 | () => new DotNode(NumOuts = 1, ID = 0, 4, "Mul")(new matNxN(4,false))) { 77 | c => new DotCompTests(c) 78 | } should be(true) 79 | // chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 80 | // () => new DotNode(NumOuts = 1, ID = 0, 4, "Mul")(new FPmatNxN(2, t = FType.M))) { 81 | // c => new FPDotCompTests(c) 82 | // } should be(true) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/dnn/CooSCALNode.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3.util.{Decoupled, Enum, Valid} 4 | import chisel3.{Bundle, Flipped, Module, Output, RegInit, UInt, printf, when, _} 5 | import config.{COLLEN, Parameters, ROWLEN, XLEN} 6 | import dnn.types.{OperatorCooSCAL, OperatorSCAL} 7 | import interfaces.{CooDataBundle, CustomDataBundle} 8 | import node.Shapes 9 | 10 | class CooSCALFU[L <: Shapes : OperatorCooSCAL](left: => L, lanes: Int, opCode: String)(implicit val p: Parameters) extends Module { 11 | val io = IO(new Bundle { 12 | val a = Flipped(Valid(left)) 13 | val b = Flipped(Valid(UInt(p(XLEN).W))) 14 | val o = Decoupled(left) 15 | }) 16 | 17 | 18 | val start = io.o.ready && io.a.valid && io.b.valid 19 | val FU = OperatorCooSCAL.magic(io.a.bits, io.b.bits, start, lanes, opCode) 20 | io.o.bits := FU._1 21 | val latency = FU._2 22 | val valid = RegInit(false.B) 23 | when(io.o.fire()){ 24 | valid := false.B 25 | } 26 | 27 | when(start){ 28 | valid := true.B 29 | } 30 | 31 | io.o.valid := valid 32 | } 33 | 34 | class CooSCALIO[L <: Shapes](left: => L)(implicit val p: Parameters) extends Module{ 35 | val io = IO(new Bundle() { 36 | val vec = Vec(left.getLength(), Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W))))) 37 | 38 | val scal = Flipped(Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 39 | 40 | val out = Vec(left.getLength(), Decoupled(new CooDataBundle(UInt(p(XLEN).W)))) 41 | }) 42 | } 43 | 44 | class CooSCALNode[L <: Shapes : OperatorCooSCAL](N: Int, ID: Int, opCode: String)(shape: => L)(implicit p: Parameters) 45 | extends CooSCALIO(shape)(p) { 46 | 47 | require(shape.getLength() == N, "shape does not match with number of multipliers") 48 | 49 | /*===============================================* 50 | * Latch inputs. Wire up left * 51 | *===============================================*/ 52 | val FU = Module(new CooSCALFU(shape, lanes = shape.getLength(), opCode)) 53 | FU.io.a.bits := VecInit(io.vec.map(_.bits.data.asUInt())).asTypeOf(shape) 54 | FU.io.b.bits := io.scal.bits.data 55 | 56 | FU.io.a.valid := io.vec.map(_.valid).reduceLeft(_&&_) 57 | FU.io.b.valid := io.scal.valid 58 | 59 | FU.io.o.ready := io.out.map(_.ready).reduceLeft(_&&_) 60 | io.scal.ready := io.out.map(_.ready).reduceLeft(_&&_) && io.vec.map(_.valid).reduceLeft(_&&_) 61 | io.vec.map(_.ready).foreach(a => a := io.out.map(_.ready).reduceLeft(_&&_) && io.scal.valid) 62 | 63 | 64 | val row = for (i <- 0 until shape.getLength()) yield { 65 | val r = Reg(UInt(p(ROWLEN).W)) 66 | r 67 | } 68 | val col = for (i <- 0 until shape.getLength()) yield { 69 | val c = Reg(UInt(p(COLLEN).W)) 70 | c 71 | } 72 | 73 | for (i <- 0 until shape.getLength()) { 74 | io.out(i).bits.data := FU.io.o.bits.asUInt()(p(XLEN) * (i + 1) - 1, p(XLEN) * i) 75 | io.out(i).valid := FU.io.o.valid 76 | 77 | when(io.out.map(_.ready).reduceLeft(_&&_)) { 78 | row(i) := io.vec(i).bits.row 79 | col(i) := io.scal.bits.col 80 | } 81 | io.out(i).bits.row := row(i) 82 | io.out(i).bits.col := col(i) 83 | 84 | io.out(i).bits.valid := true.B 85 | } 86 | } 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/main/scala/dnn/memory/inDMA_act_HWC.scala: -------------------------------------------------------------------------------- 1 | 2 | package dnn.memory 3 | 4 | import chisel3._ 5 | import chisel3.util.Decoupled 6 | import config._ 7 | import interfaces.{CustomDataBundle, TensorReadReq, TensorReadResp} 8 | import node.Shapes 9 | import shell._ 10 | //import vta.util.config._ 11 | import dnn.memory.ISA._ 12 | 13 | 14 | /** TensorLoad. 15 | * 16 | * Load 1D and 2D tensors from main memory (DRAM) to input/weight 17 | * scratchpads (SRAM). Also, there is support for zero padding, while 18 | * doing the load. Zero-padding works on the y and x axis, and it is 19 | * managed by TensorPadCtrl. The TensorDataCtrl is in charge of 20 | * handling the way tensors are stored on the scratchpads. 21 | */ 22 | class inDMA_act_HWCIO(NumRows: Int, NumOuts: Int, memTensorType: String = "none")(implicit val p: Parameters) 23 | extends Module { 24 | val tp = new TensorParams(memTensorType) 25 | val mp = p(ShellKey).memParams 26 | val io = IO(new Bundle { 27 | val start = Input(Bool()) 28 | val done = Output(Bool()) 29 | val baddr = Input(UInt(mp.addrBits.W)) 30 | val rowWidth = Input(UInt(mp.addrBits.W)) 31 | val depth = Input(UInt(mp.addrBits.W)) 32 | val vme_rd = Vec(NumRows, new VMEReadMaster) 33 | val tensor = Vec(NumRows, new TensorClient(memTensorType)) 34 | }) 35 | } 36 | 37 | class inDMA_act_HWC(NumRows: Int, NumOuts: Int, memTensorType: String = "none")(implicit p: Parameters) 38 | extends inDMA_act_HWCIO(NumRows, NumOuts, memTensorType)(p) { 39 | 40 | val tensorLoad = for (i <- 0 until NumRows) yield { 41 | val tensorL = Module(new TensorLoad(memTensorType)) 42 | tensorL 43 | } 44 | val doneR = for (i <- 0 until NumRows) yield { 45 | val doneReg = RegInit(init = false.B) 46 | doneReg 47 | } 48 | 49 | io.done := doneR.reduceLeft(_ && _) 50 | 51 | when (doneR.reduceLeft(_ && _)) { 52 | doneR.foreach(a => a := false.B) 53 | } 54 | 55 | for (i <- 0 until NumRows) yield{ 56 | when (tensorLoad(i).io.done) { 57 | doneR(i) := true.B 58 | } 59 | } 60 | 61 | val tl_Inst = Wire(new MemDecode) 62 | val memTensorRows = Mux(io.rowWidth * io.depth % tp.tensorWidth.U === 0.U, 63 | io.rowWidth * io.depth / tp.tensorWidth.U, 64 | (io.rowWidth * io.depth / tp.tensorWidth.U) + 1.U) 65 | 66 | tl_Inst.xpad_0 := 0.U 67 | tl_Inst.xpad_1 := 0.U 68 | tl_Inst.ypad_0 := 0.U 69 | tl_Inst.ypad_1 := 0.U 70 | tl_Inst.xstride := memTensorRows 71 | tl_Inst.xsize := memTensorRows 72 | tl_Inst.ysize := 1.U 73 | tl_Inst.empty_0 := 0.U 74 | tl_Inst.dram_offset := 0.U 75 | tl_Inst.sram_offset := 0.U 76 | tl_Inst.id := 3.U 77 | tl_Inst.push_next := 0.U 78 | tl_Inst.push_prev := 0.U 79 | tl_Inst.pop_next := 0.U 80 | tl_Inst.pop_prev := 0.U 81 | tl_Inst.op := 0.U 82 | 83 | for (i <- 0 until NumRows) { 84 | tensorLoad(i).io.start := io.start 85 | tensorLoad(i).io.inst := tl_Inst.asTypeOf(UInt(INST_BITS.W)) 86 | tensorLoad(i).io.baddr := io.baddr + (i.U * io.rowWidth * io.depth * (tp.tensorElemBits.U / 8.U)) 87 | tensorLoad(i).io.tensor <> io.tensor(i) 88 | io.vme_rd(i) <> tensorLoad(i).io.vme_rd 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/test/scala/dnn/SCAL.scala: -------------------------------------------------------------------------------- 1 | package dnn 2 | 3 | import chisel3._ 4 | import chisel3.iotesters.PeekPokeTester 5 | import config._ 6 | import org.scalatest.{FlatSpec, Matchers} 7 | import FPU._ 8 | import node._ 9 | 10 | // Tester. 11 | class SCALCompTests(df: SCALNode[matNxN]) 12 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 13 | poke(df.io.enable.valid, true) 14 | poke(df.io.enable.bits.control, true) 15 | 16 | poke(df.io.LeftIO.bits.data, 0xFEFEFEFEL) 17 | poke(df.io.LeftIO.valid, true) 18 | poke(df.io.LeftIO.bits.predicate, true) 19 | 20 | 21 | poke(df.io.RightIO.bits.data, 0x04L) 22 | poke(df.io.RightIO.valid, true) 23 | poke(df.io.RightIO.bits.predicate, true) 24 | 25 | poke(df.io.Out(0).ready, true.B) 26 | step(20) 27 | } 28 | 29 | 30 | class FXSCALCompTests(df: SCALNode[FXmatNxN]) 31 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 32 | poke(df.io.enable.valid, true) 33 | poke(df.io.enable.bits.control, true) 34 | // 0x32 0011.0010 . Fixed point 3.125 in fixed point 4 BP. 35 | poke(df.io.LeftIO.bits.data, 0x32323232L) 36 | poke(df.io.LeftIO.valid, true) 37 | poke(df.io.LeftIO.bits.predicate, true) 38 | 39 | // 0x32 (3.125) * 0x20 (2.0) = 6.25 (0x64 or 100) 40 | poke(df.io.RightIO.bits.data, 0x20L) 41 | poke(df.io.RightIO.valid, true) 42 | poke(df.io.RightIO.bits.predicate, true) 43 | 44 | poke(df.io.Out(0).ready, true.B) 45 | step(20) 46 | } 47 | 48 | class FPSCALCompTests(df: SCALNode[FPmatNxN]) 49 | (implicit p: config.Parameters) extends PeekPokeTester(df) { 50 | poke(df.io.enable.valid, true) 51 | poke(df.io.enable.bits.control, true) 52 | // 0x49 = 3.125 (Mini 8 bit format. 3 bit exp, 5 bit mantissa 53 | poke(df.io.LeftIO.bits.data, 0x49494949L) 54 | poke(df.io.LeftIO.valid, true) 55 | poke(df.io.LeftIO.bits.predicate, true) 56 | 57 | // 0x4e - 3.7 . Result : 103. 58 | poke(df.io.RightIO.bits.data, 0x4eL) 59 | poke(df.io.RightIO.valid, true) 60 | poke(df.io.RightIO.bits.predicate, true) 61 | 62 | poke(df.io.Out(0).ready, true.B) 63 | 64 | for (i <- 0 until 20) 65 | step(1) 66 | } 67 | 68 | 69 | class SCALCompTester extends FlatSpec with Matchers { 70 | implicit val p = config.Parameters.root((new Mat_VecConfig).toInstance) 71 | it should "Typ Compute Tester" in { 72 | // chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 73 | // () => new SCALNode(NumOuts = 1, ID = 0, 1, "Add")(new matNxN(2, true))) { 74 | // c => new SCALCompTests(c) 75 | // } should be(true) 76 | 77 | chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 78 | () => new SCALNode(NumOuts = 1, ID = 0, 1, "sqrt")(new FXmatNxN(2, 4))) { 79 | c => new FXSCALCompTests(c) 80 | } should be(true) 81 | // chisel3.iotesters.Driver.execute(Array("--backend-name", "verilator", "--target-dir", "test_run_dir"), 82 | // () => new SCALNode(NumOuts = 1, ID = 0, 4, "Mul")(new FPmatNxN(2, t = FType.M))) { 83 | // c => new FPSCALCompTests(c) 84 | // } should be(true) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/dataflow/filter/BasicFilter.scala: -------------------------------------------------------------------------------- 1 | package dataflow.filter 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import node._ 7 | import config._ 8 | import interfaces._ 9 | import arbiters._ 10 | import memory._ 11 | 12 | class BasicFilter(implicit val p: Parameters) extends Module with CoreParams { 13 | 14 | val FilterSize = 3*3 15 | 16 | val io = IO(new Bundle { 17 | val enable = Flipped(Decoupled(new ControlBundle())) 18 | val data = Vec(FilterSize,Flipped(Decoupled(new DataBundle()))) 19 | val kern = Vec(FilterSize,Flipped(Decoupled(new DataBundle()))) 20 | val sum = Decoupled(new DataBundle()) 21 | }) 22 | 23 | val Multiplier = for (i <- 0 until FilterSize) yield { 24 | val mul = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "mul")(sign = false)) 25 | mul 26 | } 27 | 28 | for (i <- 0 until FilterSize) { 29 | Multiplier(i).io.LeftIO <> io.data(i) 30 | Multiplier(i).io.RightIO <> io.kern(i) 31 | Multiplier(i).io.enable <> io.enable 32 | } 33 | 34 | val Adder = for (i <- 0 until FilterSize-1) yield { 35 | val add = Module(new ComputeNode(NumOuts = 1, ID = 0, opCode = "Add")(sign = false)) 36 | add 37 | } 38 | // First row 39 | Adder(0).io.LeftIO <> Multiplier(0).io.Out(0) 40 | Adder(0).io.RightIO <> Multiplier(1).io.Out(0) 41 | Adder(1).io.LeftIO <> Multiplier(2).io.Out(0) 42 | Adder(1).io.RightIO <> Multiplier(3).io.Out(0) 43 | Adder(2).io.LeftIO <> Multiplier(4).io.Out(0) 44 | Adder(2).io.RightIO <> Multiplier(5).io.Out(0) 45 | Adder(3).io.LeftIO <> Multiplier(6).io.Out(0) 46 | Adder(3).io.RightIO <> Multiplier(7).io.Out(0) 47 | // Second row 48 | Adder(4).io.LeftIO <> Adder(0).io.Out(0) 49 | Adder(4).io.RightIO <> Adder(1).io.Out(0) 50 | Adder(5).io.LeftIO <> Adder(2).io.Out(0) 51 | Adder(5).io.RightIO <> Adder(3).io.Out(0) 52 | // Third Row 53 | Adder(6).io.LeftIO <> Adder(4).io.Out(0) 54 | Adder(6).io.RightIO <> Adder(5).io.Out(0) 55 | // Last Row 56 | Adder(7).io.LeftIO <> Adder(6).io.Out(0) 57 | Adder(7).io.RightIO <> Multiplier(8).io.Out(0) 58 | 59 | for (i <- 0 until FilterSize-1) { 60 | Adder(i).io.enable <> io.enable 61 | } 62 | 63 | io.sum <> Adder(7).io.Out(0) 64 | 65 | // Info 66 | val countOn = true.B // increment counter every clock cycle 67 | val (counterValue, counterWrap) = Counter(countOn, 64*1024) 68 | 69 | val active = RegInit(init = false.B) 70 | val active_r = RegInit(init = false.B) 71 | active := Multiplier(0).io.Out(0).valid || Multiplier(1).io.Out(0).valid || Multiplier(2).io.Out(0).valid || 72 | Multiplier(3).io.Out(0).valid || Multiplier(4).io.Out(0).valid || Multiplier(5).io.Out(0).valid || 73 | Multiplier(6).io.Out(0).valid || Multiplier(7).io.Out(0).valid || Multiplier(8).io.Out(0).valid || 74 | Adder(0).io.Out(0).valid || Adder(1).io.Out(0).valid || Adder(2).io.Out(0).valid || 75 | Adder(3).io.Out(0).valid || Adder(4).io.Out(0).valid || Adder(5).io.Out(0).valid || 76 | Adder(6).io.Out(0).valid || Adder(7).io.Out(0).valid 77 | 78 | active_r := active 79 | when (active && !active_r) { 80 | printf("\nCOMPUTE START: %d\n", counterValue) 81 | } 82 | when (!active && active_r) { 83 | printf("\nCOMPUTE END: %d\n", counterValue) 84 | } 85 | 86 | } 87 | 88 | -------------------------------------------------------------------------------- /src/main/scala/junctions/CombineDecoupled.scala: -------------------------------------------------------------------------------- 1 | package junctions 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import interfaces._ 6 | import config._ 7 | 8 | class CombineCustomIO(argTypes: Seq[Bits])(implicit p: Parameters) extends Bundle { 9 | val In = Flipped(new VariableDecoupledCustom(argTypes)) 10 | val Out = Decoupled(new VariableCustom(argTypes)) 11 | } 12 | 13 | class CombineCustom(val argTypes: Seq[Bits])(implicit p: Parameters) extends Module { 14 | val io = IO(new CombineCustomIO(argTypes)) 15 | val inputReady = RegInit(VecInit(Seq.fill(argTypes.length){true.B})) 16 | val outputReg = RegInit(0.U.asTypeOf(io.Out)) 17 | 18 | for (i <- argTypes.indices) { 19 | when(io.Out.valid && io.Out.ready){ 20 | inputReady(i) := true.B 21 | }.elsewhen(io.In(s"field$i").valid) { 22 | outputReg.bits(s"field$i") := io.In(s"field$i").bits 23 | inputReady(i) := false.B 24 | } 25 | io.In(s"field$i").ready := inputReady(i) 26 | } 27 | io.Out.valid := ~inputReady.asUInt.orR 28 | io.Out.bits := outputReg.bits 29 | } 30 | 31 | class CombineDataIO(val argTypes: Seq[Int])(implicit p: Parameters) extends CoreBundle { 32 | val In = Flipped(new VariableDecoupledData(argTypes)) 33 | val Out = Decoupled(new VariableData(argTypes)) 34 | 35 | override def cloneType = new CombineDataIO(argTypes).asInstanceOf[this.type] 36 | } 37 | 38 | class CombineData(val argTypes: Seq[Int])(implicit p: Parameters) extends Module { 39 | val io = IO(new CombineDataIO(argTypes)) 40 | val inputReady = RegInit(VecInit(Seq.fill(argTypes.length){true.B})) 41 | val outputReg = RegInit(0.U.asTypeOf(io.Out)) 42 | 43 | for (i <- argTypes.indices) { 44 | when(io.Out.fire()){ 45 | inputReady(i) := true.B 46 | }.elsewhen(io.In(s"field$i").valid) { 47 | outputReg.bits(s"field$i") := io.In(s"field$i").bits 48 | inputReady(i) := false.B 49 | } 50 | io.In(s"field$i").ready := inputReady(i) 51 | } 52 | io.Out.valid := ~(inputReady.asUInt.orR) 53 | io.Out.bits := outputReg.bits 54 | 55 | } 56 | 57 | class CombineCallIO(val argTypes: Seq[Int])(implicit p: Parameters) extends CoreBundle { 58 | val In = Flipped(new CallDecoupled(argTypes)) 59 | val Out = Decoupled(new Call(argTypes)) 60 | override def cloneType = new CombineCallIO(argTypes).asInstanceOf[this.type] 61 | } 62 | 63 | class CombineCall(val argTypes: Seq[Int])(implicit p: Parameters) extends Module { 64 | val io = IO(new CombineCallIO(argTypes)) 65 | val inputReady = RegInit(VecInit(Seq.fill(argTypes.length+1){true.B})) 66 | val outputReg = RegInit(0.U.asTypeOf(io.Out)) 67 | 68 | for (i <- argTypes.indices) { 69 | when(io.Out.fire()){ 70 | inputReady(i) := true.B 71 | }.elsewhen(io.In.data(s"field$i").fire()) { 72 | outputReg.bits.data(s"field$i") := io.In.data(s"field$i").bits 73 | inputReady(i) := false.B 74 | } 75 | io.In.data(s"field$i").ready := inputReady(i) 76 | } 77 | 78 | when(io.Out.fire()){ 79 | inputReady(argTypes.length) := true.B 80 | }.elsewhen(io.In.enable.fire()) { 81 | outputReg.bits.enable <> io.In.enable.bits 82 | inputReady (argTypes.length) := false.B 83 | } 84 | io.In.enable.ready := inputReady(argTypes.length) 85 | 86 | io.Out.valid := ~(inputReady.asUInt.orR) 87 | io.Out.bits := outputReg.bits 88 | 89 | } 90 | -------------------------------------------------------------------------------- /include/vta/tsim.h: -------------------------------------------------------------------------------- 1 | #ifndef VTA_DPI_TSIM_H_ 2 | #define VTA_DPI_TSIM_H_ 3 | 4 | #include 5 | 6 | #include "runtime/c_runtime_api.h" 7 | #include "svdpi.h" 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | typedef unsigned char dpi8_t; 14 | 15 | typedef unsigned short dpi16_t; 16 | 17 | typedef unsigned int dpi32_t; 18 | 19 | typedef unsigned long long dpi64_t; 20 | 21 | /*! \brief the context handle */ 22 | typedef void *VTAContextHandle; 23 | 24 | typedef void (*VTASimDPIFunc)(VTAContextHandle self, dpi8_t *wait, 25 | dpi8_t *exit); 26 | 27 | /*! 28 | * \brief Host DPI callback function that is invoked in VTAHostDPI.v every clock 29 | * cycle \param req_valid Host has a valid request for read or write a register 30 | * in Accel \param req_opcode Host request type, opcode=0 for read and opcode=1 31 | * for write \param req_addr Host request register address \param req_value Host 32 | * request value to be written to a register \param req_deq Accel is ready to 33 | * dequeue Host request \param resp_valid Accel has a valid response for Host 34 | * \param resp_value Accel response value for Host 35 | * \return 0 if success, 36 | */ 37 | typedef void (*VTAHostDPIFunc)(VTAContextHandle self, dpi8_t *req_valid, 38 | dpi8_t *req_opcode, dpi16_t *req_addr, 39 | dpi32_t *req_value, dpi8_t req_deq, 40 | dpi8_t resp_valid, dpi32_t resp_value); 41 | 42 | /*! 43 | * \brief Memory DPI callback function that is invoked in VTAMemDPI.v every 44 | * clock cycle \param req_valid Accel has a valid request for Host \param 45 | * req_opcode Accel request type, opcode=0 (read) and opcode=1 (write) \param 46 | * req_len Accel request length of size 8-byte and starts at 0 \param req_addr 47 | * Accel request base address \param wr_valid Accel has a valid value for Host 48 | * \param wr_value Accel has a value to be written Host 49 | * \param rd_valid Host has a valid value for Accel 50 | * \param rd_value Host has a value to be read by Accel 51 | */ 52 | typedef void (*VTAMemDPIFunc)(VTAContextHandle self, dpi8_t req_valid, 53 | dpi8_t req_opcode, dpi8_t req_len, 54 | dpi64_t req_addr, dpi8_t wr_valid, 55 | const svLogicVecVal* wr_value, dpi8_t *rd_valid, 56 | svLogicVecVal *rd_value, dpi8_t rd_ready); 57 | 58 | /*! \brief The type of VTADPIInit function pointer */ 59 | typedef void (*VTADPIInitFunc)(VTAContextHandle handle, VTASimDPIFunc sim_dpi, 60 | VTAHostDPIFunc host_dpi, VTAMemDPIFunc mem_dpi); 61 | 62 | /*! \brief The type of VTADPISim function pointer */ 63 | typedef int (*VTADPISimFunc)(); 64 | 65 | /*! 66 | * \brief Set Host and Memory DPI functions 67 | * \param handle DPI Context handle 68 | * \param sim_dpi Sim DPI function 69 | * \param host_dpi Host DPI function 70 | * \param mem_dpi Memory DPI function 71 | */ 72 | TVM_DLL void VTADPIInit(VTAContextHandle handle, VTASimDPIFunc sim_dpi, 73 | VTAHostDPIFunc host_dpi, VTAMemDPIFunc mem_dpi); 74 | 75 | /*! \brief VTA hardware simulation thread */ 76 | TVM_DLL int VTADPISim(); 77 | 78 | #ifdef __cplusplus 79 | } 80 | #endif 81 | #endif // VTA_DPI_TSIM_H_ 82 | -------------------------------------------------------------------------------- /src/main/scala/dnnnode/WeightShapeTransformer.scala: -------------------------------------------------------------------------------- 1 | package dnnnode 2 | 3 | import Chisel.Enum 4 | import chisel3._ 5 | import chisel3.util._ 6 | import chisel3.{Module, UInt} 7 | import config.{Parameters, XLEN} 8 | import dnn.memory.{TensorClient, TensorMaster, TensorParams} 9 | import interfaces.CustomDataBundle 10 | import node.{Shapes, vecN} 11 | import shell.ShellKey 12 | import dnn.memory.ISA._ 13 | 14 | class WeightShapeTransformerIO[gen <: Shapes](wgtTensorType: String = "none", memTensorType: String = "none")(wgtShape: => gen)(implicit val p: Parameters) 15 | extends Module { 16 | val tpMem = new TensorParams(memTensorType) 17 | val tpWgt = new TensorParams(wgtTensorType) 18 | val io = IO(new Bundle { 19 | val start = Input(Bool()) 20 | val done = Output(Bool()) 21 | val numWeight = Input(UInt(tpWgt.memAddrBits.W)) 22 | val tensorMaster = new TensorMaster(memTensorType) 23 | val tensor = new TensorClient(wgtTensorType) 24 | }) 25 | } 26 | 27 | class WeightShapeTransformer[L <: Shapes](wgtTFDepth: Int, bufSize: Int, wgtTensorType: String = "none", memTensorType: String = "none")(wgtShape: => L) 28 | (implicit p: Parameters) 29 | extends WeightShapeTransformerIO(wgtTensorType, memTensorType)(wgtShape)(p) { 30 | 31 | val buffer = Module(new MIMOQueue(UInt(p(XLEN).W), bufSize, tpMem.tensorWidth, wgtShape.getLength())) 32 | require(bufSize >= tpMem.tensorWidth, "bufSize should be greater than memTensorWidth") 33 | 34 | val wgtTensorDepth = Mux(io.numWeight * wgtShape.getLength().U % tpMem.tensorWidth.U === 0.U, 35 | io.numWeight * wgtShape.getLength().U / tpMem.tensorWidth.U, (io.numWeight * wgtShape.getLength().U / tpMem.tensorWidth.U) + 1.U) 36 | 37 | val writeBufCntOn = RegInit(init = false.B) 38 | val (writeBufCnt, writeWrap) = Counter(writeBufCntOn, wgtTFDepth) 39 | 40 | val readWgtCnt = Counter(wgtTFDepth + 1) 41 | 42 | 43 | val s_idle :: s_BufferWrite :: s_Transfer :: s_Finish :: Nil = Enum(4) 44 | val state = RegInit(s_idle) 45 | 46 | val tensorFile = SyncReadMem(wgtTFDepth, wgtShape) 47 | 48 | buffer.io.enq.valid := io.tensorMaster.rd.data.valid 49 | buffer.io.enq.bits := io.tensorMaster.rd.data.bits(0) 50 | io.tensorMaster.rd.idx.bits := writeBufCnt 51 | io.tensorMaster.rd.idx.valid := buffer.io.enq.ready & writeBufCntOn 52 | io.tensorMaster.wr <> DontCare 53 | 54 | when (writeBufCnt === wgtTensorDepth - 1.U) { 55 | writeBufCntOn := false.B 56 | writeBufCnt := 0.U 57 | } 58 | when (io.start) {writeBufCntOn := true.B} 59 | 60 | when (buffer.io.deq.valid & readWgtCnt.value < io.numWeight) { 61 | tensorFile.write(readWgtCnt.value, buffer.io.deq.bits.asTypeOf(wgtShape)) 62 | buffer.io.deq.ready := true.B 63 | readWgtCnt.inc() 64 | }.otherwise { 65 | buffer.io.deq.ready := false.B 66 | } 67 | 68 | 69 | when (readWgtCnt.value === io.numWeight) { 70 | io.done := true.B 71 | buffer.io.clear := true.B 72 | // readWgtCnt.inc() 73 | readWgtCnt.value := 0.U 74 | }.otherwise{ 75 | buffer.io.clear := false.B 76 | io.done := false.B 77 | } 78 | 79 | val rvalid = RegNext(io.tensor.rd.idx.valid) 80 | io.tensor.rd.data.valid := rvalid 81 | 82 | val rdata = tensorFile.read(io.tensor.rd.idx.bits, io.tensor.rd.idx.valid) 83 | io.tensor.rd.data.bits := rdata.asUInt.asTypeOf(io.tensor.rd.data.bits) 84 | 85 | } 86 | 87 | -------------------------------------------------------------------------------- /src/main/resources/verilog/VTAHostDPI.v: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | `define DATA_LEN_HOST 32 21 | 22 | module VTAHostDPI # 23 | ( parameter ADDR_BITS = 16, 24 | parameter DATA_BITS = `DATA_LEN_HOST 25 | ) 26 | ( 27 | input clock, 28 | input reset, 29 | output logic dpi_req_valid, 30 | output logic dpi_req_opcode, 31 | output logic [ADDR_BITS-1:0] dpi_req_addr, 32 | output logic [DATA_BITS-1:0] dpi_req_value, 33 | input dpi_req_deq, 34 | input dpi_resp_valid, 35 | input [DATA_BITS-1:0] dpi_resp_bits 36 | ); 37 | 38 | import "DPI-C" function void VTAHostDPI 39 | ( 40 | output byte unsigned req_valid, 41 | output byte unsigned req_opcode, 42 | output shortint unsigned req_addr, 43 | output int unsigned req_value, 44 | input byte unsigned req_deq, 45 | input byte unsigned resp_valid, 46 | input int unsigned resp_value 47 | ); 48 | 49 | typedef logic dpi1_t; 50 | typedef logic [7:0] dpi8_t; 51 | typedef logic [15:0] dpi16_t; 52 | typedef logic [31:0] dpi32_t; 53 | 54 | dpi1_t __reset; 55 | dpi8_t __req_valid; 56 | dpi8_t __req_opcode; 57 | dpi16_t __req_addr; 58 | dpi32_t __req_value; 59 | dpi8_t __req_deq; 60 | dpi8_t __resp_valid; 61 | dpi32_t __resp_bits; 62 | 63 | // reset 64 | always_ff @(posedge clock) begin 65 | __reset <= reset; 66 | end 67 | 68 | // delaying outputs by one-cycle 69 | // since verilator does not support delays 70 | always_ff @(posedge clock) begin 71 | dpi_req_valid <= dpi1_t ' (__req_valid); 72 | dpi_req_opcode <= dpi1_t ' (__req_opcode); 73 | dpi_req_addr <= __req_addr; 74 | dpi_req_value <= __req_value; 75 | end 76 | 77 | assign __req_deq = dpi8_t ' (dpi_req_deq); 78 | assign __resp_valid = dpi8_t ' (dpi_resp_valid); 79 | assign __resp_bits = dpi_resp_bits; 80 | 81 | // evaluate DPI function 82 | always_ff @(posedge clock) begin 83 | if (reset | __reset) begin 84 | __req_valid = 0; 85 | __req_opcode = 0; 86 | __req_addr = 0; 87 | __req_value = 0; 88 | end 89 | else begin 90 | VTAHostDPI( 91 | __req_valid, 92 | __req_opcode, 93 | __req_addr, 94 | __req_value, 95 | __req_deq, 96 | __resp_valid, 97 | __resp_bits); 98 | end 99 | end 100 | 101 | endmodule 102 | -------------------------------------------------------------------------------- /src/main/scala/dnn_layers/ConvLayer.scala: -------------------------------------------------------------------------------- 1 | package dnn_layers 2 | 3 | import arbiters._ 4 | import chisel3._ 5 | import chisel3.util.{Decoupled, Valid} 6 | import chisel3.{Module, UInt, printf, _} 7 | import config.{CoreBundle, CoreParams, Parameters} 8 | import control.BasicBlockNoMaskNode 9 | import dnn.types.OperatorDot 10 | import dnn.{DotIO, DotNode, ReduceNode} 11 | import interfaces.{Call, CustomDataBundle, MemReq, MemResp} 12 | import junctions.SplitCallNew 13 | import memory.{ReadTypMemoryController, WriteTypMemoryController} 14 | import node.{FXmatNxN, TypCompute, TypLoad, TypStore, matNxN} 15 | //import javafx.scene.chart.PieChart.Data 16 | import node.{HandShakingNPS, Shapes} 17 | /* ================================================================== * 18 | * PRINTING PORTS DEFINITION * 19 | * ================================================================== */ 20 | 21 | class convLayerIO(implicit p: Parameters) extends CoreBundle { 22 | val in = Flipped(Decoupled(new Call(List(32, 32, 32)))) 23 | val MemResp = Flipped(Valid(new MemResp)) 24 | val MemReq = Decoupled(new MemReq) 25 | val out = Decoupled(new Call(List())) 26 | } 27 | 28 | class convLayer(implicit val p: Parameters) extends Module with CoreParams { 29 | val io = IO(new convLayerIO()) 30 | val shape = new matNxN(2, false) 31 | 32 | val StackFile = Module(new TypeStackFile(ID = 0, Size = 32, NReads = 2, NWrites = 1) 33 | (WControl = new WriteTypMemoryController(NumOps = 1, BaseSize = 2, NumEntries = 1)) 34 | (RControl = new ReadTypMemoryController(NumOps = 2, BaseSize = 2, NumEntries = 2))) 35 | 36 | io.MemReq <> DontCare 37 | io.MemResp <> DontCare 38 | 39 | val InputSplitter = Module(new SplitCallNew(List(1, 1, 1))) 40 | InputSplitter.io.In <> io.in 41 | 42 | 43 | val conv_bb = Module(new BasicBlockNoMaskNode(NumInputs = 1, NumOuts = 5, BID = 0)) 44 | 45 | val LoadA = Module(new TypLoad(NumPredOps = 0, NumSuccOps = 1, NumOuts = 1, ID = 0, RouteID = 0)) 46 | val LoadB = Module(new TypLoad(NumPredOps = 0, NumSuccOps = 1, NumOuts = 1, ID = 0, RouteID = 1)) 47 | val StoreType = Module(new TypStore(NumPredOps = 2, NumSuccOps = 0, NumOuts = 1, ID = 0, RouteID = 0)) 48 | 49 | val dotNode = Module(new DotNode(NumOuts = 1, ID = 0, 4, "Mul")(shape)) 50 | val reduceNode = Module(new ReduceNode(NumOuts = 1, ID = 1, false, "Add")(shape)) 51 | 52 | conv_bb.io.predicateIn <> InputSplitter.io.Out.enable 53 | /* ================================================================== * 54 | * Enable signals * 55 | * ================================================================== */ 56 | 57 | LoadA.io.enable <> conv_bb.io.Out(0) 58 | LoadB.io.enable <> conv_bb.io.Out(1) 59 | StoreType.io.enable <> conv_bb.io.Out(2) 60 | dotNode.io.enable <> conv_bb.io.Out(3) 61 | reduceNode.io.enable <> conv_bb.io.Out(4) 62 | 63 | 64 | StackFile.io.ReadIn(0) <> LoadA.io.memReq 65 | LoadA.io.memResp <> StackFile.io.ReadOut(0) 66 | 67 | StackFile.io.ReadIn(1) <> LoadB.io.memReq 68 | LoadB.io.memResp <> StackFile.io.ReadOut(1) 69 | 70 | StackFile.io.WriteIn(0) <> StoreType.io.memReq 71 | StoreType.io.memResp <> StackFile.io.WriteOut(0) 72 | 73 | dotNode.io.LeftIO <> LoadA.io.Out(0) 74 | dotNode.io.RightIO <> LoadB.io.Out(0) 75 | 76 | reduceNode.io.LeftIO <> dotNode.io.Out(0) 77 | 78 | 79 | } 80 | 81 | 82 | --------------------------------------------------------------------------------