├── .gitignore ├── .gitmodules ├── README.md ├── build.sbt └── src └── main └── scala ├── Generator.scala ├── TopLevelConfigs.scala ├── bank-rf.scala ├── bank.scala ├── configs.scala ├── consts.scala ├── dcc-fu.scala ├── dcc-mem.scala ├── dcc.scala ├── expander.scala ├── frontend.scala ├── hwacha.scala ├── instructions.scala ├── irq.scala ├── lane-ctrl.scala ├── lane.scala ├── mou.scala ├── mrt.scala ├── package.scala ├── rocc-unit.scala ├── scalar-decode.scala ├── scalar-fpu-interface.scala ├── scalar-fpu.scala ├── scalar-unit.scala ├── sequencer-lane.scala ├── sequencer-master.scala ├── smu.scala ├── types-vmu.scala ├── types-vxu.scala ├── util-confprec.scala ├── util.scala ├── vector-unit.scala ├── vfu-alu.scala ├── vfu-fcmp.scala ├── vfu-fconv.scala ├── vfu-fdiv.scala ├── vfu-fma.scala ├── vfu-idiv.scala ├── vfu-imul.scala ├── vfu-plu.scala ├── vfu-rfirst.scala ├── vfu-rpred.scala ├── vmu-addr.scala ├── vmu-memif.scala ├── vmu-pred.scala ├── vmu-sdata.scala ├── vmu-table.scala ├── vmu-tlb.scala ├── vmu-util.scala ├── vmu.scala ├── vru.scala ├── vxu.scala └── xcpt.scala /.gitignore: -------------------------------------------------------------------------------- 1 | verilog/ 2 | project/ 3 | target/ 4 | .*.swp 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucb-bar/hwacha/bf799dc48293cb5017ed2ec22c5023de8d461184/.gitmodules -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hwacha Vector-Thread Co-Processor Sources 2 | 3 | To use this co-processor, include this repo as a git submodule and add it as 4 | to your chip's ``build.sbt`` as a Project, e.g. 5 | 6 | ``` 7 | lazy val hwacha = Project(file("hwacha"), "hwacha") 8 | .settings(buildSettings) 9 | .dependsOn(rocketchip) 10 | ``` 11 | 12 | Hwacha depends on the Rocket Chip project. Make sure the proper JARs are installed. 13 | For more information on how to use this co-processor, refer to (https://github.com/ucb-bar/chipyard). 14 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | organization := "edu.berkeley.cs" 2 | 3 | version := "1.2" 4 | 5 | name := "hwacha" 6 | 7 | scalaVersion := "2.13.10" 8 | -------------------------------------------------------------------------------- /src/main/scala/Generator.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import scala.collection.mutable.LinkedHashSet 4 | import freechips.rocketchip.system._ 5 | import org.chipsalliance.cde.config._ 6 | 7 | class VectorAssemblyTestSuite(prefix: String, names: LinkedHashSet[String])(env: String) extends AssemblyTestSuite(prefix, names)(env + "-vec") 8 | class ScalarVectorAssemblyTestSuite(prefix: String, names: LinkedHashSet[String])(env: String) extends AssemblyTestSuite(prefix, names)(env + "-svec") 9 | 10 | object HwachaTestSuites { 11 | import freechips.rocketchip.system.DefaultTestSuites._ 12 | val rv64uvNames = LinkedHashSet( 13 | "wakeup", "fence", "keepcfg", 14 | "vmca", "vmcs", "vssd", "vssw", "vssh", "vssb", 15 | "vlsd", "vlsw", "vlswu", "vlsh", "vlshu", "vlsb", "vlsbu", 16 | "vsad", "vsaw", "vsah", "vsab", "vlad", "vlaw", "vlawu", "vlah", "vlahu", "vlab", "vlabu", 17 | "vld", "vlw", "vlwu", "vlh", "vlhu", "vlb", "vlbu", "vlxd", "vlxw", "vlxwu", "vlxh", "vlxhu", "vlxb", "vlxbu", 18 | "vsd", "vsw", "vsh", "vsb", "vsxd", "vsxw", "vsxh", "vsxb", 19 | "eidx", "imul", "fcvt", "fcvt_hs", "cmp", "fcmp", "vvadd_d", "vvadd_w", "vvadd_fd", "vvadd_fw", "vvmul_d", 20 | "overlap", "sched_sreg_xbar", "sched_fadd", "sched_waw", "sched_war", "pointer", "vcjal", "vfirst", "vfence", 21 | "vl_empty", "vs_empty", "vlx_empty", "vsx_empty", "vamo_empty", "eidx_empty") ++ 22 | (rv32uaNames -- Set("lrsc")) ++ (rv64uaNames -- Set("lrsc")) 23 | val rv64uvBasic = new AssemblyTestSuite("rv64uv", rv64uvNames)(_) 24 | 25 | val rv64uiVecNames = rv32uiNames ++ rv64uiNames -- Set("simple", "auipc", "lui", "fence_i", 26 | "beq", "bge", "bgeu", "blt", "bltu", "bne", "jal", "jalr", 27 | "lb", "lbu", "lh", "lhu", "lw", "lwu", "ld", "sb", "sh", "sw", "sd") 28 | val rv64uiVec = new VectorAssemblyTestSuite("rv64ui", rv64uiVecNames)(_) 29 | val rv64uiScalarVec = new ScalarVectorAssemblyTestSuite("rv64ui", rv64uiVecNames)(_) 30 | 31 | val rv64umVec = new VectorAssemblyTestSuite("rv64um", rv64umNames)(_) 32 | val rv64umScalarVec = new ScalarVectorAssemblyTestSuite("rv64um", rv64umNames)(_) 33 | 34 | val rv64ufVecNames = rv64ufNames -- Set("ldst", "move") 35 | val rv64ufVec = new VectorAssemblyTestSuite("rv64uf", rv64ufVecNames)(_) 36 | val rv64udVec = new VectorAssemblyTestSuite("rv64ud", rv64ufVecNames)(_) 37 | 38 | val rv64ufScalarVecNames = rv64ufVecNames -- Set("fdiv", "fcmp") // unsupported by current scalar unit 39 | val rv64ufScalarVec = new ScalarVectorAssemblyTestSuite("rv64uf", rv64ufScalarVecNames)(_) 40 | val rv64udScalarVec = new ScalarVectorAssemblyTestSuite("rv64ud", rv64ufScalarVecNames)(_) 41 | 42 | val rv64uv = List(rv64ufScalarVec, rv64ufVec, rv64udScalarVec, rv64udVec, rv64uiScalarVec, rv64uiVec, rv64umScalarVec, rv64umVec, rv64uvBasic) 43 | 44 | val rv64svNames = LinkedHashSet( 45 | "illegal_inst", "illegal_vt_inst", "illegal_vt_regid", "ma_utld", "ma_utsd", "ma_vld", "ma_vsd", "ma_vt_inst", "privileged_inst") 46 | val rv64svNamesV4 = rv64svNames -- Set( 47 | "illegal_inst", "illegal_vt_inst", "illegal_vt_regid", "ma_utld", "ma_utsd", "ma_vld", "ma_vsd", "ma_vt_inst", "privileged_inst") 48 | val rv64sv = new AssemblyTestSuite("rv64sv", rv64svNamesV4)(_) 49 | 50 | val hwachaBmarks = new BenchmarkTestSuite("hwacha", "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/benchmarks", LinkedHashSet( 51 | "pb-spmv", "vec-daxpy", "vec-dgemm-opt", "vec-hsaxpy", "vec-hgemm-opt", "vec-hsgemm-opt", "vec-saxpy", "vec-sdaxpy", "vec-sdgemm-opt", "vec-sgemm-naive", "vec-sgemm-opt", "vec-vvadd")) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/TopLevelConfigs.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE for license details. 2 | 3 | package hwacha 4 | 5 | import freechips.rocketchip._ 6 | import freechips.rocketchip.system._ 7 | import freechips.rocketchip.subsystem._ 8 | import freechips.rocketchip.rocket._ 9 | import hwacha._ 10 | import org.chipsalliance.cde.config._ 11 | 12 | class HwachaConfig extends Config(new DefaultHwachaConfig ++ new DefaultConfig) 13 | 14 | class EOS24Config extends Config(new WithNBanks(4) ++ new HwachaConfig) 15 | 16 | class WithNLanes(n: Int) extends Config((site, here, up) => { 17 | case HwachaNLanes => n 18 | }) 19 | 20 | class With32BtbEntires extends Config((site, here, up) => { 21 | case TilesLocated(InSubsystem) => up(TilesLocated(InSubsystem), site) map { 22 | case tp: RocketTileAttachParams => tp.copy(tileParams = tp.tileParams.copy( 23 | btb = tp.tileParams.btb.map(_.copy(nEntries = 32)))) 24 | } 25 | }) 26 | 27 | class Process28nmConfig extends Config((site, here, up) => { 28 | case TilesLocated(InSubsystem) => up(TilesLocated(InSubsystem), site) map { 29 | case tp: RocketTileAttachParams => tp.copy(tileParams = tp.tileParams.copy( 30 | core = tp.tileParams.core.copy( 31 | fpu = tp.tileParams.core.fpu.map(_.copy(sfmaLatency = 3, dfmaLatency = 4))))) 32 | } 33 | }) 34 | 35 | class WithoutConfPrec extends Config((site, here, up) => { 36 | case HwachaConfPrec => false 37 | }) 38 | 39 | class WithSmallPredRF extends Config((site, here, up) => { 40 | case HwachaNPredRFEntries => 128 41 | }) 42 | 43 | class ISCA2016Config extends Config( 44 | new Process28nmConfig ++ 45 | new WithNBanks(4) ++ 46 | new With32BtbEntires ++ new HwachaConfig) 47 | class FastISCA2016Config extends Config(new WithoutTLMonitors ++ new ISCA2016Config) 48 | 49 | class ISCA2016L2Config extends Config(new WithNLanes(2) ++ new ISCA2016Config) 50 | class ISCA2016L4Config extends Config(new WithNLanes(4) ++ new ISCA2016Config) 51 | 52 | class ISCA2016HOVB4Config extends Config(new WithNBanks(2) ++ new ISCA2016Config) 53 | class ISCA2016HOVB8Config extends Config(new ISCA2016Config) 54 | class ISCA2016LOVB4Config extends Config(new WithoutConfPrec ++ new ISCA2016HOVB4Config) 55 | class ISCA2016LOVB8Config extends Config(new WithoutConfPrec ++ new ISCA2016HOVB8Config) 56 | 57 | class ISCA2016HOVL2B4Config extends Config(new WithNLanes(2) ++ new ISCA2016HOVB4Config) 58 | class ISCA2016HOVL2B8Config extends Config(new WithNLanes(2) ++ new ISCA2016HOVB8Config) 59 | class ISCA2016LOVL2B4Config extends Config(new WithNLanes(2) ++ new ISCA2016LOVB4Config) 60 | class ISCA2016LOVL2B8Config extends Config(new WithNLanes(2) ++ new ISCA2016LOVB8Config) 61 | 62 | class ISCA2016HOVL4B4Config extends Config(new WithNLanes(4) ++ new ISCA2016HOVB4Config) 63 | class ISCA2016HOVL4B8Config extends Config(new WithNLanes(4) ++ new ISCA2016HOVB8Config) 64 | class ISCA2016LOVL4B4Config extends Config(new WithNLanes(4) ++ new ISCA2016LOVB4Config) 65 | class ISCA2016LOVL4B8Config extends Config(new WithNLanes(4) ++ new ISCA2016LOVB8Config) 66 | 67 | class DualCoreISCA2016L2Config extends Config(new WithNBigCores(2) ++ new WithNLanes(2) ++ new ISCA2016Config) 68 | 69 | class HurricaneSimilarConfig extends Config(new WithNLanes(2) ++ new WithNMemoryChannels(8) ++ new WithNBanks(1) ++ new ISCA2016Config) 70 | -------------------------------------------------------------------------------- /src/main/scala/bank-rf.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | 7 | class RFWritePort(implicit p: Parameters) extends VXUBundle()(p) with BankData with BankMask { 8 | val addr = UInt(width = log2Up(nSRAM)) 9 | } 10 | 11 | class BankRegfile(bid: Int)(implicit p: Parameters) extends VXUModule()(p) with PackLogic { 12 | val io = new Bundle { 13 | val lid = UInt(INPUT) 14 | val op = new BankOpIO().flip 15 | val global = new BankRWIO 16 | val local = new Bundle { 17 | val pdl = Vec(nLPDL, new BankPredEntry()).asOutput 18 | val opl = Vec(nLOPL, new BankDataEntry()).asOutput 19 | val ppred = new BankPredEntry().asOutput 20 | val rpred = Vec(nPredRPorts, new BankPredEntry()).asOutput 21 | val wpred = Vec(2, new BankPredMaskEntry()).asInput 22 | val wdata = new BankDataPredEntry().asInput 23 | } 24 | } 25 | val sram_rf = SeqMem(nSRAM, Vec(wBank/8, Bits(width = 8))) 26 | sram_rf.suggestName("HwSRAMRF") 27 | val ff_rf = Mem(nFF, Vec(wBank/8, Bits(width = 8))) 28 | val pred_rf = Mem(nPred, Vec(wPred, Bool())) 29 | 30 | val gopl = Mem(nGOPL, Vec(nSlices, Bits(width = regLen))) 31 | val lopl = Mem(nLOPL, Vec(nSlices, Bits(width = regLen))) 32 | val gpdl = Mem(nGPDL, Bits(width = wPred)) 33 | val lpdl = Mem(nLPDL, Bits(width = wPred)) 34 | 35 | def read_gpdl(addr: UInt) = new BankPredEntry().fromBits(gpdl(addr)) 36 | 37 | def toBytes(bits: UInt) = Vec.tabulate(wBank/8)(i => bits(8*(i+1)-1, 8*i)) 38 | def toDWords(bits: UInt) = Vec.tabulate(nSlices)(i => bits(regLen*(i+1)-1, regLen*i)) 39 | 40 | def pred_shift(idx: UInt) = if (confprec) Cat(idx, UInt(0, bSlices)) else UInt(0) 41 | def pred_read(op: ValidIO[PredRFReadOp]) = { 42 | val addr = dgate(op.valid, op.bits.addr) 43 | val idx = dgate(op.valid, op.bits.pack.idx) 44 | pred_rf(addr).asUInt >> pred_shift(idx) 45 | } 46 | 47 | // Predicate RF gated read port 48 | val pred_gated_op = IndexedSeq(io.op.pred.gread, io.op.pred.pread) 49 | val pred_gated_rdata_raw = pred_gated_op.map(op => pred_read(op)) 50 | val pred_gated_rdata = (pred_gated_op zip pred_gated_rdata_raw) map { case (op, rdata) => 51 | new BankPredEntry().fromBits( 52 | Mux(op.bits.off, op.bits.pred, op.bits.pred & Mux(op.bits.neg, ~rdata, rdata))) } 53 | val gpred = pred_gated_rdata(0) 54 | val ppred = pred_gated_rdata(1) 55 | val s1_gpred = RegEnable(gpred, io.op.pred.gread.valid) 56 | io.local.ppred := RegEnable(ppred, io.op.pred.pread.valid) 57 | 58 | // Predicate RF read port 59 | val pred_rdata = io.op.pred.read.map(op => pred_read(op)) 60 | (io.op.pred.read zip io.local.rpred zip pred_rdata) map { case ((op, rpred), rdata) => 61 | rpred := RegEnable(new BankPredEntry().fromBits(op.bits.pred & rdata), op.valid) } 62 | 63 | // Predicate RF write port 64 | when (io.op.pred.write.valid) { 65 | val waddr = io.op.pred.write.bits.addr 66 | val shift = pred_shift(io.op.pred.write.bits.pack.idx) 67 | val wpred = 68 | Mux(io.op.pred.write.bits.selg, io.global.wpred, 69 | Mux(io.op.pred.write.bits.plu, io.local.wpred(1), io.local.wpred(0))) 70 | val wdata_base = repack_pred(wpred.pred, io.op.pred.write.bits.rate) 71 | val wdata = (wdata_base << shift)(wPred-1, 0) 72 | val wmask_base = io.op.pred.write.bits.pred & wpred.mask 73 | val wmask = (wmask_base << shift)(wPred-1, 0) 74 | 75 | pred_rf.write(waddr, Vec(((wdata & wmask) | (pred_rf(waddr).asUInt & ~wmask)).asBools)) 76 | 77 | if (commit_log) { 78 | (0 until wPred) foreach { case i => 79 | when (wmask(i)) { 80 | printf("H: write_prf %d %d %d %d %d\n", io.lid, UInt(bid), waddr, UInt(i), wdata(i)) 81 | } 82 | } 83 | } 84 | } 85 | 86 | // SRAM RF read port 87 | val sram_raddr = io.op.sram.read.bits.addr 88 | val sram_rdata = sram_rf.read(sram_raddr, io.op.sram.read.valid && gpred.active()).asUInt 89 | val sram_rpack = unpack_bank(Reg(next = io.op.sram.read.bits), sram_rdata) 90 | 91 | // SRAM RF write port 92 | val sram_warb = Module(new Arbiter(new RFWritePort, 3)) 93 | sram_warb.suggestName("sram_warbInst") 94 | 95 | val sram_wdata = new BankDataPredEntry().fromBits( 96 | Mux(io.op.sram.write.bits.selg, 97 | MuxLookup(io.op.sram.write.bits.wsel, Bits(0), (0 until nWSel) map { 98 | i => UInt(i) -> io.global.wdata(i).asUInt }), 99 | io.local.wdata.asUInt)) 100 | val sram_wpack = repack_bank(io.op.sram.write.bits, sram_wdata) 101 | 102 | sram_warb.io.in(0).valid := io.op.sram.write.valid && sram_wdata.active() 103 | sram_warb.io.in(0).bits.addr := io.op.sram.write.bits.addr 104 | sram_warb.io.in(0).bits.data := sram_wpack.data 105 | sram_warb.io.in(0).bits.mask := sram_wpack.mask 106 | assert(!io.op.sram.write.valid || sram_warb.io.in(0).ready, "this sram write port should always be ready") 107 | 108 | sram_warb.io.in(1).valid := io.global.bwq.mem.valid && !io.global.bwq.mem.bits.selff 109 | sram_warb.io.in(1).bits.addr := io.global.bwq.mem.bits.saddr() 110 | sram_warb.io.in(1).bits.data := io.global.bwq.mem.bits.data 111 | sram_warb.io.in(1).bits.mask := io.global.bwq.mem.bits.mask 112 | 113 | sram_warb.io.in(2).valid := io.global.bwq.fu.valid && !io.global.bwq.fu.bits.selff 114 | sram_warb.io.in(2).bits.addr := io.global.bwq.fu.bits.saddr() 115 | sram_warb.io.in(2).bits.data := io.global.bwq.fu.bits.data 116 | sram_warb.io.in(2).bits.mask := io.global.bwq.fu.bits.mask 117 | 118 | sram_warb.io.out.ready := Bool(true) // can always write the register file 119 | when (sram_warb.io.out.valid) { 120 | val waddr = sram_warb.io.out.bits.addr 121 | val wdata = toBytes(sram_warb.io.out.bits.data) 122 | val wmask = sram_warb.io.out.bits.mask.asBools 123 | 124 | sram_rf.write(waddr, wdata, wmask) 125 | 126 | if (commit_log) { 127 | val wdata = toDWords(sram_warb.io.out.bits.data) // FIXME 128 | (0 until nSlices) foreach { case i => 129 | when (wmask(8*i)) { 130 | printf("H: write_vrf %d %d %d %d %x\n", io.lid, UInt(bid), waddr, UInt(i), wdata(i)) 131 | } 132 | } 133 | } 134 | } 135 | 136 | // FF RF read port 137 | val ff_raddr = io.op.ff.read map { op => dgate(op.valid && gpred.active(), op.bits.addr) } 138 | val ff_rdata = ff_raddr map { addr => ff_rf(addr).asUInt } 139 | 140 | // FF RF write port 141 | val ff_warb = Module(new Arbiter(new RFWritePort, 3)) 142 | ff_warb.suggestName("ff_warbInst") 143 | 144 | val ff_wdata = new BankDataPredEntry().fromBits( 145 | Mux(io.op.ff.write.bits.selg, 146 | MuxLookup(io.op.ff.write.bits.wsel, Bits(0), (0 until nWSel) map { 147 | i => UInt(i) -> io.global.wdata(i).asUInt }), 148 | io.local.wdata.asUInt)) 149 | 150 | ff_warb.io.in(0).valid := io.op.ff.write.valid && ff_wdata.active() 151 | ff_warb.io.in(0).bits.addr := io.op.ff.write.bits.addr 152 | ff_warb.io.in(0).bits.data := ff_wdata.data 153 | ff_warb.io.in(0).bits.mask := FillInterleaved(regLen/8, io.op.ff.write.bits.pred & ff_wdata.pred) 154 | assert(!io.op.ff.write.valid || ff_warb.io.in(0).ready, "this ff write port should always be ready") 155 | 156 | ff_warb.io.in(1).valid := io.global.bwq.mem.valid && io.global.bwq.mem.bits.selff 157 | ff_warb.io.in(1).bits.addr := io.global.bwq.mem.bits.faddr() 158 | ff_warb.io.in(1).bits.data := io.global.bwq.mem.bits.data 159 | ff_warb.io.in(1).bits.mask := io.global.bwq.mem.bits.mask 160 | 161 | ff_warb.io.in(2).valid := io.global.bwq.fu.valid && io.global.bwq.fu.bits.selff 162 | ff_warb.io.in(2).bits.addr := io.global.bwq.fu.bits.faddr() 163 | ff_warb.io.in(2).bits.data := io.global.bwq.fu.bits.data 164 | ff_warb.io.in(2).bits.mask := io.global.bwq.fu.bits.mask 165 | 166 | ff_warb.io.out.ready := Bool(true) // can always write the register file 167 | when (ff_warb.io.out.valid) { 168 | ff_rf.write( 169 | ff_warb.io.out.bits.addr, 170 | toBytes(ff_warb.io.out.bits.data), 171 | ff_warb.io.out.bits.mask.asBools) 172 | } 173 | 174 | // BWQ 175 | io.global.bwq.mem.ready := 176 | !io.global.bwq.mem.bits.selff && sram_warb.io.in(1).ready || 177 | io.global.bwq.mem.bits.selff && ff_warb.io.in(1).ready 178 | 179 | io.global.bwq.fu.ready := 180 | !io.global.bwq.fu.bits.selff && sram_warb.io.in(2).ready || 181 | io.global.bwq.fu.bits.selff && ff_warb.io.in(2).ready 182 | 183 | // Operand Latches (OPL) 184 | (0 until nGOPL) foreach { i => 185 | when (io.op.opl.global(i).valid && s1_gpred.active()) { 186 | gopl.write( 187 | UInt(i), 188 | toDWords(Mux(io.op.opl.global(i).bits.selff, ff_rdata(i % nFFRPorts), sram_rpack.data)), 189 | (io.op.opl.global(i).bits.pred & s1_gpred.pred)(1,0).asBools) 190 | } 191 | io.global.opl(i).data := 192 | dgate(io.op.xbar(i).valid && read_gpdl(io.op.xbar(i).bits.pdladdr).active(), gopl(i).asUInt) 193 | } 194 | (0 until nLOPL) foreach { i => 195 | when (io.op.opl.local(i).valid && s1_gpred.active()) { 196 | lopl.write( 197 | UInt(i), 198 | toDWords(Mux(io.op.opl.local(i).bits.selff, ff_rdata(i % nFFRPorts), sram_rpack.data)), 199 | (io.op.opl.local(i).bits.pred & s1_gpred.pred)(1,0).asBools) 200 | } 201 | io.local.opl(i).data := lopl(i).asUInt 202 | } 203 | 204 | // Predicate Latches (PDL) 205 | (0 until nGPDL) foreach { i => 206 | when (io.op.pdl.global(i).valid) { 207 | gpdl.write(UInt(i), s1_gpred.pred) 208 | } 209 | io.global.pdl(i).pred := dgate(io.op.pxbar(i).valid, gpdl(i)) 210 | } 211 | (0 until nLPDL) foreach { i => 212 | when (io.op.pdl.local(i).valid) { 213 | lpdl.write(UInt(i), s1_gpred.pred) 214 | } 215 | io.local.pdl(i).pred := lpdl(i) 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /src/main/scala/bank.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class BankOpIO(implicit p: Parameters) extends VXUBundle()(p) { 7 | val sram = new Bundle { 8 | val read = Valid(new SRAMRFReadMicroOp) 9 | val write = Valid(new SRAMRFWriteMicroOp) 10 | } 11 | val ff = new Bundle { 12 | val read = Vec(nFFRPorts, Valid(new FFRFReadMicroOp)) 13 | val write = Valid(new FFRFWriteMicroOp) 14 | } 15 | val pred = new Bundle { 16 | val gread = Valid(new PredRFGatedReadMicroOp) 17 | val pread = Valid(new PredRFGatedReadMicroOp) 18 | val read = Vec(nPredRPorts, Valid(new PredRFReadMicroOp)) 19 | val write = Valid(new PredRFWriteMicroOp) 20 | } 21 | val opl = new Bundle { 22 | val global = Vec(nGOPL, Valid(new OPLMicroOp)) 23 | val local = Vec(nLOPL, Valid(new OPLMicroOp)) 24 | } 25 | val pdl = new Bundle { 26 | val global = Vec(nGPDL, Valid(new PDLMicroOp)) 27 | val local = Vec(nLPDL, Valid(new PDLMicroOp)) 28 | } 29 | val sreg = Vec(nLOPL, Valid(new SRegMicroOp)) 30 | val xbar = Vec(nGOPL, Valid(new XBarMicroOp)) 31 | val pxbar = Vec(nGPDL, Valid(new PXBarMicroOp)) 32 | val viu = Valid(new VIUMicroOp) 33 | val vipu = Valid(new VIPUMicroOp) 34 | val vpu = Valid(new VPUMicroOp) 35 | val vsu = Valid(new VSUMicroOp) 36 | } 37 | 38 | class BPQIO(implicit p: Parameters) extends DecoupledIO(new BPQEntry()(p)) { 39 | } 40 | class BRQIO(implicit p: Parameters) extends DecoupledIO(new BRQEntry()(p)) { 41 | } 42 | class BWQIO(implicit p: Parameters) extends DecoupledIO(new BWQEntry()(p)) { 43 | } 44 | 45 | class BankRWIO(implicit p: Parameters) extends VXUBundle()(p) { 46 | val pdl = Vec(nGPDL, new BankPredEntry()).asOutput 47 | val opl = Vec(nGOPL, new BankDataEntry()).asOutput 48 | val wpred = new BankPredMaskEntry().asInput 49 | val wdata = Vec(nWSel, new BankDataPredEntry()).asInput 50 | 51 | val bpq = new BPQIO 52 | val brq = new BRQIO 53 | val bwq = new Bundle { 54 | val mem = new BWQIO().flip 55 | val fu = new BWQIO().flip 56 | } 57 | } 58 | 59 | class Bank(bid: Int)(implicit p: Parameters) extends VXUModule()(p) with Packing with RateLogic { 60 | val io = new Bundle { 61 | val lid = UInt(INPUT) 62 | val cfg = new HwachaConfigIO().flip 63 | val op = new BankOpIO().flip 64 | val ack = new Bundle { 65 | val viu = Valid(new VIUAck) 66 | val vipu = Valid(new VIPUAck) 67 | } 68 | val rw = new BankRWIO 69 | } 70 | 71 | val rf = Module(new BankRegfile(bid)) 72 | rf.suggestName("rfInst") 73 | 74 | rf.io.lid := io.lid 75 | rf.io.op <> io.op 76 | io.rw <> rf.io.global 77 | 78 | def valids(valid: Bool, pred: UInt, latency: Int) = 79 | ShiftRegister(Mux(valid, pred, Bits(0)), latency) 80 | 81 | // ALU 82 | val alu_pred = io.op.viu.bits.pred & rf.io.local.pdl(0).pred 83 | val alus = (0 until nSlices) map { i => 84 | val alu = Module(new ALUSlice(bid*nSlices+i)) 85 | alu.suggestName("aluInst") 86 | alu.io.cfg <> io.cfg 87 | alu.io.req.valid := io.op.viu.valid 88 | alu.io.req.bits.fn := io.op.viu.bits.fn 89 | alu.io.req.bits.eidx := io.op.viu.bits.eidx 90 | alu.io.req.bits.in0 := 91 | Mux(io.op.sreg(0).valid, splat_scalar(io.op.sreg(0).bits), 92 | unpack_slice(rf.io.local.opl(0).data, i)) 93 | alu.io.req.bits.in1 := 94 | Mux(io.op.sreg(1).valid, splat_scalar(io.op.sreg(1).bits), 95 | unpack_slice(rf.io.local.opl(1).data, i)) 96 | alu.io.req.bits.rate := io.op.viu.bits.rate 97 | alu.io.req.bits.pred := unpack_pred(alu_pred, i, alu.io.req.bits.rate) 98 | alu.io.resp 99 | } 100 | val alus_wpred = valids(io.op.viu.valid, alu_pred, stagesALU) 101 | 102 | // PLU: Predicate Logic Unit 103 | val plus = (0 until wPred) map { i => 104 | val plu = Module(new PLUSlice) 105 | plu.suggestName("pluInst") 106 | plu.io.req.valid := io.op.vipu.valid && io.op.vipu.bits.pred(i) 107 | plu.io.req.bits.fn := io.op.vipu.bits.fn 108 | plu.io.req.bits.in0 := rf.io.local.rpred(0).pred(i) 109 | plu.io.req.bits.in1 := rf.io.local.rpred(1).pred(i) 110 | plu.io.req.bits.in2 := rf.io.local.rpred(2).pred(i) 111 | plu.io.resp 112 | } 113 | val plus_wpred = valids(io.op.vipu.valid, io.op.vipu.bits.pred, stagesPLU) 114 | 115 | rf.io.local.wpred(0).pred := Cat(alus.map(_.bits.cmp).reverse) 116 | rf.io.local.wpred(0).mask := alus_wpred 117 | rf.io.local.wpred(1).pred := Cat(plus.map(_.bits.out).reverse) 118 | rf.io.local.wpred(1).mask := plus_wpred 119 | rf.io.local.wdata.data := repack_slice(alus.map(_.bits.out)) 120 | rf.io.local.wdata.pred := alus_wpred 121 | 122 | // BPQ 123 | io.rw.bpq.valid := io.op.vpu.valid 124 | io.rw.bpq.bits.pred := io.op.vpu.bits.pred & rf.io.local.ppred.pred 125 | 126 | assert(!io.op.vpu.valid || io.rw.bpq.ready, "bpq enabled when not ready; check bpq counters") 127 | 128 | // BRQ 129 | io.rw.brq.valid := io.op.vsu.valid && rf.io.local.pdl(1).active() 130 | io.rw.brq.bits.data := 131 | Mux(io.op.sreg(2).valid, splat_slice(io.op.sreg(2).bits.operand), 132 | rf.io.local.opl(2).data) 133 | 134 | assert(!io.op.vsu.valid || io.rw.brq.ready, "brq enabled when not ready; check brq counters") 135 | 136 | // ACK 137 | io.ack.viu.valid := alus.map(_.valid).reduce(_||_) 138 | io.ack.viu.bits.pred := alus_wpred 139 | } 140 | -------------------------------------------------------------------------------- /src/main/scala/configs.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import freechips.rocketchip._ 5 | import freechips.rocketchip.system._ 6 | import org.chipsalliance.cde.config._ 7 | import freechips.rocketchip.subsystem._ 8 | import freechips.rocketchip.diplomacy._ 9 | import freechips.rocketchip.rocket._ 10 | import freechips.rocketchip.tile._ 11 | import freechips.rocketchip.util._ 12 | 13 | 14 | class DefaultHwachaConfig extends Config((site, here, up) => { 15 | case HwachaIcacheKey => ICacheParams( 16 | nSets = 64, 17 | nWays = 1, 18 | rowBits = 1 * 64, 19 | nTLBWays = 8, 20 | fetchBytes = 8, // Fetch one 8 byte instruction 21 | latency = 1 22 | ) 23 | // Same as core's icache: NITLBEntries, NRAS, ECCCode, WordBits, Replacer 24 | 25 | case HwachaCommitLog => true 26 | 27 | // hwacha constants 28 | case HwachaNAddressRegs => 32 29 | case HwachaNScalarRegs => 64 30 | case HwachaNVectorRegs => 256 31 | case HwachaNPredRegs => 16 32 | case HwachaRegBits => math.max(log2Up(site(HwachaNVectorRegs)), log2Up(site(HwachaNScalarRegs))) 33 | case HwachaPredRegBits => log2Up(site(HwachaNPredRegs)) 34 | case HwachaRegLen => 64 35 | case HwachaMaxVLen => 36 | site(HwachaNBanks) * site(HwachaNSRAMRFEntries) * 37 | site(HwachaBankWidth) / site(HwachaRegLen) 38 | 39 | case HwachaNDTLB => 8 40 | case HwachaNPTLB => 4 41 | case HwachaLocalScalarFPU => false 42 | 43 | // Multi-lane constants 44 | case HwachaNLanes => 1 45 | 46 | // lane constants 47 | case HwachaBankWidth => 128 48 | case HwachaNBanks => 4 49 | case HwachaNSRAMRFEntries => 256 50 | case HwachaNFFRFEntries => 16 51 | case HwachaNFFRFReadPorts => 3 52 | case HwachaNPredRFEntries => 256 53 | case HwachaNPredRFReadPorts => 3 54 | case HwachaNOperandLatches => 6 55 | case HwachaNPredLatches => 4 56 | case HwachaWriteSelects => 2 57 | case HwachaRFAddrBits => math.max(log2Up(site(HwachaNSRAMRFEntries)), log2Up(site(HwachaNFFRFEntries))) 58 | case HwachaPRFAddrBits => log2Up(site(HwachaNPredRFEntries)) 59 | 60 | case HwachaStagesALU => 1 61 | case HwachaStagesPLU => 0 62 | case HwachaStagesIMul => 3 63 | case HwachaStagesDFMA => 4 64 | case HwachaStagesSFMA => 3 65 | case HwachaStagesHFMA => 3 66 | case HwachaStagesFConv => 2 67 | case HwachaStagesFCmp => 1 68 | 69 | case HwachaNSeqEntries => 8 70 | 71 | case HwachaNVVAQEntries => 4 72 | case HwachaNVPAQEntries => 24 73 | case HwachaNVSDQEntries => 4 74 | case HwachaNVLDQEntries => 4 75 | case HwachaNVMTEntries => 64 76 | 77 | case HwachaNSMUEntries => 16 78 | case HwachaBuildVRU => true 79 | 80 | case BuildRoCC => up(BuildRoCC) ++ Seq( 81 | (p: Parameters) => { 82 | val hwacha = LazyModule.apply(new Hwacha()(p)) 83 | hwacha 84 | } 85 | ) 86 | 87 | case HwachaConfPrec => false 88 | case HwachaVRUMaxOutstandingPrefetches => 20 89 | case HwachaVRUEarlyIgnore => 1 90 | case HwachaVRUMaxRunaheadBytes => 16777216 91 | case HwachaCMDQLen => 32 92 | case HwachaVSETVLCompress => true 93 | } 94 | ) 95 | -------------------------------------------------------------------------------- /src/main/scala/consts.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | 5 | object HwachaConstants extends HwachaConstants 6 | trait HwachaConstants 7 | extends MachineConstants 8 | with PrecisionConstants 9 | with HwachaDecodeConstants 10 | with DecodeConstants 11 | with VIUConstants 12 | with VIPUConstants 13 | with VIMUConstants 14 | with VIDUConstants 15 | with VFMUConstants 16 | with VFDUConstants 17 | with VFCUConstants 18 | with VFVUConstants 19 | with VRPUConstants 20 | with VMUConstants 21 | with SMUConstants 22 | 23 | trait MachineConstants { 24 | val SZ_D = 64 25 | val SZ_W = 32 26 | val SZ_H = 16 27 | val SZ_B = 8 28 | 29 | val HwachaElementInstBytes = HwachaElementInstructions.VSTOP.getWidth/8 30 | } 31 | 32 | trait PrecisionConstants { 33 | val SZ_PREC = 2 34 | 35 | val PREC_X = BitPat("b??") 36 | val PREC_D = UInt(0, SZ_PREC) 37 | val PREC_W = UInt(1, SZ_PREC) 38 | val PREC_H = UInt(2, SZ_PREC) 39 | } 40 | 41 | trait HwachaDecodeConstants { 42 | val VRT_X = BitPat("b?") 43 | val VRT_S = UInt(0, 1) 44 | val VRT_A = UInt(1, 1) 45 | 46 | val VR_X = BitPat("b?") 47 | val VR_RS1 = UInt(0, 1) 48 | val VR_RD = UInt(1, 1) 49 | 50 | val RIMM_X = BitPat("b???") 51 | val RIMM_VLEN = UInt(0,3) 52 | val RIMM_RS1 = UInt(1,3) 53 | val RIMM_RS2 = UInt(2,3) 54 | val RIMM_ADDR = UInt(3,3) 55 | 56 | val RESP_X = BitPat("b???") 57 | val RESP_NVL = UInt(0,3) 58 | val RESP_CAUSE = UInt(1,3) 59 | val RESP_AUX = UInt(2,3) 60 | val RESP_CFG = UInt(3,3) 61 | val RESP_VL = UInt(4,3) 62 | 63 | } 64 | 65 | trait DecodeConstants { 66 | val Y = BitPat("b1") 67 | val N = BitPat("b0") 68 | val X = BitPat("b?") 69 | 70 | val M0 = UInt(0, 2) 71 | val MR = UInt(1, 2) 72 | val ML = UInt(2, 2) 73 | val MI = UInt(3, 2) 74 | 75 | val RX = BitPat("b??") 76 | val RS = UInt(0, 2) 77 | val RA = UInt(1, 2) 78 | val RP = UInt(2, 2) 79 | val RV = UInt(3, 2) 80 | 81 | val REG_SHR = UInt(0,2) 82 | val REG_ADDR = UInt(1,2) 83 | val REG_PRED = UInt(2,2) 84 | val REG_VEC = UInt(3,2) 85 | 86 | def reg_type(t: Bits, d: Bool, i: Bool) = Mux(d, Mux(i, REG_VEC, REG_SHR), t) 87 | 88 | val SZ_I = 2 89 | val IMM_X = BitPat("b??") 90 | val IMM_I = UInt(0, SZ_I) 91 | val IMM_L = UInt(1, SZ_I) 92 | val IMM_U = UInt(2, SZ_I) 93 | 94 | val MT_SZ = 3 95 | val MT_X = BitPat("b???") 96 | val MT_B = UInt("b000") 97 | val MT_H = UInt("b001") 98 | val MT_W = UInt("b010") 99 | val MT_D = UInt("b011") 100 | val MT_BU = UInt("b100") 101 | val MT_HU = UInt("b101") 102 | val MT_WU = UInt("b110") 103 | 104 | val DW__ = BitPat("b?") 105 | val DW32 = UInt(0, 1) 106 | val DW64 = UInt(1, 1) 107 | 108 | val FP_ = BitPat("b??") 109 | val FPS = UInt(0, 2) 110 | val FPD = UInt(1, 2) 111 | val FPH = UInt(2, 2) 112 | 113 | val SZ_BMUXSEL = 2 114 | val SZ_DW = 1 115 | val SZ_FP = 2 116 | 117 | val A1_X = BitPat("b??") 118 | val A1_ZERO = UInt(0, 2) 119 | val A1_RS1 = UInt(1, 2) 120 | val A1_PC = UInt(2, 2) 121 | 122 | val A2_X = BitPat("b??") 123 | val A2_8 = UInt(0, 2) 124 | val A2_RS2 = UInt(1, 2) 125 | val A2_IMM = UInt(2, 2) 126 | 127 | //riscv-opcode fields 128 | val OPC_VD = UInt(63) 129 | val OPC_VS1 = UInt(62) 130 | val OPC_VS2 = UInt(61) 131 | val OPC_VS3 = UInt(60) 132 | val OPC_NEG = UInt(32) 133 | } 134 | 135 | trait VIUConstants { 136 | val SZ_VIU_OP = 5 137 | 138 | val I_X = BitPat("b?????") 139 | val I_ADD = UInt(0, SZ_VIU_OP) 140 | val I_ADDU = UInt(1, SZ_VIU_OP) 141 | val I_SLL = UInt(2, SZ_VIU_OP) 142 | val I_SLT = UInt(3, SZ_VIU_OP) 143 | val I_SLTU = UInt(4, SZ_VIU_OP) 144 | val I_XOR = UInt(5, SZ_VIU_OP) 145 | val I_SRL = UInt(6, SZ_VIU_OP) 146 | val I_SRA = UInt(7, SZ_VIU_OP) 147 | val I_OR = UInt(8, SZ_VIU_OP) 148 | val I_AND = UInt(9, SZ_VIU_OP) 149 | val I_SUB = UInt(10, SZ_VIU_OP) 150 | val I_IDX = UInt(11, SZ_VIU_OP) 151 | val I_MOV0 = UInt(12, SZ_VIU_OP) 152 | val I_FSJ = UInt(13, SZ_VIU_OP) 153 | val I_FSJN = UInt(14, SZ_VIU_OP) 154 | val I_FSJX = UInt(15, SZ_VIU_OP) 155 | val I_CEQ = UInt(16, SZ_VIU_OP) 156 | val I_CLT = UInt(17, SZ_VIU_OP) 157 | val I_CLTU = UInt(18, SZ_VIU_OP) 158 | } 159 | 160 | trait VIPUConstants { 161 | val SZ_VIPU_OP = 8 162 | } 163 | 164 | trait VIMUConstants { 165 | val SZ_VIMU_OP = 2 166 | 167 | val IM_X = BitPat("b??") 168 | val IM_M = UInt(0, SZ_VIMU_OP) 169 | val IM_MH = UInt(1, SZ_VIMU_OP) 170 | val IM_MHSU = UInt(2, SZ_VIMU_OP) 171 | val IM_MHU = UInt(3, SZ_VIMU_OP) 172 | } 173 | 174 | trait VIDUConstants { 175 | val SZ_VIDU_OP = 2 176 | 177 | val ID_X = BitPat("b??") 178 | val ID_DIV = UInt(0, SZ_VIDU_OP) 179 | val ID_DIVU = UInt(1, SZ_VIDU_OP) 180 | val ID_REM = UInt(2, SZ_VIDU_OP) 181 | val ID_REMU = UInt(3, SZ_VIDU_OP) 182 | } 183 | 184 | trait VFMUConstants { 185 | val SZ_VFMU_OP = 3 186 | 187 | val FM_X = BitPat("b???") 188 | val FM_ADD = UInt(0, SZ_VFMU_OP) 189 | val FM_SUB = UInt(1, SZ_VFMU_OP) 190 | val FM_MUL = UInt(2, SZ_VFMU_OP) 191 | val FM_MADD = UInt(4, SZ_VFMU_OP) 192 | val FM_MSUB = UInt(5, SZ_VFMU_OP) 193 | val FM_NMSUB = UInt(6, SZ_VFMU_OP) 194 | val FM_NMADD = UInt(7, SZ_VFMU_OP) 195 | 196 | val IS_FM_OP_MA = (x: Bits) => x(2) 197 | } 198 | 199 | trait VFDUConstants { 200 | val SZ_VFDU_OP = 1 201 | 202 | val FD_X = BitPat("b?") 203 | val FD_DIV = UInt(0, SZ_VFDU_OP) 204 | val FD_SQRT = UInt(1, SZ_VFDU_OP) 205 | } 206 | 207 | trait VFCUConstants { 208 | val SZ_VFCU_OP = 3 209 | 210 | val FC_X = BitPat("b???") 211 | val FC_CEQ = UInt(0, SZ_VFCU_OP) 212 | val FC_CLT = UInt(1, SZ_VFCU_OP) 213 | val FC_CLE = UInt(2, SZ_VFCU_OP) 214 | val FC_MIN = UInt(3, SZ_VFCU_OP) 215 | val FC_MAX = UInt(4, SZ_VFCU_OP) 216 | val FC_CLASS = UInt(5, SZ_VFCU_OP) 217 | } 218 | 219 | trait VFVUConstants { 220 | val SZ_VFVU_OP = 4 221 | 222 | val FV_X = BitPat("b????") 223 | val FV_CLTF = UInt(0, SZ_VFVU_OP) 224 | val FV_CLUTF = UInt(1, SZ_VFVU_OP) 225 | val FV_CWTF = UInt(2, SZ_VFVU_OP) 226 | val FV_CWUTF = UInt(3, SZ_VFVU_OP) 227 | val FV_CFTL = UInt(4, SZ_VFVU_OP) 228 | val FV_CFTLU = UInt(5, SZ_VFVU_OP) 229 | val FV_CFTW = UInt(6, SZ_VFVU_OP) 230 | val FV_CFTWU = UInt(7, SZ_VFVU_OP) 231 | val FV_CDTS = UInt(8, SZ_VFVU_OP) 232 | val FV_CDTH = UInt(9, SZ_VFVU_OP) 233 | val FV_CSTD = UInt(10, SZ_VFVU_OP) 234 | val FV_CSTH = UInt(11, SZ_VFVU_OP) 235 | val FV_CHTD = UInt(12, SZ_VFVU_OP) 236 | val FV_CHTS = UInt(13, SZ_VFVU_OP) 237 | } 238 | 239 | trait VRPUConstants { 240 | val SZ_VRPU_OP = 2 241 | 242 | val FR_X = BitPat("b??") 243 | val FR_ALL = UInt(0, SZ_VRPU_OP) 244 | val FR_ANY = UInt(1, SZ_VRPU_OP) 245 | } 246 | 247 | trait VMUConstants { 248 | val SZ_VMU_MODE = 2 249 | 250 | val VM_X = BitPat("b??") 251 | val VM_U = UInt(0, SZ_VMU_MODE) // unit-stride 252 | val VM_S = UInt(1, SZ_VMU_MODE) // constant-stride 253 | val VM_I = UInt(2, SZ_VMU_MODE) // indexed 254 | 255 | def vmu_unit(mode: UInt): Bool = (mode === VM_U).suggestName("vmu_unitWire") 256 | def vmu_indexed(mode: UInt): Bool = mode(1).suggestName("vmu_indexedWire") 257 | } 258 | 259 | trait SMUConstants { 260 | val SZ_SMU_CMD = 1 261 | 262 | val SM_X = BitPat("b?") 263 | val SM_L = UInt(0, SZ_SMU_CMD) 264 | val SM_S = UInt(1, SZ_SMU_CMD) 265 | } 266 | 267 | object Commands extends Commands 268 | trait Commands { 269 | // command bits for the vector command queue 270 | val CMD_X = BitPat("b???") 271 | 272 | val CMD_VSETCFG = UInt(0,3) 273 | val CMD_VSETVL = UInt(1,3) 274 | val CMD_VF = UInt(2,3) 275 | val CMD_VFT = UInt(3,3) 276 | val CMD_VMCA = UInt(4,3) 277 | val CMD_VMCS = UInt(5,3) 278 | } 279 | -------------------------------------------------------------------------------- /src/main/scala/dcc-fu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | 7 | class VDUTag(implicit p: Parameters) extends VXUBundle()(p) 8 | with BankPred with BankPack { 9 | val bank = UInt(width = bBanks) 10 | val selff = Bool() // select ff if true 11 | val addr = UInt(width = math.max(log2Up(nSRAM), log2Up(nFF))) 12 | val fusel = Bits(width = 1) // because we have 2 units idiv/fdiv 13 | } 14 | 15 | class ReduceResultIO(implicit p: Parameters) extends VXUBundle()(p) { 16 | val pred = Decoupled(new RPredResult) 17 | val first = Decoupled(new RFirstResult) 18 | } 19 | 20 | class VDU(implicit p: Parameters) extends VXUModule()(p) { 21 | val io = new DCCIssueIO { 22 | val cfg = new HwachaConfigIO().flip 23 | val ack = new DCCAckIO 24 | val pla = new CounterLookAheadIO().flip // lpq entry 25 | val qla = Vec(nVDUOperands, new CounterLookAheadIO).flip // lrq entries 26 | val ila = new CounterLookAheadIO().flip // idiv output entries 27 | val fla = new CounterLookAheadIO().flip // fdiv output entries 28 | val lpq = new LPQIO().flip 29 | val lrqs = Vec(nVDUOperands, new LRQIO).flip 30 | val bwqs = Vec(nBanks, new BWQIO) 31 | val red = new ReduceResultIO 32 | } 33 | 34 | val ctrl = Module(new VDUCtrl) 35 | ctrl.suggestName("ctrlInst") 36 | 37 | ctrl.io.op <> io.op 38 | ctrl.io.ila <> io.ila 39 | ctrl.io.fla <> io.fla 40 | ctrl.io.cfg <> io.cfg 41 | 42 | val lpq = Module(new Queue(new LPQEntry, nBanks+2)) 43 | lpq.suggestName("lpqInst") 44 | lpq.io.enq <> io.lpq 45 | ctrl.io.lpq <> lpq.io.deq 46 | 47 | val pcntr = Module(new LookAheadCounter(nBanks+2, nBanks+2)) 48 | pcntr.suggestName("pcntrInst") 49 | pcntr.io.inc.cnt := UInt(1) 50 | pcntr.io.inc.update := lpq.io.deq.fire 51 | pcntr.io.dec <> io.pla 52 | 53 | for (i <- 0 until nVDUOperands) { 54 | val lrq = Module(new Queue(new LRQEntry, nBanks+2)) 55 | lrq.suggestName("lrqInst") 56 | lrq.io.enq <> io.lrqs(i) 57 | ctrl.io.lrqs.q(i) <> lrq.io.deq 58 | 59 | val cntr = Module(new LookAheadCounter(nBanks+2, nBanks+2)) 60 | cntr.suggestName("cntrInst") 61 | cntr.io.inc.cnt := UInt(1) 62 | cntr.io.inc.update := ctrl.io.lrqs.update(i) 63 | cntr.io.dec <> io.qla(i) 64 | } 65 | 66 | for (i <- 0 until nSlices) { 67 | val idiv = Module(new IDivSlice) 68 | idiv.suggestName("idivInst") 69 | val fdiv = Module(new FDivSlice) 70 | fdiv.suggestName("fdivInst") 71 | idiv.io <> ctrl.io.idiv.fus(i) 72 | fdiv.io <> ctrl.io.fdiv.fus(i) 73 | } 74 | 75 | val rpred = Module(new RPredLane) 76 | rpred.suggestName("rpredInst") 77 | rpred.io <> ctrl.io.rpred.fu 78 | io.red.pred.bits.cond := rpred.io.result.bits.cond 79 | io.red.pred.valid := ctrl.io.rpred.result.valid 80 | ctrl.io.rpred.result.ready := io.red.pred.ready 81 | 82 | val rfirst = Module(new RFirstLane) 83 | rfirst.suggestName("rfirstInst") 84 | rfirst.io <> ctrl.io.rfirst.fu 85 | io.red.first.bits <> rfirst.io.result.bits 86 | io.red.first.valid := ctrl.io.rfirst.result.valid 87 | ctrl.io.rfirst.result.ready := io.red.first.ready 88 | 89 | io.ack.vidu <> ctrl.io.idiv.ack 90 | io.ack.vfdu <> ctrl.io.fdiv.ack 91 | io.bwqs <> ctrl.io.bwqs 92 | } 93 | 94 | class VDUCtrl(implicit p: Parameters) extends VXUModule()(p) with PackLogic { 95 | val io = new DCCIssueIO { 96 | val cfg = new HwachaConfigIO().flip 97 | val ila = new CounterLookAheadIO().flip 98 | val fla = new CounterLookAheadIO().flip 99 | val lpq = new LPQIO().flip 100 | val lrqs = new Bundle { 101 | val q = Vec(nVDUOperands, new LRQIO).flip 102 | val update = Vec(nVDUOperands, Bool(OUTPUT)) 103 | } 104 | 105 | val idiv = new Bundle { 106 | val fus = Vec(nSlices, new IDivIO) 107 | val ack = Valid(new VIDUAck) 108 | } 109 | val fdiv = new Bundle { 110 | val fus = Vec(nSlices, new FDivIO) 111 | val ack = Valid(new VFDUAck) 112 | } 113 | val rpred = new Bundle { 114 | val fu = new RPredIO 115 | val result = Decoupled(new RPredResult) 116 | } 117 | val rfirst = new Bundle { 118 | val fu = new RFirstIO 119 | val result = Decoupled(new RFirstResult) 120 | } 121 | 122 | val bwqs = Vec(nBanks, new BWQIO) 123 | } 124 | 125 | val opq = Module(new Queue(new DCCOp, nDCCOpQ)) 126 | opq.suggestName("opqInst") 127 | opq.io.enq <> io.op 128 | 129 | val s_idle :: s_busy :: s_wait :: Nil = Enum(UInt(), 3) 130 | val state = Reg(init = s_idle) 131 | val op = Reg(new DCCOp) 132 | 133 | val slice_idx = Reg(UInt(width = bfLStrip - bSlices)) 134 | val strip_idx = Reg(UInt(width = bVLen - bStrip)) 135 | val slice_idx_next = slice_idx + UInt(1) 136 | val lstrip = io.cfg.lstrip >> UInt(bSlices) 137 | val bank = slice_idx(bBanks-1, 0) 138 | 139 | val pack = Wire(new PackInfo) 140 | val (vd_update, vd_stride) = if (confprec) { 141 | pack.prec := op.vd.prec 142 | pack.idx := slice_idx >> UInt(bBanks) 143 | confprec_step(pack.prec, slice_idx_next >> UInt(bBanks), io.cfg) 144 | } else { 145 | pack.prec := PREC_D 146 | pack.idx := UInt(0) 147 | (Bool(true), io.cfg.vstride.d) 148 | } 149 | 150 | val fire = Wire(Bool()) 151 | val fire_div = Wire(Bool()) 152 | val fire_first = Wire(Bool()) 153 | val fire_reduce = Wire(Bool()) 154 | val ecnt = Mux(op.vlen > UInt(nSlices), UInt(nSlices), op.vlen(bSlices, 0)) 155 | val vlen_next = op.vlen - ecnt 156 | val pred = Vec((0 until nSlices).map(UInt(_) < ecnt)).asUInt 157 | val idiv_active = op.active.vidiv 158 | 159 | opq.io.deq.ready := Bool(false) 160 | io.rpred.result.valid := Bool(false) 161 | io.rfirst.result.valid := Bool(false) 162 | 163 | switch (state) { 164 | is (s_idle) { 165 | opq.io.deq.ready := Bool(true) 166 | when (opq.io.deq.valid) { 167 | state := s_busy 168 | op := opq.io.deq.bits 169 | slice_idx := UInt(0) 170 | strip_idx := UInt(0) 171 | } 172 | } 173 | is (s_busy) { 174 | when (fire) { 175 | op.vlen := vlen_next 176 | when (vlen_next === UInt(0)) { 177 | state := Mux(fire_reduce, s_wait, s_idle) 178 | } 179 | } 180 | when (fire_div) { 181 | slice_idx := slice_idx_next 182 | when (vd_update && (bank === UInt(nBanks-1))) { 183 | op.vd.id := op.vd.id + vd_stride 184 | } 185 | } 186 | when (fire_first) { 187 | slice_idx := slice_idx_next 188 | when (slice_idx_next === lstrip) { 189 | slice_idx := UInt(0) 190 | strip_idx := strip_idx + UInt(1) 191 | } 192 | } 193 | } 194 | is (s_wait) { 195 | io.rpred.result.valid := op.active.vrpred 196 | io.rfirst.result.valid := op.active.vrfirst 197 | when (io.rpred.result.fire || io.rfirst.result.fire) { 198 | state := s_idle 199 | } 200 | } 201 | } 202 | 203 | val tagq = Module(new Queue(new VDUTag, nDecoupledUnitWBQueue)) 204 | tagq.suggestName("tagqInst") 205 | 206 | val active_entry = io.lpq.bits.active() 207 | val mask_lrq0_valid = !active_entry || io.lrqs.q(0).valid 208 | val mask_lrq1_valid = !active_entry || io.lrqs.q(1).valid 209 | val deq_fdiv_lrq1 = op.fn.vfdu().op_is(FD_DIV) 210 | val mask_fdiv_lrq1_valid = !deq_fdiv_lrq1 || mask_lrq1_valid 211 | val enq_idivs_req = (0 until nSlices).map { i => pred(i) && io.lpq.bits.pred(i) } 212 | val enq_fdivs_req = (0 until nSlices).map { i => pred(i) && io.lpq.bits.pred(i) } 213 | val mask_idivs_req_ready = io.idiv.fus.zipWithIndex.map { case (idiv, i) => 214 | !enq_idivs_req(i) || idiv.req.ready } 215 | val mask_fdivs_req_ready = io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 216 | !enq_fdivs_req(i) || fdiv.req.ready } 217 | 218 | def fire_idiv(exclude: Bool, include: Bool*) = { 219 | val rvs = Seq( 220 | state === s_busy, op.active.vidiv, 221 | io.lpq.valid, mask_lrq0_valid, mask_lrq1_valid, 222 | tagq.io.enq.ready) ++ mask_idivs_req_ready 223 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 224 | } 225 | 226 | def fire_fdiv(exclude: Bool, include: Bool*) = { 227 | val rvs = Seq( 228 | state === s_busy, op.active.vfdiv, 229 | io.lpq.valid, mask_lrq0_valid, mask_fdiv_lrq1_valid, 230 | tagq.io.enq.ready) ++ mask_fdivs_req_ready 231 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 232 | } 233 | 234 | def fire_rpred(exclude: Bool, include: Bool*) = { 235 | val rvs = Seq( 236 | state === s_busy, op.active.vrpred, 237 | io.lpq.valid, io.rpred.fu.req.ready) 238 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 239 | } 240 | 241 | def fire_rfirst(exclude: Bool, include: Bool*) = { 242 | val rvs = Seq( 243 | state === s_busy, op.active.vrfirst, 244 | io.lpq.valid, mask_lrq0_valid, io.rfirst.fu.req.ready) 245 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 246 | } 247 | 248 | fire_div := fire_idiv(null) || fire_fdiv(null) 249 | fire_first := fire_rfirst(null) 250 | fire_reduce := fire_rpred(null) || fire_first 251 | fire := fire_div || fire_reduce 252 | 253 | io.lpq.ready := 254 | fire_idiv(io.lpq.valid) || fire_fdiv(io.lpq.valid) || 255 | fire_rpred(io.lpq.valid) || fire_rfirst(io.lpq.valid) 256 | io.lrqs.q(0).ready := 257 | fire_idiv(mask_lrq0_valid, active_entry) || fire_fdiv(mask_lrq0_valid, active_entry) || 258 | fire_rfirst(mask_lrq0_valid, active_entry) 259 | io.lrqs.q(1).ready := 260 | fire_idiv(mask_lrq1_valid, active_entry) || 261 | fire_fdiv(mask_fdiv_lrq1_valid, deq_fdiv_lrq1, active_entry) 262 | io.lrqs.update(0) := fire_idiv(null) || fire_fdiv(null) || fire_rfirst(null) 263 | io.lrqs.update(1) := fire_idiv(null) || fire_fdiv(null, deq_fdiv_lrq1) 264 | 265 | tagq.io.enq.valid := fire_idiv(tagq.io.enq.ready) || fire_fdiv(tagq.io.enq.ready) 266 | tagq.io.enq.bits.pred := pred & io.lpq.bits.pred 267 | tagq.io.enq.bits.bank := bank 268 | tagq.io.enq.bits.selff := Bool(false) // FIXME 269 | tagq.io.enq.bits.addr := op.vd.id 270 | tagq.io.enq.bits.fusel := op.active.vidiv 271 | tagq.io.enq.bits.pack := pack 272 | 273 | io.idiv.fus.zipWithIndex.map { case (idiv, i) => 274 | idiv.req.valid := fire_idiv(mask_idivs_req_ready(i), enq_idivs_req(i)) } 275 | io.idiv.fus.zipWithIndex.map { case (idiv, i) => 276 | idiv.req.bits.fn := op.fn.vidu() 277 | idiv.req.bits.in0 := unpack_slice(io.lrqs.q(0).bits.data, i) 278 | idiv.req.bits.in1 := unpack_slice(io.lrqs.q(1).bits.data, i) 279 | } 280 | 281 | io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 282 | fdiv.req.valid := fire_fdiv(mask_fdivs_req_ready(i), enq_fdivs_req(i)) } 283 | io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 284 | fdiv.req.bits.fn := op.fn.vfdu() 285 | fdiv.req.bits.in0 := unpack_slice(io.lrqs.q(0).bits.data, i) 286 | fdiv.req.bits.in1 := unpack_slice(io.lrqs.q(1).bits.data, i) 287 | } 288 | 289 | io.rpred.fu.op.valid := opq.io.deq.fire && opq.io.deq.bits.active.vrpred 290 | io.rpred.fu.op.bits := opq.io.deq.bits.fn.vrpu() 291 | io.rpred.fu.req.valid := fire_rpred(io.rpred.fu.req.ready) 292 | io.rpred.fu.req.bits.active := pred 293 | io.rpred.fu.req.bits.pred := io.lpq.bits.pred 294 | 295 | io.rfirst.fu.op.valid := opq.io.deq.fire && opq.io.deq.bits.active.vrfirst 296 | io.rfirst.fu.op.bits := opq.io.deq.bits.fn.vrfu() 297 | io.rfirst.fu.req.valid := fire_rfirst(io.rfirst.fu.req.ready) 298 | io.rfirst.fu.req.bits.active := pred 299 | io.rfirst.fu.req.bits.pred := io.lpq.bits.pred 300 | io.rfirst.fu.req.bits.lsidx := strip_idx 301 | io.rfirst.fu.req.bits.in := Vec((0 until nSlices) map { unpack_slice(io.lrqs.q(0).bits.data, _) }) 302 | 303 | val deq_idivs_resp = (0 until nSlices).map { tagq.io.deq.bits.fusel.asBool && tagq.io.deq.bits.pred(_) } 304 | val deq_fdivs_resp = (0 until nSlices).map { !tagq.io.deq.bits.fusel.asBool && tagq.io.deq.bits.pred(_) } 305 | val mask_idivs_resp_valid = io.idiv.fus.zipWithIndex.map { case (idiv, i) => 306 | !deq_idivs_resp(i) || idiv.resp.valid } 307 | val mask_fdivs_resp_valid = io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 308 | !deq_fdivs_resp(i) || fdiv.resp.valid } 309 | 310 | val enq_bwqs = (0 until nBanks).map { tagq.io.deq.bits.active() && tagq.io.deq.bits.bank === UInt(_) } 311 | val mask_bwqs_ready = io.bwqs.zipWithIndex.map { case (bwq, i) => !enq_bwqs(i) || bwq.ready } 312 | 313 | def fire_bwq(exclude: Bool, include: Bool*) = { 314 | val rvs = tagq.io.deq.valid +: (mask_idivs_resp_valid ++ mask_fdivs_resp_valid ++ mask_bwqs_ready) 315 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 316 | } 317 | 318 | tagq.io.deq.ready := fire_bwq(tagq.io.deq.valid) 319 | io.idiv.fus.zipWithIndex.map { case (idiv, i) => 320 | idiv.resp.ready := fire_bwq(mask_idivs_resp_valid(i), deq_idivs_resp(i)) } 321 | io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 322 | fdiv.resp.ready := fire_bwq(mask_fdivs_resp_valid(i), deq_fdivs_resp(i)) } 323 | io.bwqs.zipWithIndex.map { case (bwq, i) => 324 | bwq.valid := fire_bwq(mask_bwqs_ready(i), enq_bwqs(i)) } 325 | 326 | val wraw = Wire(new BankDataPredEntry) 327 | wraw.pred := tagq.io.deq.bits.pred 328 | wraw.data := Mux(tagq.io.deq.bits.fusel.asBool, 329 | repack_slice(io.idiv.fus.map(_.resp.bits.out)), 330 | repack_slice(io.fdiv.fus.map(_.resp.bits.out))) 331 | val wpack = repack_bank(tagq.io.deq.bits.pack, UInt(0), wraw) 332 | 333 | io.bwqs.map { bwq => 334 | bwq.bits.selff := tagq.io.deq.bits.selff 335 | bwq.bits.addr := tagq.io.deq.bits.addr 336 | bwq.bits.data := wpack.data 337 | bwq.bits.mask := wpack.mask 338 | } 339 | 340 | io.idiv.ack.valid := fire_bwq(null, tagq.io.deq.bits.fusel.asBool) 341 | io.idiv.ack.bits.pred := tagq.io.deq.bits.pred 342 | io.fdiv.ack.valid := fire_bwq(null, !tagq.io.deq.bits.fusel.asBool) 343 | io.fdiv.ack.bits.pred := tagq.io.deq.bits.pred 344 | io.fdiv.ack.bits.exc := io.fdiv.fus.zipWithIndex.map { case (fdiv, i) => 345 | dgate(tagq.io.deq.bits.pred(i), fdiv.resp.bits.exc) } reduce(_|_) 346 | 347 | val icntr = Module(new LookAheadCounter(0, maxLookAhead)) 348 | icntr.suggestName("icntrInst") 349 | icntr.io.inc.cnt := UInt(1) 350 | icntr.io.inc.update := fire_bwq(null, tagq.io.deq.bits.fusel.asBool) 351 | icntr.io.dec <> io.ila 352 | 353 | val fcntr = Module(new LookAheadCounter(0, maxLookAhead)) 354 | fcntr.suggestName("fcntrInst") 355 | fcntr.io.inc.cnt := UInt(1) 356 | fcntr.io.inc.update := fire_bwq(null, !tagq.io.deq.bits.fusel.asBool) 357 | fcntr.io.dec <> io.fla 358 | } 359 | -------------------------------------------------------------------------------- /src/main/scala/dcc.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | abstract trait DCCParameters extends UsesHwachaParameters { 7 | val nDCCOpQ = 2 8 | val nDCCPredQ = 4 9 | val nVDUOperands = 2 10 | 11 | val nBPQ = 2*nBanks 12 | val nBRQ = 4 13 | val nBWQ = 2 14 | 15 | val maxSLA = 7 /* Ideally (2^i - 1) where (i > 1) */ 16 | 17 | val nVLU = 2 18 | val bVLU = log2Ceil(nVLU) 19 | } 20 | 21 | class DCCAckIO(implicit p: Parameters) extends HwachaBundle()(p) { 22 | val vidu = Valid(new VIDUAck) 23 | val vfdu = Valid(new VFDUAck) 24 | } 25 | 26 | class DCCIssueIO(implicit p: Parameters) extends Bundle { 27 | val op = Decoupled(new DCCOp).flip 28 | } 29 | 30 | class DecoupledCluster(implicit p: Parameters) extends VXUModule()(p) { 31 | val io = new DCCIssueIO { 32 | val cfg = new HwachaConfigIO().flip 33 | val ack = new DCCAckIO 34 | val lpqs = Vec(nLPQ, new LPQIO).flip 35 | val lrqs = Vec(nLRQ, new LRQIO).flip 36 | val bpqs = Vec(nBanks, new BPQIO).flip 37 | val brqs = Vec(nBanks, new BRQIO).flip 38 | val bwqs = new Bundle { 39 | val mem = Vec(nBanks, new BWQIO) 40 | val fu = Vec(nBanks, new BWQIO) 41 | } 42 | val dpla = new CounterLookAheadIO().flip // DCC LPQ counter 43 | val dqla = Vec(nVDUOperands, new CounterLookAheadIO).flip // DCC LRQ counter 44 | val dila = new CounterLookAheadIO().flip // idiv counter 45 | val dfla = new CounterLookAheadIO().flip // fdiv counter 46 | val gpla = new CounterLookAheadIO().flip // VGU LPQ counter 47 | val gqla = new CounterLookAheadIO().flip // VGU LRQ counter 48 | val pla = new BPQLookAheadIO().flip // VPU BPQ counter 49 | val lla = new CounterLookAheadIO().flip // VLU BWQ counter 50 | val sla = new BRQLookAheadIO().flip // VSU BRQ counter 51 | val red = new ReduceResultIO 52 | val vmu = new VMUIO 53 | } 54 | 55 | val vdu = Module(new VDU) 56 | vdu.suggestName("vduInst") 57 | val vgu = Module(new VGU) 58 | vgu.suggestName("vguInst") 59 | val vpu = Module(new VPU) 60 | vpu.suggestName("vpuInst") 61 | val vlu = Module(new VLU) 62 | vlu.suggestName("vluInst") 63 | val vsu = Module(new VSU) 64 | vsu.suggestName("vsuInst") 65 | 66 | val mask_vdu_ready = !io.op.bits.active.enq_vdu() || vdu.io.op.ready 67 | val mask_vgu_ready = !io.op.bits.active.enq_vgu() || vgu.io.op.ready 68 | val mask_vpu_ready = !io.op.bits.active.enq_vpu() || vpu.io.op.ready 69 | val mask_vlu_ready = !io.op.bits.active.enq_vlu() || vlu.io.op.ready 70 | val mask_vsu_ready = !io.op.bits.active.enq_vsu() || vsu.io.op.ready 71 | 72 | def fire(exclude: Bool, include: Bool*) = { 73 | val rvs = Seq( 74 | io.op.valid, 75 | mask_vdu_ready, mask_vgu_ready, mask_vpu_ready, mask_vlu_ready, mask_vsu_ready) 76 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 77 | } 78 | 79 | io.op.ready := fire(io.op.valid) 80 | 81 | vdu.io.cfg <> io.cfg 82 | vdu.io.op.valid := fire(mask_vdu_ready, io.op.bits.active.enq_vdu()) 83 | vdu.io.op.bits := io.op.bits 84 | vdu.io.pla <> io.dpla 85 | vdu.io.qla <> io.dqla 86 | vdu.io.ila <> io.dila 87 | vdu.io.fla <> io.dfla 88 | vdu.io.lpq <> io.lpqs(0) 89 | vdu.io.lrqs(0) <> io.lrqs(0) 90 | vdu.io.lrqs(1) <> io.lrqs(1) 91 | io.ack <> vdu.io.ack 92 | io.bwqs.fu <> vdu.io.bwqs 93 | io.red <> vdu.io.red 94 | 95 | vgu.io.op.valid := fire(mask_vgu_ready, io.op.bits.active.enq_vgu()) 96 | vgu.io.op.bits := io.op.bits 97 | vgu.io.pla <> io.gpla 98 | vgu.io.qla <> io.gqla 99 | vgu.io.lpq <> io.lpqs(1) 100 | vgu.io.lrq <> io.lrqs(2) 101 | io.vmu.vaq <> vgu.io.vaq 102 | 103 | vpu.io.op.valid := fire(mask_vpu_ready, io.op.bits.active.enq_vpu()) 104 | vpu.io.op.bits := io.op.bits 105 | vpu.io.la <> io.pla 106 | vpu.io.bpqs <> io.bpqs 107 | io.vmu.pred <> vpu.io.pred 108 | vlu.io.pred <> vpu.io.lpred 109 | vsu.io.pred <> vpu.io.spred 110 | 111 | vlu.io.cfg <> io.cfg 112 | vlu.io.op.valid := fire(mask_vlu_ready, io.op.bits.active.enq_vlu()) 113 | vlu.io.op.bits := io.op.bits 114 | vlu.io.la <> io.lla 115 | vlu.io.vldq <> io.vmu.vldq 116 | io.bwqs.mem <> vlu.io.bwqs 117 | io.vmu.vlu <> vlu.io.map 118 | 119 | vsu.io.op.valid := fire(mask_vsu_ready, io.op.bits.active.enq_vsu()) 120 | vsu.io.op.bits := io.op.bits 121 | vsu.io.la <> io.sla 122 | vsu.io.brqs <> io.brqs 123 | io.vmu.vsdq <> vsu.io.vsdq 124 | } 125 | -------------------------------------------------------------------------------- /src/main/scala/frontend.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.diplomacy._ 6 | import freechips.rocketchip.tilelink._ 7 | import freechips.rocketchip.rocket.{ICacheParams, MStatus, TLBConfig, TLBResp} 8 | 9 | class FrontendResp(icacheParams: ICacheParams)(implicit p: Parameters) extends HwachaBundle()(p) { 10 | val pc = UInt(width = vaddrBitsExtended) // ID stage PC 11 | val data = UInt(width = icacheParams.fetchBytes * 8) 12 | val pf = Bool() 13 | 14 | } 15 | 16 | class FrontendReq(implicit p: Parameters) extends freechips.rocketchip.tile.CoreBundle()(p) { 17 | val pc = UInt(width = vaddrBits+1) 18 | val status = new MStatus 19 | } 20 | 21 | class FrontendIO(cacheParams: ICacheParams)(implicit p: Parameters) extends HwachaBundle()(p) { 22 | val active = Bool(OUTPUT) 23 | val req = Valid(new FrontendReq) 24 | val resp = Decoupled(new FrontendResp(cacheParams)).flip 25 | val invalidate = Bool(OUTPUT) 26 | 27 | } 28 | 29 | class MiniFrontend(val cacheParams: ICacheParams)(implicit p: Parameters) extends HwachaModule()(p) with freechips.rocketchip.tile.HasL1CacheParameters { 30 | val io = new Bundle { 31 | val front = new FrontendIO(cacheParams).flip 32 | val back = new Bundle { 33 | val s0_req = Decoupled(new FrontendReq) 34 | val s1_kill = Bool(OUTPUT) 35 | //TODO: make sure we dont need double Valid nesting 36 | val s1_resp = Valid(UInt(width = cacheParams.fetchBytes * 8)).flip 37 | val s1_tlb = new TLBResp().flip 38 | } 39 | } 40 | 41 | val s1_valid = Reg(init=Bool(false)) 42 | val s1_pc_ = Reg(UInt()) 43 | val s1_status = Reg(new MStatus) 44 | val s1_pc = ~(~s1_pc_ | UInt(HwachaElementInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) 45 | val s1_same_block = Reg(init=Bool(false)) 46 | val s1_req_valid = Reg(init=Bool(false)) 47 | 48 | val s2_valid = Reg(init=Bool(false)) 49 | val s2_pc = Reg(UInt()) 50 | val s2_status = Reg(new MStatus) 51 | val s2_xcpt_if = Reg(init=Bool(false)) 52 | val s2_line = Module(new Queue(UInt(width = cacheParams.fetchBytes * 8), 1, pipe=true)) 53 | 54 | s1_req_valid := io.back.s0_req.fire 55 | 56 | val icmiss = s2_valid && !s2_line.io.deq.valid 57 | val s2_replay = icmiss 58 | val s1_replay = s1_valid && !s1_same_block && !s1_req_valid 59 | 60 | val stall = !io.front.req.valid && (io.front.resp.valid && !io.front.resp.ready || !io.front.active) 61 | val s1_kill = io.front.req.valid || icmiss || s1_replay 62 | val s0_npc = s1_pc + UInt(HwachaElementInstBytes) 63 | val s0_same_block = 64 | !s1_kill && ((s0_npc & UInt(rowBytes)) === (s1_pc & UInt(rowBytes))) 65 | val s0_req_valid = !stall && !s0_same_block 66 | 67 | io.back.s0_req.valid := s0_req_valid 68 | io.back.s0_req.bits.status := 69 | Mux(io.front.req.valid, io.front.req.bits.status, 70 | Mux(s2_replay, s2_status, 71 | Mux(s1_replay, s1_status, 72 | s1_status)))// next status is same as last status w/o new req 73 | io.back.s0_req.bits.pc := 74 | Mux(io.front.req.valid, io.front.req.bits.pc, 75 | Mux(s2_replay, s2_pc, 76 | Mux(s1_replay, s1_pc, 77 | s0_npc))) 78 | io.back.s1_kill := s1_req_valid && s1_kill 79 | 80 | when (!stall) { 81 | s1_valid := s0_req_valid 82 | s1_pc_ := io.back.s0_req.bits.pc 83 | s1_status := io.back.s0_req.bits.status 84 | s1_same_block := s0_same_block && !(s1_req_valid && io.back.s1_tlb.miss) 85 | 86 | s2_valid := !s1_kill 87 | when (!s1_kill) { 88 | s2_pc := s1_pc 89 | s2_status := s1_status 90 | s2_xcpt_if := s1_req_valid && io.back.s1_tlb.pf.inst 91 | } 92 | } 93 | 94 | s2_line.io.enq.bits := io.back.s1_resp.bits 95 | s2_line.io.enq.valid := s1_req_valid && io.back.s1_resp.valid 96 | s2_line.io.deq.ready := !stall && !(s1_valid && s1_same_block) 97 | 98 | io.front.resp.valid := s2_valid && (s2_line.io.deq.valid || s2_xcpt_if) 99 | io.front.resp.bits.pc := s2_pc 100 | io.front.resp.bits.pf := s2_xcpt_if 101 | 102 | require(cacheParams.fetchBytes <= rowBytes) 103 | val fetch_data = 104 | if (cacheParams.fetchBytes == rowBytes) s2_line.io.deq.bits 105 | else s2_line.io.deq.bits >> (s2_pc(log2Up(rowBytes)-1,log2Up(cacheParams.fetchBytes)) << log2Up(cacheParams.fetchBytes*8)) 106 | 107 | io.front.resp.bits.data := fetch_data 108 | } 109 | 110 | class HwachaFrontend(implicit p : Parameters) extends LazyModule { 111 | lazy val module = new HwachaFrontendModule(this) 112 | val cacheParams = p(HwachaIcacheKey) 113 | 114 | val icache = LazyModule(new freechips.rocketchip.rocket.ICache(cacheParams, staticIdForMetadataUseOnly = 0)) 115 | 116 | val masterNode = icache.masterNode 117 | } 118 | 119 | class HwachaFrontendModule(outer: HwachaFrontend)(implicit p: Parameters) extends LazyModuleImp(outer) 120 | with freechips.rocketchip.tile.HasL1CacheParameters with UsesHwachaParameters { 121 | implicit val edge = outer.masterNode.edges.out.head 122 | val cacheParams = outer.cacheParams 123 | 124 | val io = IO(new Bundle { 125 | val vxu = new FrontendIO(cacheParams).flip 126 | val vru = new FrontendIO(cacheParams).flip 127 | val ptw = new freechips.rocketchip.rocket.TLBPTWIO() 128 | }) 129 | val icache = outer.icache.module 130 | val tlb = Module(new freechips.rocketchip.rocket.TLB(instruction = true, lgMaxSize = log2Ceil(cacheParams.fetchBytes), TLBConfig(nSets=nptlb, nWays=1, nSectors=1))(edge, p)) 131 | val vxu = Module(new MiniFrontend(cacheParams)) 132 | val vru = Module(new MiniFrontend(cacheParams)) 133 | val req_arb = Module(new Arbiter(new FrontendReq, 2)) 134 | 135 | vxu.io.front <> io.vxu 136 | vru.io.front <> io.vru 137 | 138 | req_arb.io.in(1) <> vxu.io.back.s0_req 139 | req_arb.io.in(0) <> vru.io.back.s0_req 140 | private val req = req_arb.io.out 141 | 142 | val s1_pc = RegEnable(req.bits.pc, req.valid) 143 | val s2_pc = Reg(s1_pc) 144 | 145 | icache.io.req.valid := req.valid 146 | icache.io.req.bits.addr := req.bits.pc 147 | icache.io.invalidate := Bool(false) 148 | icache.io.s1_paddr := tlb.io.resp.paddr 149 | icache.io.s2_vaddr := s2_pc 150 | icache.io.s1_kill := 151 | vxu.io.back.s1_kill || vru.io.back.s1_kill || 152 | tlb.io.resp.miss || tlb.io.resp.pf.inst 153 | //TODO: check ptw result 154 | icache.io.s2_kill := Bool(false) 155 | icache.io.s2_cacheable := false.B 156 | icache.io.s2_prefetch := false.B 157 | icache.io.clock_enabled := true.B 158 | 159 | tlb.io.req.valid := Reg(next=req.valid) 160 | tlb.io.req.bits.vaddr := s1_pc 161 | tlb.io.req.bits.passthrough := Bool(false) 162 | tlb.io.req.bits.size := UInt(log2Ceil(cacheParams.fetchBytes)) 163 | tlb.io.sfence.valid := false.B 164 | 165 | req.ready := Bool(true) 166 | 167 | vxu.io.back.s1_resp.valid <> icache.io.resp.valid 168 | vru.io.back.s1_resp.valid <> icache.io.resp.valid 169 | vxu.io.back.s1_resp.bits <> icache.io.resp.bits.data 170 | vru.io.back.s1_resp.bits <> icache.io.resp.bits.data 171 | vxu.io.back.s1_tlb <> tlb.io.resp 172 | vru.io.back.s1_tlb <> tlb.io.resp 173 | 174 | io.ptw <> tlb.io.ptw 175 | tlb.io.ptw.status := req.bits.status 176 | } 177 | -------------------------------------------------------------------------------- /src/main/scala/hwacha.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import chisel3.DontCare 5 | import org.chipsalliance.cde.config._ 6 | import freechips.rocketchip.util.ParameterizedBundle 7 | import freechips.rocketchip.tile._ 8 | import freechips.rocketchip.rocket.ICacheParams 9 | import freechips.rocketchip.diplomacy._ 10 | import freechips.rocketchip.tilelink._ 11 | 12 | case object HwachaCommitLog extends Field[Boolean] 13 | case object HwachaIcacheKey extends Field[ICacheParams] 14 | case object HwachaNLanes extends Field[Int] 15 | case object HwachaNBanks extends Field[Int] 16 | case object HwachaNAddressRegs extends Field[Int] 17 | case object HwachaNScalarRegs extends Field[Int] 18 | case object HwachaNVectorRegs extends Field[Int] 19 | case object HwachaNPredRegs extends Field[Int] 20 | case object HwachaRegBits extends Field[Int] 21 | case object HwachaPredRegBits extends Field[Int] 22 | case object HwachaMaxVLen extends Field[Int] 23 | case object HwachaBankWidth extends Field[Int] 24 | case object HwachaRegLen extends Field[Int] 25 | case object HwachaNDTLB extends Field[Int] 26 | case object HwachaNPTLB extends Field[Int] 27 | case object HwachaLocalScalarFPU extends Field[Boolean] 28 | case object HwachaBuildVRU extends Field[Boolean] 29 | case object HwachaConfPrec extends Field[Boolean] 30 | case object HwachaVRUMaxOutstandingPrefetches extends Field[Int] 31 | case object HwachaVRUEarlyIgnore extends Field[Int] 32 | case object HwachaVRUMaxRunaheadBytes extends Field[Int] 33 | case object HwachaCMDQLen extends Field[Int] 34 | case object HwachaVSETVLCompress extends Field[Boolean] 35 | 36 | abstract class HwachaModule(clock: Clock = null, _reset: Bool = null) 37 | (implicit val p: Parameters) extends Module(Option(clock), Option(_reset)) 38 | with UsesHwachaParameters 39 | 40 | abstract class HwachaBundle(implicit val p: Parameters) extends ParameterizedBundle()(p) 41 | with UsesHwachaParameters 42 | 43 | abstract trait UsesHwachaParameters extends freechips.rocketchip.tile.HasCoreParameters with UsesHwachaOnlyParameters { 44 | val aluFn = new freechips.rocketchip.rocket.ALUFN 45 | } 46 | 47 | abstract trait UsesHwachaOnlyParameters { 48 | implicit val p: Parameters 49 | 50 | val commit_log = p(HwachaCommitLog) 51 | 52 | val nARegs = p(HwachaNAddressRegs) 53 | val nSRegs = p(HwachaNScalarRegs) 54 | val nVRegs = p(HwachaNVectorRegs) 55 | val nPRegs = p(HwachaNPredRegs) 56 | 57 | val bARegs = log2Up(nARegs) 58 | val bSRegs = log2Up(nSRegs) 59 | val bVRegs = log2Up(nVRegs) 60 | val bPRegs = log2Up(nPRegs) 61 | val bfVRegs = log2Down(nVRegs) + 1 62 | val bfPRegs = log2Down(nPRegs) + 1 63 | 64 | val bRegs = List(bARegs, bSRegs, bVRegs).max 65 | val bSDest = List(bARegs, bSRegs).max 66 | 67 | val regLen = p(HwachaRegLen) 68 | val regBytes = regLen >> 3 69 | 70 | require(SZ_D == regLen) 71 | 72 | val nLanes = p(HwachaNLanes) 73 | val nBanks = p(HwachaNBanks) 74 | val wBank = p(HwachaBankWidth) 75 | val nSlices = wBank / regLen 76 | val nStrip = nBanks * nSlices 77 | 78 | require(isPow2(nLanes)) 79 | require(isPow2(nBanks)) 80 | require(isPow2(nSlices)) 81 | val bLanes = log2Ceil(nLanes) 82 | val bBanks = log2Ceil(nBanks) 83 | val bSlices = log2Ceil(nSlices) 84 | val bStrip = bBanks + bSlices 85 | 86 | val maxLStride = 2 87 | val bLStride = log2Floor(maxLStride) + 1 88 | val bfLStrip = maxLStride + bStrip + 1 89 | 90 | val maxVLen = p(HwachaMaxVLen) 91 | val bVLen = log2Down(maxVLen) + 1 92 | 93 | val maxMLVLen = nLanes * maxVLen 94 | val bMLVLen = log2Down(maxMLVLen) + 1 95 | 96 | val local_sfpu = false //p(HwachaLocalScalarFPU) //TODO: Fix local fpu for new encoding 97 | 98 | val ndtlb = p(HwachaNDTLB) 99 | val nptlb = p(HwachaNPTLB) 100 | val confvru = false//p(HwachaBuildVRU) //TODO: Fix prefetcher using TL2 Hints 101 | val confprec = p(HwachaConfPrec) 102 | 103 | val confvcmdq = new { 104 | val ncmd = p(HwachaCMDQLen) 105 | val nimm = p(HwachaCMDQLen) 106 | val nrd = p(HwachaCMDQLen) 107 | val ncnt = nBanks 108 | val nstatus = p(HwachaCMDQLen) 109 | } 110 | 111 | require(confvcmdq.ncmd >= nBanks) 112 | require(confvcmdq.nimm >= nBanks) 113 | require(confvcmdq.nrd >= nBanks) 114 | require(confvcmdq.nstatus >= nBanks) 115 | 116 | // TODO: Parameterize based on maxVLen and bandwidth-delay product 117 | val nvsreq = 512 118 | val nvlreq = 512 119 | } 120 | 121 | class HwachaCounterIO(implicit p: Parameters) extends HwachaBundle()(p) { 122 | val mseq = new MasterSequencerCounterIO 123 | val rocc = new RoCCCounterIO 124 | val vru = new VRUCounterIO 125 | } 126 | 127 | class Hwacha(implicit p: Parameters) extends LazyRoCC( 128 | opcodes = OpcodeSet.custom0 | OpcodeSet.custom1, 129 | nPTWPorts = 2 + p(HwachaNLanes), 130 | usesFPU = true) 131 | with UsesHwachaOnlyParameters { 132 | override lazy val module = new HwachaImp(this) 133 | 134 | val icache = LazyModule(new HwachaFrontend()) 135 | val smu = LazyModule(new SMU()) 136 | val vus = Seq.fill(nLanes) {LazyModule(new VectorUnit())} 137 | val atlBus = LazyModule(new TLXbar) 138 | 139 | atlNode := TLWidthWidget(16) := atlBus.node 140 | atlBus.node := icache.masterNode 141 | atlBus.node := TLWidthWidget(16) := smu.masterNode 142 | val vru = if(confvru) { 143 | val vruM = LazyModule(new VRU) 144 | atlBus.node := TLWidthWidget(16) := vruM.masterNode 145 | Some(vruM) 146 | } else None 147 | vus.map(_.masterNode).foreach { tlNode := TLWidthWidget(16) := _ } 148 | } 149 | 150 | class HwachaImp(outer: Hwacha)(implicit p: Parameters) extends LazyRoCCModuleImp(outer) 151 | with UsesHwachaParameters 152 | { 153 | // TODO: Re-add counters 154 | /* 155 | override val io = new RoCCIO { 156 | val counters = new HwachaCounterIO 157 | } 158 | */ 159 | import HwachaDecodeTable._ 160 | import Commands._ 161 | 162 | val rocc = Module(new RoCCUnit) 163 | val icache = outer.icache.module 164 | val scalar = Module(new ScalarUnit) 165 | val mseq = Module(new MasterSequencer) 166 | val vus = outer.vus.map(_.module) 167 | val rpred = Module(new RPredMaster) 168 | val rfirst = Module(new RFirstMaster) 169 | val smu = outer.smu.module 170 | val mou = Module(new MemOrderingUnit) 171 | 172 | // Connect RoccUnit to top level IO 173 | rocc.io.rocc.cmd <> io.cmd 174 | io.resp <> rocc.io.rocc.resp 175 | io.busy <> rocc.io.rocc.busy 176 | io.interrupt <> rocc.io.rocc.interrupt 177 | rocc.io.rocc.exception <> io.exception 178 | //io.counters.rocc <> rocc.io.counters 179 | 180 | // Connect RoccUnit to ScalarUnit 181 | rocc.io.pending.mseq := mseq.io.pending.all 182 | rocc.io.pending.mrt := scalar.io.pending.mrt.su.all || vus.map(_.io.pending.all).reduce(_||_) 183 | rocc.io.vf_active := scalar.io.vf_active 184 | scalar.io.cmdq <> rocc.io.cmdqs.vu 185 | scalar.io.cfg <> rocc.io.cfg 186 | 187 | // Connect ScalarUnit to Rocket's FPU 188 | if (local_sfpu) { 189 | val sfpu = Module(new ScalarFPU) 190 | scalar.io.fpu <> sfpu.io 191 | io.fpu_req.valid := Bool(false) 192 | } else { 193 | val sfpu = Module(new ScalarFPUInterface) 194 | sfpu.io.hwacha.req <> scalar.io.fpu.req 195 | io.fpu_req <> sfpu.io.rocc.req 196 | sfpu.io.rocc.resp <> io.fpu_resp 197 | scalar.io.fpu.resp <> sfpu.io.hwacha.resp 198 | } 199 | 200 | // Connect Scalar to I$ 201 | icache.io.vxu <> scalar.io.imem 202 | io.ptw(0) <> icache.io.ptw 203 | if (confvru) { 204 | val vru = outer.vru.get.module 205 | icache.io.vru <> vru.io.imem 206 | vru.io.cmdq <> rocc.io.cmdqs.vru 207 | vru.io.vf_complete_ack := mseq.io.vf.last 208 | //io.counters.vru <> vru.io.counters 209 | } else { 210 | // vru plumbing in RoCCUnit should be automatically optimized out 211 | rocc.io.cmdqs.vru.cmd.ready := Bool(true) 212 | rocc.io.cmdqs.vru.imm.ready := Bool(true) 213 | rocc.io.cmdqs.vru.rd.ready := Bool(true) 214 | rocc.io.cmdqs.vru.cnt.ready := Bool(true) 215 | rocc.io.cmdqs.vru.status.ready := Bool(true) 216 | 217 | icache.io.vru := DontCare 218 | icache.io.vru.req.valid := Bool(false) 219 | icache.io.vru.active := Bool(false) 220 | 221 | //io.counters.vru <> (new VRUCounterIO).fromBits(UInt(0)) 222 | } 223 | 224 | // Connect supporting Hwacha memory modules to external ports 225 | io.mem.req.valid := Bool(false) 226 | 227 | smu.io.scalar <> scalar.io.smu 228 | io.ptw(1) <> smu.io.ptw 229 | 230 | val enq_vxus = scalar.io.vxu.bits.lane.map(_.active) 231 | val enq_rpred = scalar.io.vxu.bits.active.vrpred 232 | val enq_rfirst = scalar.io.vxu.bits.active.vrfirst 233 | val mask_vxus_ready = (0 until nLanes) map { i => !enq_vxus(i) || vus(i).io.issue.vxu.ready } 234 | val mask_rpred_ready = !enq_rpred || rpred.io.op.ready 235 | val mask_rfirst_ready = !enq_rfirst || rfirst.io.op.ready 236 | 237 | def fire_vxu(exclude: Bool, include: Bool*) = { 238 | val rvs = Seq( 239 | scalar.io.vxu.valid, mseq.io.op.ready, 240 | mask_rpred_ready, mask_rfirst_ready) ++ mask_vxus_ready 241 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 242 | } 243 | 244 | val enq_vmus = scalar.io.vmu.bits.lane.map(_.active) 245 | val mask_vmus_ready = (0 until nLanes) map { i => !enq_vmus(i) || vus(i).io.issue.vmu.ready } 246 | 247 | def fire_vmu(exclude: Bool, include: Bool*) = { 248 | val rvs = Seq(scalar.io.vmu.valid) ++ mask_vmus_ready 249 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 250 | } 251 | 252 | scalar.io.vxu.ready := fire_vxu(scalar.io.vxu.valid) 253 | scalar.io.vmu.ready := fire_vmu(scalar.io.vmu.valid) 254 | 255 | mseq.io.op.valid := fire_vxu(mseq.io.op.ready) 256 | mseq.io.op.bits <> scalar.io.vxu.bits 257 | (mseq.io.master.clear.zipWithIndex) map { case (c, r) => 258 | c := vus.map(_.io.mseq.clear(r)).reduce(_&&_) 259 | } 260 | scalar.io.pending.mseq <> mseq.io.pending 261 | mseq.io.vf.stop := scalar.io.vf_stop 262 | //io.counters.mseq <> mseq.io.counters 263 | 264 | rpred.io.op.valid := fire_vxu(mask_rpred_ready, enq_rpred) 265 | rpred.io.op.bits <> scalar.io.vxu.bits 266 | scalar.io.red.pred <> rpred.io.result 267 | 268 | rfirst.io.op.valid := fire_vxu(mask_rfirst_ready, enq_rfirst) 269 | rfirst.io.op.bits <> scalar.io.vxu.bits 270 | scalar.io.red.first <> rfirst.io.result 271 | 272 | mou.io.cfg <> rocc.io.cfg 273 | mou.io.mseq <> mseq.io.master.state 274 | mou.io.pending.su <> scalar.io.pending.mrt.su 275 | (mou.io.pending.vus zip vus) map { case (pending, vu) => pending <> vu.io.pending } 276 | scalar.io.mocheck <> mou.io.check.su 277 | (vus zip mou.io.check.vus) map { case (vu, mocheck) => vu.io.mocheck <> mocheck } 278 | (scalar.io.pending.mrt.vus zip vus) map { case (pending, vu) => pending <> vu.io.pending } 279 | 280 | (vus.zipWithIndex) map { case (vu, i) => 281 | vu.io.id := UInt(i) 282 | 283 | vu.io.cfg <> rocc.io.cfg 284 | vu.io.issue.vxu.valid := fire_vxu(mask_vxus_ready(i), enq_vxus(i)) 285 | vu.io.issue.vxu.bits.vlen := scalar.io.vxu.bits.lane(i).vlen 286 | vu.io.issue.vxu.bits <> scalar.io.vxu.bits 287 | vu.io.issue.vmu.valid := fire_vmu(mask_vmus_ready(i), enq_vmus(i)) 288 | vu.io.issue.vmu.bits.vlen := scalar.io.vmu.bits.lane(i).vlen 289 | vu.io.issue.vmu.bits <> scalar.io.vmu.bits 290 | vu.io.mseq.state <> mseq.io.master.state 291 | vu.io.mseq.update <> mseq.io.master.update 292 | rpred.io.lane(i) <> vu.io.red.pred 293 | rfirst.io.lane(i) <> vu.io.red.first 294 | io.ptw(2 + i) <> vu.io.ptw 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /src/main/scala/irq.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | 5 | class IRQIO extends Bundle { 6 | val top = new Bundle { 7 | val illegal_cfg = Bool(OUTPUT) 8 | val illegal_inst = Bool(OUTPUT) 9 | val priv_inst = Bool(OUTPUT) 10 | val illegal_regid = Bool(OUTPUT) 11 | val aux = Bits(OUTPUT, 64) 12 | } 13 | val issue = new Bundle { 14 | val ma_inst = Bool(OUTPUT) 15 | val fault_inst = Bool(OUTPUT) 16 | val illegal = Bool(OUTPUT) 17 | val illegal_regid = Bool(OUTPUT) 18 | val aux = Bits(OUTPUT, 64) 19 | } 20 | val vmu = new Bundle { 21 | val ma_ld = Bool(OUTPUT) 22 | val ma_st = Bool(OUTPUT) 23 | val pf_ld = Bool(OUTPUT) 24 | val pf_st = Bool(OUTPUT) 25 | val ae_ld = Bool(OUTPUT) 26 | val ae_st = Bool(OUTPUT) 27 | val aux = Bits(OUTPUT, 64) 28 | } 29 | } 30 | 31 | class IRQ extends Module { 32 | val io = new Bundle { 33 | val vu = new IRQIO().flip 34 | val rocc = new Bundle { 35 | val request = Bool(OUTPUT) 36 | val cause = UInt(OUTPUT, 5) 37 | val aux = Bits(OUTPUT, 64) 38 | val clear = Bool(INPUT) 39 | } 40 | } 41 | 42 | val reg_irq = Reg(init=Bool(false)) 43 | val reg_cause = Reg(init=UInt(0, 5)) 44 | val reg_aux = Reg(init=Bits(0, 64)) 45 | 46 | val irqs = List( 47 | (io.vu.top.illegal_cfg, 0, io.vu.top.aux), 48 | (io.vu.top.illegal_inst, 1, io.vu.top.aux), 49 | (io.vu.top.priv_inst, 2, io.vu.top.aux), 50 | (io.vu.top.illegal_regid, 3, io.vu.top.aux), 51 | (io.vu.issue.ma_inst, 4, io.vu.issue.aux), 52 | (io.vu.issue.fault_inst, 5, io.vu.issue.aux), 53 | (io.vu.issue.illegal, 6, io.vu.issue.aux), 54 | (io.vu.issue.illegal_regid, 7, io.vu.issue.aux), 55 | (io.vu.vmu.ma_ld, 8, io.vu.vmu.aux), 56 | (io.vu.vmu.ma_st, 9, io.vu.vmu.aux), 57 | (io.vu.vmu.pf_ld, 10, io.vu.vmu.aux), 58 | (io.vu.vmu.pf_st, 11, io.vu.vmu.aux), 59 | (io.vu.vmu.ae_ld, 12, io.vu.vmu.aux), 60 | (io.vu.vmu.ae_st, 13, io.vu.vmu.aux), 61 | ) 62 | 63 | when (!reg_irq) { 64 | for ((cond, cause, aux) <- irqs.reverse) { 65 | when (cond) { 66 | reg_irq := Bool(true) 67 | reg_cause := UInt(cause) 68 | reg_aux := aux 69 | } 70 | } 71 | } 72 | 73 | when (io.rocc.clear) { 74 | reg_irq := Bool(false) 75 | } 76 | 77 | io.rocc.request := reg_irq 78 | io.rocc.cause := reg_cause 79 | io.rocc.aux := reg_aux 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/lane-ctrl.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class LaneCtrl(implicit p: Parameters) extends VXUModule()(p) { 7 | val io = new Bundle { 8 | val op = new LaneOpIO().flip 9 | val uop = new MicroOpIO 10 | } 11 | 12 | class Systolic[T <: LaneOp](in: ValidIO[T], multirate: Boolean) { 13 | val in_overflow = in.bits.strip > UInt(nSlices) 14 | val in_next_valid = in.valid && in_overflow 15 | val in_pred = Vec( 16 | for (i <- (0 until (if (multirate) nPack else 1)); j <- (0 until nSlices)) 17 | yield UInt((i * nStrip) + j) < in.bits.strip).asUInt 18 | val in_popcnt = Mux(in_overflow, UInt(nSlices), in.bits.strip(bSlices, 0)) 19 | 20 | if (confprec && !multirate) 21 | assert(!in.valid || (in.bits.strip <= UInt(nStrip)), 22 | "check strip count for single-rate systolic laneop: " + 23 | in.bits.getClass.getName) 24 | 25 | val out = Wire(Valid(in.bits.cloneType)) 26 | out.valid := Reg(next=in_next_valid, init=Bool(false)) 27 | out.bits := RegEnable(in.bits, in_next_valid) 28 | out.bits.strip := RegEnable(in.bits.strip - in_popcnt, in_next_valid) 29 | } 30 | 31 | def gen_systolic[T <: LaneOp, S <: MicroOp] 32 | (lop: ValidIO[T], uop: ValidIO[S], mr: Boolean = true) = { 33 | val sys = new Systolic(lop, mr) 34 | uop <> lop 35 | uop.bits.pred := sys.in_pred 36 | sys.out 37 | } 38 | 39 | def gen_vec_systolic[T <: LaneOp, S <: MicroOp] 40 | (lops: Seq[ValidIO[T]], uops: Seq[ValidIO[S]], mr: Boolean = true) = { 41 | Vec((lops zip uops) map { case (lop, uop) => gen_systolic(lop, uop, mr) }) 42 | } 43 | 44 | io.uop.bank.foldLeft(io.op.sram.read)((lop, bio) => gen_systolic(lop, bio.sram.read)) 45 | io.uop.bank.foldLeft(io.op.sram.write)((lop, bio) => gen_systolic(lop, bio.sram.write)) 46 | io.uop.bank.foldLeft(io.op.pred.gread)((lop, bio) => gen_systolic(lop, bio.pred.gread)) 47 | io.uop.bank.foldLeft(io.op.pred.pread)((lop, bio) => gen_systolic(lop, bio.pred.pread, false)) 48 | io.uop.bank.foldLeft(io.op.pred.read)((lops, bio) => gen_vec_systolic(lops, bio.pred.read)) 49 | io.uop.bank.foldLeft(io.op.pred.write)((lop, bio) => gen_systolic(lop, bio.pred.write)) 50 | io.uop.bank.foldLeft(io.op.opl.global)((lops, bio) => gen_vec_systolic(lops, bio.opl.global)) 51 | io.uop.bank.foldLeft(io.op.opl.local)((lops, bio) => gen_vec_systolic(lops, bio.opl.local)) 52 | io.uop.bank.foldLeft(io.op.pdl.global)((lops, bio) => gen_vec_systolic(lops, bio.pdl.global)) 53 | io.uop.bank.foldLeft(io.op.pdl.local)((lops, bio) => gen_vec_systolic(lops, bio.pdl.local)) 54 | io.uop.bank.foldLeft(io.op.sreg.local)((lops, bio) => gen_vec_systolic(lops, bio.sreg)) 55 | io.uop.bank.foldLeft(io.op.xbar)((lops, bio) => gen_vec_systolic(lops, bio.xbar)) 56 | io.uop.bank.foldLeft(io.op.pxbar)((lops, bio) => gen_vec_systolic(lops, bio.pxbar)) 57 | io.uop.bank.foldLeft(io.op.viu)((lop, bio) => gen_systolic(lop, bio.viu)) 58 | io.uop.bank.foldLeft(io.op.vipu)((lop, bio) => gen_systolic(lop, bio.vipu)) 59 | io.uop.bank.foldLeft(io.op.vpu)((lop, bio) => gen_systolic(lop, bio.vpu, false)) 60 | io.uop.bank.foldLeft(io.op.vsu)((lop, bio) => gen_systolic(lop, bio.vsu, false)) 61 | 62 | class Shared[T <: LaneOp](in: ValidIO[T], multirate: Boolean = false) { 63 | val reg_valid = Reg(Bool()) 64 | val reg_bits = Reg(in.bits.cloneType) 65 | 66 | val strip = Mux(in.valid, in.bits.strip, reg_bits.strip) 67 | val overflow = strip > UInt(nSlices) 68 | val in_next_valid = overflow 69 | val valid = in.valid || reg_valid 70 | val bits = Mux(in.valid, in.bits, reg_bits) 71 | val pred = Vec( 72 | for (i <- (0 until (if (multirate) nPack else 1)); j <- (0 until nSlices)) 73 | yield UInt((i * nStrip) + j) < strip).asUInt 74 | val popcnt = Mux(overflow, UInt(nSlices), strip(bSlices, 0)) 75 | 76 | if (confprec && !multirate) 77 | assert(!in.valid || (in.bits.strip <= UInt(nStrip)), 78 | "check strip count for single-rate shared laneop: " + 79 | in.bits.getClass.getName) 80 | 81 | reg_valid := in_next_valid 82 | when (in.valid && overflow) { 83 | reg_bits := in.bits 84 | } 85 | when (in_next_valid) { 86 | reg_bits.strip := strip - popcnt 87 | } 88 | 89 | when (reset) { 90 | reg_valid := Bool(false) 91 | } 92 | } 93 | 94 | val sreg = (0 until nGOPL).map { i => new Shared(io.op.sreg.global(i), true) } 95 | val vqu = new Shared(io.op.vqu) 96 | val vgu = new Shared(io.op.vgu) 97 | val vimu = new Shared(io.op.vimu) 98 | val vfmu = (0 until nVFMU) map { i => new Shared(io.op.vfmu(i), true) } 99 | val vfcu = new Shared(io.op.vfcu) 100 | val vfvu = new Shared(io.op.vfvu, true) 101 | 102 | (io.uop.sreg zip sreg) foreach { case (u, s) => 103 | u.valid := s.valid 104 | u.bits.operand := s.bits.operand 105 | u.bits.rate := s.bits.rate 106 | u.bits.pred := s.pred 107 | } 108 | 109 | def connect_vfu[T <: LaneOp, S <: MicroOp] 110 | (uop: ValidIO[S], s: Shared[T], fn: (S, Shared[T])=>Unit) = { 111 | uop.valid := s.valid 112 | uop.bits.pred := s.pred 113 | fn(uop.bits, s) 114 | } 115 | connect_vfu(io.uop.vqu, vqu, (u: VQUMicroOp, s: Shared[VQULaneOp]) => u <> s.bits) 116 | connect_vfu(io.uop.vgu, vgu, (u: VGUMicroOp, s: Shared[VGULaneOp]) => u <> s.bits) 117 | connect_vfu(io.uop.vimu, vimu, (u: VIMUMicroOp, s: Shared[VIMULaneOp]) => u <> s.bits) 118 | (io.uop.vfmu zip vfmu) foreach { case (uop, shared) => 119 | connect_vfu(uop, shared, (u: VFMUMicroOp, s: Shared[VFMULaneOp]) => u <> s.bits) 120 | } 121 | connect_vfu(io.uop.vfcu, vfcu, (u: VFCUMicroOp, s: Shared[VFCULaneOp]) => u <> s.bits) 122 | connect_vfu(io.uop.vfvu, vfvu, (u: VFVUMicroOp, s: Shared[VFVULaneOp]) => u <> s.bits) 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/lane.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | case object HwachaNSRAMRFEntries extends Field[Int] 7 | case object HwachaNFFRFEntries extends Field[Int] 8 | case object HwachaNFFRFReadPorts extends Field[Int] 9 | case object HwachaNPredRFEntries extends Field[Int] 10 | case object HwachaNPredRFReadPorts extends Field[Int] 11 | case object HwachaNOperandLatches extends Field[Int] 12 | case object HwachaNPredLatches extends Field[Int] 13 | case object HwachaWriteSelects extends Field[Int] 14 | case object HwachaRFAddrBits extends Field[Int] 15 | case object HwachaPRFAddrBits extends Field[Int] 16 | case object HwachaStagesALU extends Field[Int] 17 | case object HwachaStagesPLU extends Field[Int] 18 | case object HwachaStagesIMul extends Field[Int] 19 | case object HwachaStagesDFMA extends Field[Int] 20 | case object HwachaStagesSFMA extends Field[Int] 21 | case object HwachaStagesHFMA extends Field[Int] 22 | case object HwachaStagesFConv extends Field[Int] 23 | case object HwachaStagesFCmp extends Field[Int] 24 | 25 | abstract trait LaneParameters extends UsesHwachaParameters { 26 | val nSRAM = p(HwachaNSRAMRFEntries) 27 | val nFF = p(HwachaNFFRFEntries) 28 | val nFFRPorts = p(HwachaNFFRFReadPorts) 29 | val nPred = p(HwachaNPredRFEntries) 30 | val nPredRPorts = p(HwachaNPredRFReadPorts) 31 | val bRFAddr = p(HwachaRFAddrBits) 32 | val bPredAddr = p(HwachaPRFAddrBits) 33 | 34 | val nBankSRAMRegs = nSRAM * nSlices 35 | val nLaneSRAMRegs = nBanks * nBankSRAMRegs 36 | 37 | val nGOPL = p(HwachaNOperandLatches) 38 | val nLOPL = 3 39 | val nGPDL = p(HwachaNPredLatches) 40 | val nLPDL = 2 41 | val nWSel = p(HwachaWriteSelects) 42 | val nLPQ = 2 43 | val nLRQ = 3 44 | val nDecoupledUnitWBQueue = 4 45 | val nVFMU = 2 46 | 47 | val stagesALU = p(HwachaStagesALU) 48 | val stagesPLU = p(HwachaStagesPLU) 49 | val stagesIMul = p(HwachaStagesIMul) 50 | val stagesDFMA = p(HwachaStagesDFMA) 51 | val stagesSFMA = p(HwachaStagesSFMA) 52 | val stagesHFMA = p(HwachaStagesHFMA) 53 | val stagesFConv = p(HwachaStagesFConv) 54 | val stagesFCmp = p(HwachaStagesFCmp) 55 | 56 | require(nVRegs <= nSRAM) 57 | 58 | val bPack = if (confprec) log2Floor(regLen/SZ_H) else 0 59 | val nPack = 1 << bPack 60 | val bRate = log2Up(bPack + 1) 61 | 62 | val wPred = nSlices << bPack 63 | require(nPred % nPack == 0) 64 | } 65 | 66 | class LaneOpIO(implicit p: Parameters) extends VXUBundle()(p) { 67 | val sram = new Bundle { 68 | val read = Valid(new SRAMRFReadLaneOp) 69 | val write = Valid(new SRAMRFWriteLaneOp) 70 | } 71 | val ff = new Bundle { 72 | val read = Vec(nFFRPorts, Valid(new FFRFReadLaneOp)) 73 | val write = Valid(new FFRFWriteLaneOp) 74 | } 75 | val pred = new Bundle { 76 | val gread = Valid(new PredRFGatedReadLaneOp) // gated read 77 | val pread = Valid(new PredRFGatedReadLaneOp) // vpu read 78 | val read = Vec(nPredRPorts, Valid(new PredRFReadLaneOp)) // plu read 79 | val write = Valid(new PredRFWriteLaneOp) 80 | } 81 | val opl = new Bundle { 82 | val global = Vec(nGOPL, Valid(new OPLLaneOp)) 83 | val local = Vec(nLOPL, Valid(new OPLLaneOp)) 84 | } 85 | val pdl = new Bundle { 86 | val global = Vec(nGPDL, Valid(new PDLLaneOp)) 87 | val local = Vec(nLPDL, Valid(new PDLLaneOp)) 88 | } 89 | val sreg = new Bundle { 90 | val global = Vec(nGOPL, Valid(new SRegLaneOp)) 91 | val local = Vec(nLOPL, Valid(new SRegLaneOp)) 92 | } 93 | val xbar = Vec(nGOPL, Valid(new XBarLaneOp)) 94 | val pxbar = Vec(nGPDL, Valid(new PXBarLaneOp)) 95 | val viu = Valid(new VIULaneOp) 96 | val vipu = Valid(new VIPULaneOp) 97 | val vpu = Valid(new VPULaneOp) 98 | val vsu = Valid(new VSULaneOp) 99 | val vqu = Valid(new VQULaneOp) 100 | val vgu = Valid(new VGULaneOp) 101 | val vimu = Valid(new VIMULaneOp) 102 | val vfmu = Vec(nVFMU, Valid(new VFMULaneOp)) 103 | val vfcu = Valid(new VFCULaneOp) 104 | val vfvu = Valid(new VFVULaneOp) 105 | } 106 | 107 | class MicroOpIO(implicit p: Parameters) extends VXUBundle()(p) { 108 | val bank = Vec(nBanks, new BankOpIO) 109 | val sreg = Vec(nGOPL, Valid(new SRegMicroOp)) 110 | val vqu = Valid(new VQUMicroOp) 111 | val vgu = Valid(new VGUMicroOp) 112 | val vimu = Valid(new VIMUMicroOp) 113 | val vfmu = Vec(nVFMU, Valid(new VFMUMicroOp)) 114 | val vfcu = Valid(new VFCUMicroOp) 115 | val vfvu = Valid(new VFVUMicroOp) 116 | } 117 | 118 | class LaneAckIO(implicit p: Parameters) extends VXUBundle()(p) { 119 | val viu = Vec(nBanks, Valid(new VIUAck)) 120 | val vipu = Vec(nBanks, Valid(new VIPUAck)) 121 | val vqu = Valid(new VQUAck) 122 | val vgu = Valid(new VGUAck) 123 | val vimu = Valid(new VIMUAck) 124 | val vfmu = Vec(nVFMU, Valid(new VFMUAck)) 125 | val vfcu = Valid(new VFCUAck) 126 | val vfvu = Valid(new VFVUAck) 127 | } 128 | 129 | class LPQIO(implicit p: Parameters) extends DecoupledIO(new LPQEntry()(p)) { 130 | } 131 | class LRQIO(implicit p: Parameters) extends DecoupledIO(new LRQEntry()(p)) { 132 | } 133 | 134 | trait LanePred extends VXUBundle { 135 | val pred = Bits(width = nPack) 136 | def active(dummy: Int = 0) = pred.orR 137 | } 138 | 139 | class Lane(implicit p: Parameters) extends VXUModule()(p) with Packing with RateLogic { 140 | val io = new Bundle { 141 | val id = UInt(INPUT) 142 | val cfg = new HwachaConfigIO().flip 143 | val op = new LaneOpIO().flip 144 | val ack = new LaneAckIO 145 | val lpqs = Vec(nLPQ, new LPQIO) 146 | val lrqs = Vec(nLRQ, new LRQIO) 147 | val bpqs = Vec(nBanks, new BPQIO) 148 | val brqs = Vec(nBanks, new BRQIO) 149 | val bwqs = new Bundle { 150 | val mem = Vec(nBanks, new BWQIO).flip 151 | val fu = Vec(nBanks, new BWQIO).flip 152 | } 153 | } 154 | 155 | val ctrl = Module(new LaneCtrl) 156 | ctrl.suggestName("ctrlInst") 157 | ctrl.io.op <> io.op 158 | 159 | val banksrw = (0 until nBanks) map { i => 160 | val bank = Module(new Bank(i)) 161 | bank.suggestName("bankInst") 162 | 163 | bank.io.lid := io.id 164 | bank.io.cfg <> io.cfg 165 | bank.io.op <> ctrl.io.uop.bank(i) 166 | io.bpqs(i) <> bank.io.rw.bpq 167 | io.brqs(i) <> bank.io.rw.brq 168 | bank.io.rw.bwq.mem <> io.bwqs.mem(i) 169 | bank.io.rw.bwq.fu <> io.bwqs.fu(i) 170 | bank.io.rw 171 | } 172 | 173 | val pdls = (0 until nGPDL) map { o => banksrw.map(_.pdl(o).pred).reduce(_|_) } 174 | val opls = (0 until nGOPL) map { o => banksrw.map(_.opl(o).data).reduce(_|_) } 175 | 176 | def predicate(n: Int) = new BankPredEntry().fromBits(pdls(n)) 177 | 178 | def operands[T <: SharedLLOp](name: String, uop: ValidIO[T], n: Int, rbase: Int) = { 179 | require(n <= uop.bits.nOperands) 180 | (0 until n) map { i => 181 | val ri = rbase+i 182 | assert(!uop.valid || !uop.bits.sreg(i) || ctrl.io.uop.sreg(ri).valid, "check sreg sched logic "+name+"_"+i) 183 | Mux(uop.bits.sreg(i), splat_scalar(ctrl.io.uop.sreg(ri).bits), opls(ri)) 184 | } 185 | } 186 | 187 | def valids(valid: Bool, pred: Bits, latency: Int) = 188 | ShiftRegister(Mux(valid, pred, Bits(0)), latency) 189 | 190 | require(nLRQ == 3) 191 | 192 | // VIMU: predicate(0), operands(0, 1) 193 | // VFMU0: predicate(0), operands(0, 1, 2) 194 | // VFVU: predicate(1), operands(2) 195 | // VFMU1: predicate(2), operands(3, 4, 5) 196 | // VQU: predicate(2), operands(3, 4) 197 | // VFCU: predicate(2), operands(3, 4) 198 | // VGU: predicate(3), operands(5) 199 | 200 | val vqu_pred = predicate(2) 201 | val vqu_operands = operands("vqu", ctrl.io.uop.vqu, 2, 3) 202 | io.lpqs(0).valid := ctrl.io.uop.vqu.valid 203 | io.lpqs(0).bits.pred := vqu_pred.pred 204 | io.lrqs(0).valid := ctrl.io.uop.vqu.valid && ctrl.io.uop.vqu.bits.fn.latch(0) && vqu_pred.active() 205 | io.lrqs(0).bits.data := vqu_operands(0) 206 | io.lrqs(1).valid := ctrl.io.uop.vqu.valid && ctrl.io.uop.vqu.bits.fn.latch(1) && vqu_pred.active() 207 | io.lrqs(1).bits.data := vqu_operands(1) 208 | 209 | assert(!io.lpqs(0).valid || io.lpqs(0).ready, "check lpqs(0) counter logic") 210 | assert(!io.lrqs(0).valid || io.lrqs(0).ready, "check lrqs(0) counter logic") 211 | assert(!io.lrqs(1).valid || io.lrqs(1).ready, "check lrqs(1) counter logic") 212 | 213 | val vgu_pred = predicate(3) 214 | val vgu_operands = operands("vgu", ctrl.io.uop.vgu, 1, 5) 215 | io.lpqs(1).valid := ctrl.io.uop.vgu.valid 216 | io.lpqs(1).bits.pred := vgu_pred.pred 217 | io.lrqs(2).valid := ctrl.io.uop.vgu.valid && vgu_pred.active() 218 | io.lrqs(2).bits.data := vgu_operands(0) 219 | 220 | assert(!io.lpqs(1).valid || io.lpqs(1).ready, "check lpqs(1) counter logic") 221 | assert(!io.lrqs(2).valid || io.lrqs(2).ready, "check lrqs(2) counter logic") 222 | 223 | val vimu_pred = predicate(0) 224 | val vimu_operands = operands("vimu", ctrl.io.uop.vimu, 2, 0) 225 | val vimus = (0 until nSlices) map { i => 226 | val vimu = Module(new IMulSlice) 227 | vimu.suggestName("vimuInst") 228 | vimu.io.req.valid := ctrl.io.uop.vimu.valid && ctrl.io.uop.vimu.bits.pred(i) && vimu_pred.pred(i) 229 | vimu.io.req.bits.fn := ctrl.io.uop.vimu.bits.fn 230 | vimu.io.req.bits.in0 := unpack_slice(vimu_operands(0), i) 231 | vimu.io.req.bits.in1 := unpack_slice(vimu_operands(1), i) 232 | vimu.io.resp 233 | } 234 | 235 | val vfmus = (0 until nVFMU) map { v => 236 | val vfmu_val = ctrl.io.uop.vfmu(v).valid 237 | val vfmu_pred = ctrl.io.uop.vfmu(v).bits.pred & predicate(2*v).pred 238 | val vfmu_operands = operands("vfmu"+v, ctrl.io.uop.vfmu(v), 3, 3*v) 239 | val vfmu_fn = ctrl.io.uop.vfmu(v).bits.fn 240 | ((0 until nSlices) map { i => 241 | val vfmu = Module(new FMASlice) 242 | vfmu.suggestName("vfmuInst") 243 | vfmu.io.req.valid := vfmu_val 244 | vfmu.io.req.bits.fn := vfmu_fn 245 | vfmu.io.req.bits.in0 := unpack_slice(vfmu_operands(0), i) 246 | vfmu.io.req.bits.in1 := unpack_slice(vfmu_operands(1), i) 247 | vfmu.io.req.bits.in2 := unpack_slice(vfmu_operands(2), i) 248 | vfmu.io.req.bits.rate := ctrl.io.uop.vfmu(v).bits.rate 249 | vfmu.io.req.bits.pred := unpack_pred(vfmu_pred, i, vfmu.io.req.bits.rate) 250 | vfmu.io.resp.bits 251 | }, { 252 | val stages = Seq(stagesDFMA, stagesSFMA, stagesHFMA) 253 | val pipe = (0 until stages.max).scanRight(Bits(0, wPred)){ 254 | case (_, in) => RegNext(next=in, init=Bits(0, wPred)) } 255 | for ((fp, i) <- Seq(FPD, FPS, FPH).zip(stages)) { 256 | when (vfmu_val && vfmu_fn.fp_is(fp)) { pipe(i-1) := vfmu_pred } 257 | } 258 | pipe.head 259 | }) 260 | } 261 | 262 | val vfcu_pred = predicate(2) 263 | val vfcu_operands = operands("vfcu", ctrl.io.uop.vfcu, 2, 3) 264 | val vfcus = (0 until nSlices) map { i => 265 | val vfcu = Module(new FCmpSlice) 266 | vfcu.suggestName("vfcuInst") 267 | vfcu.io.req.valid := ctrl.io.uop.vfcu.valid && ctrl.io.uop.vfcu.bits.pred(i) && vfcu_pred.pred(i) 268 | vfcu.io.req.bits.fn := ctrl.io.uop.vfcu.bits.fn 269 | vfcu.io.req.bits.in0 := unpack_slice(vfcu_operands(0), i) 270 | vfcu.io.req.bits.in1 := unpack_slice(vfcu_operands(1), i) 271 | vfcu.io.resp 272 | } 273 | 274 | val vfvu_pred = ctrl.io.uop.vfvu.bits.pred & predicate(1).pred 275 | val vfvu_operands = operands("vfvu", ctrl.io.uop.vfvu, 1, 2) 276 | val vfvus = ((0 until nSlices) map { i => 277 | val vfvu = Module(new FConvSlice) 278 | vfvu.suggestName("vfvuInst") 279 | vfvu.io.req.valid := ctrl.io.uop.vfvu.valid 280 | vfvu.io.req.bits.fn := ctrl.io.uop.vfvu.bits.fn 281 | vfvu.io.req.bits.in := unpack_slice(vfvu_operands(0), i) 282 | vfvu.io.req.bits.rate := ctrl.io.uop.vfvu.bits.rate 283 | vfvu.io.req.bits.pred := unpack_pred(vfvu_pred, i, vfvu.io.req.bits.rate) 284 | vfvu.io.resp.bits 285 | }, valids(ctrl.io.uop.vfvu.valid, vfvu_pred, stagesFConv)) 286 | 287 | require(nVFMU == 2) 288 | 289 | val vimu_vals = Vec(vimus.map(_.valid)).asUInt 290 | val vfmu_vals = vfmus.map(_._2) 291 | val vfcu_vals = Vec(vfcus.map(_.valid)).asUInt 292 | val vfvu_vals = vfvus._2 293 | 294 | val wdata = List( 295 | MuxCase(Bits(0), Array( 296 | vimu_vals.orR -> repack_slice(vimus.map(_.bits.out)), 297 | vfmu_vals(0).orR -> repack_slice(vfmus(0)._1.map(_.out)), 298 | vfvu_vals.asUInt.orR -> repack_slice(vfvus._1.map(_.out)))), 299 | MuxCase(Bits(0), Array( 300 | vfmu_vals(1).orR -> repack_slice(vfmus(1)._1.map(_.out)), 301 | vfcu_vals.orR -> repack_slice(vfcus.map(_.bits.out))))) 302 | 303 | val wdata_pred = List( 304 | vimu_vals | vfmu_vals(0) | vfvu_vals.asUInt, 305 | vfmu_vals(1) | vfcu_vals) 306 | 307 | banksrw.map { b => 308 | b.wpred.pred := Vec(vfcus.map(_.bits.cmp)).asUInt 309 | b.wpred.mask := vfcu_vals 310 | (b.wdata.zipWithIndex) map { case (bwdata, i) => 311 | bwdata.data := wdata(i) 312 | bwdata.pred := wdata_pred(i) 313 | } 314 | } 315 | 316 | (io.ack.vfmu zip vfmu_vals) foreach { case (ack, vfmu) => 317 | ack.valid := vfmu.orR 318 | ack.bits.pred := vfmu 319 | } 320 | 321 | io.ack.vqu.valid := ctrl.io.uop.vqu.valid 322 | io.ack.vgu.valid := ctrl.io.uop.vgu.valid 323 | io.ack.vimu.valid := vimu_vals.orR 324 | io.ack.vfcu.valid := vfcu_vals.orR 325 | io.ack.vfvu.valid := vfvu_vals.asUInt.orR 326 | 327 | io.ack.vqu.bits.pred := ctrl.io.uop.vqu.bits.pred 328 | io.ack.vgu.bits.pred := ctrl.io.uop.vgu.bits.pred 329 | io.ack.vimu.bits.pred := vimu_vals 330 | io.ack.vfcu.bits.pred := vfcu_vals 331 | io.ack.vfvu.bits.pred := vfvu_vals 332 | } 333 | -------------------------------------------------------------------------------- /src/main/scala/mou.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class MOCheck(implicit p: Parameters) extends HwachaBundle()(p) { 7 | val load = Bool() 8 | val store = Bool() 9 | } 10 | 11 | class MemOrderingUnit(implicit p: Parameters) extends HwachaModule()(p) with SeqLogic { 12 | val io = new Bundle { 13 | val cfg = new HwachaConfigIO().flip 14 | val mseq = new MasterSequencerState().asInput 15 | val pending = new Bundle { 16 | val su = new MRTPending().asInput 17 | val vus = Vec(nLanes, new MRTPending).asInput 18 | } 19 | val check = new Bundle { 20 | val su = new MOCheck().asOutput 21 | val vus = Vec(nLanes, Vec(nSeq, new MOCheck)).asOutput 22 | } 23 | } 24 | 25 | // Treat addr as limiter on head advancement for comparisons 26 | val latchedHead = Reg(init = 0.U(log2Up(nSeq).W)) 27 | val headMatchAddr = io.pending.vus.map(_.addr).map{ case a => 28 | a.valid && a.bits === latchedHead }.foldLeft(Bool(false))(_ || _) 29 | when(!headMatchAddr) { latchedHead := io.mseq.head } 30 | // if the vmu is sending out acquires for a vlu/vsu between 31 | // the head and our mseq entry we need to wait 32 | def pending_addr(i: Int) = { 33 | val myIdx = i.U(log2Up(nSeq).W) 34 | // comparisons between head and aidx should be inclusive 35 | io.pending.vus.map(_.addr).map{ case a => a.valid && 36 | (latchedHead < myIdx && latchedHead <= a.bits && a.bits < myIdx) || 37 | (latchedHead > myIdx && // check for wrap-around 38 | ((a.bits >= latchedHead && a.bits > myIdx) || 39 | (a.bits < myIdx && a.bits <= latchedHead))) 40 | }.foldLeft(false.B)(_ || _) 41 | } 42 | 43 | def vus_pending_store(exclude: Bool) = 44 | io.pending.vus.map(_.store).filter(_ ne exclude).foldLeft(Bool(false))(_ || _) 45 | def vus_pending_all(exclude: Bool) = 46 | io.pending.vus.map(_.all).filter(_ ne exclude).foldLeft(Bool(false))(_ || _) 47 | 48 | // scalar loads can go through when memory ordering is relaxed or 49 | // when no pending vector stores 50 | // scalar stores can go through when memory orderig is relaxed or 51 | // when no pending vector loads & stores 52 | 53 | io.check.su.load := (io.cfg.morelax || !vus_pending_store(null)) 54 | io.check.su.store := (io.cfg.morelax || !vus_pending_all(null)) 55 | 56 | // vector loads can go through when memory ordering is relaxed or 57 | // when no pending scalar stores and when either of these conditions are met 58 | // 1) it's the first vector memory op 59 | // 2) no pending vector stores from other lanes than the one examined 60 | // vector stores can go through when memory ordering is relaxed or 61 | // when no pending scalar laods & stores and when either of these conditions are met 62 | // 1) it's the first vector memory op 63 | // 2) no pending vector loads & stores from other lanes than the one examined 64 | // First is defined as being the first vcu op after any previous memory ops have 65 | // finished sending out the aquires (vu_pending_addr) 66 | 67 | val vu_pending_addr = (0 until nSeq).map{ case i => pending_addr(i) } 68 | val first = find_first(io.mseq.valid, io.mseq.head, (i: Int) => io.mseq.e(i).active.vcu) 69 | .zip(vu_pending_addr).map{ case (f, a) => f && !a } 70 | 71 | (0 until nLanes) map { l => (0 until nSeq) map { s => 72 | io.check.vus(l)(s).load := 73 | io.cfg.morelax || !io.pending.su.store && 74 | (first(s) || !vus_pending_store(io.pending.vus(l).store)) 75 | io.check.vus(l)(s).store := 76 | io.cfg.morelax || !io.pending.su.all && 77 | (first(s) || !vus_pending_all(io.pending.vus(l).all)) 78 | } } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/mrt.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class MRTAddrIO(implicit p: Parameters) extends HwachaBundle()(p) with SeqParameters { 7 | val valid = Bool() 8 | val bits = UInt(width = log2Up(nSeq)) 9 | } 10 | 11 | class LaneMRTIO(implicit p: Parameters) extends HwachaBundle()(p) with SeqParameters { 12 | val lreq = new CounterLookAheadIO 13 | val sreq = new CounterLookAheadIO 14 | val lret = new CounterUpdateIO(bLookAhead) 15 | val areq = new MRTAddrIO 16 | } 17 | 18 | class MRTIO(implicit p: Parameters) extends LaneMRTIO()(p) with VMUParameters { 19 | val sret = new CounterUpdateIO(bSRet) 20 | val aret = Bool() 21 | val pending = new MRTPending().asInput 22 | } 23 | 24 | class MRTPending(implicit p: Parameters) extends HwachaBundle()(p) { 25 | val load = Bool() 26 | val store = Bool() 27 | val addr = new MRTAddrIO 28 | val all = Bool() 29 | } 30 | 31 | class MemTracker(nlreq: Int, nsreq: Int)(implicit p: Parameters) extends HwachaModule()(p) with SeqParameters with VMUParameters { 32 | val io = new MRTIO().flip 33 | 34 | val lcnt = Module(new LookAheadCounter(nlreq, nlreq)) 35 | lcnt.suggestName("lcntInst") 36 | val scnt = Module(new LookAheadCounter(nsreq, nsreq)) 37 | scnt.suggestName("scntInst") 38 | val addr = Module(new Queue(UInt(width = log2Up(nSeq)), nVMUQ + nVMUIQ)) 39 | 40 | lcnt.io.dec <> io.lreq 41 | lcnt.io.inc <> io.lret 42 | 43 | scnt.io.dec <> io.sreq 44 | scnt.io.inc <> io.sret 45 | 46 | io.pending.load := !lcnt.io.full 47 | io.pending.store := !scnt.io.full 48 | io.pending.all := io.pending.load || io.pending.store 49 | 50 | addr.io.enq := io.areq 51 | assert(!io.areq.valid || addr.io.enq.ready, "MRT addr queue full!") 52 | addr.io.deq.ready := io.aret 53 | io.pending.addr := addr.io.deq 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/package.scala: -------------------------------------------------------------------------------- 1 | package object hwacha extends HwachaConstants 2 | with freechips.rocketchip.rocket.constants.MemoryOpConstants 3 | -------------------------------------------------------------------------------- /src/main/scala/scalar-fpu-interface.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.tile.FPConstants._ 6 | import freechips.rocketchip.tile.FType 7 | import HardFloatHelper._ 8 | 9 | object ScalarFPUDecode { 10 | val FX: List[BitPat]= 11 | List(X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X) 12 | val FCVT_S_W = List(N,Y,N,N,N,X,X,Y,Y,Y,N,N,N,N,N,Y) 13 | val FCVT_S_WU= List(N,Y,N,N,N,X,X,Y,Y,Y,N,N,N,N,N,Y) 14 | val FCVT_S_L = List(N,Y,N,N,N,X,X,Y,Y,Y,N,N,N,N,N,Y) 15 | val FCVT_S_LU= List(N,Y,N,N,N,X,X,Y,Y,Y,N,N,N,N,N,Y) 16 | val FCLASS_S = List(N,N,Y,N,N,N,X,Y,Y,N,Y,N,N,N,N,N) 17 | val FCVT_W_S = List(N,N,Y,N,N,N,X,Y,Y,N,Y,N,N,N,N,Y) 18 | val FCVT_WU_S= List(N,N,Y,N,N,N,X,Y,Y,N,Y,N,N,N,N,Y) 19 | val FCVT_L_S = List(N,N,Y,N,N,N,X,Y,Y,N,Y,N,N,N,N,Y) 20 | val FCVT_LU_S= List(N,N,Y,N,N,N,X,Y,Y,N,Y,N,N,N,N,Y) 21 | val FEQ_S = List(N,N,Y,Y,N,N,N,Y,Y,N,Y,N,N,N,N,Y) 22 | val FLT_S = List(N,N,Y,Y,N,N,N,Y,Y,N,Y,N,N,N,N,Y) 23 | val FLE_S = List(N,N,Y,Y,N,N,N,Y,Y,N,Y,N,N,N,N,Y) 24 | val FSGNJ_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,Y,N,N,N,N) 25 | val FSGNJN_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,Y,N,N,N,N) 26 | val FSGNJX_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,Y,N,N,N,N) 27 | val FMIN_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,Y,N,N,N,Y) 28 | val FMAX_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,Y,N,N,N,Y) 29 | val FADD_S = List(N,Y,Y,Y,N,N,Y,Y,Y,N,N,N,Y,N,N,Y) 30 | val FSUB_S = List(N,Y,Y,Y,N,N,Y,Y,Y,N,N,N,Y,N,N,Y) 31 | val FMUL_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,N,Y,N,N,Y) 32 | val FMADD_S = List(N,Y,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y) 33 | val FMSUB_S = List(N,Y,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y) 34 | val FNMADD_S = List(N,Y,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y) 35 | val FNMSUB_S = List(N,Y,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y) 36 | val FDIV_S = List(N,Y,Y,Y,N,N,N,Y,Y,N,N,N,N,Y,N,Y) 37 | val FSQRT_S = List(N,Y,Y,N,N,Y,X,Y,Y,N,N,N,N,N,Y,Y) 38 | 39 | val FCVT_D_W = List(N,Y,N,N,N,X,X,N,N,Y,N,N,N,N,N,Y) 40 | val FCVT_D_WU= List(N,Y,N,N,N,X,X,N,N,Y,N,N,N,N,N,Y) 41 | val FCVT_D_L = List(N,Y,N,N,N,X,X,N,N,Y,N,N,N,N,N,Y) 42 | val FCVT_D_LU= List(N,Y,N,N,N,X,X,N,N,Y,N,N,N,N,N,Y) 43 | val FCLASS_D = List(N,N,Y,N,N,N,X,N,N,N,Y,N,N,N,N,N) 44 | val FCVT_W_D = List(N,N,Y,N,N,N,X,N,N,N,Y,N,N,N,N,Y) 45 | val FCVT_WU_D= List(N,N,Y,N,N,N,X,N,N,N,Y,N,N,N,N,Y) 46 | val FCVT_L_D = List(N,N,Y,N,N,N,X,N,N,N,Y,N,N,N,N,Y) 47 | val FCVT_LU_D= List(N,N,Y,N,N,N,X,N,N,N,Y,N,N,N,N,Y) 48 | val FCVT_S_D = List(N,Y,Y,N,N,N,X,N,Y,N,N,Y,N,N,N,Y) 49 | val FCVT_D_S = List(N,Y,Y,N,N,N,X,Y,N,N,N,Y,N,N,N,Y) 50 | val FEQ_D = List(N,N,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y) 51 | val FLT_D = List(N,N,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y) 52 | val FLE_D = List(N,N,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y) 53 | val FSGNJ_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,N,N) 54 | val FSGNJN_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,N,N) 55 | val FSGNJX_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,N,N) 56 | val FMIN_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,N,Y) 57 | val FMAX_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,N,Y) 58 | val FADD_D = List(N,Y,Y,Y,N,N,Y,N,N,N,N,N,Y,N,N,Y) 59 | val FSUB_D = List(N,Y,Y,Y,N,N,Y,N,N,N,N,N,Y,N,N,Y) 60 | val FMUL_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,N,Y,N,N,Y) 61 | val FMADD_D = List(N,Y,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y) 62 | val FMSUB_D = List(N,Y,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y) 63 | val FNMADD_D = List(N,Y,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y) 64 | val FNMSUB_D = List(N,Y,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y) 65 | val FDIV_D = List(N,Y,Y,Y,N,N,N,N,N,N,N,N,N,Y,N,Y) 66 | val FSQRT_D = List(N,Y,Y,N,N,Y,X,N,N,N,N,N,N,N,Y,Y) 67 | 68 | val FCVT_S_S = List(N,N,Y,N,N,N,X,Y,Y,N,N,Y,N,N,N,Y) // special op for half conversions 69 | 70 | } 71 | 72 | class HwachaFPInput(implicit p: Parameters) extends freechips.rocketchip.tile.FPInput { 73 | val bSRegs = log2Up(p(HwachaNScalarRegs)) 74 | val in_fmt = UInt(width = 2) 75 | val tag = UInt(width = bSRegs) 76 | } 77 | 78 | class HwachaFPResult(implicit p: Parameters) extends freechips.rocketchip.tile.FPResult { 79 | val bSRegs = log2Up(p(HwachaNScalarRegs)) 80 | val tag = UInt(width = bSRegs) 81 | } 82 | 83 | class ScalarFPUInterface(implicit p: Parameters) extends HwachaModule()(p) with Packing with freechips.rocketchip.tile.HasFPUParameters { 84 | val io = new Bundle { 85 | val hwacha = new Bundle { 86 | val req = Decoupled(new HwachaFPInput).flip 87 | val resp = Decoupled(new HwachaFPResult) 88 | } 89 | val rocc = new Bundle { 90 | val req = Decoupled(new freechips.rocketchip.tile.FPInput) 91 | val resp = Decoupled(new freechips.rocketchip.tile.FPResult).flip 92 | } 93 | } 94 | 95 | val pending_fpu = Reg(init=Bool(false)) 96 | val pending_fpu_req = Reg(new HwachaFPInput) 97 | val pending_fpu_typ = Reg(Bits(width=2)) 98 | 99 | val reqq = Module(new Queue(new HwachaFPInput, 2)) 100 | reqq.suggestName("reqqInst") 101 | val respq = Module(new Queue(new freechips.rocketchip.tile.FPResult, 2)) 102 | respq.suggestName("respqInst") 103 | 104 | reqq.io.enq <> io.hwacha.req 105 | 106 | private val hreq = reqq.io.deq.bits 107 | 108 | private val hreq_ctrl = Wire(new freechips.rocketchip.tile.FPUCtrlSigs) 109 | hreq_ctrl <> hreq 110 | //We handle half conversions locally 111 | val enq_rocc = !(hreq_ctrl.getElements.reverse.zip(ScalarFPUDecode.FCVT_S_S).map{case(l,r) => r === l.asInstanceOf[UInt]}.reduce(_ && _)) 112 | val mask_rocc_req_ready = !enq_rocc || io.rocc.req.ready 113 | val mask_respq_enq_ready = enq_rocc || respq.io.enq.ready 114 | 115 | def fire(exclude: Bool, include: Bool*) = { 116 | val rvs = Seq(!pending_fpu, 117 | reqq.io.deq.valid, mask_rocc_req_ready, mask_respq_enq_ready) 118 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 119 | } 120 | 121 | reqq.io.deq.ready := fire(reqq.io.deq.valid) 122 | io.rocc.req.valid := fire(mask_rocc_req_ready, enq_rocc) 123 | 124 | when (fire(null)) { 125 | pending_fpu := Bool(true) 126 | pending_fpu_req := hreq 127 | pending_fpu_typ := Mux(hreq.fromint, hreq.in_fmt, hreq.typ) 128 | } 129 | val ins = List(hreq.in1, hreq.in2, hreq.in3) 130 | 131 | val h2s = 132 | ins map { case in => 133 | val h2s = Module(new hardfloat.RecFNToRecFN(5, 11, 8, 24)) 134 | h2s.suggestName("h2sInst") 135 | h2s.io.in := recode_hp(in) 136 | h2s.io.roundingMode := hreq.rm 137 | // XXX: use h2s.io.exceptionFlags 138 | h2s.io.out 139 | } 140 | 141 | io.rocc.req.bits <> hreq 142 | 143 | def unboxRecode(in: UInt, minT: Option[FType]) = { 144 | unbox(recode(in, hreq.in_fmt), hreq.typeTagIn =/= S, minT) 145 | } 146 | val fuIn = ins.map(in => unboxRecode(in, None)) 147 | val sfmaIn = ins.map(in => unboxRecode(in, Some(FType.S))) 148 | val dfmaIn = ins.map(in => unboxRecode(in, Some(FType.D))) 149 | val hIn = h2s.map(in => unboxRecode(in, Some(FType.S))) 150 | 151 | val rec_s_in1 = recode(hreq.in1, hreq.in_fmt) 152 | io.rocc.req.bits.in1 := 153 | Mux(hreq.fromint, hreq.in1,//unboxing is unecessary here 154 | Mux(hreq.in_fmt === 2.U, hIn(0), 155 | Mux(hreq.fma, Mux(hreq.in_fmt === 0.U, sfmaIn(0), dfmaIn(0)), fuIn(0)))) 156 | io.rocc.req.bits.in2 := 157 | Mux(hreq.in_fmt === 2.U, hIn(1), 158 | Mux(hreq.fma, Mux(hreq.in_fmt === 0.U, sfmaIn(1), dfmaIn(1)), fuIn(1))) 159 | io.rocc.req.bits.in3 := 160 | Mux(hreq.in_fmt === 2.U, hIn(2), 161 | Mux(hreq.fma, Mux(hreq.in_fmt === 0.U, sfmaIn(2), dfmaIn(2)), fuIn(2))) 162 | 163 | respq.io.enq.valid := io.rocc.resp.valid || fire(mask_respq_enq_ready, !enq_rocc) 164 | respq.io.enq.bits := io.rocc.resp.bits 165 | when (fire(null, !enq_rocc)) { 166 | respq.io.enq.bits.data := Mux(hreq.in_fmt === UInt(0), rec_s_in1, box(h2s(0), typeTag(FType.S).U)) 167 | } 168 | 169 | respq.io.deq.ready := io.hwacha.resp.ready 170 | io.hwacha.resp.valid := respq.io.deq.valid 171 | io.rocc.resp.ready := respq.io.enq.ready 172 | 173 | when (respq.io.deq.fire) { 174 | pending_fpu := Bool(false) 175 | } 176 | 177 | private val rresp = respq.io.deq.bits 178 | private val hresp = io.hwacha.resp.bits 179 | 180 | val s2h = Module(new hardfloat.RecFNToRecFN(8, 24, 5, 11)) 181 | s2h.suggestName("s2hInst") 182 | s2h.io.in := rresp.data 183 | s2h.io.roundingMode := pending_fpu_req.rm 184 | // XXX: use s2h.io.exceptionFlags 185 | 186 | val unrec_h = ieee_hp(s2h.io.out) 187 | val unrec_s = ieee(rresp.data)(31,0) 188 | val unrec_d = ieee(rresp.data) 189 | val fsgnj_s = unrec_s.asSInt.pad(64).asUInt 190 | val unrec_fpu_resp = 191 | Mux(pending_fpu_typ === UInt(0), unrec_s, 192 | Mux(pending_fpu_typ === UInt(1), unrec_d, 193 | expand_float_h(unrec_h))) 194 | 195 | hresp.tag := pending_fpu_req.tag 196 | hresp.data := 197 | Mux(pending_fpu_req.toint, rresp.data(63, 0), 198 | Mux(pending_fpu_req.fastpipe && !pending_fpu_req.wflags && (pending_fpu_req.typeTagIn === S), fsgnj_s, 199 | unrec_fpu_resp)) 200 | } 201 | -------------------------------------------------------------------------------- /src/main/scala/scalar-fpu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config._ 6 | import freechips.rocketchip.tile.FPConstants._ 7 | import freechips.rocketchip.tile.{FPResult, FPUCtrlSigs, HasFPUParameters} 8 | import freechips.rocketchip.util._ 9 | 10 | class ScalarFPU(implicit p: Parameters) extends HwachaModule()(p) with HasFPUParameters { 11 | val io = IO(new Bundle { 12 | val req = Flipped(Decoupled(new freechips.rocketchip.tile.FPInput())) 13 | val resp = Decoupled(new FPResult()) 14 | }) 15 | //buffer for simple back-pressure model 16 | val resp_reg = Reg(UInt()) 17 | val resp_reg_val = RegInit(false.B) 18 | 19 | io.req.ready := !resp_reg_val 20 | 21 | val ex_ctrl = Wire(new FPUCtrlSigs) 22 | ex_ctrl <> io.req.bits 23 | val wb_ctrl = RegEnable(ex_ctrl, io.req.valid) 24 | val wb_reg_valid = RegNext(io.req.valid, init=false.B) 25 | 26 | val req = new freechips.rocketchip.tile.FPInput 27 | req := io.req.bits 28 | req.in2 := Mux(io.req.bits.swap23, io.req.bits.in3, io.req.bits.in2) 29 | req.in3 := Mux(io.req.bits.swap23, io.req.bits.in2, io.req.bits.in3) 30 | 31 | val sfma = Module(new freechips.rocketchip.tile.FPUFMAPipe(p(HwachaStagesSFMA), freechips.rocketchip.tile.FType.S)) 32 | sfma.suggestName("sfmaInst") 33 | sfma.io.in.valid := io.req.valid && io.req.bits.fma && 34 | (io.req.bits.typeTagIn === S) 35 | sfma.io.in.bits := req 36 | 37 | val dfma = Module(new freechips.rocketchip.tile.FPUFMAPipe(p(HwachaStagesDFMA), freechips.rocketchip.tile.FType.D)) 38 | dfma.suggestName("dfmaInst") 39 | dfma.io.in.valid := io.req.valid && io.req.bits.fma && 40 | (io.req.bits.typeTagOut === D) 41 | dfma.io.in.bits := req 42 | 43 | val fpiu = Module(new freechips.rocketchip.tile.FPToInt) 44 | fpiu.suggestName("fpiuInst") 45 | fpiu.io.in.valid := io.req.valid && (io.req.bits.toint || io.req.bits.div || io.req.bits.sqrt || (io.req.bits.fastpipe && io.req.bits.wflags)) 46 | fpiu.io.in.bits := req 47 | 48 | val ifpu = Module(new freechips.rocketchip.tile.IntToFP(3)) 49 | ifpu.suggestName("ifpuInst") 50 | ifpu.io.in.valid := io.req.valid && io.req.bits.fromint 51 | ifpu.io.in.bits := req 52 | 53 | //ifpu.io.in.bits.in1 := io.dpath.fromint_data 54 | 55 | val fpmu = Module(new freechips.rocketchip.tile.FPToFP(2)) 56 | fpmu.suggestName("fpmuInst") 57 | fpmu.io.in.valid := io.req.valid && io.req.bits.fastpipe 58 | fpmu.io.in.bits := req 59 | fpmu.io.lt := fpiu.io.out.bits.lt 60 | 61 | // No writeback arbitration since ScalarUnit can't put backpressure on us 62 | // writeback arbitration 63 | case class Pipe(p: Module, lat: Int, cond: (FPUCtrlSigs) => Bool, res: FPResult) 64 | val pipes = List( 65 | Pipe(fpmu, fpmu.latency, (c: FPUCtrlSigs) => c.fastpipe, fpmu.io.out.bits), 66 | Pipe(ifpu, ifpu.latency, (c: FPUCtrlSigs) => c.fromint, ifpu.io.out.bits), 67 | Pipe(sfma, sfma.latency, (c: FPUCtrlSigs) => c.fma && (c.typeTagOut === S), sfma.io.out.bits), 68 | Pipe(dfma, dfma.latency, (c: FPUCtrlSigs) => c.fma && (c.typeTagOut === D), dfma.io.out.bits) 69 | ) 70 | def latencyMask(c: FPUCtrlSigs, offset: Int) = { 71 | require(pipes.forall(_.lat >= offset)) 72 | pipes.map(p => Mux(p.cond(c), (1.U << (1 << p.lat-offset)), 0.U)).reduce(_|_) 73 | } 74 | def pipeid(c: FPUCtrlSigs) = pipes.zipWithIndex.map(p => Mux(p._1.cond(c), p._2.U, 0.U)).reduce(_|_) 75 | val maxLatency = pipes.map(_.lat).max 76 | val wbLatencyMask = latencyMask(wb_ctrl, 2) 77 | 78 | class WBInfo extends Bundle { 79 | val single = Bool() 80 | val pipeid = UInt(log2Ceil(pipes.size).W) 81 | } 82 | 83 | val wen = RegInit(0.U((maxLatency-1).W)) 84 | val wbInfo = Reg(Vec(maxLatency-1, new WBInfo)) 85 | val wb_wen = wb_reg_valid && (wb_ctrl.fma || wb_ctrl.fastpipe || wb_ctrl.fromint) 86 | val write_port_busy = RegEnable(wb_wen && (wbLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, io.req.valid) 87 | val wb_winfo = pipeid(wb_ctrl) 88 | 89 | for (i <- 0 until maxLatency-2) { 90 | when (wen(i+1)) { wbInfo(i) := wbInfo(i+1) } 91 | } 92 | wen := wen >> 1.U 93 | when (wb_wen) { 94 | wen := wen >> 1.U | wbLatencyMask 95 | for (i <- 0 until maxLatency-1) { 96 | when (!write_port_busy && wbLatencyMask(i)) { 97 | wbInfo(i).single := wb_ctrl.typeTagOut === S 98 | wbInfo(i).pipeid := pipeid(wb_ctrl) 99 | } 100 | } 101 | } 102 | 103 | val wsrc = wbInfo(0).pipeid 104 | val wdata = (pipes.map(_.res.data): Seq[UInt])(wsrc) 105 | val wexc = (pipes.map(_.res.exc): Seq[UInt])(wsrc) 106 | val resp_data = Mux(!fpiu.io.out.valid, wdata, fpiu.io.out.bits.toint) 107 | io.resp.bits.data := resp_data 108 | when (wen(0) || fpiu.io.out.valid ) { 109 | when(!io.resp.ready){ 110 | resp_reg := resp_data 111 | resp_reg_val := true.B 112 | } 113 | } 114 | when(io.resp.ready && resp_reg_val){ 115 | io.resp.bits.data := resp_reg 116 | resp_reg_val := false.B 117 | } 118 | io.resp.valid := wen(0) || fpiu.io.out.valid || resp_reg_val 119 | } 120 | -------------------------------------------------------------------------------- /src/main/scala/smu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.tilelink._ 6 | import freechips.rocketchip.diplomacy._ 7 | import freechips.rocketchip.rocket.{TLBPTWIO, TLBConfig} 8 | 9 | case object HwachaNSMUEntries extends Field[Int] 10 | 11 | abstract class SMUBundle(implicit p: Parameters) 12 | extends HwachaBundle()(p) with SMUParameters 13 | 14 | trait SMUParameters extends MemParameters { 15 | val nSMU = p(HwachaNSMUEntries) 16 | val bSMUTag = log2Up(nSRegs) 17 | } 18 | 19 | class SMUFn extends Bundle { 20 | val cmd = Bits(width = SZ_SMU_CMD) 21 | val mt = Bits(width = MT_SZ) 22 | } 23 | 24 | trait SMUTag extends SMUBundle { 25 | val tag = UInt(width = bSMUTag) 26 | } 27 | trait SMUData extends SMUTag { 28 | val data = Bits(width = regLen) 29 | } 30 | 31 | class SMUReq(implicit p: Parameters) extends SMUBundle()(p) 32 | with SMUData { 33 | val fn = new SMUFn 34 | val addr = UInt(width = bVAddrExtended) 35 | val status = new freechips.rocketchip.rocket.MStatus 36 | } 37 | 38 | class SMUResp(implicit p: Parameters) extends SMUBundle()(p) 39 | with SMUData { 40 | val store = Bool() 41 | } 42 | 43 | class SMUIO(implicit p: Parameters) extends HwachaBundle()(p) { 44 | val req = Decoupled(new SMUReq) 45 | val resp = Decoupled(new SMUResp).flip 46 | val confirm = Bool(INPUT) 47 | } 48 | 49 | 50 | class SMUEntry(implicit p: Parameters) extends SMUBundle()(p) 51 | with SMUTag { 52 | val mt = Bits(width = MT_SZ) 53 | val offset = UInt(width = tlByteAddrBits) 54 | } 55 | 56 | class SMU(implicit p: Parameters) extends LazyModule { 57 | lazy val module = new SMUModule(this) 58 | val masterNode = TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLMasterParameters.v1(name = "HwachaSMU", sourceId = IdRange(0, p(HwachaNSMUEntries))))))) 59 | } 60 | 61 | class SMUModule(outer: SMU)(implicit p: Parameters) extends LazyModuleImp(outer) 62 | with SMUParameters { 63 | 64 | val io = IO(new Bundle { 65 | val scalar = new SMUIO().flip 66 | 67 | val ptw = new TLBPTWIO 68 | val irq = new IRQIO 69 | }) 70 | val (dmem, edge) = outer.masterNode.out.head 71 | 72 | val table = Module(new Table(nSMU, new SMUEntry)) 73 | table.suggestName("tableInst") 74 | private val tw = table.io.w 75 | private val tr = table.io.r 76 | 77 | //--------------------------------------------------------------------\\ 78 | // request 79 | //--------------------------------------------------------------------\\ 80 | private val acquire = dmem.a 81 | 82 | val req = Reg(io.scalar.req.bits) 83 | val req_mt = DecodedMemType(req.fn.mt) 84 | val req_store = (req.fn.cmd === SM_S) 85 | 86 | val tbox = Module(new TBox(1)) 87 | tbox.suggestName("tboxInst") 88 | private val tlb = tbox.io.inner(0) 89 | tlb.req.valid := Bool(false) 90 | tlb.req.bits.vaddr := req.addr 91 | tlb.req.bits.passthrough := Bool(false) 92 | tlb.req.bits.size := req_mt.shift() 93 | tlb.req.bits.cmd := Mux(req_store, M_XWR, M_XRD) 94 | tlb.status := req.status 95 | io.irq <> tbox.io.irq 96 | 97 | val ptlb = Module(new freechips.rocketchip.rocket.TLB(instruction = false, lgMaxSize = log2Ceil(regBytes), TLBConfig(nSets=nptlb, nWays=1, nSectors=1))(edge, p)) 98 | ptlb.io.req <> tbox.io.outer.req 99 | tbox.io.outer.resp <> ptlb.io.resp 100 | io.ptw <> ptlb.io.ptw 101 | ptlb.io.ptw.status := tbox.io.outer.status 102 | ptlb.io.sfence.valid := false.B 103 | 104 | val addr_offset = req.addr(tlByteAddrBits-1, 0) 105 | 106 | private def mts(mt: DecodedMemType) = Seq(mt.b, mt.h, mt.w, mt.d) 107 | private def mask(mt: DecodedMemType) = 108 | Mux1H(mts(mt).zipWithIndex.map { case (s, i) => 109 | // TODO FIXME COLIN: this is a workaround for zero width wires in chisel3 110 | if(i == 0) (s, UInt(0, width=1)) else (s, Fill((1 << i) - 1, UInt(1, width=1))) 111 | }) 112 | 113 | val req_mask_base = Cat(mask(req_mt), Bool(true)) 114 | val req_mask = req_mask_base << addr_offset 115 | val req_data = req.data << Cat(addr_offset, UInt(0,3)) 116 | 117 | tw.valid := Bool(false) 118 | tw.bits.tag := req.tag 119 | tw.bits.mt := req.fn.mt 120 | tw.bits.offset := addr_offset 121 | 122 | val SMUID = 1.U(2.W) 123 | acquire.bits := Mux(req_store, 124 | edge.Put(tw.tag, req.addr, req_mt.shift(), req_data, req_mask)._2, 125 | edge.Get(tw.tag, req.addr, req_mt.shift())._2) 126 | 127 | private def fire(exclude: Bool, include: Bool*) = { 128 | val rvs = Seq(acquire.ready, tw.ready) 129 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 130 | } 131 | 132 | val s_idle :: s_tlb :: s_mem :: Nil = Enum(UInt(), 3) 133 | val state = Reg(init = s_idle) 134 | 135 | io.scalar.confirm := Bool(false) 136 | io.scalar.req.ready := Bool(false) 137 | acquire.valid := Bool(false) 138 | 139 | switch (state) { 140 | is (s_idle) { 141 | io.scalar.req.ready := Bool(true) 142 | when (io.scalar.req.valid) { 143 | state := s_tlb 144 | req := io.scalar.req.bits 145 | } 146 | } 147 | 148 | is (s_tlb) { 149 | tlb.req.valid := Bool(true) 150 | when (tlb.req.ready) { 151 | state := Mux(tlb.resp.xcpt, s_idle, s_mem) 152 | io.scalar.confirm := !tlb.resp.xcpt 153 | req.addr := tlb.paddr() 154 | } 155 | } 156 | 157 | is (s_mem) { 158 | acquire.valid := fire(acquire.ready) 159 | tw.valid := fire(tw.ready) 160 | when (fire(null)) { 161 | state := s_idle 162 | } 163 | } 164 | } 165 | 166 | //--------------------------------------------------------------------\\ 167 | // request 168 | //--------------------------------------------------------------------\\ 169 | private val grant = dmem.d 170 | 171 | io.scalar.resp.valid := grant.valid 172 | grant.ready := io.scalar.resp.ready 173 | tr.valid := io.scalar.resp.fire 174 | tr.bits := grant.bits.source(log2Up(nSMU)-1,0) 175 | 176 | val resp_mt = DecodedMemType(tr.record.mt) 177 | val resp_shift = Cat(tr.record.offset, UInt(0,3)) 178 | val resp_data = grant.bits.data >> resp_shift 179 | 180 | val resp_mask = FillInterleaved(8, mask(resp_mt)) 181 | val resp_sign = Mux1H(mts(resp_mt).zipWithIndex.map { case (s, i) => 182 | val w = 1 << (i + 3) 183 | (s, resp_data(w - 1)) 184 | }) 185 | val resp_extend = Fill(regLen-8, resp_mt.signed && resp_sign) 186 | 187 | io.scalar.resp.bits.store := grant.bits.opcode === TLMessages.AccessAck 188 | io.scalar.resp.bits.tag := tr.record.tag 189 | io.scalar.resp.bits.data := Cat( 190 | (resp_data(regLen-1, 8) & resp_mask) | 191 | (resp_extend & (~resp_mask)), 192 | resp_data(7, 0)) 193 | 194 | //Tie off unused channels 195 | dmem.b.ready := Bool(true) 196 | dmem.c.valid := Bool(false) 197 | dmem.e.valid := Bool(false) 198 | } 199 | -------------------------------------------------------------------------------- /src/main/scala/types-vmu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | abstract class VMUModule(clock: Clock = null, _reset: Bool = null)(implicit p: Parameters) 7 | extends HwachaModule(clock, _reset)(p) with VMUParameters 8 | abstract class VMUBundle(implicit p: Parameters) 9 | extends HwachaBundle()(p) with VMUParameters 10 | 11 | class VMUMemFn extends Bundle { 12 | val cmd = Bits(width = M_SZ) 13 | val mt = Bits(width = MT_SZ) 14 | } 15 | 16 | class VMUFn extends Bundle { 17 | val mode = Bits(width = SZ_VMU_MODE) 18 | val cmd = Bits(width = M_SZ) 19 | val mt = Bits(width = MT_SZ) 20 | } 21 | 22 | class VMUOpBase(implicit p: Parameters) extends VMUBundle()(p) { 23 | val fn = new VMUFn 24 | val base = UInt(width = bVAddrExtended) 25 | val stride = UInt(width = regLen) 26 | val status = new freechips.rocketchip.rocket.MStatus 27 | } 28 | 29 | class VMUOp(implicit p: Parameters) extends VMUOpBase()(p) with SingleLaneVLen 30 | class VMUOpML(implicit p: Parameters) extends VMUOpBase()(p) with MultiLaneVLen 31 | 32 | class DecodedMemCommand extends Bundle { 33 | val bits = Bits(width = M_SZ) 34 | 35 | val load = Bool() 36 | val store = Bool() 37 | val amo = Bool() 38 | val pf = Bool() 39 | 40 | val read = Bool() 41 | val write = Bool() 42 | } 43 | 44 | object DecodedMemCommand { 45 | def apply[T <: UInt](cmd: T): DecodedMemCommand = { 46 | cmd.suggestName("cmdWire") 47 | val dec = Wire(new DecodedMemCommand) 48 | dec.suggestName("decWire") 49 | dec.bits := cmd 50 | dec.load := (cmd === M_XRD).suggestName("loadWire") 51 | dec.store := (cmd === M_XWR).suggestName("storeWire") 52 | dec.amo := isAMO(cmd) 53 | dec.pf := isPrefetch(cmd) 54 | dec.read := (dec.load || dec.amo).suggestName("readWire") 55 | dec.write := (dec.store || dec.amo).suggestName("writeWire") 56 | dec 57 | } 58 | } 59 | 60 | class DecodedMemType extends Bundle { 61 | val b = Bool() // byte 62 | val h = Bool() // halfword 63 | val w = Bool() // word 64 | val d = Bool() // doubleword 65 | val signed = Bool() 66 | 67 | def shift(dummy: Int = 0): UInt = 68 | Cat(this.w || this.d, this.h || this.d).asUInt 69 | } 70 | 71 | object DecodedMemType { 72 | def apply[T <: UInt](mt: T): DecodedMemType = { 73 | mt.suggestName("mtWire") 74 | val b = (mt === MT_B).suggestName("bWire") 75 | val h = (mt === MT_H).suggestName("hWire") 76 | val w = (mt === MT_W).suggestName("wWire") 77 | val d = (mt === MT_D).suggestName("dWire") 78 | val bu = (mt === MT_BU).suggestName("buWire") 79 | val hu = (mt === MT_HU).suggestName("huWire") 80 | val wu = (mt === MT_WU).suggestName("wuWire") 81 | 82 | val dec = Wire(new DecodedMemType) 83 | dec.suggestName("decWire") 84 | dec.b := (b || bu).suggestName("decbWire") 85 | dec.h := (h || hu).suggestName("dechWire") 86 | dec.w := (w || wu).suggestName("decwWire") 87 | dec.d := d 88 | dec.signed := ((b || h).suggestName("bhWire") || (w || d).suggestName("wdWire")).suggestName("signedWire") 89 | dec 90 | } 91 | } 92 | 93 | /**********************************************************************/ 94 | 95 | class VVAQEntry(implicit p: Parameters) extends VMUBundle()(p) { 96 | val addr = UInt(width = bVAddrExtended) 97 | } 98 | class VVAQIO(implicit p: Parameters) extends DecoupledIO(new VVAQEntry()(p)) { 99 | } 100 | 101 | trait VMUAddr extends VMUBundle { 102 | val addr = UInt(width = bPAddr) 103 | } 104 | class VPAQEntry(implicit p: Parameters) extends VMUAddr 105 | class VPAQIO(implicit p: Parameters) extends DecoupledIO(new VPAQEntry()(p)) { 106 | } 107 | 108 | 109 | trait VMUData extends VMUBundle { 110 | val data = Bits(width = tlDataBits) 111 | } 112 | 113 | class VSDQEntry(implicit p: Parameters) extends VMUData 114 | class VSDQIO(implicit p: Parameters) extends DecoupledIO(new VSDQEntry()(p)) { 115 | } 116 | 117 | class VCUEntry(implicit p: Parameters) extends VMUBundle()(p) { 118 | val ecnt = UInt(width = bVLen) 119 | } 120 | class VCUIO(implicit p: Parameters) extends ValidIO(new VCUEntry()(p)) 121 | 122 | 123 | trait VMUTag extends VMUBundle { 124 | val tag = UInt(width = bVMUTag) 125 | } 126 | 127 | class VMULoadData(implicit p: Parameters) extends VMUData with VMUTag 128 | 129 | class VMTLoadEntry(implicit p: Parameters) extends VMUBundle()(p) 130 | with VMUMetaIndex with VMUMetaPadding with VMUMetaMask with VLUSelect 131 | 132 | class VMTStoreEntry(implicit p: Parameters) extends VMUBundle()(p) 133 | with VMUMetaCount 134 | 135 | class VMTEntry(implicit p: Parameters) extends VMUBundle()(p) { 136 | val union = Bits(math.max( 137 | new VMTLoadEntry().getWidth, 138 | new VMTStoreEntry().getWidth).W) 139 | 140 | def load(d: Int = 0) = new VMTLoadEntry().fromBits(this.union) 141 | def store(d: Int = 0) = new VMTStoreEntry().fromBits(this.union) 142 | } 143 | 144 | class VLDQEntry(implicit p: Parameters) extends VMUData { 145 | val meta = new VMTLoadEntry 146 | } 147 | class VLDQIO(implicit p: Parameters) extends DecoupledIO(new VLDQEntry()(p)) { 148 | } 149 | 150 | /**********************************************************************/ 151 | 152 | class PredEntry(implicit p: Parameters) extends HwachaBundle()(p) { 153 | val pred = Bits(width = nStrip) 154 | } 155 | 156 | class VMUMaskEntry_0(implicit p: Parameters) extends VMUBundle()(p) { 157 | val pred = Bool() 158 | val ecnt = UInt(width = bVLen) 159 | val last = Bool() 160 | 161 | val unit = new Bundle { 162 | val page = Bool() /* Entry is final for current page */ 163 | } 164 | val nonunit = new Bundle { 165 | val shift = UInt(width = bStrip) 166 | } 167 | } 168 | class VMUMaskIO_0(implicit p: Parameters) extends DecoupledIO(new VMUMaskEntry_0()(p)) { 169 | } 170 | 171 | class VMUMaskEntry_1(implicit p: Parameters) extends VMUBundle()(p) { 172 | val data = Bits(width = tlDataBytes >> 1) 173 | val vsdq = Bool() 174 | } 175 | class VMUMaskIO_1(implicit p: Parameters) extends DecoupledIO(new VMUMaskEntry_1()(p)) { 176 | val meta = new VMUBundle { 177 | val eoff = UInt(INPUT, tlByteAddrBits - 1) 178 | val last = Bool(INPUT) 179 | } 180 | } 181 | 182 | /**********************************************************************/ 183 | 184 | /* Encodes 2^n as 0; values 1 to (2^n-1) are represented as normal. */ 185 | class CInt(n: Int) extends Bundle { 186 | val raw = UInt(width = n) 187 | def encode[T <: UInt](x: T) { 188 | //COLIN FIXME: this was not being emitted in chisel2 and always fires in chisel3 189 | //assert(x != UInt(0), "CInt: invalid value") 190 | raw := x 191 | } 192 | def decode(dummy: Int = 0): UInt = Cat(raw === UInt(0), raw) 193 | } 194 | 195 | trait VMUMetaCount extends VMUBundle { 196 | val ecnt = new CInt(tlByteAddrBits-1) 197 | } 198 | trait VMUMetaPadding extends VMUBundle { 199 | val epad = UInt(width = tlByteAddrBits) 200 | } 201 | trait VMUMetaIndex extends VMUBundle { 202 | val eidx = UInt(width = bVLen) 203 | } 204 | trait VMUMetaMask extends VMUBundle { 205 | val mask = Bits(width = nStrip) 206 | } 207 | trait VMUMetaStore extends VMUBundle { 208 | val last = Bool() 209 | val vsdq = Bool() 210 | } 211 | 212 | /**********************************************************************/ 213 | 214 | trait VMUMemOp extends VMUAddr { 215 | val fn = new VMUMemFn 216 | } 217 | 218 | class VMUMetaAddr(implicit p: Parameters) extends VMUMetaCount 219 | with VMUMetaPadding with VMUMetaMask with VMUMetaStore with VLUSelect 220 | 221 | class VMUAddrEntry(implicit p: Parameters) extends VMUMemOp { 222 | val meta = new VMUMetaAddr with VMUMetaIndex 223 | } 224 | class VMUAddrIO(implicit p: Parameters) extends DecoupledIO(new VMUAddrEntry()(p)) { 225 | } 226 | -------------------------------------------------------------------------------- /src/main/scala/util-confprec.scala: -------------------------------------------------------------------------------- 1 | 2 | package hwacha 3 | 4 | import Chisel._ 5 | import org.chipsalliance.cde.config._ 6 | 7 | trait PrecLogic { 8 | def confprec_decode(prec: UInt): Seq[(Bool, Int)] = 9 | Seq(PREC_D, PREC_W, PREC_H).map(_ === prec).zipWithIndex 10 | def confprec_stride(prec: Seq[(Bool, Int)], cfg: HwachaConfigIO): UInt = 11 | Mux1H(prec.map(_._1), Seq(cfg.vstride.d, cfg.vstride.w, cfg.vstride.h)) 12 | def confprec_step(prec: UInt, idx: UInt, cfg: HwachaConfigIO): (Bool, UInt) = { 13 | val selp = confprec_decode(prec) 14 | val stride = confprec_stride(selp, cfg) 15 | val update = Mux1H(selp.map { case (p, i) => 16 | p -> (if (i > 0) idx(i-1, 0) === UInt(0) else Bool(true)) }) 17 | (update, stride) 18 | } 19 | } 20 | 21 | trait RateLogic extends LaneParameters { 22 | def rate_decode(rate: UInt): Seq[(Bool, Int)] = 23 | (0 to bPack).map(r => rate === UInt(r)).zipWithIndex 24 | 25 | def unpack_pred(n: UInt, i: Int, rate: UInt): Bits = { 26 | require(i <= nSlices) 27 | if (confprec) { 28 | val shift = UInt(i) << rate 29 | val mask = Mux1H(rate_decode(rate).map { case (r, k) => 30 | r -> Fill(1 << k, UInt(1,1)) }) 31 | ((n >> shift) & mask)(nPack-1, 0) 32 | } else n(i) 33 | } 34 | def repack_pred(n: Bits, rate: UInt): Bits = 35 | if (confprec) 36 | Mux1H(rate_decode(rate).map { case (r, i) => 37 | val w = (1 << i) - 1 38 | r -> Vec((0 until wPred by nPack).map(k => n(w+k, k))).asUInt }) 39 | else n 40 | 41 | def splat_scalar(uop: SRegMicroOp) = 42 | if (confprec) 43 | Mux1H(rate_decode(uop.rate).map { case (r, i) => 44 | r -> Fill(nSlices << i, uop.operand((regLen >> i)-1, 0)) }) 45 | else Fill(nSlices, uop.operand) 46 | } 47 | 48 | trait PackLogic extends PrecLogic with RateLogic with Packing { 49 | private def _prologue(pack: PackInfo, rate: UInt) = { 50 | val selp = confprec_decode(pack.prec) 51 | val shift = Mux1H(selp.map { case (p, i) => 52 | p -> (pack.idx << bPack-i)(bPack-1, 0) }) 53 | (selp, rate_decode(rate), shift) 54 | } 55 | 56 | def unpack_bank(pack: PackInfo, rate: UInt, in: BankData) = { 57 | val out = Wire(new BankDataEntry) 58 | if (confprec) { 59 | val (selp, selr, shift) = _prologue(pack, rate) 60 | val data = in.data >> Cat(shift, UInt(0, bSlices + 4)) 61 | val fn = Seq( 62 | (unpack_d _, expand_d _), 63 | (unpack_w _, expand_w _), 64 | (unpack_h _, expand_h _)) 65 | 66 | val pass = selp.map { case (p, n) => p && selr(n)._1 } 67 | val opts = for { 68 | ((p, n), (unpack, expand)) <- selp.zip(fn) 69 | (r, i) <- selr.take(n) 70 | } yield (p && r) -> { 71 | val msb = (regLen >> i) - 1 72 | Vec((0 until (nSlices << i)).map(k => 73 | expand(unpack(data, k))(msb, 0))).asUInt } 74 | 75 | out.data := Mux1H((pass.reduce(_ || _), in.data) +: opts) 76 | } else { 77 | out.data := in.data 78 | } 79 | out 80 | } 81 | def unpack_bank(op: MicroOp with BankPack, in: Bits): BankDataEntry = 82 | unpack_bank(op.pack, op.rate, new BankDataEntry().fromBits(in)) 83 | 84 | def repack_bank(pack: PackInfo, rate: UInt, in: BankData with BankPred) = { 85 | val out = Wire(new BankDataMaskEntry) 86 | if (confprec) { 87 | val (selp, selr, shift) = _prologue(pack, rate) 88 | val shift_data = Cat(shift, UInt(0, bSlices + 4)) 89 | val shift_mask = Cat(shift, UInt(0, bSlices)) 90 | 91 | val pass = selp.map { case (p, n) => p && selr(n)._1 } 92 | val opts = for ((p, n) <- selp; (r, i) <- selr.take(n)) 93 | yield (p && r) -> { 94 | val (width, period) = (regLen >> n, regLen >> i) 95 | Vec((0 until (nSlices << i)).map(k => 96 | _unpack(in.data, k, wBank, period, width))).asUInt } 97 | val data = Mux1H((pass.reduce(_ || _), in.data) +: opts) 98 | 99 | val _mask = Mux1H(selp.map { case (p, i) => 100 | p -> FillInterleaved(1 << (bPack-i), in.pred) }) 101 | 102 | val mask = (_mask << shift_mask)(wPred-1, 0) 103 | out.data := (data << shift_data)(wBank-1, 0) 104 | out.mask := FillInterleaved(SZ_H/SZ_B, mask) 105 | } else { 106 | out.data := in.data 107 | out.mask := FillInterleaved(regLen/SZ_B, in.pred) 108 | } 109 | out 110 | } 111 | def repack_bank(op: MicroOp with BankPack, in: BankData with BankPred) 112 | : BankDataMaskEntry = { 113 | val tmp = Wire(new BankDataPredEntry) 114 | tmp.data := in.data 115 | tmp.pred := op.pred & in.pred 116 | repack_bank(op.pack, op.rate, tmp) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/util.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import scala.math._ 6 | 7 | abstract trait Packing extends LaneParameters { 8 | def splat_d(n: Bits) = Fill(SZ_D/SZ_D, n.asUInt) 9 | def splat_w(n: Bits) = Fill(SZ_D/SZ_W, n.asUInt) 10 | def splat_h(n: Bits) = Fill(SZ_D/SZ_H, n.asUInt) 11 | def splat_b(n: Bits) = Fill(SZ_D/SZ_B, n.asUInt) 12 | 13 | def _expand(n: Bits, s: Bits, width: Int) = { 14 | Cat(Fill(SZ_D - width, s.asUInt), n) 15 | } 16 | 17 | def expand_d(n: Bits) = n 18 | def expand_w(n: Bits) = _expand(n, n(SZ_W-1), SZ_W) 19 | def expand_h(n: Bits) = _expand(n, n(SZ_H-1), SZ_H) 20 | def expand_b(n: Bits) = _expand(n, n(SZ_B-1), SZ_B) 21 | def expand_float_d(n: Bits) = expand_d(n) 22 | def expand_float_s(n: Bits) = expand_w(n) 23 | def expand_float_h(n: Bits) = expand_h(n) 24 | 25 | def _repack(n: Seq[Bits], len: Int) = { 26 | require(n.length == len) 27 | Cat(n.reverse) 28 | } 29 | 30 | def repack_d(n: Seq[Bits]) = _repack(n, SZ_D/SZ_D) 31 | def repack_w(n: Seq[Bits]) = _repack(n, SZ_D/SZ_W) 32 | def repack_h(n: Seq[Bits]) = _repack(n, SZ_D/SZ_H) 33 | def repack_b(n: Seq[Bits]) = _repack(n, SZ_D/SZ_B) 34 | 35 | def _unpack(n: Bits, idx: Int, extent: Int, period: Int, width: Int): UInt = { 36 | require((idx+1)*period <= extent) 37 | val base = idx*period 38 | n(width+base-1, base) 39 | } 40 | def _unpack(n: Bits, idx: Int, extent: Int, period: Int): UInt = 41 | _unpack(n, idx, extent, period, period) 42 | 43 | def unpack_d(n: Bits, idx: Int) = _unpack(n, idx, SZ_D, SZ_D) 44 | def unpack_w(n: Bits, idx: Int) = _unpack(n, idx, SZ_D, SZ_W) 45 | def unpack_h(n: Bits, idx: Int) = _unpack(n, idx, SZ_D, SZ_H) 46 | def unpack_b(n: Bits, idx: Int) = _unpack(n, idx, SZ_D, SZ_B) 47 | 48 | def splat_slice(n: Bits) = Fill(nSlices, n.asUInt) 49 | def repack_slice(n: Seq[Bits]) = _repack(n, nSlices) 50 | def unpack_slice(n: Bits, idx: Int) = 51 | _unpack(n, idx, wBank, p(HwachaRegLen)) 52 | } 53 | 54 | abstract trait BankLogic extends LaneParameters { 55 | def strip_to_bcnt(strip: UInt) = { 56 | val stripp1 = strip + UInt(1) 57 | if (nSlices > 1) stripp1 >> UInt(bSlices) else strip 58 | } 59 | 60 | def strip_to_bmask(strip: UInt) = { 61 | EnableDecoder(strip_to_bcnt(strip), nBanks).asUInt 62 | } 63 | } 64 | 65 | abstract trait MinMax { 66 | def min(x: UInt, y: UInt) = Mux(x > y, y, x) 67 | def max(x: UInt, y: UInt) = Mux(x > y, x, y) 68 | } 69 | 70 | abstract trait SeqLogic extends SeqParameters { 71 | def find_first(v: Vec[Bool], head: UInt, fn: Int=>Bool) = { 72 | val internal = Wire(Vec(2*nSeq, Bool())) 73 | for (i <- 0 until nSeq) { 74 | internal(i+nSeq) := v(i) && fn(i) 75 | internal(i) := internal(i+nSeq) && (UInt(i) >= head) 76 | } 77 | val priority_oh = PriorityEncoderOH(internal) 78 | val out = Wire(Vec(nSeq, Bool())) 79 | for (i <- 0 until nSeq) { 80 | out(i) := priority_oh(i) | priority_oh(i+nSeq) 81 | } 82 | out 83 | } 84 | 85 | def mreadfn[T <: Data](sched: Vec[Bool], me: Vec[MasterSeqEntry], rfn: MasterSeqEntry=>T) = 86 | rfn(me(0)).cloneType.fromBits(Mux1H(sched, me.map(rfn(_).asUInt))) 87 | 88 | def readfn[T <: Data](sched: Vec[Bool], e: Vec[SeqEntry], rfn: SeqEntry=>T) = 89 | rfn(e(0)).cloneType.fromBits(Mux1H(sched, e.map(rfn(_).asUInt))) 90 | 91 | def step(ptr: UInt, n: Int): UInt = { 92 | require(n < nSeq) 93 | if (isPow2(nSeq)) 94 | ptr + UInt(n) 95 | else if (n == 1) 96 | Mux(ptr === UInt(nSeq-1), UInt(0), ptr + UInt(1)) 97 | else 98 | ptr + Mux(ptr < UInt(nSeq-n), UInt(n), -UInt(nSeq-n, log2Up(nSeq))) 99 | } 100 | } 101 | 102 | object DataGating { 103 | def dgate(valid: Bool, b: UInt) = Fill(b.getWidth, valid) & b 104 | } 105 | 106 | object HardFloatHelper { 107 | def recode_dp(n: Bits) = hardfloat.recFNFromFN(11, 53, n.asUInt) 108 | def recode_sp(n: Bits) = hardfloat.recFNFromFN(8, 24, n.asUInt) 109 | def recode_hp(n: Bits) = hardfloat.recFNFromFN(5, 11, n.asUInt) 110 | def ieee_dp(n: Bits) = hardfloat.fNFromRecFN(11, 53, n.asUInt) 111 | def ieee_sp(n: Bits) = hardfloat.fNFromRecFN(8, 24, n.asUInt) 112 | def ieee_hp(n: Bits) = hardfloat.fNFromRecFN(5, 11, n.asUInt) 113 | } 114 | 115 | class MaskStall[T <: Data](data: => T) extends Module { 116 | val io = new Bundle { 117 | val input = Decoupled(data).flip 118 | val output = Decoupled(data) 119 | val stall = Bool(INPUT) 120 | } 121 | 122 | io.output.valid := io.input.valid && !io.stall 123 | io.output.bits := io.input.bits 124 | io.input.ready := io.output.ready && !io.stall 125 | } 126 | 127 | object MaskStall { 128 | def apply[T <: Data](deq: DecoupledIO[T], stall: Bool) = { 129 | val ms = Module(new MaskStall(deq.bits.cloneType)) 130 | ms.suggestName("msInst") 131 | ms.io.input <> deq 132 | ms.io.stall := stall 133 | ms.io.output 134 | } 135 | } 136 | 137 | class QCounter(reset_cnt: Int, max_cnt: Int) extends Module() { 138 | val sz = log2Down(max_cnt)+1 139 | val io = new Bundle { 140 | val inc = Bool(INPUT) 141 | val dec = Bool(INPUT) 142 | val qcnt = UInt(INPUT, sz) 143 | val watermark = Bool(OUTPUT) 144 | val full = Bool(OUTPUT) 145 | val empty = Bool(OUTPUT) 146 | } 147 | 148 | val count = Reg(init = UInt(reset_cnt, sz)) 149 | 150 | when (io.inc ^ io.dec) { 151 | when (io.inc) { count := count + UInt(1) } 152 | when (io.dec) { count := count - UInt(1) } 153 | } 154 | 155 | io.watermark := count >= io.qcnt 156 | io.full := count === UInt(max_cnt) 157 | io.empty := count === UInt(0) 158 | } 159 | 160 | trait LookAheadIO extends HwachaBundle { 161 | val reserve = Bool(OUTPUT) 162 | val available = Bool(INPUT) 163 | } 164 | 165 | class CounterLookAheadIO(implicit p: Parameters) extends LookAheadIO with SeqParameters { 166 | val cnt = UInt(OUTPUT, bLookAhead) 167 | } 168 | 169 | class CounterUpdateIO(sz: Int) extends Bundle { 170 | val cnt = UInt(OUTPUT, sz) 171 | val update = Bool(OUTPUT) 172 | 173 | } 174 | 175 | class LookAheadCounter(reset_cnt: Int, max_cnt: Int, resetSignal: Bool = null)(implicit p: Parameters) extends HwachaModule(_reset = resetSignal)(p) with LaneParameters { 176 | require(reset_cnt <= max_cnt) 177 | val sz = log2Down(max_cnt)+1 178 | val io = new Bundle { 179 | val inc = new CounterUpdateIO(sz).flip 180 | val dec = new CounterLookAheadIO().flip 181 | val full = Bool(OUTPUT) 182 | } 183 | 184 | val count = Reg(init = UInt(reset_cnt, sz)) 185 | io.dec.available := (count >= io.dec.cnt) 186 | 187 | val add = (io.inc.cnt & Fill(sz, io.inc.update)) 188 | val sub = (io.dec.cnt & Fill(sz, io.dec.reserve)) 189 | count := count + add - sub 190 | 191 | io.full := (count === UInt(max_cnt)) 192 | } 193 | -------------------------------------------------------------------------------- /src/main/scala/vector-unit.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.diplomacy._ 6 | import freechips.rocketchip.tilelink._ 7 | import freechips.rocketchip.rocket.{TLBPTWIO, TLBConfig} 8 | 9 | class VectorUnit(implicit p: Parameters) extends LazyModule { 10 | lazy val module = new VectorUnitModule(this) 11 | val masterNode = TLClientNode(Seq(TLMasterPortParameters.v1( 12 | Seq(TLMasterParameters.v1(name = "HwachaVMU", sourceId = IdRange(0, p(HwachaNVMTEntries))))))) 13 | } 14 | 15 | class VectorUnitModule(outer: VectorUnit)(implicit p: Parameters) extends LazyModuleImp(outer) with SeqParameters { 16 | val io = IO(new Bundle { 17 | val id = UInt(INPUT) 18 | val cfg = new HwachaConfigIO().flip 19 | val issue = new Bundle { 20 | val vxu = Decoupled(new IssueOp).flip 21 | val vmu = Decoupled(new VMUOp).flip 22 | } 23 | val mseq = new MasterSequencerIO().flip 24 | val mocheck = Vec(nSeq, new MOCheck).asInput 25 | val red = new ReduceResultIO 26 | val ptw = new TLBPTWIO 27 | val pending = new MRTPending().asOutput 28 | 29 | val complete_memop = Bool(OUTPUT) 30 | }) 31 | val (dmem, edge) = outer.masterNode.out.head 32 | 33 | val vxu = Module(new VXU) 34 | vxu.suggestName("vxuInst") 35 | val vmu = Module(new VMU) 36 | vmu.suggestName("vmuInst") 37 | val memif = Module(new VMUTileLink(edge)) 38 | memif.suggestName("memifInst") 39 | val mrt = Module(new MemTracker(nvlreq, nvsreq)) 40 | mrt.suggestName("mrtInst") 41 | val dtlb = Module(new freechips.rocketchip.rocket.TLB(instruction = false, lgMaxSize = log2Ceil(regBytes), TLBConfig(nSets=1, nWays=ndtlb))(edge, p)) 42 | 43 | vxu.io.id := io.id 44 | vxu.io.cfg <> io.cfg 45 | vxu.io.issue <> io.issue.vxu 46 | vxu.io.mseq <> io.mseq 47 | vxu.io.mocheck <> io.mocheck 48 | vmu.io.op <> io.issue.vmu 49 | 50 | vmu.io.id := io.id 51 | vmu.io.cfg <> io.cfg 52 | vmu.io.lane <> vxu.io.vmu 53 | memif.io.vmu <> vmu.io.memif 54 | 55 | io.complete_memop := vmu.io.memif.resp.ready && vmu.io.memif.resp.valid 56 | 57 | mrt.io.lreq <> vxu.io.mrt.lreq 58 | mrt.io.lret <> vxu.io.mrt.lret 59 | mrt.io.sreq <> vxu.io.mrt.sreq 60 | mrt.io.areq <> vxu.io.mrt.areq 61 | mrt.io.sret <> vmu.io.sret 62 | mrt.io.aret <> vmu.io.aret 63 | 64 | dtlb.io.req <> vmu.io.tlb.req 65 | vmu.io.tlb.resp <> dtlb.io.resp 66 | io.ptw <> dtlb.io.ptw 67 | dtlb.io.ptw.status := vmu.io.tlb.status 68 | dtlb.io.sfence.valid := false.B 69 | 70 | io.red <> vxu.io.red 71 | dmem <> memif.io.dmem 72 | io.pending <> mrt.io.pending 73 | 74 | vmu.io.xcpt.prop.vmu.stall := Bool(false) 75 | vmu.io.xcpt.prop.vmu.drain := Bool(false) 76 | vmu.io.xcpt.prop.top.stall := Bool(false) 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/vfu-alu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class ALUOperand(implicit p: Parameters) extends VXUBundle()(p) 7 | with LanePred with Rate { 8 | val fn = new VIUFn 9 | val eidx = Bits(width = bMLVLen - bStrip) 10 | val in0 = Bits(width = SZ_D) 11 | val in1 = Bits(width = SZ_D) 12 | } 13 | 14 | class ALUResult(implicit p: Parameters) extends VXUBundle()(p) { 15 | val out = Bits(width = SZ_D) 16 | val cmp = Bits(width = nPack) 17 | } 18 | 19 | class ALUSlice(aid: Int)(implicit p: Parameters) extends VXUModule()(p) with Packing { 20 | val io = new Bundle { 21 | val cfg = new HwachaConfigIO().flip 22 | val req = Valid(new ALUOperand).flip 23 | val resp = Valid(new ALUResult) 24 | } 25 | 26 | val fn = io.req.bits.fn 27 | val eidx = io.req.bits.eidx 28 | val in0 = io.req.bits.in0 29 | val in1 = io.req.bits.in1 30 | val (in0_hi, in0_lo) = (in0(63, 32), in0(31, 0)) 31 | val (in1_hi, in1_lo) = (in1(63, 32), in1(31, 0)) 32 | 33 | val (fn_dw64, fn_dw32) = (fn.dw_is(DW64), fn.dw_is(DW32)) 34 | 35 | // TODO: Support x4 rate with halfwords 36 | val rate_x2 = if (confprec) (io.req.bits.rate === UInt(1)) else Bool(false) 37 | 38 | val sub = fn.op_is(I_SUB) 39 | class Adder(in0: UInt, in1: UInt, cin: UInt, w: Int = SZ_W) { 40 | private val bits = 41 | Cat(UInt(0, 1), in0, cin).asUInt + 42 | Cat(UInt(0, 1), in1 ^ Fill(w, sub), cin).asUInt 43 | val (cout, out) = (bits(w+1), bits(w, 1)) 44 | } 45 | val adder_out = if (confprec) { 46 | val lo = new Adder(in0_lo, in1_lo, sub) 47 | val hi = new Adder(in0_hi, in1_hi, Mux(rate_x2, sub, lo.cout)) 48 | Cat(hi.out, lo.out) 49 | } else new Adder(in0, in1, sub, SZ_D).out 50 | 51 | // SLL, SRL, SRA 52 | val sra = fn.op_is(I_SRA) 53 | val shright = sra || fn.op_is(I_SRL) 54 | val shamt_lo = Cat(in1(5) & fn_dw64, in1(4,0)).asUInt 55 | val shamt_hi = Cat(in1(37) & fn_dw64, in1(36,32)).asUInt 56 | // Swap shift amounts if left shift and x2 rate 57 | val shamt_hi_swap = Mux(rate_x2 && shright, shamt_hi, shamt_lo) 58 | val shamt_lo_swap = Mux(rate_x2 && !shright, shamt_hi, shamt_lo) 59 | val shfill_lo = sra & in0(31) 60 | val shin_hi = Mux(fn_dw32 && !rate_x2, Fill(32, shfill_lo), in0_hi) 61 | val shin_r = Cat(shin_hi, in0_lo) 62 | val shin = Mux(shright, shin_r, Reverse(shin_r)) 63 | val shfill_hi = sra & shin_r(63) 64 | val shout_r = (0 to 5).foldLeft(shin) { case (bits, i) => 65 | val n = 1 << i 66 | val pad_hi = Fill(n, shfill_hi) 67 | if (confprec) { 68 | val pad_lo = Mux(rate_x2, Fill(n, shfill_lo), bits(31+n, 32)) 69 | Cat(Mux(shamt_hi_swap(i), if (i < 5) Cat(pad_hi, bits(63, 32+n)) else pad_hi, bits(63, 32)), 70 | Mux(shamt_lo_swap(i), if (i < 5) Cat(pad_lo, bits(31, n)) else pad_lo, bits(31, 0))) 71 | } else Mux(shamt_lo(i), Cat(pad_hi, bits(63, n)), bits) 72 | } 73 | val shift_out = Mux(fn.op_is(I_SLL), Reverse(shout_r), shout_r) 74 | 75 | val slt = fn.op_is(I_SLT) 76 | val sltu = fn.op_is(I_SLTU) 77 | trait CmpResult { 78 | val ltu, lt, eq: Bool 79 | val set = (slt && lt) || (sltu && ltu) 80 | } 81 | class Comparator(in0: UInt, in1: UInt, w: Int = SZ_W) extends { 82 | private val (neg0, neg1) = (in0(w-1), in1(w-1)) 83 | val ltu = (in0 < in1) 84 | val lt = ((neg0 === neg1) && ltu) || (neg0 && !neg1) 85 | val eq = (in0 === in1) 86 | } with CmpResult 87 | val cmp_lo = new Comparator(in0_lo, in1_lo) 88 | val cmp_hi = new Comparator(in0_hi, in1_hi) 89 | val cmp_d = new { 90 | private val _ltu = cmp_hi.eq && cmp_lo.ltu 91 | val ltu = cmp_hi.ltu || _ltu 92 | val lt = cmp_hi.lt || _ltu 93 | val eq = cmp_hi.eq && cmp_lo.eq 94 | } with CmpResult 95 | private def cmp(fn: CmpResult => Bool) = 96 | Mux(rate_x2, Cat(fn(cmp_hi), fn(cmp_lo)), fn(cmp_d)) 97 | val set = cmp(_.set) 98 | val set_out = if (confprec) Cat(set(1), UInt(0, 31), set(0)) else cmp_d.set 99 | 100 | val in0_sp = unpack_w(in0, 0) 101 | val in1_sp = unpack_w(in1, 0) 102 | 103 | val in0_dp = unpack_d(in0, 0) 104 | val in1_dp = unpack_d(in1, 0) 105 | 106 | val sj_sp = 107 | fn.op_is(I_FSJ) & in1_sp(31) | 108 | fn.op_is(I_FSJN) & ~in1_sp(31) | 109 | fn.op_is(I_FSJX) & (in1_sp(31) ^ in0_sp(31)) 110 | 111 | val sj_dp = 112 | fn.op_is(I_FSJ) & in1_dp(63) | 113 | fn.op_is(I_FSJN) & ~in1_dp(63) | 114 | fn.op_is(I_FSJX) & (in1_dp(63) ^ in0_dp(63)) 115 | 116 | val s0_result64 = Mux1H(Seq( 117 | fn.op_is(I_IDX) -> Cat(eidx, UInt(aid, bStrip)), 118 | fn.op_is(I_MOV0) -> in0, 119 | fn.op_is(I_ADD,I_ADDU,I_SUB) -> adder_out, 120 | fn.op_is(I_SLL,I_SRL,I_SRA) -> shift_out, 121 | fn.op_is(I_SLT,I_SLTU) -> set_out, 122 | fn.op_is(I_AND) -> (in0 & in1), 123 | fn.op_is(I_OR) -> (in0 | in1), 124 | fn.op_is(I_XOR) -> (in0 ^ in1), 125 | (fn.op_is(I_FSJ,I_FSJN,I_FSJX) && fn.fp_is(FPS)) -> expand_float_s(Cat(sj_sp, in0_sp(30,0))), 126 | (fn.op_is(I_FSJ,I_FSJN,I_FSJX) && fn.fp_is(FPD)) -> expand_float_d(Cat(sj_dp, in0_dp(62,0))) 127 | )) 128 | 129 | val s0_result = MuxCase( 130 | Bits(0, SZ_D), Array( 131 | (fn_dw64 || rate_x2) -> s0_result64, 132 | fn_dw32 -> expand_w(s0_result64(31,0)) 133 | )) 134 | 135 | val s0_cmp = Mux1H(Seq( 136 | fn.op_is(I_CEQ) -> cmp(_.eq), 137 | fn.op_is(I_CLT) -> cmp(_.lt), 138 | fn.op_is(I_CLTU) -> cmp(_.ltu) 139 | )) 140 | 141 | val result = Wire(new ALUResult) 142 | result.out := s0_result 143 | result.cmp := s0_cmp 144 | 145 | io.resp := Pipe(io.req.valid && io.req.bits.active(), result, stagesALU) 146 | } 147 | -------------------------------------------------------------------------------- /src/main/scala/vfu-fcmp.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | import HardFloatHelper._ 7 | import freechips.rocketchip.tile.FType 8 | 9 | class FCmpOperand(implicit p: Parameters) extends VXUBundle()(p) { 10 | val fn = new VFCUFn 11 | val in0 = Bits(width = SZ_D) 12 | val in1 = Bits(width = SZ_D) 13 | } 14 | 15 | class FCmpResult(implicit p: Parameters) extends VXUBundle()(p) { 16 | val out = Bits(width = SZ_D) 17 | val cmp = Bits(width = nPack) 18 | } 19 | 20 | class FCmpSlice(implicit p: Parameters) extends VXUModule()(p) with Packing with freechips.rocketchip.tile.HasFPUParameters { 21 | val io = new Bundle { 22 | val req = Valid(new FCmpOperand).flip 23 | val resp = Valid(new FCmpResult) 24 | } 25 | 26 | val fn = io.req.bits.fn.dgate(io.req.valid) 27 | val in0 = dgate(io.req.valid, io.req.bits.in0) 28 | val in1 = dgate(io.req.valid, io.req.bits.in1) 29 | 30 | val wdp = (11, 53) 31 | val wsp = (8, 24) 32 | val whp = (5, 11) 33 | 34 | val val_cmp = fn.op_is(FC_CEQ,FC_CLT,FC_CLE,FC_MIN,FC_MAX) 35 | 36 | val ins = 37 | List((FPD, recode_dp _, unpack_d _), 38 | (FPS, recode_sp _, unpack_w _), 39 | (FPH, recode_hp _, unpack_h _)) map { 40 | case (fp, recode, unpack) => { 41 | val valid = fn.fp_is(fp) 42 | val input0 = recode(dgate(valid, unpack(in0, 0))) 43 | val input1 = recode(dgate(valid, unpack(in1, 0))) 44 | (input0, input1) 45 | } 46 | } 47 | 48 | val cmps = 49 | ins zip List(wdp, wsp, whp) map { 50 | case ((input0, input1), (exp, sig)) => { 51 | val comp = Module(new hardfloat.CompareRecFN(exp, sig)) 52 | comp.suggestName("compInst") 53 | comp.io.a := input0 54 | comp.io.b := input1 55 | comp.io.signaling := Bool(true) 56 | comp.io 57 | } 58 | } 59 | 60 | val classifys = 61 | ins zip List(wdp, wsp, whp) map { 62 | case ((input0, input1), (exp, sig)) => { 63 | val c0 = FType(exp, sig).classify(input0) 64 | val c1 = FType(exp, sig).classify(input1) 65 | (c0, c1) 66 | } 67 | } 68 | 69 | val results = 70 | List((unpack_d _, expand_float_d _, FType.D), 71 | (unpack_w _, expand_float_s _, FType.S), 72 | (unpack_h _, expand_float_h _, FType(whp._1, whp._2))) zip cmps zip ins zip classifys map { 73 | case ((((unpack, expand, fType), cmp), (input0, input1)), classify) => { 74 | val less = cmp.lt || (input0.asSInt < 0.S && input1.asSInt >= 0.S) 75 | val in0_nan = fType.isNaN(input0) 76 | val in1_nan = fType.isNaN(input1) 77 | val isInvalid = fType.isSNaN(input0) || fType.isSNaN(input1) 78 | val isNaNOut = (in0_nan && in1_nan) 79 | val want_min = in1_nan || (fn.op_is(FC_MIN) === less) && !in0_nan 80 | val in0_minmax = expand(unpack(in0, 0)) 81 | val in1_minmax = expand(unpack(in1, 0)) 82 | val qnan = fType.qNaN 83 | val ieeeNaN = if(fType == FType.S || fType == FType.D) ieee(qnan, fType) else qnan 84 | val minmax = 85 | Mux(isNaNOut, ieeeNaN, Mux(want_min, in0_minmax, in1_minmax)) 86 | val sel = List(FC_MIN,FC_MAX,FC_CLASS).map(fn.op_is(_)) 87 | val in = List( 88 | minmax, // FC_MIN 89 | minmax, // FC_MAX 90 | classify._1) // FC_CLASS 91 | Mux1H(sel, in) 92 | } 93 | } 94 | 95 | val cmp_results = 96 | List(expand_float_d _, expand_float_s _, expand_float_h _) zip cmps zip classifys map { 97 | case ((expand, cmp), classify) => { 98 | val less = cmp.lt 99 | val equal = cmp.eq 100 | val sel = List(FC_CEQ,FC_CLT,FC_CLE).map(fn.op_is(_)) 101 | val in = List( 102 | equal, // FC_CEQ 103 | less, // FC_CLT 104 | equal || less) // FC_CLE 105 | Mux1H(sel, in) 106 | } 107 | } 108 | 109 | 110 | val fpmatch = List(FPD, FPS, FPH).map { fn.fp_is(_) } 111 | val result = Wire(new FCmpResult) 112 | result.out := Mux1H(fpmatch, results) 113 | result.cmp := Mux1H(fpmatch, cmp_results) 114 | 115 | io.resp := Pipe(io.req.valid, result, stagesFCmp) 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/vfu-fconv.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | import HardFloatHelper._ 7 | import scala.collection.mutable.ArrayBuffer 8 | 9 | class FConvOperand(implicit p: Parameters) extends VXUBundle()(p) 10 | with LanePred with Rate { 11 | val fn = new VFVUFn 12 | val in = Bits(width = SZ_D) 13 | } 14 | 15 | class FConvResult extends Bundle { 16 | val out = Bits(OUTPUT, SZ_D) 17 | val exc = Bits(OUTPUT, freechips.rocketchip.tile.FPConstants.FLAGS_SZ) 18 | } 19 | 20 | class FConvSlice(implicit p: Parameters) extends VXUModule()(p) with Packing { 21 | val io = new Bundle { 22 | val req = Valid(new FConvOperand).flip 23 | val resp = Valid(new FConvResult) 24 | } 25 | 26 | val pred = Mux(io.req.valid, io.req.bits.pred, Bits(0)) 27 | val active = io.req.valid && io.req.bits.active() 28 | val fn = io.req.bits.fn.dgate(active) 29 | val in = io.req.bits.in 30 | 31 | val op_int2float = MuxCase( 32 | Bits(0), Array( 33 | fn.op_is(FV_CLTF) -> UInt("b11"), 34 | fn.op_is(FV_CLUTF) -> UInt("b10"), 35 | fn.op_is(FV_CWTF) -> UInt("b01"), 36 | fn.op_is(FV_CWUTF) -> UInt("b00") 37 | )) 38 | 39 | val op_float2int = MuxCase( 40 | Bits(0), Array( 41 | fn.op_is(FV_CFTL) -> UInt("b11"), 42 | fn.op_is(FV_CFTLU) -> UInt("b10"), 43 | fn.op_is(FV_CFTW) -> UInt("b01"), 44 | fn.op_is(FV_CFTWU) -> UInt("b00") 45 | )) 46 | 47 | val val_int2float = fn.op_is(FV_CLTF,FV_CLUTF,FV_CWTF,FV_CWUTF) 48 | val val_float2int32 = fn.op_is(FV_CFTW,FV_CFTWU) 49 | val val_float2int64 = fn.op_is(FV_CFTL,FV_CFTLU) 50 | val val_float2int = val_float2int32 || val_float2int64 51 | 52 | val wdp = (11, 53) 53 | val wsp = (8, 24) 54 | val whp = (5, 11) 55 | 56 | private def pipe[T <: Data](in: T) = Pipe(active, in, stagesFConv).bits 57 | private def pipe(valid: Bool, out: Bits, exc: Bits, fn: Bits=>Bits = identity) = 58 | (fn(Pipe(valid, out, stagesFConv).bits), Pipe(valid, exc, stagesFConv).bits) 59 | 60 | val results_int2float = 61 | List((FPD, ieee_dp _, expand_float_d _, wdp), 62 | (FPS, ieee_sp _, expand_float_s _, wsp), 63 | (FPH, ieee_hp _, expand_float_h _, whp)) map { 64 | case (fp, ieee, expand, (exp, sig)) => { 65 | val valid = fn.fp_is(fp) && val_int2float && pred(0) 66 | val input = dgate(valid, in) 67 | val rm = dgate(valid, fn.rm) 68 | val op = dgate(valid, op_int2float) 69 | val l2fp = Module(new hardfloat.INToRecFN(SZ_D, exp, sig)) 70 | l2fp.suggestName("l2fpInst") 71 | val w2fp = Module(new hardfloat.INToRecFN(SZ_W, exp, sig)) 72 | w2fp.suggestName("w2fpInst") 73 | l2fp.io.signedIn := op(0) 74 | l2fp.io.in := input 75 | l2fp.io.roundingMode := rm 76 | w2fp.io.signedIn := op(0) 77 | w2fp.io.in := input 78 | w2fp.io.roundingMode := rm 79 | val output = Mux(op(1), l2fp.io.out, w2fp.io.out) 80 | val exc = Mux(op(1), l2fp.io.exceptionFlags, w2fp.io.exceptionFlags) 81 | pipe(valid, ieee(output), exc, expand) 82 | } 83 | } 84 | 85 | val results_float2int = 86 | List((FPD, recode_dp _, unpack_d _, wdp), 87 | (FPS, recode_sp _, unpack_w _, wsp), 88 | (FPH, recode_hp _, unpack_h _, whp)) map { 89 | case (fp, recode, unpack, (exp, sig)) => { 90 | val valid = fn.fp_is(fp) && val_float2int && pred(0) 91 | val input = recode(dgate(valid, unpack(in, 0))) 92 | val rm = dgate(valid, fn.rm) 93 | val op = dgate(valid, op_float2int) 94 | val fp2l = Module(new hardfloat.RecFNToIN(exp, sig, SZ_D)) 95 | fp2l.suggestName("fp2lInst") 96 | val fp2w = Module(new hardfloat.RecFNToIN(exp, sig, SZ_W)) 97 | fp2w.suggestName("fp2wInst") 98 | fp2l.io.signedOut := op(0) 99 | fp2l.io.in := input 100 | fp2l.io.roundingMode := rm 101 | fp2w.io.signedOut := op(0) 102 | fp2w.io.in := input 103 | fp2w.io.roundingMode := rm 104 | val output = Mux(op(1), fp2l.io.out, expand_w(fp2w.io.out)) 105 | val iexc = Mux(op(1), fp2l.io.intExceptionFlags, fp2w.io.intExceptionFlags) 106 | pipe(valid, output, Cat(iexc(2, 1).orR, UInt(0, 3), iexc(0))) 107 | } 108 | } 109 | 110 | val results_float2float = 111 | List((FV_CSTD, recode_sp _, unpack_w _, ieee_dp _, expand_float_d _, wsp, wdp), 112 | (FV_CHTD, recode_hp _, unpack_h _, ieee_dp _, expand_float_d _, whp, wdp), 113 | (FV_CDTS, recode_dp _, unpack_d _, ieee_sp _, expand_float_s _, wdp, wsp), 114 | (FV_CHTS, recode_hp _, unpack_h _, ieee_sp _, expand_float_s _, whp, wsp), 115 | (FV_CDTH, recode_dp _, unpack_d _, ieee_hp _, expand_float_h _, wdp, whp), 116 | (FV_CSTH, recode_sp _, unpack_w _, ieee_hp _, expand_float_h _, wsp, whp)) map { 117 | case (op, recode, unpack, ieee, expand, (exps, sigs), (expd, sigd)) => { 118 | val (szs, szd) = (exps + sigs, expd + sigd) 119 | val sz = math.max(szs, szd) 120 | val (m, n) = (math.max(szd / szs, 1), regLen / sz) 121 | val val_op = fn.op_is(op) 122 | val results = for (i <- (0 until n) if (confprec || i == 0)) yield { 123 | val fp2fp = Module(new hardfloat.RecFNToRecFN(exps, sigs, expd, sigd)) 124 | fp2fp.suggestName("fp2fpInst") 125 | val valid = pred(i) && val_op 126 | fp2fp.io.in := recode(dgate(valid, unpack(in, i * m))) 127 | fp2fp.io.roundingMode := dgate(valid, fn.rm) 128 | pipe(valid, ieee(fp2fp.io.out), fp2fp.io.exceptionFlags, expand) 129 | } 130 | val valid = active && val_op 131 | val output = if (results.size > 1) { 132 | val rmatch = (io.req.bits.rate === UInt(log2Ceil(n))) 133 | Mux(Pipe(valid, rmatch, stagesFConv).bits, 134 | Vec(results.map(_._1(sz-1, 0))).asUInt, results.head._1) 135 | } else results.head._1 136 | (output, results.map(_._2).reduce(_.asUInt | _.asUInt )) 137 | } 138 | } 139 | 140 | val val_int2float_pipe = pipe(val_int2float) 141 | val val_float2int_pipe = pipe(val_float2int) 142 | val fn_pipe = pipe(fn) 143 | 144 | val results = 145 | List((FV_CSTD, FV_CHTD), (FV_CDTS, FV_CHTS), (FV_CDTH, FV_CSTH)).zipWithIndex.map { 146 | case ((op0, op1), i) => Mux1H(Seq( 147 | val_int2float_pipe -> results_int2float(i), 148 | val_float2int_pipe -> results_float2int(i), 149 | fn_pipe.op_is(op0) -> results_float2float(2*i), 150 | fn_pipe.op_is(op1) -> results_float2float(2*i+1) 151 | ) map { case (sel, (out, exc)) => 152 | val result = Wire(new FConvResult) 153 | result.out := out 154 | result.exc := exc 155 | sel -> result 156 | }) 157 | } 158 | 159 | val fpmatch = Seq(FPD, FPS, FPH).map(fn_pipe.fp_is(_)) 160 | io.resp.valid := ShiftRegister(active, stagesFConv) 161 | io.resp.bits := Mux1H(fpmatch, results) 162 | } 163 | -------------------------------------------------------------------------------- /src/main/scala/vfu-fdiv.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | import HardFloatHelper._ 7 | 8 | class FDivOperand(implicit p: Parameters) extends VXUBundle()(p) { 9 | val fn = new VFDUFn 10 | val in0 = Bits(width = SZ_D) 11 | val in1 = Bits(width = SZ_D) 12 | } 13 | 14 | class FDivResult extends Bundle { 15 | val out = Bits(width = SZ_D) 16 | val exc = Bits(width = freechips.rocketchip.tile.FPConstants.FLAGS_SZ) 17 | } 18 | 19 | class FDivIO(implicit p: Parameters) extends VXUBundle()(p) { 20 | val req = Decoupled(new FDivOperand) 21 | val resp = Decoupled(new FDivResult).flip 22 | } 23 | 24 | class FDivTag(implicit p: Parameters) extends VXUBundle()(p) { 25 | val fn = new VFDUFn 26 | val exc = Bits(width = freechips.rocketchip.tile.FPConstants.FLAGS_SZ) 27 | } 28 | 29 | class FDivSlice(implicit p: Parameters) extends VXUModule()(p) with Packing { 30 | val io = new FDivIO().flip 31 | 32 | val qcnt = Module(new QCounter(nDecoupledUnitWBQueue, nDecoupledUnitWBQueue)) 33 | qcnt.suggestName("qcntInst") 34 | 35 | qcnt.io.dec := io.req.fire 36 | qcnt.io.inc := io.resp.fire 37 | 38 | // stage0 39 | val ins = List(io.req.bits.in0, io.req.bits.in1) map { in => 40 | val dp = recode_dp(in) 41 | val sp = Module(new hardfloat.RecFNToRecFN(8, 24, 11, 53)) 42 | sp.suggestName("spInst") 43 | val hp = Module(new hardfloat.RecFNToRecFN(5, 11, 11, 53)) 44 | hp.suggestName("hpInst") 45 | sp.io.in := recode_sp(in) 46 | sp.io.roundingMode := io.req.bits.fn.rm 47 | hp.io.in := recode_hp(in) 48 | hp.io.roundingMode := io.req.bits.fn.rm 49 | val out = Mux(io.req.bits.fn.fp_is(FPD), dp, 50 | Mux(io.req.bits.fn.fp_is(FPS), sp.io.out, hp.io.out)) 51 | val exc = Mux(io.req.bits.fn.fp_is(FPD), Bits(0), 52 | Mux(io.req.bits.fn.fp_is(FPS), sp.io.exceptionFlags, hp.io.exceptionFlags)) 53 | (out, exc) 54 | } 55 | 56 | val in0q = Module(new Queue(Bits(width = 65), 2)) 57 | in0q.suggestName("in0qInst") 58 | val in1q = Module(new Queue(Bits(width = 65), 2)) 59 | in1q.suggestName("in1qInst") 60 | val intagq = Module(new Queue(new FDivTag, 2)) 61 | intagq.suggestName("intagqInst") 62 | 63 | val s0_op_div = io.req.bits.fn.op_is(FD_DIV) 64 | 65 | in0q.io.enq.valid := io.req.valid 66 | in0q.io.enq.bits := ins(0)._1 67 | in1q.io.enq.valid := io.req.valid && s0_op_div 68 | in1q.io.enq.bits := ins(1)._1 69 | intagq.io.enq.valid := io.req.valid 70 | intagq.io.enq.bits.fn := io.req.bits.fn 71 | intagq.io.enq.bits.exc := ins(0)._2 | dgate(s0_op_div, ins(1)._2) 72 | 73 | io.req.ready := 74 | !qcnt.io.empty && intagq.io.enq.ready && 75 | in0q.io.enq.ready && (!s0_op_div || in1q.io.enq.ready) 76 | 77 | // stage1 78 | val div = Module(new hardfloat.DivSqrtRecF64) 79 | div.suggestName("divInst") 80 | val outtagq = Module(new Queue(new FDivTag, nDecoupledUnitWBQueue)) 81 | outtagq.suggestName("outtagqInst") 82 | 83 | val s1_op_div = intagq.io.deq.bits.fn.op_is(FD_DIV) 84 | val mask_in1q_valid = !s1_op_div || in1q.io.deq.valid 85 | val mask_div_ready = s1_op_div && div.io.inReady_div || !s1_op_div && div.io.inReady_sqrt 86 | 87 | def fire(exclude: Bool, include: Bool*) = { 88 | val rvs = Seq( 89 | in0q.io.deq.valid, mask_in1q_valid, intagq.io.deq.valid, 90 | mask_div_ready, outtagq.io.enq.ready) 91 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 92 | } 93 | 94 | in0q.io.deq.ready := fire(in0q.io.deq.valid) 95 | in1q.io.deq.ready := fire(mask_in1q_valid, s1_op_div) 96 | intagq.io.deq.ready := fire(intagq.io.deq.valid) 97 | div.io.inValid := fire(mask_div_ready) 98 | outtagq.io.enq.valid := fire(outtagq.io.enq.ready) 99 | 100 | div.io.sqrtOp := !s1_op_div 101 | div.io.a := in0q.io.deq.bits 102 | div.io.b := Mux(s1_op_div, in1q.io.deq.bits, in0q.io.deq.bits) 103 | div.io.roundingMode := intagq.io.deq.bits.fn.rm 104 | 105 | outtagq.io.enq.bits := intagq.io.deq.bits 106 | 107 | // output 108 | val s0_result_valid = div.io.outValid_div || div.io.outValid_sqrt 109 | 110 | // stage output+1 111 | val s1_result_valid = Reg(next=s0_result_valid) 112 | val s1_result_out = RegEnable(div.io.out, s0_result_valid).asUInt 113 | val s1_result_exc = RegEnable(div.io.exceptionFlags, s0_result_valid) 114 | 115 | val rq = Module(new Queue(new FDivResult, nDecoupledUnitWBQueue)) 116 | rq.suggestName("rqInst") 117 | 118 | val s1_result_dp = ieee_dp(s1_result_out) 119 | val s1_result_sp = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24)) 120 | s1_result_sp.suggestName("s1_result_spInst") 121 | val s1_result_hp = Module(new hardfloat.RecFNToRecFN(11, 53, 5, 11)) 122 | s1_result_hp.suggestName("s1_result_hpInst") 123 | s1_result_sp.io.in := s1_result_out 124 | s1_result_sp.io.roundingMode := outtagq.io.deq.bits.fn.rm 125 | s1_result_hp.io.in := s1_result_out 126 | s1_result_hp.io.roundingMode := outtagq.io.deq.bits.fn.rm 127 | 128 | val s1_out = Mux(outtagq.io.deq.bits.fn.fp_is(FPD), s1_result_dp, 129 | Mux(outtagq.io.deq.bits.fn.fp_is(FPS), expand_float_s(ieee_sp(s1_result_sp.io.out)), 130 | expand_float_h(ieee_hp(s1_result_hp.io.out)))) 131 | val s1_exc = Mux(outtagq.io.deq.bits.fn.fp_is(FPD), Bits(0), 132 | Mux(outtagq.io.deq.bits.fn.fp_is(FPS), s1_result_sp.io.exceptionFlags, 133 | s1_result_hp.io.exceptionFlags)) 134 | 135 | rq.io.enq.valid := s1_result_valid 136 | outtagq.io.deq.ready := s1_result_valid 137 | 138 | rq.io.enq.bits.out := s1_out 139 | rq.io.enq.bits.exc := s1_result_exc | s1_exc | outtagq.io.deq.bits.exc 140 | 141 | assert(!s1_result_valid || rq.io.enq.ready, "result queue should always be ready when a result is about to enqueue") 142 | assert(!io.req.fire || rq.io.enq.ready, "result queue should always be ready when a request fires") 143 | 144 | io.resp <> rq.io.deq 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/vfu-fma.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | import HardFloatHelper._ 7 | import scala.collection.mutable.ArrayBuffer 8 | 9 | class FMAOperand(implicit p: Parameters) extends VXUBundle()(p) 10 | with LanePred with Rate { 11 | val fn = new VFMUFn 12 | val in0 = Bits(width = SZ_D) 13 | val in1 = Bits(width = SZ_D) 14 | val in2 = Bits(width = SZ_D) 15 | } 16 | 17 | class FMAResult extends Bundle { 18 | val out = Bits(OUTPUT, SZ_D) 19 | val exc = Bits(OUTPUT, freechips.rocketchip.tile.FPConstants.FLAGS_SZ) 20 | } 21 | 22 | class FMASlice(implicit p: Parameters) extends VXUModule()(p) with Packing { 23 | val io = new Bundle { 24 | val req = Valid(new FMAOperand).flip 25 | val resp = Valid(new FMAResult) 26 | } 27 | 28 | val pred = Mux(io.req.valid, io.req.bits.pred, Bits(0)) 29 | val active = io.req.valid && io.req.bits.active() 30 | val fn = io.req.bits.fn.dgate(active) 31 | val in0 = io.req.bits.in0 32 | val in1 = io.req.bits.in1 33 | val in2 = io.req.bits.in2 34 | 35 | val fma_op = MuxCase( 36 | Bits("b00",2), Array( 37 | fn.op_is(FM_SUB, FM_MSUB) -> Bits("b01",2), 38 | fn.op_is(FM_NMSUB) -> Bits("b10",2), 39 | fn.op_is(FM_NMADD) -> Bits("b11",2) 40 | )) 41 | 42 | val one_dp = splat_d(Bits("h3FF0_0000_0000_0000", SZ_D)) 43 | val one_sp = splat_w(Bits("h3F80_0000", SZ_W)) 44 | val one_hp = splat_h(Bits("h3C00", SZ_H)) 45 | val fma_multiplicand = in0 46 | val fma_multiplier = MuxCase( 47 | in1, Array( 48 | (fn.fp_is(FPD) && fn.op_is(FM_ADD, FM_SUB)) -> one_dp, 49 | (fn.fp_is(FPS) && fn.op_is(FM_ADD, FM_SUB)) -> one_sp, 50 | (fn.fp_is(FPH) && fn.op_is(FM_ADD, FM_SUB)) -> one_hp 51 | )) 52 | 53 | val fma_addend = MuxCase( 54 | in2, Array( 55 | fn.op_is(FM_ADD, FM_SUB) -> in1, 56 | fn.op_is(FM_MUL) -> Bits(0, SZ_D) 57 | )) 58 | 59 | val results = 60 | List((stagesDFMA, SZ_D, FPD, recode_dp _, unpack_d _, ieee_dp _, repack_d _, expand_float_d _, (11, 53)), 61 | (stagesSFMA, SZ_W, FPS, recode_sp _, unpack_w _, ieee_sp _, repack_w _, expand_float_s _, (8, 24)), 62 | (stagesHFMA, SZ_H, FPH, recode_hp _, unpack_h _, ieee_hp _, repack_h _, expand_float_h _, (5, 11))) map { 63 | case (stages, sz, fp, recode, unpack, ieee, repack, expand, (exp, sig)) => { 64 | val n = SZ_D / sz 65 | val val_fp = fn.fp_is(fp) 66 | val results = for (i <- (0 until n) if (confprec || i == 0)) yield { 67 | val fma = Module(new freechips.rocketchip.tile.MulAddRecFNPipe(stages min 2, exp, sig)) 68 | fma.suggestName("fmaInst") 69 | val valid = pred(i) && val_fp 70 | fma.io.validin := valid 71 | fma.io.op := dgate(valid, fma_op) 72 | fma.io.a := recode(dgate(valid, unpack(fma_multiplicand, i))) 73 | fma.io.b := recode(dgate(valid, unpack(fma_multiplier, i))) 74 | fma.io.c := recode(dgate(valid, unpack(fma_addend, i))) 75 | fma.io.roundingMode := dgate(valid, fn.rm) 76 | val out = Pipe(fma.io.validout, ieee(fma.io.out), (stages - 2) max 0).bits 77 | val exc = Pipe(fma.io.validout, fma.io.exceptionFlags, (stages - 2) max 0).bits 78 | (out, exc) 79 | } 80 | val valid = active && val_fp 81 | val out_head = expand(results.head._1) 82 | val out = if (results.size > 1) { 83 | val rmatch = (io.req.bits.rate === UInt(log2Ceil(n))) 84 | Mux(Pipe(valid, rmatch, stages).bits, repack(results.map(_._1)), out_head) 85 | } else out_head 86 | val exc = results.map(_._2).reduce(_|_) 87 | (ShiftRegister(valid, stages), out, exc) 88 | } 89 | } 90 | 91 | val fpmatch = results.map(_._1) 92 | io.resp.valid := fpmatch.reduce(_ || _) 93 | io.resp.bits.out := Mux1H(fpmatch, results.map(_._2)) 94 | io.resp.bits.exc := Mux1H(fpmatch, results.map(_._3)) 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/vfu-idiv.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.rocket._ 6 | 7 | case object FastMulDiv extends Field[Boolean] 8 | 9 | object RocketConstants extends freechips.rocketchip.rocket.constants.ScalarOpConstants 10 | 11 | class IDivOperand(implicit p: Parameters) extends VXUBundle()(p) { 12 | val fn = new VIDUFn 13 | val in0 = Bits(width = SZ_D) 14 | val in1 = Bits(width = SZ_D) 15 | } 16 | 17 | class IDivResult extends Bundle { 18 | val out = Bits(width = SZ_D) 19 | } 20 | 21 | class IDivIO(implicit p: Parameters) extends VXUBundle()(p) { 22 | val req = Decoupled(new IDivOperand) 23 | val resp = Decoupled(new IDivResult).flip 24 | } 25 | 26 | class IDivSlice(implicit p: Parameters) extends VXUModule()(p) { 27 | val io = new IDivIO().flip 28 | 29 | implicit def BitPatToUInt(x: BitPat): UInt = { 30 | require(x.mask == (BigInt(1) << x.getWidth)-1) 31 | UInt(x.value, x.getWidth) 32 | } 33 | 34 | val qcnt = Module(new QCounter(nDecoupledUnitWBQueue, nDecoupledUnitWBQueue)) 35 | qcnt.suggestName("qcntInst") 36 | 37 | qcnt.io.dec := io.req.fire 38 | qcnt.io.inc := io.resp.fire 39 | 40 | val div = Module(new MulDiv(cfg = MulDivParams(mulUnroll = 8, mulEarlyOut = true, divEarlyOut = true), width = p(HwachaRegLen))) 41 | div.suggestName("divInst") 42 | 43 | div.io.req.valid := io.req.valid 44 | io.req.ready := !qcnt.io.empty && div.io.req.ready 45 | div.io.req.bits.dw := 46 | Mux(io.req.bits.fn.dw_is(DW32), RocketConstants.DW_32, 47 | RocketConstants.DW_64) 48 | div.io.req.bits.fn := 49 | Mux(io.req.bits.fn.op_is(ID_DIV), aluFn.FN_DIV, 50 | Mux(io.req.bits.fn.op_is(ID_DIVU), aluFn.FN_DIVU, 51 | Mux(io.req.bits.fn.op_is(ID_REM), aluFn.FN_REM, 52 | aluFn.FN_REMU))) 53 | div.io.req.bits.in1 := io.req.bits.in0 54 | div.io.req.bits.in2 := io.req.bits.in1 55 | div.io.kill := Bool(false) 56 | 57 | val rq = Module(new Queue(new IDivResult, nDecoupledUnitWBQueue)) 58 | rq.suggestName("rqInst") 59 | 60 | rq.io.enq.valid := div.io.resp.valid 61 | rq.io.enq.bits.out := div.io.resp.bits.data 62 | div.io.resp.ready := rq.io.enq.ready 63 | 64 | assert(!div.io.resp.valid || rq.io.enq.ready, "result queue should always be ready when a result is about to enqueue") 65 | assert(!io.req.fire || rq.io.enq.ready, "result queue should always be ready when a request fires") 66 | 67 | io.resp <> rq.io.deq 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/vfu-imul.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | 7 | class IMulOperand(implicit p: Parameters) extends VXUBundle()(p) { 8 | val fn = new VIMUFn 9 | val in0 = Bits(width = SZ_D) 10 | val in1 = Bits(width = SZ_D) 11 | } 12 | 13 | class IMulResult extends Bundle { 14 | val out = Bits(width = SZ_D) 15 | } 16 | 17 | class IMulSlice(implicit p: Parameters) extends VXUModule()(p) { 18 | val io = new Bundle { 19 | val req = Valid(new IMulOperand).flip 20 | val resp = Valid(new IMulResult) 21 | } 22 | 23 | val fn = io.req.bits.fn.dgate(io.req.valid) 24 | val in0 = dgate(io.req.valid, io.req.bits.in0) 25 | val in1 = dgate(io.req.valid, io.req.bits.in1) 26 | 27 | val sxl64 = fn.is(DW64, IM_MH, IM_MHSU) 28 | val sxr64 = fn.is(DW64, IM_MH) 29 | val zxl32 = fn.is(DW32, IM_MHU) 30 | val zxr32 = fn.is(DW32, IM_MHU, IM_MHSU) 31 | val sxl32 = fn.is(DW32, IM_MH, IM_MHSU) 32 | val sxr32 = fn.is(DW32, IM_MH) 33 | 34 | val lhs = Cat( 35 | in0(63) & sxl64, 36 | Fill(32, ~zxl32)&in0(63,32) | Fill(32, sxl32&in0(31)), 37 | in0(31,0)) //TODO: 65 bits 38 | val rhs = Cat( 39 | in1(63) & sxr64, 40 | Fill(32, ~zxr32)&in1(63,32) | Fill(32, sxr32&in1(31)), 41 | in1(31,0)) //TODO: 65 bits 42 | 43 | val mul_result = lhs.asSInt * rhs.asSInt //TODO:130 bits 44 | 45 | val result = Wire(new IMulResult) 46 | result.out := MuxCase( 47 | Bits(0), Array( 48 | fn.is(DW64, IM_M) -> mul_result(63,0), 49 | fn.is(DW64, IM_MH) -> mul_result(127,64), 50 | fn.is(DW64, IM_MHU) -> mul_result(127,64), 51 | fn.is(DW64, IM_MHSU) -> mul_result(127,64), 52 | fn.is(DW32, IM_M) -> Cat(Fill(32, mul_result(31)), mul_result(31,0)), 53 | fn.is(DW32, IM_MH) -> Cat(Fill(32, mul_result(63)), mul_result(63,32)), 54 | fn.is(DW32, IM_MHU) -> Cat(Fill(32, mul_result(63)), mul_result(63,32)), 55 | fn.is(DW32, IM_MHSU) -> Cat(Fill(32, mul_result(63)), mul_result(63,32)) 56 | )) 57 | 58 | io.resp := Pipe(io.req.valid, result, stagesIMul) 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/vfu-plu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class PLUOperand(implicit p: Parameters) extends VXUBundle()(p) { 7 | val fn = new VIPUFn 8 | val in0 = Bool() 9 | val in1 = Bool() 10 | val in2 = Bool() 11 | } 12 | 13 | class PLUResult(implicit p: Parameters) extends VXUBundle()(p) { 14 | val out = Bool() 15 | } 16 | 17 | class PLUSlice(implicit p: Parameters) extends VXUModule()(p) { 18 | val io = new Bundle { 19 | val req = Valid(new PLUOperand).flip 20 | val resp = Valid(new PLUResult) 21 | } 22 | 23 | val op = io.req.bits.fn.op 24 | val s2 = Mux(io.req.bits.in2, op(7,4), op(3,0)) 25 | val s1 = Mux(io.req.bits.in1, s2(3,2), s2(1,0)) 26 | val s0 = Mux(io.req.bits.in0, s1(1), s1(0)) 27 | 28 | val result = Wire(new PLUResult) 29 | result.out := s0 30 | 31 | //io.resp := Pipe(io.req.valid, result, stagesPLU) 32 | //TODO COLIN FIXME: Bug in chisel Pipe falsely? generating chisel3 compat error 33 | io.resp.valid := io.req.valid 34 | io.resp.bits := result 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/vfu-rfirst.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import DataGating._ 6 | 7 | class RFirstOperand(implicit p: Parameters) extends VXUBundle()(p) { 8 | val active = Bits(width = nSlices) 9 | val pred = Bits(width = nSlices) 10 | val lsidx = UInt(width = bVLen - bStrip) 11 | val in = Vec(nSlices, Bits(width = SZ_D)) 12 | } 13 | 14 | class RFirstResult(implicit p: Parameters) extends VXUBundle()(p) { 15 | val found = Bool() 16 | val lsidx = Bits(width = bVLen - bStrip) 17 | val first = Bits(width = SZ_D) 18 | val sd = UInt(width = bSRegs) 19 | } 20 | 21 | class RFirstIO(implicit p: Parameters) extends VXUBundle()(p) { 22 | val op = Valid(new VRFUFn) 23 | val req = Decoupled(new RFirstOperand) 24 | val result = Decoupled(new RFirstResult).flip 25 | } 26 | 27 | class RFirstLane(implicit p: Parameters) extends VXUModule()(p) { 28 | val io = new RFirstIO().flip 29 | 30 | val result = Reg(new RFirstResult) 31 | 32 | when (io.op.valid) { 33 | result.found := Bool(false) 34 | result.sd := io.op.bits.sd 35 | } 36 | 37 | io.req.ready := Bool(true) 38 | val pred = PriorityEncoderOH((io.req.bits.active & io.req.bits.pred).asBools) 39 | val found = pred.reduce(_ || _) 40 | when (io.req.fire && !result.found && found) { 41 | result.found := Bool(true) 42 | result.lsidx := io.req.bits.lsidx 43 | result.first := Mux1H(pred, io.req.bits.in) 44 | } 45 | 46 | io.result.bits := result 47 | } 48 | 49 | class RFirstMaster(implicit p: Parameters) extends VXUModule()(p) { 50 | val io = new Bundle { 51 | val op = Decoupled(new IssueOpML).flip 52 | val lane = Vec(nLanes, Decoupled(new RFirstResult)).flip 53 | val result = Decoupled(new RFirstResult) 54 | } 55 | 56 | val opq = Module(new Queue(new IssueOpML, 2)) 57 | opq.suggestName("opqInst") 58 | opq.io.enq <> io.op 59 | 60 | val s_idle :: s_busy :: Nil = Enum(UInt(), 2) 61 | val state = Reg(init = s_idle) 62 | val deq_lane = Reg(Vec(nLanes, Bool())) 63 | val fn = Reg(new VRFUFn) 64 | 65 | val mask_lane_valid = (deq_lane zip io.lane) map { case (deq, lane) => !deq || lane.valid } 66 | 67 | def fire(exclude: Bool, include: Bool*) = { 68 | val rvs = Seq(state === s_busy, io.result.ready) ++ mask_lane_valid 69 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 70 | } 71 | 72 | io.result.valid := fire(io.result.ready) 73 | (io.lane.zipWithIndex) map { case (lane, i) => 74 | lane.ready := fire(mask_lane_valid(i), deq_lane(i)) } 75 | 76 | opq.io.deq.ready := Bool(false) 77 | 78 | switch (state) { 79 | is (s_idle) { 80 | opq.io.deq.ready := Bool(true) 81 | when (opq.io.deq.valid) { 82 | state := s_busy 83 | deq_lane := Vec(opq.io.deq.bits.lane.map(_.active)) 84 | fn := opq.io.deq.bits.fn.vrfu() 85 | } 86 | } 87 | is (s_busy) { 88 | when (fire(null)) { state := s_idle } 89 | } 90 | } 91 | 92 | def find_min(n: Int, s: Int): Tuple2[UInt, UInt] = { 93 | if (n == 1) { 94 | return (io.lane(s).valid && io.lane(s).bits.found, io.lane(s).bits.lsidx) 95 | } else { 96 | require(isPow2(n)) 97 | val half = n/2 98 | val left = find_min(half, s) 99 | val right = find_min(half, s+half) 100 | val left_found = left._1.orR 101 | val right_found = right._1.orR 102 | val left_min = (left._2 <= right._2) 103 | val left_mask = left_found && (!right_found || left_min) 104 | val right_mask = right_found && (!left_found || !left_min) 105 | assert(!left_mask || !right_mask, "left and right can't be turned on at the same time") 106 | return (Cat(dgate(right_mask, right._1), dgate(left_mask, left._1)), 107 | Mux(left_mask, left._2, right._2)) 108 | } 109 | } 110 | 111 | val m = find_min(nLanes, 0) 112 | io.result.bits.found := m._1.orR 113 | io.result.bits.lsidx := m._2 // approximate eidx 114 | io.result.bits.first := 115 | (m._1.asBools zip io.lane.map(_.bits.first)) map { case (v, f) => dgate(v, f) } reduce(_ | _) 116 | io.result.bits.sd := fn.sd 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/vfu-rpred.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class RPredOperand(implicit p: Parameters) extends VXUBundle()(p) { 7 | val active = Bits(width = nSlices) 8 | val pred = Bits(width = nSlices) 9 | } 10 | 11 | class RPredResult(implicit p: Parameters) extends VXUBundle()(p) { 12 | val cond = Bool() 13 | } 14 | 15 | class RPredIO(implicit p: Parameters) extends VXUBundle()(p) { 16 | val op = Valid(new VRPUFn) 17 | val req = Decoupled(new RPredOperand) 18 | val result = Decoupled(new RPredResult).flip 19 | } 20 | 21 | class RPredLane(implicit p: Parameters) extends VXUModule()(p) { 22 | val io = new RPredIO().flip 23 | 24 | val fn = Reg(new VRPUFn) 25 | val cond = Reg(Bool()) 26 | 27 | when (io.op.valid) { 28 | fn := io.op.bits 29 | when (io.op.bits.op_is(FR_ALL)) { cond := Bool(true) } 30 | when (io.op.bits.op_is(FR_ANY)) { cond := Bool(false) } 31 | } 32 | 33 | io.req.ready := Bool(true) 34 | when (io.req.fire) { 35 | when (fn.op_is(FR_ALL)) { cond := cond & (io.req.bits.pred | ~io.req.bits.active).orR } 36 | when (fn.op_is(FR_ANY)) { cond := cond | (io.req.bits.pred & io.req.bits.active).orR } 37 | } 38 | 39 | io.result.bits.cond := cond 40 | } 41 | 42 | class RPredMaster(implicit p: Parameters) extends VXUModule()(p) { 43 | val io = new Bundle { 44 | val op = Decoupled(new IssueOpML).flip 45 | val lane = Vec(nLanes, Decoupled(new RPredResult)).flip 46 | val result = Decoupled(new RPredResult) 47 | } 48 | 49 | val opq = Module(new Queue(new IssueOpML, 2)) 50 | opq.suggestName("opqInst") 51 | opq.io.enq <> io.op 52 | 53 | val s_idle :: s_busy :: Nil = Enum(UInt(), 2) 54 | val state = Reg(init = s_idle) 55 | val deq_lane = Reg(Vec(nLanes, Bool())) 56 | val fn = Reg(new VRPUFn) 57 | 58 | val mask_lane_valid = (deq_lane zip io.lane) map { case (deq, lane) => !deq || lane.valid } 59 | 60 | def fire(exclude: Bool, include: Bool*) = { 61 | val rvs = Seq(state === s_busy, io.result.ready) ++ mask_lane_valid 62 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 63 | } 64 | 65 | io.result.valid := fire(io.result.ready) 66 | (io.lane.zipWithIndex) map { case (lane, i) => 67 | lane.ready := fire(mask_lane_valid(i), deq_lane(i)) } 68 | 69 | opq.io.deq.ready := Bool(false) 70 | 71 | switch (state) { 72 | is (s_idle) { 73 | opq.io.deq.ready := Bool(true) 74 | when (opq.io.deq.valid) { 75 | state := s_busy 76 | deq_lane := Vec(opq.io.deq.bits.lane.map(_.active)) 77 | fn := opq.io.deq.bits.fn.vrpu() 78 | } 79 | } 80 | is (s_busy) { 81 | when (fire(null)) { state := s_idle } 82 | } 83 | } 84 | 85 | val cond = 86 | fn.op_is(FR_ALL) && io.lane.map(r => r.bits.cond | ~r.valid).reduce(_ && _) || 87 | fn.op_is(FR_ANY) && io.lane.map(r => r.bits.cond & r.valid).reduce(_ || _) 88 | 89 | io.result.bits.cond := cond 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/vmu-memif.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.tilelink._ 6 | 7 | class VMUMemReq(implicit p: Parameters) extends VMUMemOp 8 | with VMUTag with VMUData { 9 | val mask = UInt(width = tlDataBytes) 10 | val pred = Bool() 11 | } 12 | 13 | class VMUMemResp(implicit p: Parameters) extends VMULoadData()(p) { 14 | val store = Bool() 15 | } 16 | 17 | class VMUMemIO(implicit p: Parameters) extends VMUBundle()(p) { 18 | val req = Decoupled(new VMUMemReq) 19 | val resp = Decoupled(new VMUMemResp).flip 20 | } 21 | 22 | class MBox(implicit p: Parameters) extends VMUModule()(p) { 23 | val io = new Bundle { 24 | val inner = new Bundle { 25 | val abox = new VMUAddrIO().flip 26 | val sbox = new VMUStoreIO().flip 27 | val lbox = new VLDQIO 28 | } 29 | val outer = new VMUMemIO 30 | 31 | val sret = new CounterUpdateIO(bSRet) 32 | } 33 | 34 | val vmt = Module(new Table(nVMT, new VMTEntry)) 35 | vmt.suggestName("vmtInst") 36 | 37 | private val abox = io.inner.abox 38 | private val sbox = io.inner.sbox 39 | private val lbox = io.inner.lbox 40 | private val req = io.outer.req 41 | private val resp = io.outer.resp 42 | 43 | val mt = DecodedMemType(abox.bits.fn.mt) 44 | val cmd = DecodedMemCommand(abox.bits.fn.cmd) 45 | val read = cmd.read 46 | val write = cmd.write 47 | 48 | val pred = abox.bits.meta.mask.orR 49 | 50 | val sbox_valid = !write || sbox.valid 51 | val vmt_ready = !pred || vmt.io.w.ready 52 | val req_ready = !pred || req.ready 53 | 54 | private def fire(exclude: Bool, include: Bool*) = { 55 | val rvs = Seq(abox.valid, sbox_valid, vmt_ready, req_ready) 56 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 57 | } 58 | 59 | abox.ready := fire(abox.valid) 60 | sbox.ready := fire(sbox_valid, write) 61 | vmt.io.w.valid := fire(vmt_ready, pred) 62 | req.valid := fire(req_ready, pred) 63 | 64 | /* Data mask */ 65 | val mask_shift = abox.bits.meta.epad(tlByteAddrBits - 1) 66 | val mask = Mux(mask_shift, 67 | Cat(abox.bits.meta.mask, UInt(0, tlDataBytes >> 1)), 68 | abox.bits.meta.mask) 69 | 70 | /* Metadata table */ 71 | val vlt = Wire(new VMTLoadEntry) 72 | val vst = Wire(new VMTStoreEntry) 73 | vlt.vidx := abox.bits.meta.vidx 74 | vlt.eidx := abox.bits.meta.eidx 75 | vlt.epad := abox.bits.meta.epad 76 | vlt.mask := abox.bits.meta.mask 77 | vst.ecnt := abox.bits.meta.ecnt 78 | vmt.io.w.bits.union := Mux(cmd.store, vst.toBits, vlt.toBits) 79 | 80 | /* Store metadata */ 81 | sbox.meta.offset := abox.bits.addr(tlByteAddrBits-1, 0) 82 | sbox.meta.ecnt := abox.bits.meta.ecnt 83 | sbox.meta.last := abox.bits.meta.last 84 | sbox.meta.vsdq := abox.bits.meta.vsdq 85 | 86 | /* Request */ 87 | req.bits.fn := abox.bits.fn 88 | req.bits.addr := abox.bits.addr 89 | req.bits.mask := PredicateByteMask(mask, mt) 90 | req.bits.data := sbox.bits.data 91 | req.bits.pred := pred 92 | req.bits.tag := vmt.io.w.tag 93 | 94 | /* Response */ 95 | lbox.bits.data := resp.bits.data 96 | lbox.bits.meta := vmt.io.r.record.load() 97 | lbox.valid := resp.valid && !resp.bits.store 98 | resp.ready := resp.bits.store || lbox.ready 99 | vmt.io.r.valid := resp.fire 100 | vmt.io.r.bits := resp.bits.tag 101 | 102 | /* Store acknowledgement */ 103 | val sret_req_en = fire(vmt_ready, cmd.store, !pred) 104 | val sret_req_cnt = abox.bits.meta.ecnt.decode() 105 | val sret_resp_en = vmt.io.r.valid && resp.bits.store 106 | val sret_resp_cnt = vmt.io.r.record.store().ecnt.decode() 107 | 108 | io.sret.update := Bool(true) 109 | io.sret.cnt := 110 | Mux(sret_req_en, sret_req_cnt, 0.U) + 111 | Mux(sret_resp_en, sret_resp_cnt, 0.U) 112 | } 113 | 114 | class VMUTileLink(edge: TLEdgeOut)(implicit p: Parameters) extends VMUModule()(p) { 115 | val io = new Bundle { 116 | val vmu = new VMUMemIO().flip 117 | val dmem = TLBundle(edge.bundle) 118 | } 119 | 120 | private val req = io.vmu.req 121 | private val resp = io.vmu.resp 122 | private val acquire = io.dmem.a 123 | private val grant = io.dmem.d 124 | 125 | val cmd = DecodedMemCommand(req.bits.fn.cmd) 126 | assert(!req.valid || cmd.load || cmd.store || cmd.amo, 127 | "memif: unknown memory command") 128 | 129 | req.ready := acquire.ready 130 | acquire.valid := req.valid 131 | val req_tag = Cat(cmd.read, req.bits.tag) 132 | 133 | val acq_amo = MuxLookup(req.bits.fn.cmd, Wire(new TLBundleA(edge.bundle)), Seq( 134 | M_XA_SWAP -> edge.Logical(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.SWAP)._2, 135 | M_XA_XOR -> edge.Logical(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.XOR)._2, 136 | M_XA_OR -> edge.Logical(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.OR)._2, 137 | M_XA_AND -> edge.Logical(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.AND)._2, 138 | M_XA_ADD -> edge.Arithmetic(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.ADD)._2, 139 | M_XA_MIN -> edge.Arithmetic(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.MIN)._2, 140 | M_XA_MAX -> edge.Arithmetic(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.MAX)._2, 141 | M_XA_MINU -> edge.Arithmetic(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.MINU)._2, 142 | M_XA_MAXU -> edge.Arithmetic(req_tag, req.bits.addr, req.bits.fn.mt, req.bits.data, TLAtomics.MAXU)._2 143 | )) 144 | 145 | // Unclear what the tradeoff is for us not requesting the smallest size transaction we need. 146 | val req_size = log2Up(tlDataBytes).U 147 | val req_addr_beat_aligned = (req.bits.addr >> UInt(tlByteAddrBits)) << UInt(tlByteAddrBits) 148 | acquire.bits := Mux1H(Seq( 149 | cmd.load -> edge.Get(req_tag, req_addr_beat_aligned, req_size)._2, 150 | cmd.store -> edge.Put(req_tag, req_addr_beat_aligned, req_size, req.bits.data, req.bits.mask)._2, 151 | cmd.amo -> acq_amo 152 | )) 153 | 154 | val resp_en = edge.hasData(grant.bits) || resp.bits.store 155 | grant.ready := !resp_en || resp.ready 156 | 157 | resp.valid := grant.valid && resp_en 158 | resp.bits.tag := grant.bits.source(bVMUTag-1,0) 159 | resp.bits.data := grant.bits.data 160 | resp.bits.store := grant.bits.opcode === TLMessages.AccessAck 161 | 162 | //Tie off unused channels 163 | io.dmem.b.ready := Bool(true) 164 | io.dmem.c.valid := Bool(false) 165 | io.dmem.e.valid := Bool(false) 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/vmu-pred.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class VMUMaskIO(implicit p: Parameters) extends VMUBundle()(p) { 7 | val ante = new VMUMaskIO_0 8 | val post = new VMUMaskIO_1 9 | } 10 | 11 | class PBox0(implicit p: Parameters) extends VMUModule()(p) { 12 | val io = new VMUIssueIO { 13 | val ingress = Decoupled(new PredEntry).flip 14 | val egress = Decoupled(new PredEntry) 15 | val sample = new VMUMaskIO_0 16 | } 17 | 18 | val op = Reg(new VMUDecodedOp) 19 | 20 | val index = Reg(UInt(width = bStrip)) 21 | val head = io.ingress.bits.pred >> index 22 | val step_max = UInt(nStrip) - index 23 | 24 | /* Density-time approximation for non-coalesced operations: 25 | * Skip ahead by power-of-2 lengths when the next 2^i predicates are 26 | * uniformly false 27 | * 28 | * c.f. Smith et al., "Vector Instruction Set Support for Conditional 29 | * Operations," in Proc. 27th Annual International Symp. on 30 | * Computer Architecture, New York, NY, 2000, pp. 260-269. 31 | */ 32 | private val scan = (1 to bStrip).scanLeft( 33 | (0, Bool(true))) { 34 | case ((j, zero_tail), i) => 35 | val m = (1 << i) 36 | val n = (1 << j) - 1 37 | // Ensure that sufficient number of predicates remain in the set 38 | val valid = (index <= UInt(nStrip - m)) 39 | val zero = zero_tail && (head(m-1, n) === UInt(0)) && valid 40 | (i, zero) 41 | }.tail 42 | 43 | val pred_n = MuxCase(head(0), 44 | scan.reverse.map { case (_, zero) => (zero -> Bool(false)) }) 45 | 46 | val lgecnt_n = MuxCase(UInt(0), 47 | scan.reverse.map { case (i, zero) => (zero -> UInt(i)) }) 48 | val ecnt_n = UInt(1) << lgecnt_n 49 | val step_n = ecnt_n 50 | 51 | val lead = Wire(Bool()) 52 | lead := Bool(false) 53 | 54 | val pglen = Reg(UInt(width = bPgIdx + 1)) 55 | val pglen_next = pglen.zext - step_max.zext 56 | val pglen_end = (pglen_next <= SInt(0)) 57 | val pglen_skip = Mux(lead, io.op.bits.base(bPgIdx-1, 0), UInt(0)) 58 | val pglen_shift = Mux(lead, io.op.bits.mt.shift(), op.mt.shift()) 59 | val pglen_max = (UInt(pgSize) - pglen_skip) >> pglen_shift 60 | val pglen_final = Reg(Bool()) 61 | val pglen_reset = Wire(Bool()) 62 | pglen_reset := Bool(false) 63 | 64 | val head_mask = EnableDecoder(pglen, nStrip) 65 | val pred_u = (head & head_mask).orR 66 | val pred = Mux(op.mode.unit, pred_u, pred_n) 67 | 68 | val vlen_end = Wire(Bool()) 69 | val ecnt_u_page = pred_u || pglen_end 70 | val ecnt_u = Mux(ecnt_u_page, pglen, step_max) 71 | val step_u = Mux(pglen_end, pglen(bStrip, 0), step_max) 72 | val ecnt_n_bounded = Mux(vlen_end, op.vlen, ecnt_n) 73 | val ecnt = Mux(op.mode.unit, ecnt_u, ecnt_n_bounded) 74 | 75 | val step = Mux(op.mode.unit, step_u, step_n) 76 | val vlen_next = op.vlen.zext - step.zext 77 | vlen_end := (vlen_next <= SInt(0)) 78 | 79 | io.sample.bits.pred := pred 80 | io.sample.bits.ecnt := ecnt 81 | io.sample.bits.last := vlen_end || 82 | (op.mode.unit && pglen_final && ecnt_u_page) 83 | io.sample.bits.unit.page := ecnt_u_page 84 | io.sample.bits.nonunit.shift := lgecnt_n 85 | io.egress.bits := io.ingress.bits 86 | 87 | val index_next = index + step 88 | val index_end = (index_next === UInt(nStrip)) 89 | 90 | val egress_en = index_end || vlen_end 91 | val egress_ready = !egress_en || io.egress.ready 92 | 93 | val sample_en = Reg(Bool()) 94 | val sample_ready = !sample_en || io.sample.ready 95 | 96 | private def fire(exclude: Bool, include: Bool*) = { 97 | val rvs = Seq(io.ingress.valid, egress_ready, sample_ready) 98 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 99 | } 100 | 101 | io.op.ready := Bool(false) 102 | io.ingress.ready := Bool(false) 103 | io.egress.valid := Bool(false) 104 | io.sample.valid := Bool(false) 105 | 106 | val s_idle :: s_busy :: Nil = Enum(UInt(), 2) 107 | val state = Reg(init = s_idle) 108 | 109 | switch (state) { 110 | is (s_idle) { 111 | io.op.ready := Bool(true) 112 | pglen_reset := Bool(true) 113 | } 114 | 115 | is (s_busy) { 116 | io.ingress.ready := fire(io.ingress.valid, egress_en) 117 | io.egress.valid := fire(egress_ready, egress_en) 118 | io.sample.valid := fire(sample_ready, sample_en) 119 | 120 | when (fire(null)) { 121 | index := index_next 122 | assert(index_next <= UInt(nStrip), "PBox0: index overflow") 123 | 124 | when (op.mode.unit) { 125 | when (pred_u) { 126 | sample_en := Bool(false) 127 | } 128 | pglen := pglen_next.asUInt 129 | when (pglen_end) { 130 | pglen_reset := Bool(true) 131 | } 132 | } 133 | 134 | op.vlen := vlen_next.asUInt 135 | when (vlen_end) { 136 | state := s_idle 137 | assert(!op.mode.unit || pglen_end, 138 | "PBox0: desynchronized vlen and pglen counters"); 139 | 140 | io.op.ready := Bool(true) 141 | pglen_reset := Bool(true) 142 | } 143 | } 144 | } 145 | } 146 | 147 | val _vlen_next = Mux(lead, io.op.bits.vlen, vlen_next(bVLen-1, 0)) 148 | val _pglen_final = (_vlen_next <= pglen_max) 149 | when (pglen_reset) { 150 | pglen := Mux(_pglen_final, _vlen_next, pglen_max) 151 | pglen_final := _pglen_final 152 | sample_en := Bool(true) 153 | } 154 | 155 | when (io.op.fire) { /* initialization */ 156 | state := s_busy 157 | op := io.op.bits 158 | index := UInt(0) 159 | lead := Bool(true) 160 | } 161 | } 162 | 163 | class PBox1(implicit p: Parameters) extends VMUModule()(p) { 164 | val io = new VMUIssueIO { 165 | val ingress = Decoupled(new PredEntry).flip 166 | val sample = new VMUMaskIO_1 167 | } 168 | 169 | private val limit = tlDataBytes >> 1 170 | private val lglimit = tlByteAddrBits - 1 171 | require(limit == nStrip) 172 | require(isPow2(limit)) 173 | 174 | /* Number of doublewords per TileLink subblock */ 175 | private val lgslices = tlByteAddrBits - 3 176 | private val nslices = 1 << lgslices 177 | require(lgslices >= 0) 178 | 179 | val op = Reg(new VMUDecodedOp) 180 | private val mts = Seq((op.mt.b || op.mt.h), op.mt.w, op.mt.d) 181 | private val meta = io.sample.meta 182 | 183 | val index = Reg(UInt(width = lglimit)) 184 | val step_u = Cat(mts :+ UInt(0, lgslices)) 185 | val step = Mux(op.mode.unit, step_u, UInt(1)) 186 | val index_next = (index + step)(lglimit-1, 0) 187 | val index_end = (index_next === UInt(0)) 188 | 189 | when (io.sample.fire) { 190 | index := index_next 191 | } 192 | 193 | val hold = Reg(Bits(width = limit - 1)) 194 | when (io.ingress.fire) { 195 | hold := io.ingress.bits.pred(nStrip-1, nStrip-limit+1) 196 | } 197 | 198 | val surplus = Wire(Bool()) 199 | val pred = Mux(surplus, Bits(0), io.ingress.bits.pred) 200 | val preds = (0 until nStrip).map(pred(_)).toSeq 201 | 202 | val head = Cat(pred, hold) 203 | /* Equivalence: index + (UInt(limit) - 1 - eoff) */ 204 | val shift = Cat(UInt(0,1), index) + (~meta.eoff) 205 | val mask_data = (head >> shift)(limit-1, 0) 206 | 207 | /* Clear predicates unrelated to the current request */ 208 | //val mask_type_base = Seq(!op.mt.d, mts.head).map(_ && op.mode.unit) 209 | //val mask_type_head = (op.mode.unit, nslices - 1) 210 | //val mask_type_tail = mask_type_base.zipWithIndex.map { 211 | // case (m, i) => (m, 1 << (i + lgslices)) 212 | //} 213 | //val mask_type_full = Cat(( 214 | // Bool(true) +: /* First predicate is always relevant */ 215 | // ((mask_type_head +: mask_type_tail).map { 216 | // case (m, w) => Fill(w, m) /* Trailing predicates */ 217 | // })).reverse) 218 | //val mask_type = (mask_type_full << meta.epad)(limit-1, 0) 219 | 220 | /* For each possible VLDQ entry, generate a flag indicating its 221 | presence or absence, collated by data width */ 222 | val mask_vsdq_all = (1 until mts.size).scanRight( 223 | preds.grouped(nslices).map(_.reduce(_ || _)).toSeq) { 224 | case (_, xs) => xs.grouped(2).map(_.reduce(_ || _)).toSeq 225 | } 226 | /* Select flag associated with the current VSDQ entry */ 227 | val mask_vsdq_mux = mask_vsdq_all.map { xs => 228 | if (xs.size > 1) { 229 | val n = log2Up(xs.size) 230 | Vec(xs)(index(lglimit-1, lglimit-n)) 231 | } else xs.head 232 | } 233 | val mask_vsdq = Mux1H(mts, mask_vsdq_mux) 234 | 235 | /* Truncation condition for coalesced operations: 236 | * After the first strip, if the number of remaining elements is less 237 | * than or equal to the (non-zero) offset, then the final mask is 238 | * derived exclusively from the hold register. 239 | * Note that truncation only needs to be handled for the end of a 240 | * vector and not premature termination after an exception, since the 241 | * predicates are already available in the latter case. 242 | * 243 | * Alternative formulation: 244 | * (possibly higher logic depth due to ecnt signal) 245 | * val truncate = op.mode.unit && !meta.first && 246 | * (meta.ecnt <= meta.eoff) 247 | */ 248 | val remnant = (op.vlen + meta.eoff)(lglimit-1, 0) 249 | val truncate = op.mode.unit && 250 | (remnant <= meta.eoff) && (remnant =/= UInt(0)) 251 | surplus := meta.last && truncate 252 | 253 | io.sample.bits.data := mask_data 254 | io.sample.bits.vsdq := mask_vsdq && !surplus 255 | 256 | val ingress_deq = (index_end || meta.last) && !surplus 257 | val ingress_valid = surplus || io.ingress.valid 258 | 259 | private def fire(exclude: Bool, include: Bool*) = { 260 | val rvs = Seq(ingress_valid, io.sample.ready) 261 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 262 | } 263 | 264 | io.op.ready := Bool(false) 265 | io.ingress.ready := Bool(false) 266 | io.sample.valid := Bool(false) 267 | 268 | val s_idle :: s_busy :: Nil = Enum(UInt(), 2) 269 | val state = Reg(init = s_idle) 270 | 271 | switch (state) { 272 | is (s_idle) { 273 | io.op.ready := Bool(true) 274 | } 275 | 276 | is (s_busy) { 277 | io.ingress.ready := fire(ingress_valid, ingress_deq) 278 | io.sample.valid := fire(io.sample.ready) 279 | when (fire(null) && meta.last) { 280 | state := s_idle 281 | io.op.ready := Bool(true) 282 | } 283 | } 284 | } 285 | 286 | when (io.op.fire) { /* initialization */ 287 | state := s_busy 288 | op := io.op.bits 289 | hold := Bits(0) 290 | index := UInt(0) 291 | } 292 | } 293 | 294 | class PBox(implicit p: Parameters) extends VMUModule()(p) { 295 | val io = new Bundle { 296 | val op = Vec(2, Decoupled(new VMUDecodedOp)).flip 297 | val pred = Decoupled(new PredEntry).flip 298 | val mask = new VMUMaskIO 299 | } 300 | 301 | /* NOTE: As a consequence of limited buffer space, predicates enqueued 302 | * in predq1 must be matched by at least an equal increment of the VCU 303 | * counter in order to avoid potential deadlock. 304 | */ 305 | val predq0 = Module(new Queue(io.pred.bits, nVMUPredQ)) 306 | predq0.suggestName("predq0Inst") 307 | val predq1 = Module(new Queue(io.pred.bits, nVMUPredQ)) 308 | predq1.suggestName("predq1Inst") 309 | val pbox0 = Module(new PBox0) 310 | pbox0.suggestName("pbox0Inst") 311 | val pbox1 = Module(new PBox1) 312 | pbox1.suggestName("pbox1Inst") 313 | 314 | predq0.io.enq <> io.pred 315 | 316 | pbox0.io.op <> io.op(0) 317 | pbox0.io.ingress <> predq0.io.deq 318 | 319 | predq1.io.enq <> pbox0.io.egress 320 | 321 | pbox1.io.op <> io.op(1) 322 | pbox1.io.ingress <> predq1.io.deq 323 | 324 | val anteq = Module(new Queue(pbox0.io.sample.bits, 2)) 325 | anteq.suggestName("anteqInst") 326 | anteq.io.enq <> pbox0.io.sample 327 | io.mask.ante <> anteq.io.deq 328 | io.mask.post <> pbox1.io.sample 329 | } 330 | 331 | /* Expand predicate set to byte-granular mask */ 332 | object PredicateByteMask { 333 | def apply[T <: Bits](pred: T, mt: DecodedMemType): Bits = { 334 | val sel = Seq(mt.b, mt.h, mt.w, mt.d) 335 | Mux1H(sel.zipWithIndex.map { case (s, i) => 336 | val len = pred.getWidth 337 | val w = 1 << i 338 | val n = (len >> i) - 1 339 | require((len > 0) && (n >= 0)) 340 | (s, FillInterleaved(w, pred(n, 0))) 341 | }) 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /src/main/scala/vmu-sdata.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class VMUStoreCtrl(implicit p: Parameters) extends VMUBundle()(p) { 7 | val mode = new Bundle { 8 | val unit = Bool() 9 | } 10 | val base = UInt(width = tlByteAddrBits) 11 | val mt = new DecodedMemType 12 | } 13 | 14 | class VMUStoreMeta(implicit p: Parameters) extends VMUMetaStore with VMUMetaCount { 15 | val offset = UInt(width = tlByteAddrBits) 16 | } 17 | class VMUStoreIO(implicit p: Parameters) extends VSDQIO()(p) { 18 | val meta = new VMUStoreMeta().asInput 19 | 20 | } 21 | 22 | class SBox(implicit p: Parameters) extends VMUModule()(p) { 23 | val io = new Bundle { 24 | val ctrl = Valid(new VMUStoreCtrl).flip 25 | val lane = new VSDQIO().flip 26 | val mem = new VMUStoreIO 27 | } 28 | 29 | private val op = io.ctrl.bits 30 | private val mts = Seq(op.mt.d, op.mt.w, op.mt.h, op.mt.b) 31 | private val meta = io.mem.meta 32 | 33 | val vsdq = Module(new Queue(io.lane.bits, nVSDQ)) 34 | vsdq.suggestName("vsdqInst") 35 | vsdq.io.enq <> io.lane 36 | 37 | /* Byte mode: Ignore MSB */ 38 | private def saturate[T <: Bits](x: T) = 39 | Cat(x(tlByteAddrBits-1) & (!op.mt.b), x(tlByteAddrBits-2, 0)) 40 | 41 | val lead = Reg(init = Bool(true)) 42 | val index = Reg(init = UInt(0, tlByteAddrBits)) 43 | val index_step = Cat(mts) 44 | val index_next = saturate(index + index_step) 45 | val index_end = (index_next === UInt(0)) 46 | 47 | val offset_u = saturate(op.base) 48 | val offset = Mux(op.mode.unit, offset_u, meta.offset) 49 | 50 | /* NOTE: Due to read bandwidth constraints, only the lower half of a 51 | VSDQ entry (width tlDataBits/2) is populated for byte operations 52 | ("byte mode"). */ 53 | private val bbyte = tlDataBits >> 1 54 | /* Byte mode: Number of relevant data bits in a partial VSDQ entry to 55 | save into the hold register */ 56 | private val bpart = bbyte - 8 57 | 58 | val hold = Reg(Bits(width = tlDataBits - 16)) 59 | when (vsdq.io.deq.fire) { 60 | /* Byte mode: Align relevant data bits to the upper end of the 61 | hold register */ 62 | hold := Cat(Mux(op.mt.b, 63 | vsdq.io.deq.bits.data(bpart+7, 8), 64 | vsdq.io.deq.bits.data(tlDataBits-1, tlDataBits-bpart)), 65 | /* Lower bits ignored during byte mode */ 66 | vsdq.io.deq.bits.data(tlDataBits-bpart-1, 16)) 67 | } 68 | 69 | val data_head = Cat(vsdq.io.deq.bits.data, hold, Bits(0, 8) /* pad */) 70 | /* Equivalence: index + (UInt(tlDataBytes) - 1 - offset) */ 71 | val shift = Cat(UInt(0,1), index) + (~offset) 72 | 73 | val data_align = (data_head >> Cat(shift, UInt(0,3)))(tlDataBits-1, 0) 74 | val data_hi = data_align(tlDataBits-1, bbyte) 75 | val data_lo = data_align(bbyte-1, 0) 76 | val data = Cat(Mux(op.mode.unit && op.mt.b, data_lo, data_hi), data_lo) 77 | io.mem.bits.data := data 78 | 79 | val bcnt = meta.ecnt.decode() << op.mt.shift() 80 | assert(!op.mt.b || (bcnt <= UInt(tlDataBytes >> 1)), 81 | "SBox: bcnt exceeds limit") 82 | val truncate = (bcnt <= offset_u) && !lead && meta.last 83 | 84 | val vsdq_deq = meta.vsdq && 85 | Mux(op.mode.unit, !truncate, index_end || meta.last) 86 | val vsdq_valid = !meta.vsdq || (op.mode.unit && truncate) || 87 | vsdq.io.deq.valid 88 | 89 | private def fire(exclude: Bool, include: Bool*) = { 90 | val rvs = Seq(vsdq_valid, io.mem.ready, io.ctrl.valid) 91 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 92 | } 93 | 94 | vsdq.io.deq.ready := fire(vsdq_valid, vsdq_deq) 95 | io.mem.valid := fire(io.mem.ready) 96 | 97 | when (fire(null)) { 98 | index := Mux(op.mode.unit || meta.last, UInt(0), index_next) 99 | lead := meta.last 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/vmu-table.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class TableWIO[T <: Data](gen: => T, sztag: Int) 7 | extends DecoupledIO(gen) { 8 | val tag = UInt(INPUT, sztag) 9 | 10 | } 11 | 12 | class TableRIO[T <: Data](gen: => T, sztag: Int) 13 | extends ValidIO(UInt(width = sztag)) { 14 | val record = gen.asInput 15 | 16 | } 17 | 18 | class Table[T <: Data](n: Int, gen: => T) extends Module { 19 | private val sztag = log2Up(n) 20 | val io = new Bundle { 21 | val r = new TableRIO(gen, sztag).flip 22 | val w = new TableWIO(gen, sztag).flip 23 | } 24 | 25 | val valid = Reg(init = Bits(0, n)) 26 | val array = Mem(n, gen) 27 | 28 | io.w.ready := !valid.andR 29 | 30 | private val rtag = io.r.bits 31 | private val wtag = io.w.tag 32 | wtag := CTZ(~valid, n) 33 | 34 | val wen = io.w.fire 35 | val ren = io.r.valid 36 | val valid_mask_r = ren << rtag 37 | val valid_mask_w = wen << wtag 38 | valid := (valid & (~valid_mask_r)) | valid_mask_w 39 | 40 | assert(!ren || valid(rtag), "table: invalid read tag") 41 | 42 | io.r.record := array(rtag) 43 | when (wen) { 44 | array(io.w.tag) := io.w.bits 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/vmu-tlb.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.rocket.{TLBReq, TLBResp, MStatus} 6 | 7 | abstract class TLBReqIO(implicit p: Parameters) extends VMUBundle()(p) { 8 | val status = new MStatus().asOutput 9 | val req = Decoupled(new TLBReq(log2Ceil(regBytes))) 10 | } 11 | 12 | class TLBIO(implicit p: Parameters) extends TLBReqIO()(p) { 13 | val resp = new Bundle { 14 | val ppn = UInt(INPUT, bPPN) 15 | val xcpt = Bool(INPUT) 16 | } 17 | 18 | def pgidx(dummy: Int = 0): UInt = this.req.bits.vaddr(bPgIdx-1, 0) 19 | def vpn(dummy: Int = 0): UInt = this.req.bits.vaddr(bVAddrExtended-1, bPgIdx) 20 | def paddr(dummy: Int = 0): UInt = Cat(this.resp.ppn, this.pgidx()) 21 | } 22 | 23 | class RocketTLBIO(implicit p: Parameters) extends TLBReqIO()(p) { 24 | val resp = new TLBResp().flip 25 | 26 | def bridge(client: TLBIO) { 27 | this.status := client.status 28 | this.req.bits := client.req.bits 29 | this.req.valid := client.req.valid 30 | client.req.ready := this.req.ready && !this.resp.miss 31 | client.resp.ppn := this.resp.paddr(bPAddr-1, bPgIdx) 32 | } 33 | } 34 | 35 | class TBox(n: Int)(implicit p: Parameters) extends VMUModule()(p) { 36 | val io = new Bundle { 37 | val inner = Vec(n, new TLBIO()).flip 38 | val outer = new RocketTLBIO 39 | val irq = new IRQIO 40 | } 41 | 42 | val arb = Wire(new TLBIO()) 43 | io.outer.bridge(arb) 44 | 45 | arb.status := PriorityMux(io.inner.map(x => x.req.valid -> x.status)) 46 | arb.req.bits := PriorityMux(io.inner.map(x => x.req.valid -> x.req.bits)) 47 | arb.req.valid := io.inner.map(_.req.valid).reduce(_ || _) 48 | 49 | val ready = io.inner.init.map(!_.req.valid).scanLeft(arb.req.ready)(_ && _) 50 | io.inner.zip(ready).foreach { case (i, r) => 51 | i.req.ready := r 52 | i.resp.ppn <> arb.resp.ppn 53 | i.resp.xcpt <> arb.resp.xcpt 54 | } 55 | 56 | val xcpts = Seq( 57 | io.outer.resp.ma.ld, 58 | io.outer.resp.ma.st, 59 | io.outer.resp.pf.ld, 60 | io.outer.resp.pf.st, 61 | io.outer.resp.ae.ld, 62 | io.outer.resp.ae.st) 63 | val irqs = Seq( 64 | io.irq.vmu.ma_ld, 65 | io.irq.vmu.ma_st, 66 | io.irq.vmu.pf_ld, 67 | io.irq.vmu.pf_st, 68 | io.irq.vmu.ae_ld, 69 | io.irq.vmu.ae_st) 70 | 71 | val fire = arb.req.fire 72 | irqs.zip(xcpts).foreach { case (irq, xcpt) => 73 | irq := xcpt && fire 74 | } 75 | io.irq.vmu.aux := arb.req.bits.vaddr 76 | arb.resp.xcpt := xcpts.reduce(_ || _) 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/vmu-util.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | 5 | // Bidirectional barrel shifter 6 | // Selects n-element field from 2n-element input 7 | class FunnelShifter[T <: Data](gen: T, n: Int) extends Module { 8 | private val lgn = log2Up(n) 9 | require(n == (1 << lgn)) 10 | 11 | val io = new Bundle { 12 | val in0 = Vec(n, gen.cloneType).asInput 13 | val in1 = Vec(n, gen.cloneType).asInput // left-shift input 14 | val out = Vec(n, gen.cloneType).asOutput 15 | val shift = SInt(INPUT, lgn + 1) 16 | } 17 | 18 | // Right shift by n 19 | private var data = Vec((1 until n).map(i => 20 | Mux(io.shift(lgn), io.in0(i), io.in1(i))) ++ io.in0) 21 | 22 | // Left shift by (n - 1) .. 0 23 | for (stage <- (lgn - 1) to 0 by -1) { 24 | val m = (0 until stage).map(1 << _).sum 25 | val k = 1 << stage 26 | data = Vec.tabulate(n + m){ i => 27 | Mux(io.shift(stage), data(i), data(i + k)) 28 | } 29 | } 30 | io.out := data 31 | } 32 | 33 | // Rotates n input elements into m output slots 34 | class Rotator[T <: Data](gen: T, n: Int, m: Int, rev: Boolean = false) extends Module { 35 | require(n <= m) 36 | val io = new Bundle { 37 | val in = Vec(n, gen.cloneType).asInput 38 | val out = Vec(m, gen.cloneType).asOutput 39 | val sel = UInt(INPUT, log2Up(m)) 40 | } 41 | 42 | var barrel = io.in 43 | for (stage <- 0 until log2Up(m)) { 44 | val shift = 1 << stage 45 | val len = math.min(barrel.length + shift, m) 46 | barrel = Vec.tabulate(len){ i => { 47 | // k: source index with rotation enabled 48 | // i: source index with rotation disabled 49 | val k = if (rev) (i + shift) % m // shift backward 50 | else ((i - shift) + m) % m // shift forward 51 | if (i < barrel.length && k < barrel.length) { 52 | Mux(io.sel(stage), barrel(k), barrel(i)) 53 | } else { 54 | // If either entry does not exist, use the other. 55 | if (i < barrel.length) barrel(i) else barrel(k) 56 | } 57 | }} 58 | } 59 | io.out := barrel 60 | } 61 | 62 | object EnableDecoder { 63 | def apply[T <: UInt](in: T, n: Int): UInt = { 64 | val lgn = log2Up(n) 65 | val lut = Vec( 66 | (0 until n).map(i => Bits((1 << i) - 1, n)) ++ 67 | Seq.fill((1 << lgn) - n)(Fill(n, Bool(true)))) 68 | val mask = ((in >> lgn) =/= UInt(0)) 69 | lut(in(lgn-1, 0)) | Fill(n, mask) 70 | } 71 | } 72 | 73 | object Ceil { 74 | def apply[T <: UInt](in: T, shift: Int): UInt = 75 | if (shift == 0) in else 76 | ((in >> shift) + in(shift-1, 0).orR.asUInt) 77 | } 78 | 79 | /* Count trailing zeroes */ 80 | object CTZ { 81 | private def mux[T <: Data](in: Seq[(Bool, T)]): (Bool, T) = { 82 | /* Returns the last (lowest-priority) item if none are selected */ 83 | val elt = in.init.foldRight(in.last._2) { 84 | case ((sel, elt0), elt1) => Mux(sel, elt0, elt1) 85 | } 86 | val sel = in.map(_._1).reduce(_ || _) 87 | (sel, elt) 88 | } 89 | 90 | private def tree[T <: Data](in: Seq[(Bool, T)]): Seq[(Bool, T)] = { 91 | val stage = in.grouped(2).map(mux(_)).toSeq 92 | if (stage.size > 1) tree(stage) else stage 93 | } 94 | 95 | def apply[T <: Bits](in: T, n: Int): UInt = { 96 | val init = (0 until n).map(i => (in(i), UInt(i))) :+ 97 | (Bool(true), UInt(n)) /* Result for zero input */ 98 | tree(init).head._2 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/scala/vmu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | import freechips.rocketchip.tilelink.TLEdgeOut 6 | import freechips.rocketchip.tile.TileVisibilityNodeKey 7 | 8 | case object HwachaNVVAQEntries extends Field[Int] 9 | case object HwachaNVPAQEntries extends Field[Int] 10 | case object HwachaNVSDQEntries extends Field[Int] 11 | case object HwachaNVLDQEntries extends Field[Int] 12 | case object HwachaNVMTEntries extends Field[Int] 13 | 14 | trait MemParameters extends UsesHwachaParameters 15 | with freechips.rocketchip.tile.HasCoreParameters { 16 | val bVAddr = vaddrBits 17 | val bPAddr = paddrBits 18 | val bVAddrExtended = bVAddr + (if (bVAddr < regLen) 1 else 0) 19 | 20 | val bVPN = vpnBits 21 | val bPPN = ppnBits 22 | val bPgIdx = pgIdxBits 23 | val pgSize = 1 << bPgIdx 24 | 25 | val tlDataBytes = 16 26 | val tlByteAddrBits = log2Up(tlDataBytes) 27 | val tlDataBits = tlDataBytes*8 28 | 29 | 30 | require(((tlDataBytes*8) & (regLen - 1)) == 0) 31 | } 32 | 33 | trait VMUParameters extends MemParameters { 34 | val nVMUQ = 2 35 | val nVMUIQ = 2 36 | val nVVAQ = p(HwachaNVVAQEntries) 37 | val nVPAQ = p(HwachaNVPAQEntries) 38 | val nVSDQ = p(HwachaNVSDQEntries) 39 | val nVLDQ = p(HwachaNVLDQEntries) 40 | val nVMUPredQ = 4 41 | 42 | val nVMT = p(HwachaNVMTEntries) 43 | val bVMUTag = log2Up(nVMT) 44 | 45 | /* Maximum of two ongoing operations in the VMU */ 46 | val maxVCU = maxVLen << 1 47 | val bVCU = bVLen + 1 48 | 49 | val bSRet = log2Down(tlDataBytes) + 1 50 | } 51 | 52 | class VMUIO(implicit p: Parameters) extends HwachaBundle()(p) { 53 | val vaq = new VVAQIO 54 | val vsdq = new VSDQIO 55 | val vldq = new VLDQIO().flip 56 | 57 | val pred = Decoupled(new PredEntry) 58 | val pala = new CounterLookAheadIO 59 | val vlu = new VLUSelectIO 60 | } 61 | 62 | class VMUDecodedOp(implicit p: Parameters) extends VMUOp()(p) with VMUMetaIndex { 63 | val mode = new Bundle { 64 | val unit = Bool() 65 | val indexed = Bool() 66 | } 67 | val cmd = new DecodedMemCommand 68 | val mt = new DecodedMemType 69 | 70 | val first = Bool() 71 | } 72 | 73 | object VMUDecodedOp extends HwachaConstants { 74 | def apply(op: VMUOp)(implicit p: Parameters): VMUDecodedOp = { 75 | val dec = Wire(new VMUDecodedOp) 76 | dec.suggestName("decWire") 77 | dec.fn := op.fn 78 | dec.vlen := op.vlen 79 | dec.base := op.base 80 | dec.stride := op.stride 81 | dec.status := op.status 82 | 83 | dec.mode.unit := vmu_unit(op.fn.mode) 84 | dec.mode.indexed := vmu_indexed(op.fn.mode) 85 | 86 | dec.cmd := DecodedMemCommand(op.fn.cmd) 87 | dec.mt := DecodedMemType(op.fn.mt) 88 | 89 | dec.eidx := UInt(0) 90 | dec.first := Bool(true) 91 | dec 92 | } 93 | } 94 | 95 | class VMUIssueIO(implicit p: Parameters) extends HwachaBundle()(p) { 96 | val op = Decoupled(new VMUDecodedOp).flip 97 | } 98 | class IBoxIO(implicit p: Parameters) extends VMUIssueIO()(p) { 99 | val issue = Vec(4, Decoupled(new VMUDecodedOp)) 100 | val aret = Bool() 101 | 102 | def span(sink: DecoupledIO[VMUDecodedOp]*) = { 103 | val src = Wire(Decoupled(new VMUDecodedOp)).suggestName("srcWire") 104 | val rvs = (src.ready, src.valid) +: sink.map(x => (x.valid, x.ready)) 105 | rvs.foreach { case (x, y) => 106 | x := rvs.map(_._2).filter(_ ne y).reduce((a,b) => (a && b).suggestName("andWire")) 107 | x.suggestName("andRvs") 108 | } 109 | sink.foreach { case x => x.bits := src.bits } 110 | src 111 | } 112 | } 113 | 114 | class IBox(implicit p: Parameters) extends VMUModule()(p) { 115 | val io = new Bundle { 116 | val id = UInt(INPUT) 117 | val op = Decoupled(new VMUOp).flip 118 | val cfg = new HwachaConfigIO().flip 119 | val agu = new AGUIO 120 | val abox = Vec(3, Decoupled(new VMUDecodedOp)) 121 | val pbox = Vec(2, Decoupled(new VMUDecodedOp)) 122 | val aret = Bool() 123 | } 124 | 125 | val opq = Module(new Queue(io.op.bits, nVMUQ)) 126 | opq.suggestName("opqInst") 127 | opq.io.enq <> io.op 128 | val op = VMUDecodedOp(opq.io.deq.bits) 129 | 130 | val agent = if (nLanes > 1) { 131 | val _agent = Module(new IBoxML) 132 | _agent.suggestName("_agentInst") 133 | _agent.io.id := io.id 134 | _agent.io.cfg <> io.cfg 135 | io.agu <> _agent.io.agu 136 | _agent.io 137 | } else (Module(new IBoxSL)).io 138 | 139 | agent.op.bits := op 140 | agent.op.valid := opq.io.deq.valid 141 | opq.io.deq.ready := agent.op.ready 142 | io.aret := agent.aret 143 | 144 | val issue = Seq(io.abox(0), io.pbox(0), 145 | agent.span(io.abox(1), io.pbox(1)), io.abox(2)) 146 | issue zip agent.issue map {case(s,d) => s <> d} 147 | } 148 | 149 | class IBoxSL(implicit p: Parameters) extends VMUModule()(p) { 150 | val io = new IBoxIO 151 | 152 | val mask = Reg(init = Vec.fill(io.issue.size){Bool(false)}) 153 | io.issue.zipWithIndex.map { case (box, i) => 154 | val _mask = mask(i) 155 | box.bits := io.op.bits 156 | box.valid := io.op.valid && !_mask 157 | val fire = io.op.valid && box.ready 158 | _mask := _mask || fire 159 | } 160 | 161 | io.op.ready := mask.asUInt.andR 162 | when (io.op.ready) { 163 | mask.map(_ := Bool(false)) 164 | io.aret := true.B 165 | } 166 | } 167 | 168 | class IBoxML(implicit p: Parameters) extends VMUModule()(p) { 169 | val io = new IBoxIO { 170 | val id = UInt(INPUT, width = bLanes) 171 | val cfg = new HwachaConfigIO().flip 172 | val agu = new AGUIO 173 | } 174 | 175 | val op = Reg(new VMUDecodedOp) 176 | val indexed = io.op.bits.mode.indexed 177 | 178 | val idMask = Reg(UInt(width = bLanes)) 179 | 180 | // Each bit position of id is put into priority mux with priority of its bit position 181 | // whether its requested in the queue is dependent on the value at that bit position 182 | // we keep track of which was the most recent position issued and mask any higher bit positions off 183 | val pMux = PriorityMux( 184 | Reverse(io.id) & Reverse(idMask), 185 | ((bLanes - 1) to 0 by -1).map(UInt(_))) 186 | 187 | 188 | val shift = Wire(UInt(width = io.agu.in.bits.shift.getWidth)) 189 | shift := UInt(bLanes) 190 | 191 | io.agu.in.valid := Bool(false) 192 | io.agu.in.bits.base := op.base 193 | io.agu.in.bits.offset := Cat(io.op.bits.stride, UInt(0, bStrip)) 194 | io.agu.in.bits.shift := io.cfg.lstride + shift 195 | 196 | val ecnt_max = io.cfg.lstrip 197 | val eidx_next = op.eidx + ecnt_max 198 | val vlen_next = op.vlen.zext - ecnt_max.zext 199 | val vlen_end = (vlen_next <= SInt(0)) 200 | val ecnt = Mux(vlen_end, op.vlen(bfLStrip-1, 0), ecnt_max) 201 | 202 | 203 | val qcntr = Reg(init = 0.U((log2Up(nVMUIQ + 2)).W)) 204 | val qcnts = Wire(Vec(io.issue.size, UInt(width = log2Up(nVMUIQ + 1)))) 205 | val aret_pending = Reg(init = Bool(false)) 206 | val enq = io.span(io.issue.zipWithIndex.map { case (deq, i) => 207 | val q = Module(new Queue(new VMUDecodedOp, nVMUIQ)) 208 | qcnts(i) := q.io.count 209 | q.suggestName("qInst") 210 | deq <> q.io.deq 211 | q.io.enq 212 | }:_*) 213 | enq.bits := io.op.bits 214 | enq.bits.vlen := ecnt 215 | enq.bits.base := op.base 216 | enq.bits.eidx := op.eidx 217 | enq.bits.first := op.first 218 | enq.bits.status := op.status 219 | 220 | io.op.ready := Bool(false) 221 | io.aret := Bool(false) 222 | enq.valid := Bool(false) 223 | 224 | when(io.issue(3).fire) { 225 | qcntr := Mux(qcntr === 0.U, 0.U, (qcntr.zext - 1.S).asUInt) 226 | io.aret := qcntr === 1.U || aret_pending 227 | aret_pending := Bool(false) 228 | } 229 | 230 | val s_idle :: s_busy :: s_setup :: Nil = Enum(UInt(), 3) 231 | val state = Reg(init = s_idle) 232 | 233 | switch (state) { 234 | is (s_idle) { 235 | when (io.op.valid) { 236 | op := io.op.bits 237 | when(io.id =/= UInt(0)) { 238 | idMask := ~UInt(0, bLanes) 239 | state := Mux(indexed, s_busy, s_setup) 240 | } .otherwise { 241 | state := s_busy 242 | } 243 | } 244 | } 245 | 246 | is (s_busy) { 247 | io.agu.in.valid := !indexed 248 | enq.valid := !aret_pending && (indexed || io.agu.out.valid) 249 | 250 | when (enq.fire) { 251 | when (!indexed) { 252 | op.base := io.agu.out.bits.addr 253 | } 254 | op.vlen := vlen_next.asUInt 255 | op.eidx := eidx_next 256 | op.first := Bool(false) 257 | when (vlen_end) { 258 | state := s_idle 259 | io.op.ready := Bool(true) 260 | // Last queue is abox2 deepest stage 261 | // +1+1 because we are enqing this cycle and need to wait for the next op to be eaten by abox2 262 | qcntr := qcnts(3) + 1.U + Mux(io.issue(3).fire, 0.U, 1.U) 263 | // aret after next issue3.fire 264 | aret_pending := qcntr =/= 0.U 265 | assert(qcntr <= UInt(1), "IBox: qcntr too large. aret broken") 266 | } 267 | } 268 | } 269 | 270 | is (s_setup) { 271 | when (io.id =/= UInt(0)) { 272 | shift := pMux 273 | io.agu.in.valid := Bool(true) 274 | when (io.agu.out.valid) { 275 | op.base := io.agu.out.bits.addr 276 | val newMask = idMask & ((UInt(1) << pMux) - UInt(1)) 277 | idMask := newMask 278 | when (newMask === UInt(0) || !((newMask & io.id).orR)) { 279 | state := s_busy 280 | } 281 | } 282 | } 283 | } 284 | } 285 | } 286 | 287 | class VMU(resetSignal: Bool = null)(implicit p: Parameters) 288 | extends VMUModule(_reset = resetSignal)(p) { 289 | val io = new Bundle { 290 | val id = UInt(INPUT) 291 | val op = Decoupled(new VMUOp).flip 292 | val cfg = new HwachaConfigIO().flip 293 | val lane = new VMUIO().flip 294 | val tlb = new RocketTLBIO 295 | val memif = new VMUMemIO 296 | 297 | val sret = new CounterUpdateIO(bSRet) 298 | val aret = Bool() 299 | val irq = new IRQIO 300 | val xcpt = new XCPTIO().flip 301 | } 302 | 303 | private val confml = (nLanes > 1) 304 | val ibox = Module(new IBox) 305 | ibox.suggestName("iboxInst") 306 | val pbox = Module(new PBox) 307 | pbox.suggestName("pboxInst") 308 | val abox = Module(new ABox) 309 | abox.suggestName("aboxInst") 310 | val tbox = Module(new TBox(1)) 311 | tbox.suggestName("tboxInst") 312 | val sbox = Module(new SBox) 313 | sbox.suggestName("sboxInst") 314 | val vldq = Module(new Queue(io.lane.vldq.bits, nVLDQ)) 315 | vldq.suggestName("vldqInst") 316 | val mbox = Module(new MBox) 317 | mbox.suggestName("mboxInst") 318 | val agu = Module(new AGU(if (confml) 2 else 1)) 319 | agu.suggestName("aguInst") 320 | 321 | ibox.io.id := io.id 322 | ibox.io.op <> io.op 323 | ibox.io.cfg <> io.cfg 324 | io.aret <> ibox.io.aret 325 | if (confml) agu.io.ports(1) <> ibox.io.agu 326 | 327 | pbox.io.op <> ibox.io.pbox 328 | pbox.io.pred <> io.lane.pred 329 | 330 | abox.io.op <> ibox.io.abox 331 | abox.io.mask <> pbox.io.mask 332 | abox.io.lane <> io.lane.vaq 333 | abox.io.xcpt <> io.xcpt 334 | abox.io.la <> io.lane.pala 335 | abox.io.load <> io.lane.vlu 336 | agu.io.ports(0) <> abox.io.agu 337 | 338 | tbox.io.inner(0) <> abox.io.tlb 339 | io.tlb <> tbox.io.outer 340 | 341 | io.irq <> tbox.io.irq 342 | 343 | sbox.io.ctrl <> abox.io.store 344 | sbox.io.lane <> io.lane.vsdq 345 | io.lane.vldq <> vldq.io.deq 346 | 347 | mbox.io.inner.abox <> abox.io.mem 348 | mbox.io.inner.sbox <> sbox.io.mem 349 | vldq.io.enq <> mbox.io.inner.lbox 350 | io.sret <> mbox.io.sret 351 | 352 | io.memif <> mbox.io.outer 353 | } 354 | -------------------------------------------------------------------------------- /src/main/scala/vxu.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class VXU(implicit p: Parameters) extends VXUModule()(p) { 7 | val io = new Bundle { 8 | val id = UInt(INPUT) 9 | val cfg = new HwachaConfigIO().flip 10 | val issue = Decoupled(new IssueOp).flip 11 | val mseq = new MasterSequencerIO().flip 12 | val mocheck = Vec(nSeq, new MOCheck).asInput 13 | val red = new ReduceResultIO 14 | val vmu = new VMUIO 15 | val mrt = new LaneMRTIO 16 | } 17 | 18 | val seq = Module(new LaneSequencer) 19 | seq.suggestName("seqInst") 20 | val exp = Module(new Expander) 21 | exp.suggestName("expInst") 22 | val lane = Module(new Lane) 23 | lane.suggestName("laneInst") 24 | val dcc = Module(new DecoupledCluster) 25 | dcc.suggestName("dccInst") 26 | 27 | seq.io.cfg <> io.cfg 28 | seq.io.lid := io.id 29 | exp.io.cfg <> io.cfg 30 | lane.io.cfg <> io.cfg 31 | lane.io.id := io.id 32 | dcc.io.cfg <> io.cfg 33 | 34 | val enq_dcc = io.issue.bits.active.enq_dcc() 35 | val mask_dcc_ready = !enq_dcc || dcc.io.op.ready 36 | 37 | def fire(exclude: Bool, include: Bool*) = { 38 | val rvs = Seq(io.issue.valid, mask_dcc_ready) 39 | (rvs.filter(_ ne exclude) ++ include).reduce(_ && _) 40 | } 41 | 42 | io.issue.ready := fire(io.issue.valid) 43 | seq.io.op.valid := fire(null) 44 | dcc.io.op.valid := fire(mask_dcc_ready, enq_dcc) 45 | 46 | seq.io.op.bits := io.issue.bits 47 | dcc.io.op.bits.vlen := io.issue.bits.vlen 48 | dcc.io.op.bits.active := io.issue.bits.active 49 | dcc.io.op.bits.fn := io.issue.bits.fn 50 | dcc.io.op.bits.vd := io.issue.bits.base.vd 51 | dcc.io.op.bits.vd.id := io.issue.bits.reg.vd.id 52 | 53 | seq.io.master <> io.mseq 54 | seq.io.mocheck <> io.mocheck 55 | 56 | exp.io.seq <> seq.io.seq 57 | lane.io.op <> exp.io.lane 58 | 59 | seq.io.ticker <> exp.io.ticker 60 | seq.io.lack <> lane.io.ack 61 | seq.io.dack <> dcc.io.ack 62 | 63 | dcc.io.dpla <> seq.io.dpla 64 | dcc.io.dqla <> seq.io.dqla 65 | dcc.io.dila <> seq.io.dila 66 | dcc.io.dfla <> seq.io.dfla 67 | dcc.io.gpla <> seq.io.gpla 68 | dcc.io.gqla <> seq.io.gqla 69 | dcc.io.pla <> seq.io.pla 70 | dcc.io.lla <> seq.io.lla 71 | dcc.io.sla <> seq.io.sla 72 | 73 | dcc.io.lpqs <> lane.io.lpqs 74 | dcc.io.lrqs <> lane.io.lrqs 75 | dcc.io.bpqs <> lane.io.bpqs 76 | dcc.io.brqs <> lane.io.brqs 77 | lane.io.bwqs.mem <> dcc.io.bwqs.mem 78 | lane.io.bwqs.fu <> dcc.io.bwqs.fu 79 | //lane.io.bwqs <> dcc.io.bwqs 80 | io.red <> dcc.io.red 81 | 82 | io.vmu <> dcc.io.vmu 83 | io.vmu.pala <> seq.io.vmu.pala 84 | 85 | io.mrt.lreq <> seq.io.lreq 86 | io.mrt.sreq <> seq.io.sreq 87 | io.mrt.areq <> seq.io.areq 88 | io.mrt.lret.cnt := dcc.io.lla.cnt 89 | io.mrt.lret.update := dcc.io.lla.reserve 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/xcpt.scala: -------------------------------------------------------------------------------- 1 | package hwacha 2 | 3 | import Chisel._ 4 | import org.chipsalliance.cde.config._ 5 | 6 | class XCPTIO(implicit p: Parameters) extends HwachaBundle()(p) { 7 | val prop = new Bundle { 8 | val vu = new Bundle { 9 | val busy = Bool(OUTPUT) 10 | val flush_top = Bool(OUTPUT) 11 | val flush_kill = Bool(OUTPUT) 12 | val flush_aiw = Bool(OUTPUT) 13 | val flush_vxu = Bool(OUTPUT) 14 | val flush_vru = Bool(OUTPUT) 15 | val flush_vmu = Bool(OUTPUT) 16 | } 17 | 18 | val top = new Bundle { 19 | val stall = Bool(OUTPUT) 20 | } 21 | 22 | val issue = new Bundle { 23 | val stall = Bool(OUTPUT) 24 | } 25 | 26 | val seq = new Bundle { 27 | val stall = Bool(OUTPUT) 28 | } 29 | 30 | val vmu = new Bundle { 31 | val stall = Bool(OUTPUT) 32 | val drain = Bool(OUTPUT) 33 | } 34 | 35 | val evac = new Bundle { 36 | val start = Bool(OUTPUT) 37 | val addr = UInt(OUTPUT, regLen) 38 | } 39 | } 40 | val report = new Bundle { 41 | val exp = new Bundle { 42 | val empty = Bool(INPUT) 43 | } 44 | 45 | val mrt = new Bundle { 46 | val pending = Bool(INPUT) 47 | } 48 | 49 | val evac = new Bundle { 50 | val done = Bool(INPUT) 51 | } 52 | } 53 | } 54 | 55 | class XCPT(implicit p: Parameters) extends HwachaModule()(p) { 56 | val io = new HwachaBundle { 57 | val rocc = new Bundle { 58 | val exception = Bool(INPUT) 59 | val evac = Bool(INPUT) 60 | val evac_addr = UInt(INPUT, regLen) 61 | val hold = Bool(INPUT) 62 | val kill = Bool(INPUT) 63 | } 64 | 65 | val vu = new XCPTIO 66 | } 67 | 68 | val hold_top = Reg(init = Bool(false)) 69 | val hold_vu = Reg(init = Bool(false)) 70 | val hold_tlb = Reg(init = Bool(false)) 71 | 72 | // output assignments 73 | io.vu.prop.top.stall := hold_top 74 | io.vu.prop.issue.stall := hold_vu 75 | io.vu.prop.seq.stall := hold_vu 76 | io.vu.prop.vmu.stall := hold_tlb 77 | io.vu.prop.vmu.drain := Bool(false) 78 | 79 | val NORMAL = Bits(0, 3) 80 | val XCPT_DRAIN = Bits(1, 3) 81 | val XCPT_FLUSH = Bits(2, 3) 82 | val XCPT_EVAC = Bits(3, 3) 83 | val XCPT_DRAIN_EVAC = Bits(4, 3) 84 | val HOLD = Bits(5, 3) 85 | 86 | val state = Reg(init = NORMAL) 87 | val addr = Reg(init = UInt(0, regLen)) 88 | val evac = Reg(init = Bool(false)) 89 | val kill = Reg(init = Bool(false)) 90 | 91 | when (io.rocc.evac) { 92 | evac := Bool(true) 93 | addr := io.rocc.evac_addr 94 | } 95 | 96 | when (io.rocc.kill) { 97 | kill := Bool(true) 98 | } 99 | 100 | io.vu.prop.vu.busy := (state =/= NORMAL) && (state =/= HOLD) 101 | io.vu.prop.vu.flush_top := Bool(false) 102 | io.vu.prop.vu.flush_kill := Bool(false) 103 | io.vu.prop.vu.flush_aiw := Bool(false) 104 | io.vu.prop.vu.flush_vxu := Bool(false) 105 | io.vu.prop.vu.flush_vru := Bool(false) 106 | io.vu.prop.vu.flush_vmu := Bool(false) 107 | 108 | io.vu.prop.evac.start := Bool(false) 109 | io.vu.prop.evac.addr := addr 110 | 111 | switch (state) { 112 | 113 | is (NORMAL) { 114 | when (io.rocc.exception) { 115 | hold_top := Bool(true) 116 | hold_vu := Bool(true) 117 | hold_tlb := Bool(true) 118 | 119 | evac := Bool(false) 120 | kill := Bool(false) 121 | 122 | state := XCPT_DRAIN 123 | } 124 | 125 | when (io.rocc.hold) { 126 | hold_vu := Bool(true) 127 | hold_tlb := Bool(true) 128 | 129 | state := HOLD 130 | } 131 | } 132 | 133 | is (XCPT_DRAIN) { 134 | when (io.vu.report.exp.empty && !io.vu.report.mrt.pending) { 135 | hold_top := Bool(false) 136 | 137 | state := XCPT_FLUSH 138 | } 139 | } 140 | 141 | is (XCPT_FLUSH) { 142 | io.vu.prop.vu.flush_top := Bool(true) 143 | io.vu.prop.vu.flush_vxu := Bool(true) 144 | io.vu.prop.vu.flush_vru := Bool(true) 145 | io.vu.prop.vu.flush_vmu := Bool(true) 146 | 147 | when (kill) { 148 | io.vu.prop.vu.flush_kill := Bool(true) 149 | io.vu.prop.vu.flush_aiw := Bool(true) 150 | } 151 | 152 | when (evac) { 153 | hold_tlb := Bool(false) 154 | 155 | state := XCPT_EVAC 156 | } 157 | 158 | when (kill) { 159 | hold_vu := Bool(false) 160 | hold_tlb := Bool(false) 161 | kill := Bool(false) 162 | 163 | state := NORMAL 164 | } 165 | } 166 | 167 | is (XCPT_EVAC) { 168 | io.vu.prop.evac.start := Bool(true) 169 | io.vu.prop.vmu.drain := Bool(true) 170 | 171 | when (io.vu.report.evac.done) { 172 | state := XCPT_DRAIN_EVAC 173 | } 174 | } 175 | 176 | is (XCPT_DRAIN_EVAC) { 177 | io.vu.prop.vmu.drain := Bool(true) 178 | 179 | when (!io.vu.report.mrt.pending) { 180 | hold_vu := Bool(false) 181 | hold_tlb := Bool(false) 182 | evac := Bool(false) 183 | 184 | state := NORMAL 185 | } 186 | } 187 | 188 | is (HOLD) { 189 | when (!io.rocc.hold) { 190 | hold_vu := Bool(false) 191 | hold_tlb := Bool(false) 192 | 193 | state := NORMAL 194 | } 195 | } 196 | 197 | } 198 | } 199 | --------------------------------------------------------------------------------