├── .github └── workflows │ └── scala.yml ├── .gitmodules ├── .scalafmt.conf ├── LICENSE ├── Makefile ├── README.md ├── build.sc ├── common.sc └── src ├── main └── scala │ ├── Consts.scala │ ├── config │ ├── Parameters.scala │ ├── SubsystemConfig.scala │ └── SystemConfig.scala │ ├── core │ ├── ALU.scala │ ├── AMOALU.scala │ ├── BranchUnit.scala │ ├── Cache.scala │ ├── DataStruct.scala │ ├── Dispatch.scala │ ├── ICache.scala │ ├── IDecodeUnit.scala │ ├── IFetch.scala │ ├── Instructions.scala │ ├── Issue.scala │ ├── PTW.scala │ ├── SGPR.scala │ ├── SIMTStack.scala │ ├── Scoreboard.scala │ ├── TLB.scala │ ├── VGPR.scala │ ├── VectorALU.scala │ ├── WarpScheduler.scala │ └── Writeback.scala │ ├── dispatcher │ ├── DispatcherBundle.scala │ ├── JobDispatcher.scala │ └── TLQM.scala │ ├── lib │ └── Sram.scala │ ├── package.scala │ ├── smmu │ └── SMMU.scala │ ├── system │ └── SoC.scala │ ├── tile │ ├── CuTaskBundle.scala │ └── WorkGroupScheduler.scala │ └── util │ ├── AddrBits.scala │ └── PipelineReg.scala └── test ├── data └── test1 │ ├── add.asm │ ├── add.hex │ ├── add_0.txt │ ├── add_1.txt │ ├── add_2.txt │ └── add_3.txt └── scala ├── AXI4RamTest.scala ├── DCacheTest.scala ├── ICacheTest.scala ├── IFetchTest.scala ├── PTWTest.scala ├── TLBTest.scala └── WarpSchedulerTest.scala /.github/workflows/scala.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | name: Scala CI 7 | 8 | on: 9 | push: 10 | branches: [ "main" ] 11 | pull_request: 12 | branches: [ "main" ] 13 | 14 | permissions: 15 | contents: read 16 | 17 | jobs: 18 | build: 19 | 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Set up JDK 11 25 | uses: actions/setup-java@v3 26 | with: 27 | java-version: '11' 28 | distribution: 'temurin' 29 | - name: Download mill 30 | run: 31 | curl -L https://github.com/com-lihaoyi/mill/releases/download/0.10.10/0.10.10 > mill.0.10 && chmod +x mill.0.10 32 | - name: Run tests 33 | run: 34 | export PATH=`pwd`:${PATH} && make test 35 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "rocket-chip"] 2 | path = rocket-chip 3 | url = https://github.com/chipsalliance/rocket-chip.git 4 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.7.11 2 | maxColumn = 120 3 | runner.dialect=scala213 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OpenGPGPU 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | git submodule update --init 3 | cd rocket-chip && git submodule update --init 4 | 5 | test: init format fix 6 | mill -i ogpu\[chisel\].test 7 | 8 | format: 9 | mill -i ogpu\[chisel\].reformat 10 | mill -i ogpu\[chisel\].test.reformat 11 | 12 | fix: 13 | mill -i ogpu\[chisel\].fix 14 | mill -i ogpu\[chisel\].test.fix 15 | 16 | count: 17 | mill -i ogpu\[chisel\].printLineCount 18 | 19 | .phony: test 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OPENGPGPU 2 | 3 | A simple gpgpu for study purpose. 4 | -------------------------------------------------------------------------------- /build.sc: -------------------------------------------------------------------------------- 1 | import $ivy.`com.goyeau::mill-scalafix_mill0.11:0.3.1` 2 | import com.goyeau.mill.scalafix.ScalafixModule 3 | import coursier.maven.MavenRepository 4 | 5 | import mill._ 6 | import scalalib._ 7 | import scalafmt._ 8 | import $file.`rocket-chip`.common 9 | import $file.`rocket-chip`.dependencies.cde.common 10 | import $file.`rocket-chip`.dependencies.hardfloat.common 11 | import $file.`rocket-chip`.dependencies.diplomacy.common 12 | import $file.common 13 | 14 | 15 | val defaultScalaVersion = "2.13.12" 16 | 17 | 18 | def defaultVersions(chiselVersion: String) = chiselVersion match { 19 | case "chisel" => Map( 20 | "chisel" -> ivy"org.chipsalliance::chisel:6.1.0", 21 | "chisel-plugin" -> ivy"org.chipsalliance:::chisel-plugin:6.1.0", 22 | "chiseltest" -> ivy"edu.berkeley.cs::chiseltest:6.0.0", 23 | "sourcecode" -> ivy"com.lihaoyi::sourcecode:0.3.1" 24 | ) 25 | } 26 | 27 | trait HasChisel extends ScalaModule with Cross.Module[String] { 28 | 29 | 30 | def repositoriesTask = T.task { 31 | super.repositoriesTask() ++ Seq(MavenRepository("https://oss.sonatype.org/content/repositories/snapshots")) 32 | } 33 | 34 | def chiselModule: Option[ScalaModule] = None 35 | 36 | def chiselPluginJar: T[Option[PathRef]] = None 37 | 38 | def chiselIvy: Option[Dep] = Some(defaultVersions(crossValue)("chisel")) 39 | 40 | def chiselPluginIvy: Option[Dep] = Some(defaultVersions(crossValue)("chisel-plugin")) 41 | 42 | override def scalaVersion = defaultScalaVersion 43 | 44 | override def scalacOptions = super.scalacOptions() ++ 45 | Agg("-language:reflectiveCalls", "-Ymacro-annotations", "-Ytasty-reader", "-Ywarn-unused") 46 | 47 | override def ivyDeps = super.ivyDeps() ++ Agg(chiselIvy.get) 48 | 49 | override def scalacPluginIvyDeps = super.scalacPluginIvyDeps() ++ Agg(chiselPluginIvy.get) 50 | } 51 | 52 | object macros extends Macros 53 | 54 | trait Macros 55 | extends millbuild.`rocket-chip`.common.MacrosModule 56 | with SbtModule { 57 | 58 | def scalaVersion: T[String] = T(defaultScalaVersion) 59 | 60 | def scalaReflectIvy = ivy"org.scala-lang:scala-reflect:${defaultScalaVersion}" 61 | } 62 | 63 | object hardfloat extends Cross[Hardfloat]("chisel") 64 | 65 | trait Hardfloat 66 | extends millbuild.`rocket-chip`.dependencies.hardfloat.common.HardfloatModule with HasChisel with SbtModule { 67 | 68 | def scalaVersion: T[String] = T(defaultScalaVersion) 69 | 70 | override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "hardfloat" / "hardfloat" 71 | 72 | } 73 | 74 | object cde extends CDE 75 | 76 | trait CDE extends millbuild.`rocket-chip`.dependencies.cde.common.CDEModule with ScalaModule { 77 | 78 | def scalaVersion: T[String] = T(defaultScalaVersion) 79 | 80 | override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "cde" / "cde" 81 | } 82 | 83 | object diplomacy extends Cross[Diplomacy]("chisel") 84 | 85 | trait Diplomacy 86 | extends millbuild.`rocket-chip`.dependencies.diplomacy.common.DiplomacyModule 87 | with HasChisel { 88 | 89 | override def scalaVersion: T[String] = T(defaultScalaVersion) 90 | 91 | def cdeModule = cde 92 | 93 | def sourcecodeIvy = defaultVersions(crossValue)("sourcecode") 94 | 95 | override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "diplomacy" / "diplomacy" 96 | } 97 | 98 | object rocketchip extends Cross[RocketChip]("chisel") 99 | 100 | trait RocketChip 101 | extends millbuild.`rocket-chip`.common.RocketChipModule 102 | with HasChisel with SbtModule { 103 | 104 | override def millSourcePath = os.pwd / "rocket-chip" 105 | 106 | def macrosModule = macros 107 | 108 | def hardfloatModule = hardfloat(crossValue) 109 | 110 | def cdeModule = cde 111 | 112 | def diplomacyModule = diplomacy(crossValue) 113 | 114 | def mainargsIvy = ivy"com.lihaoyi::mainargs:0.5.4" 115 | 116 | def json4sJacksonIvy = ivy"org.json4s::json4s-jackson:4.0.6" 117 | } 118 | 119 | object ogpu extends Cross[OGPU]("chisel") 120 | 121 | trait OGPU extends millbuild.common.OGPUModule 122 | with HasChisel 123 | with SbtModule 124 | with ScalafixModule 125 | with ScalafmtModule { 126 | 127 | override def millSourcePath = os.pwd 128 | 129 | def rocketModule = rocketchip(crossValue) 130 | 131 | override def forkArgs = Seq("-Xmx8G", "-Xss256m") 132 | 133 | override def sources = T.sources { 134 | super.sources() ++ Seq(PathRef(this.millSourcePath / "src" / crossValue / "main" / "scala")) 135 | } 136 | 137 | def lineCount = T { 138 | this.sources().filter(ref => os.exists(ref.path)).flatMap(ref => os.walk(ref.path)).filter(os.isFile).flatMap(os.read.lines).size 139 | } 140 | 141 | def printLineCount() = T.command { 142 | println(s"Lines of code(LOC): ${lineCount()} !!!") 143 | } 144 | 145 | object test extends SbtModuleTests 146 | with TestModule.ScalaTest with ScalafixModule 147 | with ScalafmtModule { 148 | 149 | override def forkArgs = Seq("-Xmx8G", "-Xss256m") 150 | 151 | override def sources = T.sources { 152 | super.sources() ++ Seq(PathRef(this.millSourcePath / "src" / crossValue / "test" / "scala")) 153 | } 154 | 155 | override def ivyDeps = super.ivyDeps() ++ Agg( 156 | defaultVersions(crossValue)("chiseltest") 157 | ) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /common.sc: -------------------------------------------------------------------------------- 1 | import mill._ 2 | import mill.scalalib._ 3 | 4 | trait OGPUModule extends ScalaModule { 5 | 6 | def rocketModule: ScalaModule 7 | 8 | override def moduleDeps = super.moduleDeps ++ Seq( 9 | rocketModule, 10 | ) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/Consts.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.Berkeley for license details. 2 | 3 | package ogpu.constants 4 | 5 | import chisel3._ 6 | import chisel3.util._ 7 | import freechips.rocketchip.util._ 8 | 9 | trait ScalarOpConstants { 10 | val SZ_BR = 3 11 | def BR_X = BitPat("b???") 12 | def BR_EQ = 0.U(3.W) 13 | def BR_NE = 1.U(3.W) 14 | def BR_J = 2.U(3.W) 15 | def BR_N = 3.U(3.W) 16 | def BR_LT = 4.U(3.W) 17 | def BR_GE = 5.U(3.W) 18 | def BR_LTU = 6.U(3.W) 19 | def BR_GEU = 7.U(3.W) 20 | 21 | def A1_X = BitPat("b??") 22 | def A1_ZERO = 0.U(2.W) 23 | def A1_RS1 = 1.U(2.W) 24 | def A1_PC = 2.U(2.W) 25 | 26 | def IMM_X = BitPat("b???") 27 | def IMM_S = 0.U(3.W) 28 | def IMM_SB = 1.U(3.W) 29 | def IMM_U = 2.U(3.W) 30 | def IMM_UJ = 3.U(3.W) 31 | def IMM_I = 4.U(3.W) 32 | def IMM_Z = 5.U(3.W) 33 | 34 | def A2_X = BitPat("b??") 35 | def A2_ZERO = 0.U(2.W) 36 | def A2_SIZE = 1.U(2.W) 37 | def A2_RS2 = 2.U(2.W) 38 | def A2_IMM = 3.U(2.W) 39 | 40 | def X = BitPat("b?") 41 | def N = BitPat("b0") 42 | def Y = BitPat("b1") 43 | 44 | val SZ_DW = 1 45 | def DW_X = X 46 | def DW_32 = false.B 47 | def DW_64 = true.B 48 | def DW_XPR = DW_64 49 | } 50 | 51 | trait MemoryOpConstants { 52 | val NUM_XA_OPS = 9 53 | val M_SZ = 5 54 | def M_X = BitPat("b?????"); 55 | def M_XRD = "b00000".U; // int load 56 | def M_XWR = "b00001".U; // int store 57 | def M_PFR = "b00010".U; // prefetch with intent to read 58 | def M_PFW = "b00011".U; // prefetch with intent to write 59 | def M_XA_SWAP = "b00100".U 60 | def M_FLUSH_ALL = "b00101".U // flush all lines 61 | def M_XLR = "b00110".U 62 | def M_XSC = "b00111".U 63 | def M_XA_ADD = "b01000".U 64 | def M_XA_XOR = "b01001".U 65 | def M_XA_OR = "b01010".U 66 | def M_XA_AND = "b01011".U 67 | def M_XA_MIN = "b01100".U 68 | def M_XA_MAX = "b01101".U 69 | def M_XA_MINU = "b01110".U 70 | def M_XA_MAXU = "b01111".U 71 | def M_FLUSH = "b10000".U // write back dirty data and cede R/W permissions 72 | def M_PWR = "b10001".U // partial (masked) store 73 | def M_PRODUCE = "b10010".U // write back dirty data and cede W permissions 74 | def M_CLEAN = "b10011".U // write back dirty data and retain R/W permissions 75 | def M_SFENCE = "b10100".U // SFENCE.VMA 76 | def M_HFENCEV = "b10101".U // HFENCE.VVMA 77 | def M_HFENCEG = "b10110".U // HFENCE.GVMA 78 | def M_WOK = "b10111".U // check write permissions but don't perform a write 79 | def M_HLVX = "b10000".U // HLVX instruction 80 | 81 | def isAMOLogical(cmd: UInt) = cmd.isOneOf(M_XA_SWAP, M_XA_XOR, M_XA_OR, M_XA_AND) 82 | def isAMOArithmetic(cmd: UInt) = cmd.isOneOf(M_XA_ADD, M_XA_MIN, M_XA_MAX, M_XA_MINU, M_XA_MAXU) 83 | def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd) 84 | def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW 85 | def isRead(cmd: UInt) = cmd.isOneOf(M_XRD, M_HLVX, M_XLR, M_XSC) || isAMO(cmd) 86 | def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd) 87 | def isWriteIntent(cmd: UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/config/Parameters.scala: -------------------------------------------------------------------------------- 1 | package ogpu.config 2 | 3 | import org.chipsalliance.cde.config._ 4 | 5 | case object XLen extends Field[Int] 6 | case object DTSModel extends Field[String] 7 | case object DTSCompat extends Field[String] 8 | case object ThreadNum extends Field[Int] 9 | case object WarpNum extends Field[Int] 10 | case object RegNum extends Field[Int] 11 | case object RegIDWidth extends Field[Int] 12 | case object WarpIDWidth extends Field[Int] 13 | case object AddrWidth extends Field[Int] 14 | case object StackDepth extends Field[Int] 15 | case object DimWidth extends Field[Int] 16 | -------------------------------------------------------------------------------- /src/main/scala/config/SubsystemConfig.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.SiFive for license details. 2 | // See LICENSE.Berkeley for license details. 3 | 4 | package ogpu.config 5 | 6 | import org.chipsalliance.cde.config._ 7 | import freechips.rocketchip.diplomacy._ 8 | 9 | class BaseSubsystemConfig extends Config((_, _, _) => { case XLen => 64 }) 10 | 11 | class WithDTS(model: String, compat: Seq[String]) 12 | extends Config((_, _, _) => { 13 | case DTSModel => model 14 | case DTSCompat => compat 15 | }) 16 | -------------------------------------------------------------------------------- /src/main/scala/config/SystemConfig.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.SiFive for license details. 2 | // See LICENSE.Berkeley for license details. 3 | 4 | package ogpu.config 5 | 6 | import chisel3.util._ 7 | import org.chipsalliance.cde.config._ 8 | 9 | class OGPUBaseConfig(n: Int) 10 | extends Config((site, _, _) => { 11 | case DTSModel => "" 12 | case ThreadNum => 32 13 | case XLen => 64 14 | case WarpNum => 8 15 | case RegNum => 32 16 | case RegIDWidth => log2Ceil(site(RegNum)) 17 | case WarpIDWidth => log2Ceil(site(WarpNum)) 18 | case AddrWidth => 64 19 | case StackDepth => 16 20 | case DimWidth => 16 21 | }) 22 | 23 | class OGPUDefaultConfig(n: Int = 1) 24 | extends Config( 25 | new OGPUBaseConfig(n) 26 | ) 27 | -------------------------------------------------------------------------------- /src/main/scala/core/ALU.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | import freechips.rocketchip.rocket.ALU._ 8 | import freechips.rocketchip.rocket.{SZ_DW, DW_64, DW_32} 9 | 10 | class ScalarALU( 11 | implicit p: Parameters) 12 | extends Module { 13 | val xLen = p(XLen) 14 | val io = IO(new Bundle() { 15 | val dw = Input(UInt(SZ_DW.W)) 16 | val fn = Input(UInt(SZ_ALU_FN.W)) 17 | val in1 = Input(UInt(xLen.W)) 18 | val in2 = Input(UInt(xLen.W)) 19 | val out = Output(UInt(xLen.W)) 20 | val adder_out = Output(UInt(xLen.W)) 21 | val cmp_out = Output(Bool()) 22 | }) 23 | 24 | override def desiredName = "CHIPALU" 25 | 26 | // ADD, SUB 27 | val in2_inv = Mux(isSub(io.fn), ~io.in2, io.in2) 28 | val in1_xor_in2 = io.in1 ^ in2_inv 29 | val in1_and_in2 = io.in1 & in2_inv 30 | io.adder_out := io.in1 + in2_inv + isSub(io.fn) 31 | 32 | // SLT, SLTU 33 | val slt = 34 | Mux(io.in1(xLen-1) === io.in2(xLen-1), io.adder_out(xLen-1), 35 | Mux(cmpUnsigned(io.fn), io.in2(xLen-1), io.in1(xLen-1))) 36 | io.cmp_out := cmpInverted(io.fn) ^ Mux(cmpEq(io.fn), in1_xor_in2 === 0.U, slt) 37 | 38 | // SLL, SRL, SRA 39 | val (shamt, shin_r) = 40 | if (xLen == 32) (io.in2(4,0), io.in1) 41 | else { 42 | require(xLen == 64) 43 | val shin_hi_32 = Fill(32, isSub(io.fn) && io.in1(31)) 44 | val shin_hi = Mux(io.dw === DW_64, io.in1(63,32), shin_hi_32) 45 | val shamt = Cat(io.in2(5) & (io.dw === DW_64), io.in2(4,0)) 46 | (shamt, Cat(shin_hi, io.in1(31,0))) 47 | } 48 | val shin = Mux(shiftReverse(io.fn), Reverse(shin_r), shin_r) 49 | val shout_r = (Cat(isSub(io.fn) & shin(xLen-1), shin).asSInt >> shamt)(xLen-1,0) 50 | val shout_l = Reverse(shout_r) 51 | val shout = Mux(io.fn === FN_SR || io.fn === FN_SRA || io.fn === FN_BEXT, shout_r, 0.U) | 52 | Mux(io.fn === FN_SL, shout_l, 0.U) 53 | 54 | // CZEQZ, CZNEZ 55 | val in2_not_zero = io.in2.orR 56 | val usingConditionalZero = true 57 | val cond_out = Option.when(usingConditionalZero)( 58 | Mux((io.fn === FN_CZEQZ && in2_not_zero) || (io.fn === FN_CZNEZ && !in2_not_zero), io.in1, 0.U) 59 | ) 60 | 61 | // AND, OR, XOR 62 | val logic = Mux(io.fn === FN_XOR || io.fn === FN_OR || io.fn === FN_ORN || io.fn === FN_XNOR, in1_xor_in2, 0.U) | 63 | Mux(io.fn === FN_OR || io.fn === FN_AND || io.fn === FN_ORN || io.fn === FN_ANDN, in1_and_in2, 0.U) 64 | 65 | val useZbs = true 66 | val bext_mask = Mux(useZbs.B && io.fn === FN_BEXT, 1.U, ~(0.U(xLen.W))) 67 | val shift_logic = (isCmp (io.fn) && slt) | logic | (shout & bext_mask) 68 | val shift_logic_cond = cond_out match { 69 | case Some(co) => shift_logic | co 70 | case _ => shift_logic 71 | } 72 | 73 | // CLZ, CTZ, CPOP 74 | val tz_in = MuxLookup((io.dw === DW_32) ## !io.in2(0), 0.U)(Seq( 75 | 0.U -> io.in1, 76 | 1.U -> Reverse(io.in1), 77 | 2.U -> 1.U ## io.in1(31,0), 78 | 3.U -> 1.U ## Reverse(io.in1(31,0)) 79 | )) 80 | val popc_in = Mux(io.in2(1), 81 | Mux(io.dw === DW_32, io.in1(31,0), io.in1), 82 | PriorityEncoderOH(1.U ## tz_in) - 1.U)(xLen-1,0) 83 | val count = PopCount(popc_in) 84 | val in1_bytes = io.in1.asTypeOf(Vec(xLen / 8, UInt(8.W))) 85 | val orcb = VecInit(in1_bytes.map(b => Fill(8, b =/= 0.U))).asUInt 86 | val rev8 = VecInit(in1_bytes.reverse).asUInt 87 | val unary = MuxLookup(io.in2(11,0), count)(Seq( 88 | 0x287.U -> orcb, 89 | (if (xLen == 32) 0x698 else 0x6b8).U -> rev8, 90 | 0x080.U -> io.in1(15,0), 91 | 0x604.U -> Fill(xLen-8, io.in1(7)) ## io.in1(7,0), 92 | 0x605.U -> Fill(xLen-16, io.in1(15)) ## io.in1(15,0) 93 | )) 94 | 95 | // MAX, MIN, MAXU, MINU 96 | val maxmin_out = Mux(io.cmp_out, io.in2, io.in1) 97 | 98 | // ROL, ROR 99 | val rot_shamt = Mux(io.dw === DW_32, 32.U, xLen.U) - shamt 100 | val rotin = Mux(io.fn(0), shin_r, Reverse(shin_r)) 101 | val rotout_r = (rotin >> rot_shamt)(xLen-1,0) 102 | val rotout_l = Reverse(rotout_r) 103 | val rotout = Mux(io.fn(0), rotout_r, rotout_l) | Mux(io.fn(0), shout_l, shout_r) 104 | 105 | val useZbb = true 106 | val out = MuxLookup(io.fn, shift_logic_cond)(Seq( 107 | FN_ADD -> io.adder_out, 108 | FN_SUB -> io.adder_out 109 | ) ++ (if (useZbb) Seq( 110 | FN_UNARY -> unary, 111 | FN_MAX -> maxmin_out, 112 | FN_MIN -> maxmin_out, 113 | FN_MAXU -> maxmin_out, 114 | FN_MINU -> maxmin_out, 115 | FN_ROL -> rotout, 116 | FN_ROR -> rotout, 117 | ) else Nil)) 118 | 119 | 120 | io.out := out 121 | if (xLen > 32) { 122 | require(xLen == 64) 123 | when (io.dw === DW_32) { io.out := Cat(Fill(32, out(31)), out(31,0)) } 124 | } 125 | } 126 | 127 | // object ALURTL extends App { 128 | // implicit val p = new CoreConfig 129 | // emitVerilog(new ScalarALU(), Array("--target-dir", "generated")) 130 | // } 131 | -------------------------------------------------------------------------------- /src/main/scala/core/AMOALU.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.SiFive for license details. 2 | // See LICENSE.Berkeley for license details. 3 | 4 | package ogpu.core 5 | 6 | import chisel3._ 7 | import chisel3.util._ 8 | import org.chipsalliance.cde.config.Parameters 9 | 10 | import ogpu.ogpu._ 11 | 12 | class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) { 13 | val size = Wire(UInt(log2Up(log2Up(maxSize) + 1).W)) 14 | size := typ 15 | 16 | def misaligned: Bool = 17 | (addr & ((1.U << size) - 1.U)(log2Up(maxSize) - 1, 0)).orR 18 | 19 | def mask = { 20 | var res = 1.U 21 | for (i <- 0 until log2Up(maxSize)) { 22 | val upper = 23 | Mux(addr(i), res, 0.U) | Mux(size >= (i + 1).U, ((BigInt(1) << (1 << i)) - 1).U, 0.U) 24 | val lower = Mux(addr(i), 0.U, res) 25 | res = Cat(upper, lower) 26 | } 27 | res 28 | } 29 | 30 | protected def genData(i: Int): UInt = 31 | if (i >= log2Up(maxSize)) dat 32 | else Mux(size === i.U, Fill(1 << (log2Up(maxSize) - i), dat((8 << i) - 1, 0)), genData(i + 1)) 33 | 34 | def data = genData(0) 35 | def wordData = genData(2) 36 | } 37 | 38 | class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) { 39 | private val size = new StoreGen(typ, addr, dat, maxSize).size 40 | 41 | private def genData(logMinSize: Int): UInt = { 42 | var res = dat 43 | for (i <- log2Up(maxSize) - 1 to logMinSize by -1) { 44 | val pos = 8 << i 45 | val shifted = Mux(addr(i), res(2 * pos - 1, pos), res(pos - 1, 0)) 46 | val doZero = (i == 0).B && zero 47 | val zeroed = Mux(doZero, 0.U, shifted) 48 | res = Cat( 49 | Mux( 50 | size === i.U || doZero, 51 | Fill(8 * maxSize - pos, signed && zeroed(pos - 1)), 52 | res(8 * maxSize - 1, pos) 53 | ), 54 | zeroed 55 | ) 56 | } 57 | res 58 | } 59 | 60 | def wordData = genData(2) 61 | def data = genData(0) 62 | } 63 | 64 | class AMOALU( 65 | operandBits: Int 66 | )( 67 | implicit p: Parameters) 68 | extends Module { 69 | val minXLen = 32 70 | val widths = (0 to log2Ceil(operandBits / minXLen)).map(minXLen << _) 71 | 72 | val io = IO(new Bundle { 73 | val mask = Input(UInt((operandBits / 8).W)) 74 | val cmd = Input(UInt(M_SZ.W)) 75 | val lhs = Input(UInt(operandBits.W)) 76 | val rhs = Input(UInt(operandBits.W)) 77 | val out = Output(UInt(operandBits.W)) 78 | val out_unmasked = Output(UInt(operandBits.W)) 79 | }) 80 | 81 | val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU 82 | val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU 83 | val add = io.cmd === M_XA_ADD 84 | val logic_and = io.cmd === M_XA_OR || io.cmd === M_XA_AND 85 | val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR 86 | 87 | val adder_out = { 88 | // partition the carry chain to support sub-xLen addition 89 | val mask = 90 | ~(0.U(operandBits.W) +: widths.init.map(w => !io.mask(w / 8 - 1) << (w - 1))).reduce(_ | _) 91 | (io.lhs & mask) + (io.rhs & mask) 92 | } 93 | 94 | val less = { 95 | // break up the comparator so the lower parts will be CSE'd 96 | def isLessUnsigned(x: UInt, y: UInt, n: Int): Bool = { 97 | if (n == minXLen) x(n - 1, 0) < y(n - 1, 0) 98 | else 99 | x(n - 1, n / 2) < y(n - 1, n / 2) || x(n - 1, n / 2) === y(n - 1, n / 2) && isLessUnsigned( 100 | x, 101 | y, 102 | n / 2 103 | ) 104 | } 105 | 106 | def isLess(x: UInt, y: UInt, n: Int): Bool = { 107 | val signed = { 108 | val mask = M_XA_MIN ^ M_XA_MINU 109 | (io.cmd & mask) === (M_XA_MIN & mask) 110 | } 111 | Mux(x(n - 1) === y(n - 1), isLessUnsigned(x, y, n), Mux(signed, x(n - 1), y(n - 1))) 112 | } 113 | 114 | PriorityMux(widths.reverse.map(w => (io.mask(w / 8 / 2), isLess(io.lhs, io.rhs, w)))) 115 | } 116 | 117 | val minmax = Mux(Mux(less, min, max), io.lhs, io.rhs) 118 | 119 | val logic = 120 | Mux(logic_and, io.lhs & io.rhs, 0.U) | 121 | Mux(logic_xor, io.lhs ^ io.rhs, 0.U) 122 | 123 | val out = 124 | Mux(add, adder_out, Mux(logic_and || logic_xor, logic, minmax)) 125 | 126 | val wmask = FillInterleaved(8, io.mask) 127 | io.out := wmask & out | ~wmask & io.lhs 128 | io.out_unmasked := out 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/core/BranchUnit.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | 8 | class BranchUnit( 9 | implicit p: Parameters) 10 | extends Module { 11 | val numThread = p(ThreadNum) 12 | val io = IO(new Bundle { 13 | val branch_data = Flipped(DecoupledIO(new BranchData())) 14 | val branch_ctl = DecoupledIO(new BranchControlData()) 15 | }) 16 | 17 | val branch_result = Module(new Queue(new BranchControlData(), 1, pipe = true)) 18 | 19 | io.branch_data.ready := branch_result.io.enq.ready 20 | branch_result.io.enq.valid := io.branch_data.valid 21 | 22 | // default 23 | branch_result.io.enq.bits.pc := 0.U 24 | branch_result.io.enq.bits.mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask) 25 | branch_result.io.enq.bits.wid := 0.U 26 | branch_result.io.enq.bits.diverge := 0.U 27 | 28 | branch_result.io.enq.bits.data.mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask) 29 | branch_result.io.enq.bits.data.pc := 0.U 30 | branch_result.io.enq.bits.data.orig_mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask) 31 | 32 | val pc_imm = io.branch_data.bits.pc + io.branch_data.bits.imm 33 | val pc_rs1 = io.branch_data.bits.rs1_data + io.branch_data.bits.imm 34 | val pc_next = io.branch_data.bits.pc + 4.U 35 | 36 | val taken_all = io.branch_data.bits.mask === io.branch_data.bits.orig_mask 37 | val taken_none = io.branch_data.bits.mask === 0.U.asTypeOf(io.branch_data.bits.orig_mask) 38 | val diverge = !(taken_all | taken_none) 39 | 40 | when(io.branch_data.bits.branch.jal) { 41 | branch_result.io.enq.bits.pc := pc_imm 42 | branch_result.io.enq.bits.mask := io.branch_data.bits.orig_mask 43 | branch_result.io.enq.bits.wid := io.branch_data.bits.wid 44 | branch_result.io.enq.bits.diverge := false.B 45 | 46 | }.elsewhen(io.branch_data.bits.branch.jalr) { 47 | branch_result.io.enq.bits.pc := pc_rs1 48 | branch_result.io.enq.bits.mask := io.branch_data.bits.orig_mask 49 | branch_result.io.enq.bits.wid := io.branch_data.bits.wid 50 | branch_result.io.enq.bits.diverge := false.B 51 | }.elsewhen(io.branch_data.bits.branch.branch) { 52 | branch_result.io.enq.bits.pc := Mux(taken_none, pc_next, pc_imm) 53 | branch_result.io.enq.bits.mask := Mux(diverge, io.branch_data.bits.mask, io.branch_data.bits.orig_mask) 54 | branch_result.io.enq.bits.wid := io.branch_data.bits.wid 55 | branch_result.io.enq.bits.diverge := diverge 56 | 57 | branch_result.io.enq.bits.data.mask := io.branch_data.bits.orig_mask.zip(io.branch_data.bits.mask).map { 58 | case (a, b) => a & !b 59 | } 60 | branch_result.io.enq.bits.data.pc := pc_next 61 | branch_result.io.enq.bits.data.orig_mask := io.branch_data.bits.orig_mask 62 | } 63 | 64 | io.branch_ctl <> branch_result.io.deq 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/core/DataStruct.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import freechips.rocketchip.rocket._ 7 | import ogpu.config._ 8 | import freechips.rocketchip.rocket.ALU._ 9 | 10 | class BranchSignal( 11 | implicit p: Parameters) 12 | extends Bundle { 13 | val jal = Bool() 14 | val jalr = Bool() 15 | val branch = Bool() 16 | } 17 | 18 | class VALUData( 19 | implicit p: Parameters) 20 | extends Bundle { 21 | val numThreads = p(ThreadNum) 22 | val numWarps = p(WarpNum) 23 | val xLen = p(XLen) 24 | val addrWidth = p(AddrWidth) 25 | val regIDWidth = p(RegIDWidth) 26 | 27 | val op1 = Vec(numThreads, UInt(xLen.W)) 28 | val op2 = Vec(numThreads, UInt(xLen.W)) 29 | val func = UInt(SZ_ALU_FN.W) 30 | val mask = Vec(numThreads, Bool()) 31 | val wid = UInt(log2Ceil(numWarps).W) 32 | val pc = UInt(addrWidth.W) 33 | val rd = UInt(regIDWidth.W) 34 | val branch = new BranchSignal() 35 | val imm = UInt(xLen.W) 36 | val rs1_data = UInt(xLen.W) 37 | } 38 | 39 | class SALUData( 40 | implicit p: Parameters) 41 | extends Bundle { 42 | val numThreads = p(ThreadNum) 43 | val numWarps = p(WarpNum) 44 | val xLen = p(XLen) 45 | val addrWidth = p(AddrWidth) 46 | val regIDWidth = p(RegIDWidth) 47 | 48 | val op1 = UInt(xLen.W) 49 | val op2 = UInt(xLen.W) 50 | val func = UInt(SZ_ALU_FN.W) 51 | val wid = UInt(log2Ceil(numWarps).W) 52 | val pc = UInt(addrWidth.W) 53 | val rd = UInt(regIDWidth.W) 54 | val branch = new BranchSignal() 55 | val imm = UInt(xLen.W) 56 | val rs1_data = UInt(xLen.W) 57 | } 58 | 59 | class BranchData( 60 | implicit p: Parameters) 61 | extends Bundle { 62 | val numThreads = p(ThreadNum) 63 | val numWarps = p(WarpNum) 64 | val addrWidth = p(AddrWidth) 65 | val xLen = p(XLen) 66 | 67 | val branch = new BranchSignal() 68 | val mask = Vec(numThreads, Bool()) 69 | val orig_mask = Vec(numThreads, Bool()) 70 | val wid = UInt(log2Ceil(numWarps).W) 71 | val pc = UInt(addrWidth.W) 72 | val imm = UInt(xLen.W) 73 | val rs1_data = UInt(xLen.W) 74 | } 75 | 76 | class LSUData( 77 | implicit p: Parameters) 78 | extends Bundle { 79 | val numThreads = p(ThreadNum) 80 | val xLen = p(XLen) 81 | val addrWidth = p(AddrWidth) 82 | val numWarps = p(WarpNum) 83 | val regIDWidth = p(RegIDWidth) 84 | 85 | val addr = Vec(numThreads, UInt(addrWidth.W)) 86 | val data = Vec(numThreads, UInt(xLen.W)) 87 | val mask = Vec(numThreads, Bool()) 88 | val func = UInt(1.W) 89 | val wid = UInt(log2Ceil(numWarps).W) 90 | 91 | val pc = UInt(addrWidth.W) 92 | // val fence = Bool() 93 | val offset = UInt(xLen.W) 94 | val rd = UInt(regIDWidth.W) 95 | } 96 | 97 | class CommitVData( 98 | implicit p: Parameters) 99 | extends Bundle { 100 | val numThreads = p(ThreadNum) 101 | val xLen = p(XLen) 102 | val addrWidth = p(AddrWidth) 103 | val numWarps = p(WarpNum) 104 | val regIDWidth = p(RegIDWidth) 105 | 106 | val wid = UInt(log2Ceil(numWarps).W) 107 | val mask = Vec(numThreads, Bool()) 108 | val pc = UInt(addrWidth.W) 109 | val eop = Bool() 110 | val rd = UInt(regIDWidth.W) 111 | val data = Vec(numThreads, UInt(xLen.W)) 112 | } 113 | 114 | class CommitSData( 115 | implicit p: Parameters) 116 | extends Bundle { 117 | val xLen = p(XLen) 118 | val addrWidth = p(AddrWidth) 119 | val numWarps = p(WarpNum) 120 | val regIDWidth = p(RegIDWidth) 121 | 122 | val wid = UInt(log2Ceil(numWarps).W) 123 | val mask = Bool() 124 | val pc = UInt(addrWidth.W) 125 | val eop = Bool() 126 | val rd = UInt(regIDWidth.W) 127 | val data = UInt(xLen.W) 128 | } 129 | 130 | class StackData( 131 | implicit p: Parameters) 132 | extends Bundle { 133 | val numThreads = p(ThreadNum) 134 | val addrWidth = p(AddrWidth) 135 | 136 | val mask = Vec(numThreads, Bool()) 137 | val pc = UInt(addrWidth.W) 138 | val orig_mask = Vec(numThreads, Bool()) 139 | } 140 | 141 | class InstData( 142 | implicit p: Parameters) 143 | extends Bundle { 144 | val numThreads = p(ThreadNum) 145 | val addrWidth = p(AddrWidth) 146 | val numWarps = p(WarpNum) 147 | 148 | val mask = Vec(numThreads, Bool()) 149 | val wid = UInt(log2Ceil(numWarps).W) 150 | val pc = UInt(addrWidth.W) 151 | val data = UInt(32.W) 152 | } 153 | 154 | class ExType( 155 | implicit p: Parameters) 156 | extends Bundle { 157 | val lsu = Bool() 158 | val alu = Bool() 159 | } 160 | 161 | class DecodeData( 162 | implicit p: Parameters) 163 | extends Bundle { 164 | val numThreads = p(ThreadNum) 165 | val xLen = p(XLen) 166 | val addrWidth = p(AddrWidth) 167 | val numWarps = p(WarpNum) 168 | val regIDWidth = p(RegIDWidth) 169 | 170 | val wid = UInt(log2Ceil(numWarps).W) 171 | val mask = Vec(numThreads, Bool()) 172 | val wb = Bool() 173 | val imm = UInt(xLen.W) 174 | val sel_alu1 = UInt(A1_X.getWidth.W) 175 | val sel_alu2 = UInt(A2_X.getWidth.W) 176 | val ex_type = new ExType() 177 | val func = UInt(SZ_ALU_FN.W) 178 | val mem_cmd = UInt(1.W) 179 | val branch = new BranchSignal() 180 | val pc = UInt(addrWidth.W) 181 | val rd = UInt(regIDWidth.W) 182 | val rs1 = UInt(regIDWidth.W) 183 | val rs2 = UInt(regIDWidth.W) 184 | } 185 | 186 | class WarpControlData( 187 | implicit p: Parameters) 188 | extends Bundle { 189 | val numWarps = p(WarpNum) 190 | 191 | val wid = UInt(log2Ceil(numWarps).W) 192 | val active = Bool() 193 | val join = Bool() 194 | val end = Bool() 195 | } 196 | 197 | class InstFetchData( 198 | implicit p: Parameters) 199 | extends Bundle { 200 | val numThreads = p(ThreadNum) 201 | val addrWidth = p(AddrWidth) 202 | val numWarps = p(WarpNum) 203 | 204 | val mask = Vec(numThreads, Bool()) 205 | val wid = UInt(log2Ceil(numWarps).W) 206 | val pc = UInt(addrWidth.W) 207 | } 208 | 209 | class WarpCommandData( 210 | implicit p: Parameters) 211 | extends Bundle { 212 | val numThreads = p(ThreadNum) 213 | val addrWidth = p(AddrWidth) 214 | val numWarps = p(WarpNum) 215 | val dimWidth = p(DimWidth) 216 | val xLen = p(XLen) 217 | 218 | val mask = Vec(numThreads, Bool()) 219 | // max threads num in a workgroup 220 | val thread_dims = Vec(3, UInt(dimWidth.W)) 221 | val vgpr_num = UInt(2.W) 222 | val sgprs = Vec(16, UInt(xLen.W)) 223 | val sgpr_num = UInt(4.W) 224 | val reg_index = UInt(p(RegIDWidth).W) 225 | val pc = UInt(addrWidth.W) 226 | } 227 | 228 | class WarpEndData( 229 | implicit p: Parameters) 230 | extends Bundle { 231 | val numThreads = p(ThreadNum) 232 | val addrWidth = p(AddrWidth) 233 | val numWarps = p(WarpNum) 234 | 235 | val wid = UInt(log2Ceil(numWarps).W) 236 | } 237 | 238 | class BranchControlData( 239 | implicit p: Parameters) 240 | extends Bundle { 241 | val numThreads = p(ThreadNum) 242 | val addrWidth = p(AddrWidth) 243 | val numWarps = p(WarpNum) 244 | 245 | val mask = Vec(numThreads, Bool()) 246 | val wid = UInt(log2Ceil(numWarps).W) 247 | val pc = UInt(addrWidth.W) 248 | val data = new StackData() 249 | val diverge = Bool() 250 | } 251 | 252 | class WritebackData( 253 | implicit p: Parameters) 254 | extends Bundle { 255 | val numThreads = p(ThreadNum) 256 | val xLen = p(XLen) 257 | val addrWidth = p(AddrWidth) 258 | val numWarps = p(WarpNum) 259 | val regIDWidth = p(RegIDWidth) 260 | 261 | val wid = UInt(log2Ceil(numWarps).W) 262 | val mask = Vec(numThreads, Bool()) 263 | val pc = UInt(addrWidth.W) 264 | val eop = Bool() 265 | val rd = UInt(regIDWidth.W) 266 | val data = Vec(numThreads, UInt(xLen.W)) 267 | } 268 | 269 | class ReadGPRReq( 270 | implicit p: Parameters) 271 | extends Bundle { 272 | val numWarps = p(WarpNum) 273 | val regIDWidth = p(RegIDWidth) 274 | 275 | val wid = UInt(log2Ceil(numWarps).W) 276 | val rs1 = UInt(regIDWidth.W) 277 | val rs2 = UInt(regIDWidth.W) 278 | } 279 | 280 | class ReadSGPRRsp( 281 | implicit p: Parameters) 282 | extends Bundle { 283 | val xLen = p(XLen) 284 | 285 | val rs1_data = UInt(xLen.W) 286 | val rs2_data = UInt(xLen.W) 287 | } 288 | 289 | class ReadVGPRRsp( 290 | implicit p: Parameters) 291 | extends Bundle { 292 | val numThreads = p(ThreadNum) 293 | val xLen = p(XLen) 294 | 295 | val rs1_data = Vec(numThreads, UInt(xLen.W)) 296 | val rs2_data = Vec(numThreads, UInt(xLen.W)) 297 | } 298 | 299 | class ThreadMask( 300 | implicit p: Parameters) 301 | extends Bundle { 302 | val numThreads = p(ThreadNum) 303 | 304 | val mask = Vec(numThreads, Bool()) 305 | } 306 | -------------------------------------------------------------------------------- /src/main/scala/core/Dispatch.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | import freechips.rocketchip.rocket._ 8 | 9 | class Dispatch( 10 | implicit p: Parameters) 11 | extends Module { 12 | val numThread = p(ThreadNum) 13 | val xLen = p(XLen) 14 | val io = IO(new Bundle { 15 | val ibuffer = Flipped(DecoupledIO(new DecodeData())) 16 | val vgpr_rsp = Flipped(new ReadVGPRRsp()) 17 | 18 | val alu = DecoupledIO(new VALUData()) 19 | val lsu = DecoupledIO(new LSUData()) 20 | }) 21 | 22 | val buffer = Module( 23 | new Queue( 24 | new Bundle { 25 | val decode = new DecodeData 26 | val vgpr_rsp = new ReadVGPRRsp() 27 | }, 28 | 1, 29 | pipe = true 30 | ) 31 | ) 32 | 33 | val buffer_deq = buffer.io.deq.bits 34 | buffer.io.enq.valid := io.ibuffer.valid 35 | buffer.io.enq.bits.decode := io.ibuffer.bits 36 | buffer.io.enq.bits.vgpr_rsp := io.vgpr_rsp 37 | io.ibuffer.ready := buffer.io.enq.ready 38 | 39 | val ex_op1 = Wire(Vec(numThread, UInt(xLen.W))) 40 | val ex_op2 = Wire(Vec(numThread, UInt(xLen.W))) 41 | val pc_vec = Wire(Vec(numThread, UInt(xLen.W))) 42 | val imm_vec = Wire(Vec(numThread, UInt(xLen.W))) 43 | val const_vec = Wire(Vec(numThread, UInt(xLen.W))) 44 | 45 | pc_vec := VecInit.tabulate(numThread) { _ => buffer_deq.decode.pc } 46 | imm_vec := VecInit.tabulate(numThread) { _ => buffer_deq.decode.imm } 47 | const_vec := VecInit.tabulate(numThread) { _ => 4.U } 48 | 49 | ex_op1 := MuxLookup(buffer_deq.decode.sel_alu1.asUInt, 0.U.asTypeOf(ex_op1))( 50 | Seq(A1_RS1.asUInt -> buffer_deq.vgpr_rsp.rs1_data, A1_PC.asUInt -> pc_vec) 51 | ) 52 | ex_op2 := MuxLookup(buffer_deq.decode.sel_alu2.asUInt, 0.U.asTypeOf(ex_op1))( 53 | Seq(A2_RS2.asUInt -> buffer_deq.vgpr_rsp.rs2_data, A2_IMM.asUInt -> imm_vec, A2_SIZE.asUInt -> const_vec) 54 | ) 55 | 56 | io.alu.valid := buffer.io.deq.valid && buffer_deq.decode.ex_type.alu 57 | io.alu.bits.op1 := ex_op1 58 | io.alu.bits.op2 := ex_op2 59 | io.alu.bits.func := buffer_deq.decode.func 60 | io.alu.bits.mask := buffer_deq.decode.mask 61 | io.alu.bits.wid := buffer_deq.decode.wid 62 | io.alu.bits.pc := buffer_deq.decode.pc 63 | io.alu.bits.rd := buffer_deq.decode.rd 64 | io.alu.bits.imm := buffer_deq.decode.imm 65 | io.alu.bits.rs1_data := buffer_deq.vgpr_rsp.rs1_data(PriorityEncoder(buffer_deq.decode.mask)) 66 | io.alu.bits.branch := buffer_deq.decode.branch 67 | 68 | io.lsu.valid := buffer.io.deq.valid && buffer_deq.decode.ex_type.lsu 69 | io.lsu.bits.func := buffer_deq.decode.mem_cmd 70 | io.lsu.bits.wid := buffer_deq.decode.wid 71 | io.lsu.bits.mask := buffer_deq.decode.mask 72 | io.lsu.bits.addr := buffer_deq.vgpr_rsp.rs1_data 73 | io.lsu.bits.rd := buffer_deq.decode.rd 74 | io.lsu.bits.data := buffer_deq.vgpr_rsp.rs2_data 75 | io.lsu.bits.offset := buffer_deq.decode.imm 76 | io.lsu.bits.pc := buffer_deq.decode.pc 77 | 78 | val mapping = Seq((1.U, io.alu.ready), (2.U, io.lsu.ready)) 79 | buffer.io.deq.ready := MuxLookup(Cat(buffer_deq.decode.ex_type.lsu, buffer_deq.decode.ex_type.alu), 1.B)(mapping) 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/core/ICache.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import freechips.rocketchip.util._ 6 | import freechips.rocketchip.diplomacy._ 7 | import freechips.rocketchip.tilelink._ 8 | import freechips.rocketchip.amba._ 9 | import org.chipsalliance.cde.config.Parameters 10 | import chisel3.util.random.LFSR 11 | 12 | case class ICacheParams( 13 | nSets: Int = 64, 14 | nWays: Int = 4, 15 | rowBits: Int = 128, 16 | paddrBits: Int = 48, 17 | vaddrBits: Int = 48, 18 | pgIdxBits: Int = 12, 19 | dataBits: Int = 64, 20 | nTLBSets: Int = 32, 21 | nTLBWays: Int = 4, 22 | coreId: Int = 0, 23 | tagECC: Option[String] = None, 24 | dataECC: Option[String] = None, 25 | prefetch: Boolean = false, 26 | pgLevels: Int = 3, 27 | blockBytes: Int = 64, 28 | latency: Int = 2) { 29 | def tagCode: Code = Code.fromString(tagECC) 30 | def dataCode: Code = Code.fromString(dataECC) 31 | def replacement = new RandomReplacement(nWays) 32 | def blockOffBits: Int = log2Ceil(blockBytes) 33 | def lgCacheBlockBytes = blockOffBits 34 | def untagBits: Int = log2Ceil(nSets) + blockOffBits 35 | def tagBits: Int = vaddrBits - untagBits 36 | def pgUntagBits: Int = untagBits 37 | def idxBits = log2Up(nSets) 38 | def isDM = nWays == 1 39 | def cacheDataBeats = (blockBytes * 8) / dataBits 40 | def fetchBytes = dataBits / 8 41 | def refillCycles = cacheDataBeats 42 | def vpnBits: Int = vaddrBits - pgIdxBits 43 | } 44 | 45 | class ICacheReq(vaddrBits: Int) extends Bundle { 46 | val addr = UInt(vaddrBits.W) 47 | } 48 | 49 | class ICacheResp(dataBits: Int) extends Bundle { 50 | val data = UInt(dataBits.W) 51 | val replay = Bool() 52 | val ae = Bool() 53 | } 54 | 55 | class ICacheBundle(cfg: ICacheParams) extends Bundle { 56 | val req = Flipped(Decoupled(new ICacheReq(cfg.vaddrBits))) 57 | val s1_paddr = Input(UInt(cfg.paddrBits.W)) 58 | val s1_kill = Input(Bool()) 59 | val s2_kill = Input(Bool()) 60 | val s2_cacheable = Input(Bool()) 61 | val s2_prefetch = Input(Bool()) 62 | val resp = Valid(new ICacheResp(cfg.dataBits)) 63 | val invalidate = Input(Bool()) 64 | } 65 | 66 | class ICache( 67 | val cfg: ICacheParams 68 | )( 69 | implicit p: Parameters) 70 | extends LazyModule { 71 | lazy val module = new ICacheModule(this) 72 | val masterNode = TLClientNode( 73 | Seq( 74 | TLMasterPortParameters.v1( 75 | clients = Seq( 76 | TLMasterParameters.v1( 77 | sourceId = IdRange(0, 2), // 0=refill, 1=hint 78 | name = s"Core ${cfg.coreId} ICache" 79 | ) 80 | ), 81 | requestFields = Seq() 82 | ) 83 | ) 84 | ) 85 | 86 | val size = cfg.nSets * cfg.nWays * cfg.blockBytes 87 | 88 | } 89 | 90 | class ICacheModule(outer: ICache) extends LazyModuleImp(outer) { 91 | val cfg = outer.cfg 92 | val io = IO(new ICacheBundle(cfg)) 93 | val (tl_out, edge_out) = outer.masterNode.out(0) 94 | 95 | val tECC = cfg.tagCode 96 | val dECC = cfg.dataCode 97 | require(isPow2(cfg.nSets) && isPow2(cfg.nWays)) 98 | 99 | /** valid signal for CPU accessing cache in stage 0. */ 100 | val s0_valid = io.req.fire 101 | 102 | /** virtual address from CPU in stage 0. */ 103 | val s0_vaddr = io.req.bits.addr 104 | 105 | /** valid signal for stage 1, drived by s0_valid. */ 106 | val s1_valid = RegInit(false.B) 107 | 108 | /** virtual address from CPU in stage 1. */ 109 | val s1_vaddr = RegEnable(s0_vaddr, s0_valid) 110 | 111 | /** tag hit vector to indicate hit which way. */ 112 | val s1_tag_hit = Wire(Vec(cfg.nWays, Bool())) 113 | 114 | val s1_hit = s1_tag_hit.reduce(_ || _) 115 | dontTouch(s1_hit) 116 | val s2_valid = RegNext(s1_valid && !io.s1_kill, false.B) 117 | val s2_hit = RegNext(s1_hit) 118 | 119 | /** status register to indicate a cache flush. */ 120 | val invalidated = Reg(Bool()) 121 | val refill_valid = RegInit(false.B) 122 | 123 | /** register to indicate [[tl_out]] is performing a hint. prefetch only happens after refilling 124 | */ 125 | val send_hint = RegInit(false.B) 126 | 127 | /** indicate [[tl_out]] is performing a refill. */ 128 | val refill_fire = tl_out.a.fire && !send_hint 129 | 130 | /** register to indicate there is a outstanding hint. */ 131 | val hint_outstanding = RegInit(false.B) 132 | 133 | /** [[io]] access L1 I$ miss. */ 134 | val s2_miss = s2_valid && !s2_hit && !io.s2_kill 135 | 136 | /** forward signal to stage 1, permit stage 1 refill. */ 137 | val s1_can_request_refill = !(s2_miss || refill_valid) 138 | 139 | /** real refill signal, stage 2 miss, and was permit to refill in stage 1. Since a miss will trigger burst. miss under 140 | * miss won't trigger another burst. 141 | */ 142 | val s2_request_refill = s2_miss && RegNext(s1_can_request_refill) 143 | val refill_paddr = RegEnable(io.s1_paddr, s1_valid && s1_can_request_refill) 144 | val refill_vaddr = RegEnable(s1_vaddr, s1_valid && s1_can_request_refill) 145 | val refill_tag = refill_paddr >> cfg.pgUntagBits 146 | val refill_idx = index(refill_vaddr, refill_paddr) 147 | 148 | /** AccessAckData, is refilling I$, it will block request from CPU. */ 149 | val refill_one_beat = tl_out.d.fire && edge_out.hasData(tl_out.d.bits) 150 | 151 | /** block request from CPU when refill or scratch pad access. */ 152 | io.req.ready := !(refill_one_beat) 153 | s1_valid := s0_valid 154 | 155 | val (_, _, d_done, refill_cnt) = edge_out.count(tl_out.d) 156 | 157 | /** at last beat of `tl_out.d.fire`, finish refill. */ 158 | val refill_done = refill_one_beat && d_done 159 | 160 | /** scratchpad is writing data. block refill. */ 161 | tl_out.d.ready := true.B 162 | 163 | require(edge_out.manager.minLatency > 0) 164 | 165 | /** way to be replaced, implemented with a hardcoded random replacement algorithm */ 166 | val repl_way = 167 | if (cfg.isDM) 0.U 168 | else { 169 | // pick a way that is not used by the scratchpad 170 | val v0 = LFSR(16, refill_fire)(log2Up(cfg.nWays) - 1, 0) 171 | v0 172 | } 173 | 174 | /** Tag SRAM, indexed with virtual memory, content with `refillError ## tag[19:0]` after ECC 175 | */ 176 | val tag_array = DescribedSRAM( 177 | name = "tag_array", 178 | desc = "ICache Tag Array", 179 | size = cfg.nSets, 180 | data = Vec(cfg.nWays, UInt(tECC.width(1 + cfg.tagBits).W)) 181 | ) 182 | val tag_rdata = tag_array.read(s0_vaddr(cfg.untagBits - 1, cfg.blockOffBits), !refill_done && s0_valid) 183 | 184 | /** register indicates the ongoing GetAckData transaction is corrupted. */ 185 | val accruedRefillError = Reg(Bool()) 186 | 187 | /** wire indicates the ongoing GetAckData transaction is corrupted. */ 188 | val refillError = tl_out.d.bits.corrupt || (refill_cnt > 0.U && accruedRefillError) 189 | when(refill_done) { 190 | // For AccessAckData, denied => corrupt 191 | /** data written to [[tag_array]]. ECC encoded `refillError ## refill_tag` 192 | */ 193 | val enc_tag = tECC.encode(Cat(refillError, refill_tag)) 194 | tag_array.write(refill_idx, VecInit(Seq.fill(cfg.nWays) { enc_tag }), Seq.tabulate(cfg.nWays)(repl_way === _.U)) 195 | 196 | } 197 | // notify CPU, I$ has corrupt. 198 | // io.errors.bus.valid := tl_out.d.fire && (tl_out.d.bits.denied || tl_out.d.bits.corrupt) 199 | // io.errors.bus.bits := (refill_paddr >> blockOffBits) << blockOffBits 200 | 201 | /** true indicate this cacheline is valid, indexed by (wayIndex ## setIndex) after refill_done and not FENCE.I, 202 | * (repl_way ## refill_idx) set to true. 203 | */ 204 | val vb_array = RegInit(0.U((cfg.nSets * cfg.nWays).W)) 205 | when(refill_one_beat) { 206 | accruedRefillError := refillError 207 | // clear bit when refill starts so hit-under-miss doesn't fetch bad data 208 | vb_array := vb_array.bitSet(Cat(repl_way, refill_idx), refill_done && !invalidated) 209 | } 210 | 211 | /** flush cache when invalidate is true. */ 212 | val invalidate = WireDefault(io.invalidate) 213 | when(invalidate) { 214 | vb_array := 0.U 215 | invalidated := true.B 216 | } 217 | 218 | /** wire indicates that tag is correctable or uncorrectable. will trigger CPU to replay and I$ invalidating, if 219 | * correctable. 220 | */ 221 | val s1_tag_disparity = Wire(Vec(cfg.nWays, Bool())) 222 | 223 | /** wire indicates that bus has an uncorrectable error. respond to CPU [[io.resp.bits.ae]], cause 224 | * [[Causes.fetch_access]]. 225 | */ 226 | val s1_tl_error = Wire(Vec(cfg.nWays, Bool())) 227 | 228 | /** how many bits will be fetched by CPU for each fetch. */ 229 | val wordBits = cfg.fetchBytes * 8 230 | 231 | /** a set of raw data read from [[data_arrays]]. */ 232 | val s1_dout = Wire(Vec(cfg.nWays, UInt(dECC.width(wordBits).W))) 233 | s1_dout := DontCare 234 | 235 | // /** address accessed by [[tl_in]] for ITIM. */ 236 | // val s0_slaveAddr = tl_in.map(_.a.bits.address).getOrElse(0.U) 237 | // /** address used at stage 1 and 3. 238 | // * {{{ 239 | // * In stage 1, it caches TileLink data, store in stage 2 if ECC passed. 240 | // * In stage 3, it caches corrected data from stage 2, and store in stage 4.}}} 241 | // */ 242 | // val s1s3_slaveAddr = Reg(UInt(log2Ceil(outer.size).W)) 243 | // /** data used at stage 1 and 3. 244 | // * {{{ 245 | // * In stage 1, it caches TileLink data, store in stage 2. 246 | // * In stage 3, it caches corrected data from data ram, and return to d channel.}}} 247 | // */ 248 | // val s1s3_slaveData = Reg(UInt(wordBits.W)) 249 | 250 | for (i <- 0 until cfg.nWays) { 251 | val s1_idx = index(s1_vaddr, io.s1_paddr) 252 | val s1_tag = io.s1_paddr >> cfg.pgUntagBits 253 | 254 | /** this way is used by scratchpad. [[tag_array]] corrupted. 255 | */ 256 | // val scratchpadHit = scratchpadWayValid(i.U) && 257 | // Mux(s1_slaveValid, 258 | // // scratchpad accessing form [[tl_in]]. 259 | // // @todo I think XBar will guarantee there won't be an illegal access on the bus? 260 | // // so why did have this check `lineInScratchpad(scratchpadLine(s1s3_slaveAddr))`? 261 | // // I think it will always be true. 262 | // lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i.U, 263 | // // scratchpad accessing from [[io]]. 264 | // // @todo Accessing ITIM correspond address will be able to read cacheline? 265 | // // is this desired behavior? 266 | // addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i.U) 267 | val s1_vb = vb_array(Cat(i.U, s1_idx)) 268 | // printf(cf"width is variable because of i.U ${Cat(i.U, s1_idx).getWidth}\n") 269 | val enc_tag = tECC.decode(tag_rdata(i)) 270 | 271 | /** [[tl_error]] ECC error bit. [[tag]] of [[tag_array]] access. 272 | */ 273 | val (tl_error, tag) = Split(enc_tag.uncorrected, cfg.tagBits) 274 | val tagMatch = s1_vb && tag === s1_tag 275 | 276 | /** tag error happens. */ 277 | s1_tag_disparity(i) := s1_vb && enc_tag.error 278 | 279 | /** if tag matched but ecc checking failed, this access will trigger [[Causes.fetch_access]] exception. */ 280 | s1_tl_error(i) := tagMatch && tl_error.asBool 281 | s1_tag_hit(i) := tagMatch 282 | } 283 | assert(!(s1_valid) || PopCount(s1_tag_hit.zip(s1_tag_disparity).map { case (h, d) => h && !d }) <= 1.U) 284 | 285 | println(s"tl width ${tl_out.d.bits.data.getWidth}") 286 | println(s"tl mask width ${tl_out.a.bits.mask.getWidth}") 287 | require(tl_out.d.bits.data.getWidth % wordBits == 0) 288 | 289 | /** Data SRAM 290 | * 291 | * banked with TileLink beat bytes / CPU fetch bytes, indexed with [[index]] and multi-beats cycle, content with 292 | * `eccError ## wordBits` after ECC. 293 | * {{{ 294 | * │ │xx│xxxxxx│xxx│x│xx│ 295 | * ↑word 296 | * ↑bank 297 | * ↑way 298 | * └─set──┴─offset─┘ 299 | * └────row───┘ 300 | * }}} 301 | * Note: Data SRAM is indexed with virtual memory(vaddr[11:2]), 302 | * - vaddr[11:3]->row, 303 | * - vaddr[2]->bank=i 304 | * - Cache line size = refillCycels(8) * bank(2) * datasize(4 bytes) = 64 bytes 305 | * - data width = 32 306 | * 307 | * read: read happens in stage 0 308 | * 309 | * write: It takes 8 beats to refill 16 instruction in each refilling cycle. Data_array receives data[63:0](2 310 | * instructions) at once,they will be allocated in deferent bank according to vaddr[2] 311 | */ 312 | val data_arrays = Seq.tabulate(tl_out.d.bits.data.getWidth / wordBits) { i => 313 | DescribedSRAM( 314 | name = s"data_arrays_${i}", 315 | desc = "ICache Data Array", 316 | size = cfg.nSets * cfg.refillCycles, 317 | data = Vec(cfg.nWays, UInt(dECC.width(wordBits).W)) 318 | ) 319 | } 320 | 321 | for ((data_array, i) <- data_arrays.zipWithIndex) { 322 | 323 | /** bank match (vaddr[2]) */ 324 | def wordMatch(addr: UInt) = 325 | addr.extract(log2Ceil(tl_out.d.bits.data.getWidth / 8) - 1, log2Ceil(wordBits / 8)) === i.U 326 | def row(addr: UInt) = addr(cfg.untagBits - 1, cfg.blockOffBits - log2Ceil(cfg.refillCycles)) 327 | 328 | /** read_enable signal */ 329 | val s0_ren = (s0_valid && wordMatch(s0_vaddr)) 330 | 331 | /** write_enable signal refill from [[tl_out]] or ITIM write. 332 | */ 333 | val wen = (refill_one_beat && !invalidated) 334 | 335 | /** index to access [[data_array]]. */ 336 | val mem_idx = 337 | // I$ refill. refill_idx[2:0] is the beats 338 | Mux( 339 | refill_one_beat, 340 | (refill_idx << log2Ceil(cfg.refillCycles)) | refill_cnt, 341 | // CPU read. 342 | row(s0_vaddr) 343 | ) 344 | when(wen) { 345 | // wr_data 346 | val data = tl_out.d.bits.data(wordBits * (i + 1) - 1, wordBits * i) 347 | // the way to be replaced/written 348 | val way = repl_way 349 | data_array.write( 350 | mem_idx, 351 | VecInit(Seq.fill(cfg.nWays) { dECC.encode(data) }), 352 | (0 until cfg.nWays).map(way === _.U) 353 | ) 354 | } 355 | // write access 356 | /** data read from [[data_array]]. */ 357 | val dout = data_array.read(mem_idx, !wen && s0_ren) 358 | // Mux to select a way to [[s1_dout]] 359 | when(wordMatch(io.s1_paddr)) { 360 | s1_dout := dout 361 | } 362 | } 363 | 364 | /** When writing full words to ITIM, ECC errors are correctable. When writing a full scratchpad word, suppress the 365 | * read so Xs don't leak out 366 | */ 367 | // val s1s2_full_word_write = WireDefault(false.B) 368 | // val s1_dont_read = s1_slaveValid && s1s2_full_word_write 369 | 370 | /** clock gate signal for [[s2_tag_hit]], [[s2_dout]], [[s2_tag_disparity]], [[s2_tl_error]], [[s2_scratchpad_hit]]. 371 | */ 372 | val s1_clk_en = s1_valid 373 | val s2_tag_hit = RegEnable(s1_tag_hit, s1_clk_en) 374 | 375 | /** way index to access [[data_arrays]]. */ 376 | val s2_hit_way = OHToUInt(s2_tag_hit) 377 | 378 | /** ITIM index to access [[data_arrays]]. replace tag with way, word set to 0. 379 | */ 380 | val s2_dout = RegEnable(s1_dout, s1_clk_en) 381 | val s2_way_mux = Mux1H(s2_tag_hit, s2_dout) 382 | val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_clk_en).asUInt.orR 383 | val s2_tl_error = RegEnable(s1_tl_error.asUInt.orR, s1_clk_en) 384 | 385 | /** ECC decode result for [[data_arrays]]. */ 386 | val s2_data_decoded = dECC.decode(s2_way_mux) 387 | 388 | /** ECC error happened, correctable or uncorrectable, ask CPU to replay. */ 389 | val s2_disparity = s2_tag_disparity || s2_data_decoded.error 390 | 391 | /** access hit in ITIM, if [[s1_slaveValid]], this access is from [[tl_in]], else from CPU [[io]]. */ 392 | // val s1_scratchpad_hit = Mux(s1_slaveValid, lineInScratchpad(scratchpadLine(s1s3_slaveAddr)), addrInScratchpad(io.s1_paddr)) 393 | /** stage 2 of [[s1_scratchpad_hit]]. */ 394 | // val s2_scratchpad_hit = RegEnable(s1_scratchpad_hit, s1_clk_en) 395 | /** ITIM uncorrectable read. `s2_scratchpad_hit`: processing a scratchpad read(from [[tl_in]] or [[io]]) 396 | * `s2_data_decoded.uncorrectable`: read a uncorrectable data. `s2_valid`: [[io]] non-canceled read. `(s2_slaveValid 397 | * && !s2_full_word_write)`: [[tl_in]] read or write a word with wormhole. if write a full word, even stage 2 read 398 | * uncorrectable. stage 3 full word write will recovery this. 399 | */ 400 | // val s2_report_uncorrectable_error = s2_scratchpad_hit && s2_data_decoded.uncorrectable && (s2_valid || (s2_slaveValid && !s1s2_full_word_write)) 401 | /** ECC uncorrectable address, send to Bus Error Unit. */ 402 | // val s2_error_addr = scratchpadBase.map(base => Mux(s2_scratchpad_hit, base + s2_scratchpad_word_addr, 0.U)).getOrElse(0.U) 403 | 404 | // output signals 405 | outer.cfg.latency match { 406 | // if I$ latency is 1, no ITIM, no ECC. 407 | case 1 => 408 | require(tECC.isInstanceOf[IdentityCode]) 409 | require(dECC.isInstanceOf[IdentityCode]) 410 | // reply data to CPU at stage 2. no replay. 411 | io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout) 412 | io.resp.bits.ae := s1_tl_error.asUInt.orR 413 | io.resp.valid := s1_valid && s1_hit 414 | io.resp.bits.replay := false.B 415 | 416 | // if I$ latency is 2, can have ITIM and ECC. 417 | case 2 => 418 | // when some sort of memory bit error have occurred 419 | // @todo why so aggressive to invalidate all when ecc corrupted. 420 | when(s2_valid && s2_disparity) { invalidate := true.B } 421 | 422 | // reply data to CPU at stage 2. 423 | io.resp.bits.data := s2_data_decoded.uncorrected 424 | io.resp.bits.ae := s2_tl_error 425 | io.resp.bits.replay := s2_disparity 426 | io.resp.valid := s2_valid && s2_hit 427 | 428 | // // report correctable error to BEU at stage 2. 429 | // io.errors.correctable.foreach { c => 430 | // c.valid := (s2_valid || s2_slaveValid) && s2_disparity 431 | // c.bits := s2_error_addr 432 | // } 433 | // // report uncorrectable error to BEU at stage 2. 434 | // io.errors.uncorrectable.foreach { u => 435 | // u.valid := false.B 436 | // u.bits := s2_error_addr 437 | // } 438 | 439 | } 440 | 441 | println(s"edge out bundle ${edge_out.bundle}") 442 | tl_out.a.valid := s2_request_refill 443 | tl_out.a.bits := edge_out 444 | .Get( 445 | fromSource = 0.U, 446 | toAddress = (refill_paddr >> cfg.blockOffBits) << cfg.blockOffBits, 447 | lgSize = cfg.lgCacheBlockBytes.U 448 | ) 449 | ._2 450 | 451 | // // prefetch when next-line access does not cross a page 452 | // if (cacheParams.prefetch) { 453 | // /** [[crosses_page]] indicate if there is a crosses page access 454 | // * [[next_block]] : the address to be prefetched. 455 | // */ 456 | // val (crosses_page, next_block) = Split(refill_paddr(pgIdxBits-1, blockOffBits) +& 1.U, pgIdxBits-blockOffBits) 457 | 458 | // when (tl_out.a.fire) { 459 | // send_hint := !hint_outstanding && io.s2_prefetch && !crosses_page 460 | // when (send_hint) { 461 | // send_hint := false.B 462 | // hint_outstanding := true.B 463 | // } 464 | // } 465 | 466 | // // @todo why refill_done will kill hint at this cycle? 467 | // when (refill_done) { 468 | // send_hint := false.B 469 | // } 470 | 471 | // // D channel reply with HintAck. 472 | // when (tl_out.d.fire && !refill_one_beat) { 473 | // hint_outstanding := false.B 474 | // } 475 | 476 | // when (send_hint) { 477 | // tl_out.a.valid := true.B 478 | // tl_out.a.bits := edge_out.Hint( 479 | // fromSource = 1.U, 480 | // toAddress = Cat(refill_paddr >> pgIdxBits, next_block) << blockOffBits, 481 | // lgSize = lgCacheBlockBytes.U, 482 | // param = TLHints.PREFETCH_READ)._2 483 | // } 484 | 485 | // ccover(send_hint && !tl_out.a.ready, "PREFETCH_A_STALL", "I$ prefetch blocked by A-channel") 486 | // ccover(refill_valid && (tl_out.d.fire && !refill_one_beat), "PREFETCH_D_BEFORE_MISS_D", "I$ prefetch resolves before miss") 487 | // ccover(!refill_valid && (tl_out.d.fire && !refill_one_beat), "PREFETCH_D_AFTER_MISS_D", "I$ prefetch resolves after miss") 488 | // ccover(tl_out.a.fire && hint_outstanding, "PREFETCH_D_AFTER_MISS_A", "I$ prefetch resolves after second miss") 489 | // } 490 | 491 | // Drive APROT information 492 | tl_out.a.bits.user.lift(AMBAProt).foreach { x => 493 | // Rocket caches all fetch requests, and it's difficult to differentiate privileged/unprivileged on 494 | // cached data, so mark as privileged 495 | x.fetch := true.B 496 | x.secure := true.B 497 | x.privileged := true.B 498 | x.bufferable := true.B 499 | x.modifiable := true.B 500 | x.readalloc := io.s2_cacheable 501 | x.writealloc := io.s2_cacheable 502 | } 503 | tl_out.b.ready := true.B 504 | tl_out.c.valid := false.B 505 | tl_out.e.valid := false.B 506 | 507 | // if there is an outstanding refill, cannot flush I$. 508 | when(!refill_valid) { invalidated := false.B } 509 | when(refill_fire) { refill_valid := true.B } 510 | when(refill_done) { refill_valid := false.B } 511 | 512 | // io.perf.acquire := refill_fire 513 | // don't gate I$ clock since there are outstanding transcations. 514 | // io.keep_clock_enabled := 515 | // tl_in 516 | // .map(tl => tl.a.valid || tl.d.valid || s1_slaveValid || s2_slaveValid || s3_slaveValid) 517 | // .getOrElse(false.B) || // ITIM 518 | // s1_valid || s2_valid || refill_valid || send_hint || hint_outstanding // I$ 519 | 520 | /** index to access [[data_arrays]] and [[tag_array]]. 521 | * @note 522 | * if [[untagBits]] > [[pgIdxBits]] in 523 | * {{{ 524 | * ┌──idxBits──┐ 525 | * ↓ ↓ 526 | * │ tag │ set │offset│ 527 | * │ pageTag │ pageIndex│ 528 | * ↑ ↑ ↑ │ 529 | * untagBits│ blockOffBits│ 530 | * pgIdxBits │ 531 | * └msb┴──lsb──┘ 532 | * vaddr paddr 533 | * }}} 534 | * 535 | * else use paddr directly. Note: if [[untagBits]] > [[pgIdxBits]], there will be a alias issue which isn't 536 | * addressend by the icache yet. 537 | */ 538 | def index(vaddr: UInt, paddr: UInt) = { 539 | 540 | /** [[paddr]] as LSB to be used for VIPT. */ 541 | val lsbs = paddr(cfg.pgUntagBits - 1, cfg.blockOffBits) 542 | 543 | /** if [[untagBits]] > [[pgIdxBits]], append [[vaddr]] to higher bits of index as [[msbs]]. */ 544 | val msbs = (cfg.idxBits + cfg.blockOffBits > cfg.pgUntagBits) 545 | .option(vaddr(cfg.idxBits + cfg.blockOffBits - 1, cfg.pgUntagBits)) 546 | msbs ## lsbs 547 | } 548 | } 549 | -------------------------------------------------------------------------------- /src/main/scala/core/IDecodeUnit.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | import freechips.rocketchip.rocket._ 8 | import freechips.rocketchip.util._ 9 | import freechips.rocketchip.rocket.ALU._ 10 | 11 | // add join inst based on jal, use wxd to distinguish 12 | /* Automatically generated by parse_opcodes */ 13 | object GPUInstructions { 14 | def JOIN = BitPat("b?????????????????000?????1101011") 15 | 16 | } 17 | 18 | import GPUInstructions._ 19 | class GPUDecode( 20 | implicit val p: Parameters) 21 | extends DecodeConstants { 22 | val aluFn = ALU 23 | val table: Array[(BitPat, List[BitPat])] = Array( 24 | // format: off 25 | JOIN -> List( 26 | Y,N,N,N,Y,N,N,N,N,N,N,A2_SIZE,A1_PC,IMM_UJ,DW_XPR,aluFn.FN_ADD,N,M_X,N,N,N,N,N,N,N,CSR.N,N,N,N,N)) 27 | // format: on 28 | } 29 | 30 | // reuse rocketchip decode 31 | class IDecodeUnit( 32 | implicit p: Parameters) 33 | extends Module() { 34 | val io = IO(new Bundle { 35 | val inst = Flipped(DecoupledIO(new InstData())) 36 | val decode = DecoupledIO(new DecodeData()) 37 | val wcontrol = DecoupledIO(new WarpControlData()) 38 | }) 39 | 40 | val aluFn = ALU 41 | val decode_table = { 42 | Seq(new GPUDecode()) ++: 43 | Seq(new IDecode()) ++: 44 | Seq(new I64Decode()) 45 | }.flatMap(_.table) 46 | val id_ctrl = Wire(new IntCtrlSigs()).decode(io.inst.bits.data, decode_table) 47 | val exec_ctrl = RegInit(0.U.asTypeOf(new IntCtrlSigs())) 48 | val ctrl_valid = RegInit(0.B) 49 | val decode_valid = RegInit(0.B) 50 | val inst_reg = RegInit(0.U.asTypeOf(io.inst.bits)) 51 | when(io.inst.fire) { 52 | exec_ctrl := id_ctrl 53 | ctrl_valid := 1.B 54 | decode_valid := 1.B 55 | inst_reg := io.inst.bits 56 | }.otherwise { 57 | ctrl_valid := 0.B 58 | } 59 | 60 | when(io.decode.fire & !io.inst.fire) { 61 | decode_valid := 0.B 62 | } 63 | io.inst.ready := !io.decode.valid || io.decode.fire 64 | io.decode.valid := decode_valid 65 | io.wcontrol.valid := ctrl_valid 66 | 67 | val ctrl = exec_ctrl 68 | val is_alu = ctrl.wxd && !(ctrl.mem || ctrl.fp || ctrl.mul || ctrl.div || ctrl.csr =/= CSR.N) 69 | val is_lsu = ctrl.mem 70 | val is_csr = ctrl.csr =/= CSR.N 71 | 72 | // branch inst break warp schedule 73 | val is_jal = ctrl.wxd && ctrl.jal 74 | val is_jalr = ctrl.jalr 75 | val is_join = !ctrl.wxd && ctrl.jal 76 | val is_branch = ctrl.branch 77 | val is_end = inst_reg.data === 0x10500073.U 78 | val imm = ImmGen(ctrl.sel_imm, inst_reg.data) 79 | 80 | // output 81 | io.decode.bits.wid := inst_reg.wid 82 | io.decode.bits.mask := inst_reg.mask 83 | io.decode.bits.pc := inst_reg.pc 84 | io.decode.bits.ex_type.lsu := is_lsu 85 | io.decode.bits.ex_type.alu := is_alu 86 | 87 | io.decode.bits.func := ctrl.alu_fn 88 | io.decode.bits.mem_cmd := ctrl.mem_cmd(0) 89 | io.decode.bits.wb := ctrl.wxd 90 | io.decode.bits.sel_alu2 := ctrl.sel_alu2.asUInt 91 | io.decode.bits.sel_alu1 := ctrl.sel_alu1.asUInt 92 | io.decode.bits.imm := imm.asUInt 93 | io.decode.bits.branch.jal := is_jal 94 | io.decode.bits.branch.jalr := is_jalr 95 | io.decode.bits.branch.branch := is_branch 96 | io.decode.bits.rd := inst_reg.data(11, 7) 97 | io.decode.bits.rs1 := inst_reg.data(19, 15) 98 | io.decode.bits.rs2 := inst_reg.data(24, 20) 99 | 100 | io.wcontrol.bits.wid := inst_reg.wid 101 | io.wcontrol.bits.join := is_join 102 | io.wcontrol.bits.active := !(is_branch || is_jal || is_jalr || is_end) 103 | // wfi inst as end inst 104 | io.wcontrol.bits.end := is_end 105 | } 106 | -------------------------------------------------------------------------------- /src/main/scala/core/IFetch.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | 7 | class InstFetch( 8 | cfg: ICacheParams 9 | )( 10 | implicit p: Parameters) 11 | extends Module { 12 | 13 | val io = IO(new Bundle { 14 | val inst_fetch = Flipped(Decoupled(new InstFetchData())) 15 | val to_icache = Flipped(new ICacheBundle(cfg)) 16 | val to_ptw = new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels) 17 | val inst_out = Decoupled(new InstData()) 18 | }) 19 | 20 | val tlb_param = 21 | TLBParameter(nSets = cfg.nTLBSets, nWays = cfg.nTLBWays, paddrBits = cfg.paddrBits, vaddrBits = cfg.vaddrBits) 22 | val tlb = Module(new TLB(true, tlb_param)) 23 | 24 | // fetch inst fetch 25 | val fetch_idle :: fetch_req :: fetch_wait1 :: fetch_wait2 :: dispatch_wait :: Nil = Enum(5) 26 | val s_state = RegInit(fetch_idle) 27 | 28 | val ifetch_data = RegInit(0.U.asTypeOf(new InstFetchData)) 29 | 30 | switch(s_state) { 31 | is(fetch_idle) { 32 | when(io.inst_fetch.fire) { 33 | s_state := fetch_req 34 | ifetch_data := io.inst_fetch.bits 35 | } 36 | } 37 | is(fetch_req) { 38 | when(io.to_icache.req.fire) { 39 | s_state := fetch_wait1 40 | } 41 | } 42 | is(fetch_wait1) { 43 | when(io.to_icache.resp.valid) { 44 | s_state := dispatch_wait 45 | }.otherwise { 46 | s_state := fetch_wait2 47 | } 48 | } 49 | is(fetch_wait2) { 50 | when(io.to_icache.resp.valid) { 51 | s_state := dispatch_wait 52 | }.otherwise { 53 | s_state := fetch_req 54 | } 55 | } 56 | is(dispatch_wait) { 57 | when(io.inst_out.fire) { 58 | s_state := fetch_idle 59 | } 60 | } 61 | } 62 | 63 | io.to_icache.req.valid := s_state === fetch_req 64 | io.to_icache.req.bits.addr := ifetch_data.pc 65 | io.to_icache.s1_paddr := tlb.io.resp.paddr 66 | io.to_icache.s1_kill := tlb.io.resp.miss 67 | io.to_icache.s2_kill := false.B 68 | io.to_icache.s2_cacheable := true.B 69 | io.to_icache.s2_prefetch := false.B 70 | io.to_icache.invalidate := false.B 71 | 72 | tlb.io.req.valid := (s_state === fetch_req) 73 | tlb.io.req.bits.vaddr := ifetch_data.pc 74 | tlb.io.req.bits.passthrough := false.B 75 | tlb.io.req.bits.size := 2.U 76 | tlb.io.req.bits.cmd := 0.U 77 | tlb.io.req.bits.prv := 0.U 78 | 79 | tlb.io.sfence := 0.U.asTypeOf(tlb.io.sfence) 80 | tlb.io.kill := false.B 81 | io.to_ptw <> tlb.io.ptw 82 | 83 | io.inst_fetch.ready := (s_state === fetch_idle) 84 | 85 | val cache_data = RegInit(0.U.asTypeOf(new ICacheResp(cfg.dataBits))) 86 | 87 | when(io.to_icache.resp.valid) { 88 | cache_data := io.to_icache.resp.bits 89 | } 90 | 91 | io.inst_out.valid := s_state === dispatch_wait 92 | io.inst_out.bits.pc := ifetch_data.pc 93 | io.inst_out.bits.mask := ifetch_data.mask 94 | io.inst_out.bits.wid := ifetch_data.wid 95 | io.inst_out.bits.data := (cache_data.data >> (ifetch_data.pc(5, 2) * 32.U))(31, 0) 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/core/Issue.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | 8 | class Issue( 9 | implicit p: Parameters) 10 | extends Module { 11 | val numThreads = p(ThreadNum) 12 | val addrWidth = p(AddrWidth) 13 | 14 | val io = IO(new Bundle { 15 | val writeback = Flipped(DecoupledIO(new CommitVData())) 16 | val writeback_cmd = Flipped(DecoupledIO(new CommitVData())) 17 | val decode = Flipped(DecoupledIO(new DecodeData())) 18 | 19 | val alu = DecoupledIO(new VALUData()) 20 | val lsu = DecoupledIO(new LSUData()) 21 | 22 | }) 23 | 24 | val decode_n = Reg(new DecodeData()) 25 | val decode_valid_n = RegInit(0.B) 26 | when(io.decode.fire) { 27 | decode_n := io.decode.bits 28 | decode_valid_n := 1.B 29 | }.otherwise { 30 | decode_valid_n := 0.B 31 | } 32 | val vgpr = Module(new VGPR()) 33 | val score_board = Module(new ScoreBoard()) 34 | val dispatch = Module(new Dispatch()) 35 | 36 | io.decode.ready := score_board.io.ibuffer.ready && dispatch.io.ibuffer.ready && !decode_valid_n 37 | 38 | vgpr.io.writeback.bits := io.writeback.bits 39 | vgpr.io.writeback.valid := io.writeback.valid 40 | vgpr.io.read_req.wid := io.decode.bits.wid 41 | vgpr.io.read_req.rs1 := io.decode.bits.rs1 42 | vgpr.io.read_req.rs2 := io.decode.bits.rs2 43 | vgpr.io.writeback_cmd <> io.writeback_cmd 44 | 45 | score_board.io.writeback <> io.writeback 46 | score_board.io.ibuffer.bits := io.decode.bits 47 | score_board.io.ibuffer.valid := io.decode.valid 48 | 49 | dispatch.io.ibuffer.valid := decode_valid_n 50 | dispatch.io.ibuffer.bits := decode_n 51 | dispatch.io.vgpr_rsp := vgpr.io.read_rsp 52 | 53 | io.alu <> dispatch.io.alu 54 | io.lsu <> dispatch.io.lsu 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/core/PTW.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import freechips.rocketchip.rocket.MStatus 6 | import freechips.rocketchip.util._ 7 | import freechips.rocketchip.rocket.{M_XRD, PRV} 8 | 9 | class PTE() extends Bundle { 10 | val reserved_for_future = UInt(10.W) 11 | val ppn = UInt(44.W) 12 | val reserved_for_software = Bits(2.W) 13 | 14 | /** dirty bit */ 15 | val d = Bool() 16 | 17 | /** access bit */ 18 | val a = Bool() 19 | 20 | /** global mapping */ 21 | val g = Bool() 22 | 23 | /** user mode accessible */ 24 | val u = Bool() 25 | 26 | /** whether the page is executable */ 27 | val x = Bool() 28 | 29 | /** whether the page is writable */ 30 | val w = Bool() 31 | 32 | /** whether the page is readable */ 33 | val r = Bool() 34 | 35 | /** valid bit */ 36 | val v = Bool() 37 | 38 | /** return true if find a pointer to next level page table */ 39 | def table(dummy: Int = 0) = v && !r && !w && !x && !d && !a && !u && reserved_for_future === 0.U 40 | 41 | /** return true if find a leaf PTE */ 42 | def leaf(dummy: Int = 0) = v && (r || (x && !w)) && a 43 | 44 | /** user read */ 45 | def ur(dummy: Int = 0) = sr() && u 46 | 47 | /** user write */ 48 | def uw(dummy: Int = 0) = sw() && u 49 | 50 | /** user execute */ 51 | def ux(dummy: Int = 0) = sx() && u 52 | 53 | /** supervisor read */ 54 | def sr(dummy: Int = 0) = leaf() && r 55 | 56 | /** supervisor write */ 57 | def sw(dummy: Int = 0) = leaf() && w && d 58 | 59 | /** supervisor execute */ 60 | def sx(dummy: Int = 0) = leaf() && x 61 | 62 | /** full permission: writable and executable in user mode */ 63 | def isFullPerm(dummy: Int = 0) = uw() && ux() 64 | } 65 | 66 | class PTWReq(vpnBits: Int) extends Bundle { 67 | val addr = UInt(vpnBits.W) 68 | val vstage1 = Bool() 69 | val stage2 = Bool() 70 | } 71 | 72 | class PTWResp(vaddrBits: Int, pgLevels: Int) extends Bundle { 73 | 74 | /** ptw access exception */ 75 | val ae_ptw = Bool() 76 | 77 | /** final access exception */ 78 | val ae_final = Bool() 79 | 80 | /** page fault */ 81 | val pf = Bool() 82 | 83 | // /** guest page fault */ 84 | // val gf = Bool() 85 | 86 | // /** hypervisor read */ 87 | // val hr = Bool() 88 | 89 | // /** hypervisor write */ 90 | // val hw = Bool() 91 | 92 | // /** hypervisor execute */ 93 | // val hx = Bool() 94 | 95 | /** PTE to refill L1TLB 96 | * 97 | * source: L2TLB 98 | */ 99 | val pte = new PTE 100 | 101 | /** pte pglevel */ 102 | val level = UInt(log2Ceil(pgLevels).W) 103 | 104 | /** fragmented_superpage support */ 105 | // val fragmented_superpage = Bool() 106 | 107 | /** homogeneous for both pma and pmp */ 108 | val homogeneous = Bool() 109 | // val gpa = Valid(UInt(vaddrBits.W)) 110 | // val gpa_is_pte = Bool() 111 | } 112 | 113 | class PTBR() extends Bundle { 114 | val mode = UInt(4.W) 115 | val asid = UInt(16.W) 116 | val ppn = UInt(44.W) 117 | } 118 | 119 | /** IO between TLB and PTW 120 | * 121 | * PTW receives : 122 | * - PTE request 123 | * - CSRs info 124 | * - pmp results from PMP(in TLB) 125 | */ 126 | class TLBPTWIO(vpnBits: Int, vaddrBits: Int, pgLevels: Int) extends Bundle { 127 | val req = Decoupled(Valid(new PTWReq(vpnBits))) 128 | val resp = Flipped(Valid(new PTWResp(vaddrBits, pgLevels))) 129 | val ptbr = Input(new PTBR()) 130 | 131 | // val hgatp = Input(new PTBR()) 132 | // val vsatp = Input(new PTBR()) 133 | val status = Input(new MStatus()) 134 | // val hstatus = Input(new HStatus()) 135 | // val gstatus = Input(new MStatus()) 136 | // val customCSRs = Flipped(coreParams.customCSRs) 137 | } 138 | 139 | class DatapathPTWIO(vaddrBits: Int) extends Bundle { 140 | val ptbr = Input(new PTBR()) 141 | val sfence = Flipped(Valid(new SFenceReq(vaddrBits))) 142 | val status = Input(new MStatus()) 143 | val clock_enabled = Output(Bool()) 144 | } 145 | 146 | case class PTWParameter( 147 | paddrBits: Int, 148 | vaddrBits: Int, 149 | pgIdxBits: Int = 12, 150 | pgLevelBits: Int = 9, 151 | nSectors: Int = 4, 152 | xLen: Int = 64, 153 | pgLevels: Int = 3) { 154 | 155 | def ppnBits: Int = paddrBits - pgIdxBits 156 | def vpnBits: Int = vaddrBits - pgIdxBits 157 | } 158 | 159 | /** PTW contains L2TLB, and performs page table walk for high level TLB, and cache queries from L1 TLBs(I$, D$, RoCC) 160 | * 161 | * It performs hierarchy page table query to mem for the desired leaf PTE and cache them in l2tlb. Besides leaf PTEs, 162 | * it also caches non-leaf PTEs in pte_cache to accerlerate the process. 163 | * 164 | * ==Structure== 165 | * - l2tlb : for leaf PTEs 166 | * - set-associative (configurable with [[CoreParams.nL2TLBEntries]]and [[CoreParams.nL2TLBWays]])) 167 | * - PLRU 168 | * - pte_cache: for non-leaf PTEs 169 | * - set-associative 170 | * - LRU 171 | * - s2_pte_cache: for non-leaf PTEs in 2-stage translation 172 | * - set-associative 173 | * - PLRU 174 | * 175 | * l2tlb Pipeline: 3 stage 176 | * {{{ 177 | * stage 0 : read 178 | * stage 1 : decode 179 | * stage 2 : hit check 180 | * }}} 181 | * ==State Machine== 182 | * s_ready: ready to reveive request from TLB s_req: request mem; pte_cache hit judge s_wait1: deal with l2tlb error 183 | * s_wait2: final hit judge s_wait3: receive mem response s_fragment_superpage: for superpage PTE 184 | * 185 | * @note 186 | * l2tlb hit happens in s_req or s_wait1 187 | * @see 188 | * RV-priv spec 4.3-4.6 for Virtual-Memory System 189 | * @see 190 | * RV-priv spec 8.5 for Two-Stage Address Translation 191 | * @todo 192 | * details in two-stage translation 193 | */ 194 | class PTW(n: Int, cfg: PTWParameter, cache_cfg: CacheParameter) extends Module { 195 | val io = IO(new Bundle { 196 | 197 | /** to n TLB */ 198 | val requestor = Flipped(Vec(n, new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels))) 199 | 200 | /** to Cache */ 201 | val mem = new CacheIO(cache_cfg) 202 | 203 | /** to Core 204 | * 205 | * contains CSRs info and performance statistics 206 | */ 207 | val dpath = new DatapathPTWIO(cfg.vaddrBits) 208 | }) 209 | 210 | val s_ready :: s_req :: s_wait1 :: s_dummy1 :: s_wait2 :: s_wait3 :: s_dummy2 :: s_fragment_superpage :: Nil = Enum(8) 211 | val state = RegInit(s_ready) 212 | val l2_refill_wire = Wire(Bool()) 213 | 214 | /** Arbiter to arbite request from nTLB */ 215 | val arb = Module(new Arbiter(Valid(new PTWReq(cfg.vpnBits)), n)) 216 | // use TLB req as arbitor's input 217 | arb.io.in <> io.requestor.map(_.req) 218 | // receive req only when s_ready and not in refill 219 | arb.io.out.ready := (state === s_ready) && !l2_refill_wire 220 | 221 | val resp_valid = RegNext(VecInit(Seq.fill(io.requestor.size)(false.B))) 222 | 223 | val clock_en = 224 | state =/= s_ready || l2_refill_wire || arb.io.out.valid || io.dpath.sfence.valid 225 | io.dpath.clock_enabled := clock_en 226 | 227 | val invalidated = Reg(Bool()) 228 | 229 | /** current PTE level 230 | * {{{ 231 | * 0 <= count <= pgLevel-1 232 | * count = pgLevel - 1 : leaf PTE 233 | * count < pgLevel - 1 : non-leaf PTE 234 | * }}} 235 | */ 236 | val count = Reg(UInt(log2Ceil(cfg.pgLevels).W)) 237 | val resp_ae_ptw = Reg(Bool()) 238 | val resp_ae_final = Reg(Bool()) 239 | val resp_pf = Reg(Bool()) 240 | 241 | /** tlb request */ 242 | val r_req = Reg(new PTWReq(cfg.vpnBits)) 243 | 244 | /** current selected way in arbitor */ 245 | val r_req_dest = Reg(Bits()) 246 | // to construct mem.req.addr 247 | val r_pte = Reg(new PTE) 248 | 249 | val aux_pte = Reg(new PTE) 250 | 251 | val satp = io.dpath.ptbr 252 | val vpn = r_req.addr 253 | 254 | val mem_resp_valid = RegNext(io.mem.resp.valid) 255 | val mem_resp_data = RegNext(io.mem.resp.bits.data) 256 | // io.mem.uncached_resp.map { resp => 257 | // assert(!(resp.valid && io.mem.resp.valid)) 258 | // resp.ready := true.B 259 | // when(resp.valid) { 260 | // mem_resp_valid := true.B 261 | // mem_resp_data := resp.bits.data 262 | // } 263 | // } 264 | // construct pte from mem.resp 265 | val (pte, invalid_paddr) = { 266 | val tmp = mem_resp_data.asTypeOf(new PTE()) 267 | val res = WireDefault(tmp) 268 | res.ppn := tmp.ppn(cfg.ppnBits - 1, 0) 269 | when(tmp.r || tmp.w || tmp.x) { 270 | // for superpage mappings, make sure PPN LSBs are zero 271 | for (i <- 0 until cfg.pgLevels - 1) 272 | when( 273 | count <= i.U && tmp.ppn( 274 | (cfg.pgLevels - 1 - i) * cfg.pgLevelBits - 1, 275 | (cfg.pgLevels - 2 - i) * cfg.pgLevelBits 276 | ) =/= 0.U 277 | ) { res.v := false.B } 278 | } 279 | (res, (tmp.ppn >> cfg.ppnBits) =/= 0.U) 280 | } 281 | // find non-leaf PTE, need traverse 282 | val traverse = pte.table() && !invalid_paddr && count < (cfg.pgLevels - 1).U 283 | 284 | /** address send to mem for enquerry */ 285 | val pte_addr = { 286 | val vpn_idxs = (0 until cfg.pgLevels).map { i => 287 | val width = cfg.pgLevelBits 288 | (vpn >> (cfg.pgLevels - i - 1) * cfg.pgLevelBits)(width - 1, 0) 289 | } 290 | val mask = ((1 << cfg.pgLevelBits) - 1).U 291 | val vpn_idx = vpn_idxs(count) & mask 292 | val raw_pte_addr = ((r_pte.ppn << cfg.pgLevelBits) | vpn_idx) << log2Ceil(cfg.xLen / 8) 293 | val size = cfg.paddrBits 294 | // use r_pte.ppn as page table base address 295 | // use vpn slice as offset 296 | raw_pte_addr.apply(size.min(raw_pte_addr.getWidth) - 1, 0) 297 | } 298 | 299 | /** pte_cache input addr */ 300 | val pte_cache_addr = pte_addr 301 | 302 | /** PTECache caches non-leaf PTE 303 | * @param s2 304 | * true: 2-stage address translation 305 | */ 306 | def makePTECache(s2: Boolean): (Bool, UInt) = (false.B, 0.U) 307 | // generate pte_cache 308 | val (pte_cache_hit, pte_cache_data) = makePTECache(false) 309 | // pte_cache hit or 2-stage pte_cache hit 310 | val pte_hit = RegNext(false.B) 311 | // l2_refill happens when find the leaf pte 312 | val l2_refill = RegNext(false.B) 313 | l2_refill_wire := l2_refill 314 | // l2tlb 315 | val (l2_hit, l2_error, l2_pte, l2_tlb_ram) = (false.B, false.B, WireDefault(0.U.asTypeOf(new PTE)), None) 316 | 317 | // if SFENCE occurs during walk, don't refill PTE cache or L2 TLB until next walk 318 | invalidated := io.dpath.sfence.valid || (invalidated && state =/= s_ready) 319 | // mem request 320 | // io.mem.keep_clock_enabled := false.B 321 | 322 | io.mem.req.valid := state === s_req 323 | io.mem.req.bits.tag := 0.U 324 | io.mem.req.bits.phys := true.B 325 | io.mem.req.bits.cmd := M_XRD 326 | io.mem.req.bits.size := log2Ceil(cfg.xLen / 8).U 327 | io.mem.req.bits.signed := false.B 328 | io.mem.req.bits.addr := pte_addr 329 | // io.mem.req.bits.idx.foreach(_ := pte_addr) 330 | io.mem.req.bits.dprv := PRV.S.U // PTW accesses are S-mode by definition 331 | io.mem.req.bits.dv := false.B 332 | // io.mem.req.bits.tag := DontCare 333 | io.mem.req.bits.no_alloc := DontCare 334 | io.mem.req.bits.no_xcpt := DontCare 335 | io.mem.req.bits.data := DontCare 336 | io.mem.req.bits.mask := DontCare 337 | 338 | io.mem.s1_kill := l2_hit || state =/= s_wait1 339 | io.mem.s1_data := DontCare 340 | io.mem.s2_kill := false.B 341 | 342 | val homogeneous = true.B 343 | // response to tlb 344 | for (i <- 0 until io.requestor.size) { 345 | io.requestor(i).resp.valid := resp_valid(i) 346 | io.requestor(i).resp.bits.ae_ptw := resp_ae_ptw 347 | io.requestor(i).resp.bits.ae_final := resp_ae_final 348 | io.requestor(i).resp.bits.pf := resp_pf 349 | io.requestor(i).resp.bits.pte := r_pte 350 | io.requestor(i).resp.bits.level := count 351 | io.requestor(i).resp.bits.homogeneous := homogeneous 352 | io.requestor(i).ptbr := io.dpath.ptbr 353 | // io.requestor(i).customCSRs <> io.dpath.customCSRs 354 | io.requestor(i).status := io.dpath.status 355 | // io.requestor(i).pmp := io.dpath.pmp 356 | } 357 | 358 | // control state machine 359 | val next_state = WireDefault(state) 360 | state := OptimizationBarrier(next_state) 361 | 362 | switch(state) { 363 | is(s_ready) { 364 | when(arb.io.out.fire) { 365 | val aux_ppn = arb.io.out.bits.bits.addr 366 | 367 | r_req := arb.io.out.bits.bits 368 | r_req_dest := arb.io.chosen 369 | next_state := Mux(arb.io.out.bits.valid, s_req, s_ready) 370 | count := 0.U 371 | aux_pte.ppn := aux_ppn 372 | aux_pte.reserved_for_future := 0.U 373 | resp_ae_ptw := false.B 374 | resp_ae_final := false.B 375 | } 376 | } 377 | is(s_req) { 378 | // pte_cache hit 379 | when(pte_cache_hit) { 380 | count := count + 1.U 381 | pte_hit := true.B 382 | }.otherwise { 383 | next_state := Mux(io.mem.req.ready, s_wait1, s_req) 384 | } 385 | } 386 | is(s_wait1) { 387 | // This Mux is for the l2_error case; the l2_hit && !l2_error case is overriden below 388 | next_state := Mux(l2_hit, s_req, s_wait2) 389 | } 390 | is(s_wait2) { 391 | next_state := s_wait3 392 | when(io.mem.s2_xcpt.ae.ld) { 393 | resp_ae_ptw := true.B 394 | next_state := s_ready 395 | resp_valid(r_req_dest) := true.B 396 | } 397 | } 398 | } 399 | 400 | r_pte := OptimizationBarrier( 401 | // l2tlb hit->find a leaf PTE(l2_pte), respond to L1TLB 402 | Mux( 403 | l2_hit && !l2_error, 404 | l2_pte, 405 | // pte cache hit->find a non-leaf PTE(pte_cache),continue to request mem 406 | Mux( 407 | state === s_req && pte_cache_hit, 408 | makePTE(pte_cache_data, l2_pte), 409 | // when mem respond, store mem.resp.pte 410 | Mux( 411 | mem_resp_valid, 412 | pte, 413 | // when tlb request come->request mem, use root address in satp(or vsatp,hgatp) 414 | Mux(arb.io.out.fire, makePTE(satp.ppn, r_pte), r_pte) 415 | ) 416 | ) 417 | ) 418 | ) 419 | 420 | when(l2_hit && !l2_error) { 421 | assert(state === s_req || state === s_wait1) 422 | next_state := s_ready 423 | resp_valid(r_req_dest) := true.B 424 | count := (cfg.pgLevels - 1).U 425 | } 426 | when(mem_resp_valid) { 427 | assert(state === s_wait3) 428 | next_state := s_req 429 | when(traverse) { 430 | count := count + 1.U 431 | }.otherwise { 432 | val ae = pte.v && invalid_paddr 433 | val pf = pte.v && pte.reserved_for_future =/= 0.U 434 | val success = pte.v && !ae && !pf 435 | 436 | // find a leaf pte, start l2 refill 437 | l2_refill := success && count === (cfg.pgLevels - 1).U 438 | count := 0.U 439 | 440 | next_state := s_ready 441 | resp_valid(r_req_dest) := true.B 442 | 443 | resp_ae_ptw := ae && count < (cfg.pgLevels - 1).U && pte.table() 444 | resp_ae_final := ae 445 | resp_pf := pf 446 | } 447 | } 448 | when(io.mem.s2_nack) { 449 | assert(state === s_wait2) 450 | next_state := s_req 451 | } 452 | 453 | /** Relace PTE.ppn with ppn */ 454 | private def makePTE(ppn: UInt, default: PTE) = { 455 | val pte = WireDefault(default) 456 | pte.ppn := ppn 457 | pte 458 | } 459 | } 460 | -------------------------------------------------------------------------------- /src/main/scala/core/SGPR.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | import chisel3._ 3 | import chisel3.util._ 4 | import org.chipsalliance.cde.config.Parameters 5 | import ogpu.lib._ 6 | import ogpu.config._ 7 | 8 | class SGPR( 9 | implicit p: Parameters) 10 | extends Module() { 11 | val numWarps = p(WarpNum) 12 | val numRegs = p(RegNum) 13 | val xLen = p(XLen) 14 | 15 | val io = IO(new Bundle { 16 | val writeback = Flipped(DecoupledIO(new CommitSData())) 17 | val writeback_cmd = Flipped(DecoupledIO(new CommitSData())) 18 | val read_req = Flipped(new ReadGPRReq()) 19 | val read_rsp = new ReadSGPRRsp() 20 | }) 21 | 22 | val gpr_ram = VecInit(Seq.fill(numWarps)((Module(new MaskedSmem_2R1W(xLen, numRegs, 1)).io))) 23 | val raddr_reg = RegInit(0.U) 24 | val raddr2_reg = RegInit(0.U) 25 | val rwid_reg = RegInit(0.U(log2Ceil(numWarps).W)) 26 | val ready_reg = RegInit(0.B) 27 | val cmd_ready_reg = RegInit(0.B) 28 | 29 | raddr_reg := io.read_req.rs1 30 | raddr2_reg := io.read_req.rs2 31 | rwid_reg := io.read_req.wid 32 | 33 | io.writeback.ready := ready_reg 34 | io.writeback_cmd.ready := cmd_ready_reg 35 | 36 | for (i <- 0 until numWarps) { 37 | // init 38 | gpr_ram(i).write_en := 0.B 39 | gpr_ram(i).waddr := 0.U 40 | gpr_ram(i).raddr := io.read_req.rs1 41 | gpr_ram(i).raddr2 := io.read_req.rs2 42 | gpr_ram(i).mask := 0.U.asTypeOf(io.writeback.bits.mask) 43 | gpr_ram(i).dataIn := 0.U.asTypeOf(io.writeback.bits.data) 44 | 45 | when(io.writeback_cmd.valid && i.U === io.writeback_cmd.bits.wid) { 46 | gpr_ram(i).write_en := io.writeback_cmd.valid 47 | gpr_ram(i).waddr := io.writeback_cmd.bits.rd 48 | gpr_ram(i).mask := io.writeback_cmd.bits.mask 49 | gpr_ram(i).dataIn := io.writeback_cmd.bits.data 50 | }.elsewhen(io.writeback.valid && i.U === io.writeback.bits.wid) { 51 | gpr_ram(i).write_en := io.writeback.valid 52 | gpr_ram(i).waddr := io.writeback.bits.rd 53 | gpr_ram(i).dataIn := io.writeback.bits.data 54 | } 55 | 56 | } 57 | 58 | ready_reg := 0.B 59 | cmd_ready_reg := 0.B 60 | when(io.writeback_cmd.valid) { 61 | cmd_ready_reg := 1.B 62 | }.elsewhen(io.writeback.valid) { 63 | ready_reg := 1.B 64 | } 65 | 66 | io.read_rsp.rs1_data := Mux(raddr_reg === 0.U, 0.U.asTypeOf(gpr_ram(0).dataOut), gpr_ram(rwid_reg).dataOut) 67 | io.read_rsp.rs2_data := Mux(raddr2_reg === 0.U, 0.U.asTypeOf(gpr_ram(0).dataOut), gpr_ram(rwid_reg).dataOut2) 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/core/SIMTStack.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | import ogpu.lib._ 8 | 9 | class SIMTStack( 10 | implicit p: Parameters) 11 | extends Module { 12 | val numThreads = p(ThreadNum) 13 | val addrWidth = p(AddrWidth) 14 | val stackDepth = p(StackDepth) 15 | 16 | val io = IO(new Bundle { 17 | val in_diverge = Input(Bool()) 18 | val in_data = Input(new StackData()) 19 | val out_data = Output(new StackData()) 20 | val push = Input(Bool()) 21 | val pop = Input(Bool()) 22 | val out_diverge = Output(Bool()) 23 | val empty = Output(Bool()) 24 | val full = Output(Bool()) 25 | }) 26 | 27 | val stack_addr = RegInit(0.U(log2Ceil(stackDepth + 1).W)) 28 | val stack_pop_addr = RegInit(0.U(log2Ceil(stackDepth + 1).W)) 29 | val out_diverge = RegInit(0.B) 30 | val out_data = Wire(new StackData()) 31 | val diverge_status = RegInit(VecInit(Seq.fill(stackDepth)(false.B))) 32 | val stack_sram = Module(new ReadWriteSmem(io.in_data.getWidth, stackDepth)) 33 | 34 | stack_pop_addr := stack_addr - 1.U 35 | stack_sram.io.enable := io.push || io.pop 36 | stack_sram.io.write := io.push 37 | stack_sram.io.addr := Mux(io.push, stack_addr, stack_pop_addr) 38 | stack_sram.io.dataIn := io.in_data.asUInt 39 | out_data := stack_sram.io.dataOut.asTypeOf(new StackData()) 40 | 41 | when(io.push) { 42 | stack_addr := stack_addr + 1.U 43 | stack_pop_addr := stack_addr 44 | }.elsewhen(io.pop && ~diverge_status(stack_pop_addr)) { 45 | stack_addr := stack_addr - 1.U 46 | stack_pop_addr := stack_pop_addr - 1.U 47 | } 48 | 49 | when(io.push) { 50 | diverge_status(stack_addr) := io.in_diverge 51 | }.elsewhen(io.pop) { 52 | diverge_status(stack_pop_addr) := 0.B 53 | out_diverge := diverge_status(stack_pop_addr) 54 | } 55 | 56 | io.empty := stack_addr === 0.U 57 | io.full := stack_addr === stackDepth.U 58 | io.out_diverge := out_diverge 59 | io.out_data := out_data 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/core/Scoreboard.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | 8 | class ScoreBoard( 9 | implicit p: Parameters) 10 | extends Module { 11 | val numWarps = p(WarpNum) 12 | val numRegs = p(RegNum) 13 | val io = IO(new Bundle { 14 | val ibuffer = Flipped(DecoupledIO(new DecodeData())) 15 | val writeback = Flipped(DecoupledIO(new WritebackData())) 16 | }) 17 | 18 | // Registers to hold the state of inuse_regs 19 | val inuseRegs = RegInit(VecInit(Seq.fill(numWarps)(VecInit(Seq.fill(numRegs)(false.B))))) 20 | 21 | // Wires to get the state immediately 22 | val inuseRegsCurrent = Wire(Vec(numWarps, Vec(numRegs, Bool()))) 23 | 24 | // Reserve a register when instruction is valid, ready, and writeback is enabled 25 | val reserveReg = io.ibuffer.valid && io.ibuffer.ready && io.ibuffer.bits.wb 26 | 27 | // Release a register when writeback to a register is complete (and it is the last instruction of a packet) 28 | val releaseReg = io.writeback.valid && io.writeback.ready && io.writeback.bits.eop 29 | 30 | // Update `inuseRegsCurrent` with `reserveReg` and `releaseReg` 31 | for (i <- 0 until numWarps) { 32 | for (j <- 0 until numRegs) { 33 | inuseRegsCurrent(i)(j) := inuseRegs(i)(j) 34 | when(reserveReg && io.ibuffer.bits.wid === i.U && io.ibuffer.bits.rd === j.U) { 35 | inuseRegsCurrent(i)(j) := true.B 36 | } 37 | when(releaseReg && io.writeback.bits.wid === i.U && io.writeback.bits.rd === j.U) { 38 | inuseRegsCurrent(i)(j) := false.B 39 | } 40 | } 41 | } 42 | 43 | // Update `inuseRegs` with `inuseRegsCurrent` on rising edge of clock 44 | inuseRegs := inuseRegsCurrent 45 | 46 | // Check if the requested registers are free 47 | val deqInuseRd = RegInit(false.B) 48 | val deqInuseRs1 = RegInit(false.B) 49 | val deqInuseRs2 = RegInit(false.B) 50 | 51 | deqInuseRd := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rd) 52 | deqInuseRs1 := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rs1) 53 | deqInuseRs2 := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rs2) 54 | 55 | io.writeback.ready := true.B 56 | io.ibuffer.ready := !(deqInuseRd || deqInuseRs1 || deqInuseRs2) 57 | 58 | // Check and assert if any deadlock is detected 59 | val deadlockCtr = RegInit(0.U(32.W)) 60 | val deadlockTimeout = 100000.U 61 | 62 | when(io.ibuffer.valid && !io.ibuffer.ready) { 63 | deadlockCtr := deadlockCtr + 1.U 64 | assert( 65 | deadlockCtr < deadlockTimeout, 66 | cf"Deadlock detected - PC: 0x${Hexadecimal(io.ibuffer.bits.pc)}, wid: ${io.ibuffer.bits.wid}, rd: ${io.ibuffer.bits.rd}" 67 | ) 68 | }.elsewhen(io.ibuffer.valid && io.ibuffer.ready) { 69 | deadlockCtr := 0.U 70 | }.elsewhen(io.writeback.valid && io.writeback.ready && io.writeback.bits.eop) { 71 | assert( 72 | inuseRegs(io.writeback.bits.wid)(io.writeback.bits.rd), 73 | cf"Invalid writeback register - PC: 0x${Hexadecimal(io.writeback.bits.pc)}, wid: ${io.writeback.bits.wid}, rd: ${io.writeback.bits.rd}" 74 | ) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/core/TLB.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import chisel3.experimental.SourceInfo 6 | import freechips.rocketchip.rocket.{ 7 | isAMOArithmetic, 8 | isAMOLogical, 9 | isRead, 10 | isWrite, 11 | M_FLUSH_ALL, 12 | M_PWR, 13 | M_SZ, 14 | M_WOK, 15 | M_XLR, 16 | M_XSC, 17 | PRV 18 | } 19 | import freechips.rocketchip.util._ 20 | 21 | case class TLBParameter( 22 | nSets: Int, 23 | nWays: Int, 24 | paddrBits: Int, 25 | vaddrBits: Int, 26 | xLen: Int = 32, 27 | pgIdxBits: Int = 12, 28 | minPgLevels: Int = 2, 29 | pgLevelBits: Int = 9, 30 | nSectors: Int = 4, 31 | pgLevels: Int = 3) { 32 | 33 | def ppnBits: Int = paddrBits - pgIdxBits 34 | def vpnBits: Int = vaddrBits - pgIdxBits 35 | 36 | } 37 | 38 | class SFenceReq(vaddrBits: Int) extends Bundle { 39 | val rs1 = Bool() 40 | val rs2 = Bool() 41 | val addr = UInt(vaddrBits.W) 42 | val asid = UInt(16.W) 43 | } 44 | 45 | class TLBReq(lgMaxSize: Int, vaddrBits: Int) extends Bundle { 46 | 47 | /** request address from CPU. */ 48 | val vaddr = UInt(vaddrBits.W) 49 | 50 | /** don't lookup TLB, bypass vaddr as paddr */ 51 | val passthrough = Bool() 52 | 53 | /** granularity */ 54 | val size = UInt(log2Ceil(lgMaxSize + 1).W) 55 | 56 | /** memory command. */ 57 | val cmd = Bits(M_SZ.W) 58 | val prv = UInt(PRV.SZ.W) 59 | 60 | /** virtualization mode */ 61 | // val v = Bool() 62 | 63 | } 64 | 65 | class TLBExceptions extends Bundle { 66 | val ld = Bool() 67 | val st = Bool() 68 | val inst = Bool() 69 | } 70 | 71 | class TLBResp(paddrBits: Int, vaddrBits: Int) extends Bundle { 72 | // lookup responses 73 | val miss = Bool() 74 | 75 | /** physical address */ 76 | val paddr = UInt(paddrBits.W) 77 | 78 | /** page fault exception */ 79 | val pf = new TLBExceptions 80 | 81 | /** access exception */ 82 | val ae = new TLBExceptions 83 | 84 | /** misaligned access exception */ 85 | val ma = new TLBExceptions 86 | } 87 | 88 | class TLBEntryData(tlbParam: TLBParameter) extends Bundle { 89 | val ppn = UInt(tlbParam.ppnBits.W) 90 | 91 | /** pte.u user */ 92 | val u = Bool() 93 | 94 | /** access exception. D$ -> PTW -> TLB AE Alignment failed. 95 | */ 96 | val ae_ptw = Bool() 97 | val ae_final = Bool() 98 | 99 | /** page fault */ 100 | val pf = Bool() 101 | 102 | /** prot_w */ 103 | val pw = Bool() 104 | 105 | /** prot_x */ 106 | val px = Bool() 107 | 108 | /** prot_r */ 109 | val pr = Bool() 110 | 111 | } 112 | 113 | /** basic cell for TLB data */ 114 | class TLBEntry(val tlbParam: TLBParameter) extends Bundle { 115 | 116 | val level = UInt(log2Ceil(tlbParam.pgLevels).W) 117 | 118 | /** use vpn as tag */ 119 | val tag_vpn = UInt(tlbParam.vpnBits.W) 120 | 121 | val tag_asid = Vec(tlbParam.nSectors, UInt(16.W)) 122 | 123 | /** entry data */ 124 | val data = Vec(tlbParam.nSectors, UInt(new TLBEntryData(tlbParam).getWidth.W)) 125 | 126 | /** valid bit */ 127 | val valid = Vec(tlbParam.nSectors, Bool()) 128 | 129 | /** returns all entry data in this entry */ 130 | def entry_data = data.map(_.asTypeOf(new TLBEntryData(tlbParam))) 131 | 132 | /** returns the index of sector */ 133 | private def sectorIdx(vpn: UInt) = vpn.extract(tlbParam.nSectors.log2 - 1, 0) 134 | 135 | /** returns the entry data matched with this vpn */ 136 | def getData(vpn: UInt) = OptimizationBarrier(data(sectorIdx(vpn)).asTypeOf(new TLBEntryData(tlbParam))) 137 | 138 | /** returns whether a sector hits */ 139 | def sectorHit(vpn: UInt) = valid.orR && sectorTagMatch(vpn) 140 | 141 | /** returns whether tag matches vpn */ 142 | def sectorTagMatch(vpn: UInt) = 143 | (((tag_vpn ^ vpn) >> tlbParam.nSectors.log2) === 0.U) 144 | 145 | /** returns hit signal */ 146 | def hit(vpn: UInt, asid: UInt): Bool = { 147 | val idx = sectorIdx(vpn) 148 | (tag_asid(idx) === asid) && valid(idx) && sectorTagMatch(vpn) 149 | } 150 | 151 | /** returns the ppn of the input TLBEntryData */ 152 | def ppn(data: TLBEntryData) = { 153 | data.ppn 154 | } 155 | 156 | /** does the refill 157 | * 158 | * find the target entry with vpn tag and replace the target entry with the input entry data 159 | */ 160 | def insert(vpn: UInt, asid: UInt, level: UInt, entry: TLBEntryData): Unit = { 161 | this.tag_vpn := vpn 162 | this.level := level.extract(log2Ceil(tlbParam.pgLevels) - 1, 0) 163 | 164 | val idx = sectorIdx(vpn) 165 | valid(idx) := true.B 166 | data(idx) := entry.asUInt 167 | tag_asid(idx) := asid 168 | } 169 | 170 | def invalidate(): Unit = { valid.foreach(_ := false.B) } 171 | def invalidate(asid: UInt): Unit = { 172 | for (((v, id), e) <- valid.zip(tag_asid).zip(entry_data)) 173 | when(id === asid) { v := false.B } 174 | } 175 | def invalidateVPN(vpn: UInt, asid: UInt): Unit = { 176 | when(sectorTagMatch(vpn)) { 177 | for ((((v, id), e), i) <- (valid.zip(tag_asid).zip(entry_data)).zipWithIndex) 178 | when(id === asid && i.U === sectorIdx(vpn)) { v := false.B } 179 | } 180 | } 181 | def invalidateNonGlobal(asid: UInt): Unit = { 182 | for (((v, id), e) <- valid.zip(tag_asid).zip(entry_data)) 183 | when(id === asid) { v := false.B } 184 | } 185 | } 186 | 187 | /** =Overview= 188 | * [[TLB]] is a TLB template. 189 | * 190 | * TLB caches PTE and accelerates the address translation process. When tlb miss happens, ask PTW(L2TLB) for Page Table 191 | * Walk. 192 | * 193 | * ==Cache Structure== 194 | * - Sectored Entry (PTE) 195 | * - set-associative or direct-mapped 196 | * - nsets = [[nSets]] 197 | * - nways = [[nWays]] / [[nSectors]] 198 | * - PTEEntry( sectors = [[nSectors]] ) 199 | * - LRU(if set-associative) 200 | * 201 | * ==Address structure== 202 | * {{{ 203 | * |vaddr | 204 | * |ppn/vpn | pgIndex | 205 | * | | | 206 | * | |nSets |nSector | | 207 | * }}} 208 | * 209 | * ==State Machine== 210 | * {{{ 211 | * s_ready: ready to accept request from EXE. 212 | * s_request: when L1TLB(this) miss, send request to PTW(L2TLB), . 213 | * s_wait: wait for PTW to refill L1TLB. 214 | * s_wait_invalidate: L1TLB is waiting for respond from PTW, but L1TLB will invalidate respond from PTW. 215 | * }}} 216 | * 217 | * ==Note== 218 | * Boom use Rocket ITLB, and its own DTLB. 219 | * 220 | * Accelerators:{{{ sha3: DTLB gemmini: DTLB hwacha: DTLB*2+ITLB}}} 221 | * @param instruction 222 | * true for ITLB, false for DTLB 223 | * @param lgMaxSize 224 | * \@todo seems granularity 225 | * @param cfg 226 | * [[TLBConfig]] 227 | * @param edge 228 | * collect SoC metadata. 229 | */ 230 | class TLB( 231 | instruction: Boolean, 232 | cfg: TLBParameter) //( 233 | // implicit edge: TLEdgeOut, 234 | // p: Parameters) 235 | extends Module { 236 | val io = IO(new Bundle { 237 | 238 | /** request from Core */ 239 | val req = Flipped(Decoupled(new TLBReq(cfg.xLen / 8, cfg.vaddrBits))) 240 | 241 | /** response to Core */ 242 | val resp = Output(new TLBResp(cfg.paddrBits, cfg.vaddrBits)) 243 | 244 | /** SFence Input */ 245 | val sfence = Flipped(Valid(new SFenceReq(cfg.vaddrBits))) 246 | 247 | /** IO to PTW */ 248 | val ptw = new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels) 249 | 250 | /** suppress a TLB refill, one cycle after a miss */ 251 | val kill = Input(Bool()) 252 | }) 253 | 254 | val usingAtomicsInCache = true 255 | val usingAtomics = true 256 | val vpn = io.req.bits.vaddr(cfg.vaddrBits - 1, cfg.pgIdxBits) 257 | 258 | /** index for sectored_Entry */ 259 | val memIdx = vpn.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2) 260 | 261 | /** TLB Entry */ 262 | val sectored_entries = Reg(Vec(cfg.nSets, Vec(cfg.nWays / cfg.nSectors, new TLBEntry(cfg)))) 263 | def ordinary_entries = sectored_entries(memIdx) 264 | def all_entries = ordinary_entries 265 | def all_real_entries = sectored_entries.flatten 266 | 267 | val s_ready :: s_request :: s_wait :: s_wait_invalidate :: Nil = Enum(4) 268 | val state = RegInit(s_ready) 269 | // use vpn as refill_tag 270 | val r_refill_tag = Reg(UInt(cfg.vpnBits.W)) 271 | val r_sectored_repl_addr = Reg(UInt(log2Ceil(sectored_entries.head.size).W)) 272 | val r_sectored_hit = Reg(Valid(UInt(log2Ceil(sectored_entries.head.size).W))) 273 | 274 | /** privilege mode */ 275 | val priv = io.req.bits.prv 276 | val priv_v = false.B 277 | val priv_s = priv(0) 278 | // user mode and supervisor mode 279 | val priv_uses_vm = priv <= PRV.S.U 280 | val satp = io.ptw.ptbr 281 | val asid = satp.asid 282 | val stage1_en = satp.mode(satp.mode.getWidth - 1) 283 | 284 | /** Enable Virtual Memory when: 285 | * 1. statically configured 286 | * 1. satp highest bits enabled 287 | * i. RV32: 288 | * - 0 -> Bare 289 | * - 1 -> SV32 290 | * i. RV64: 291 | * - 0000 -> Bare 292 | * - 1000 -> SV39 293 | * - 1001 -> SV48 294 | * - 1010 -> SV57 295 | * - 1011 -> SV64 296 | * 1. In virtualization mode, vsatp highest bits enabled 297 | * 1. priv mode in U and S. 298 | * 1. in H & M mode, disable VM. 299 | * 1. no passthrough(micro-arch defined.) 300 | * 301 | * @see 302 | * RV-priv spec 4.1.11 Supervisor Address Translation and Protection (satp) Register 303 | * @see 304 | * RV-priv spec 8.2.18 Virtual Supervisor Address Translation and Protection Register (vsatp) 305 | */ 306 | val vm_enabled = stage1_en && priv_uses_vm && !io.req.bits.passthrough 307 | 308 | // share a single physical memory attribute checker (unshare if critical path) 309 | val refill_ppn = io.ptw.resp.bits.pte.ppn(cfg.ppnBits - 1, 0) 310 | 311 | /** refill signal */ 312 | val do_refill = io.ptw.resp.valid 313 | 314 | /** sfence invalidate refill */ 315 | val invalidate_refill = state.isOneOf(s_request /* don't care */, s_wait_invalidate) || io.sfence.valid 316 | 317 | val mpu_ppn = refill_ppn 318 | val mpu_physaddr = Cat(mpu_ppn, io.req.bits.vaddr(cfg.pgIdxBits - 1, 0)) 319 | // PMA 320 | // check exist a slave can consume this address. 321 | // val legal_address = edge.manager.findSafe(mpu_physaddr).reduce(_ || _) 322 | // check utility to help check SoC property. 323 | // def fastCheck(member: TLManagerParameters => Boolean) = 324 | // legal_address && edge.manager.fastProperty(mpu_physaddr, member, (b: Boolean) => b.B) 325 | 326 | // val cacheable = fastCheck(_.supportsAcquireB) && (instruction).B 327 | // val cacheable = (instruction).B 328 | 329 | val homogeneous = false.B 330 | // TLBPageLookup(edge.manager.managers, cfg.xLen, cfg.CacheBlockBytes, BigInt(1) << cfg.pgIdxBits)( 331 | // mpu_physaddr 332 | // ).homogeneous 333 | val prot_r = true.B // fastCheck(_.supportsGet) 334 | val prot_w = true.B // fastCheck(_.supportsPutFull) 335 | val prot_pp = true.B // fastCheck(_.supportsPutPartial) 336 | val prot_al = true.B // fastCheck(_.supportsLogical) 337 | val prot_aa = true.B // fastCheck(_.supportsArithmetic) 338 | val prot_x = true.B // fastCheck(_.executable) 339 | val prot_eff = true.B // fastCheck(Seq(RegionType.PUT_EFFECTS, RegionType.GET_EFFECTS) contains _.regionType) 340 | 341 | // hit check 342 | val sector_hits = sectored_entries(memIdx).map(_.sectorHit(vpn)) 343 | val hitsVec = all_entries.map(vm_enabled && _.hit(vpn, asid)) 344 | val real_hits = hitsVec.asUInt 345 | val hits = Cat(!vm_enabled, real_hits) 346 | 347 | // use ptw response to refill 348 | // permission bit arrays 349 | when(do_refill) { 350 | val pte = io.ptw.resp.bits.pte 351 | // val refill_v = r_vstage1_en || r_stage2_en 352 | // val asid 353 | val newEntry = Wire(new TLBEntryData(cfg)) 354 | newEntry.ppn := pte.ppn 355 | // newEntry.c := cacheable 356 | newEntry.u := pte.u 357 | // newEntry.g := pte.g && pte.v 358 | newEntry.ae_ptw := io.ptw.resp.bits.ae_ptw 359 | newEntry.ae_final := io.ptw.resp.bits.ae_final 360 | newEntry.pf := io.ptw.resp.bits.pf 361 | newEntry.pr := prot_r 362 | newEntry.pw := prot_w 363 | newEntry.px := prot_x 364 | // newEntry.ppp := prot_pp 365 | // newEntry.pal := prot_al 366 | // newEntry.paa := prot_aa 367 | // newEntry.eff := prot_eff 368 | // refill sectored_hit 369 | val r_memIdx = r_refill_tag.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2) 370 | val waddr = Mux(r_sectored_hit.valid, r_sectored_hit.bits, r_sectored_repl_addr) 371 | for ((e, i) <- sectored_entries(r_memIdx).zipWithIndex) when(waddr === i.U) { 372 | when(!r_sectored_hit.valid) { e.invalidate() } 373 | e.insert(r_refill_tag, asid, 0.U, newEntry) 374 | when(invalidate_refill) { e.invalidate() } 375 | } 376 | } 377 | 378 | // get all entries data. 379 | val entries = all_entries.map(_.getData(vpn)) 380 | val normal_entries = entries.take(ordinary_entries.size) 381 | // parallel query PPN from [[all_entries]], if VM not enabled return VPN instead 382 | val ppn = Mux1H( 383 | hitsVec :+ !vm_enabled, 384 | (all_entries.zip(entries)).map { case (entry, data) => entry.ppn(data) } :+ vpn(cfg.ppnBits - 1, 0) 385 | ) 386 | 387 | val nPhysicalEntries = 1 388 | // generally PTW misaligned load exception. 389 | val ptw_ae_array = Cat(false.B, entries.map(_.ae_ptw).asUInt) 390 | val final_ae_array = Cat(false.B, entries.map(_.ae_final).asUInt) 391 | val ptw_pf_array = Cat(false.B, entries.map(_.pf).asUInt) 392 | val sum = io.ptw.status.sum 393 | // if in hypervisor/machine mode, cannot read/write user entries. 394 | // if in superviosr/user mode, "If the SUM bit in the sstatus register is set, supervisor mode software may also access pages with U=1.(from spec)" 395 | val priv_rw_ok = entries.map(_.u).asUInt 396 | // if in hypervisor/machine mode, other than user pages, all pages are executable. 397 | // if in superviosr/user mode, only user page can execute. 398 | val priv_x_ok = entries.map(_.u).asUInt 399 | val mxr = io.ptw.status.mxr 400 | // "The vsstatus field MXR, which makes execute-only pages readable, only overrides VS-stage page protection.(from spec)" 401 | // val r_array = 402 | // Cat(true.B, (priv_rw_ok & (entries.map(_.sr).asUInt | Mux(mxr, entries.map(_.sx).asUInt, 0.U)))) 403 | // These array is for each TLB entries. 404 | // user mode can read: PMA OK, TLB OK, AE OK 405 | val pr_array = Cat(Fill(nPhysicalEntries, prot_r), normal_entries.map(_.pr).asUInt) & ~(ptw_ae_array | final_ae_array) 406 | // user mode can write: PMA OK, TLB OK, AE OK 407 | val pw_array = Cat(Fill(nPhysicalEntries, prot_w), normal_entries.map(_.pw).asUInt) & ~(ptw_ae_array | final_ae_array) 408 | // user mode can write: PMA OK, TLB OK, AE OK 409 | val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~(ptw_ae_array | final_ae_array) 410 | // put effect 411 | // val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt) 412 | // cacheable 413 | // val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt) 414 | // put partial 415 | // val ppp_array = Cat(Fill(nPhysicalEntries, prot_pp), normal_entries.map(_.ppp).asUInt) 416 | // // atomic arithmetic 417 | // val paa_array = Cat(Fill(nPhysicalEntries, prot_aa), normal_entries.map(_.paa).asUInt) 418 | // // atomic logic 419 | // val pal_array = Cat(Fill(nPhysicalEntries, prot_al), normal_entries.map(_.pal).asUInt) 420 | // val ppp_array_if_cached = ppp_array // | c_array 421 | // val paa_array_if_cached = paa_array // | (if (usingAtomicsInCache) c_array else 0.U) 422 | // val pal_array_if_cached = pal_array // | (if (usingAtomicsInCache) c_array else 0.U) 423 | 424 | // vaddr misaligned: vaddr[1:0]=b00 425 | val misaligned = (io.req.bits.vaddr & (UIntToOH(io.req.bits.size) - 1.U)).orR 426 | // def badVA(): Bool = { 427 | // val additionalPgLevels = satp.additionalPgLevels 428 | // val signed = 1 429 | // val nPgLevelChoices = cfg.pgLevels - cfg.minPgLevels + 1 430 | // val minVAddrBits = cfg.pgIdxBits + cfg.minPgLevels * cfg.pgLevelBits 431 | // (for (i <- 0 until nPgLevelChoices) yield { 432 | // val mask = 433 | // ((BigInt(1) << cfg.vaddrBits) - (BigInt(1) << (minVAddrBits + i * cfg.pgLevelBits - signed.toInt))).U 434 | // val maskedVAddr = io.req.bits.vaddr & mask 435 | // additionalPgLevels === i.U && !(maskedVAddr === 0.U || signed.B && maskedVAddr === mask) 436 | // }).orR 437 | // } 438 | val bad_gpa = false.B 439 | val bad_va = false.B 440 | 441 | val cmd_lrsc = usingAtomics.B && io.req.bits.cmd.isOneOf(M_XLR, M_XSC) 442 | val cmd_amo_logical = usingAtomics.B && isAMOLogical(io.req.bits.cmd) 443 | val cmd_amo_arithmetic = usingAtomics.B && isAMOArithmetic(io.req.bits.cmd) 444 | val cmd_put_partial = io.req.bits.cmd === M_PWR 445 | val cmd_read = isRead(io.req.bits.cmd) 446 | val cmd_readx = false.B 447 | val cmd_write = isWrite(io.req.bits.cmd) 448 | val cmd_write_perms = cmd_write || 449 | io.req.bits.cmd.isOneOf(M_FLUSH_ALL, M_WOK) // not a write, but needs write permissions 450 | 451 | // val lrscAllowed = Mux((usingDataScratchpad || usingAtomicsOnlyForIO).B, 0.U, c_array) 452 | val lrscAllowed = 0.U 453 | val ae_array = 454 | // Mux(misaligned, eff_array, 0.U) | 455 | Mux(cmd_lrsc, ~lrscAllowed, 0.U) 456 | 457 | // access exception needs SoC information from PMA 458 | val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U) 459 | val ae_st_array = 460 | Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) // | 461 | // Mux(cmd_put_partial, ~ppp_array_if_cached, 0.U) | 462 | // Mux(cmd_amo_logical, ~pal_array_if_cached, 0.U) | 463 | // Mux(cmd_amo_arithmetic, ~paa_array_if_cached, 0.U) 464 | // val must_alloc_array = 465 | // Mux(cmd_put_partial, ~ppp_array, 0.U) | 466 | // Mux(cmd_amo_logical, ~pal_array, 0.U) | 467 | // Mux(cmd_amo_arithmetic, ~paa_array, 0.U) | 468 | // Mux(cmd_lrsc, ~0.U(pal_array.getWidth.W), 0.U) 469 | val pf_ld_array = 470 | Mux(cmd_read, (ptw_ae_array | ptw_pf_array), 0.U) 471 | val pf_st_array = Mux(cmd_write_perms, (ptw_ae_array | ptw_pf_array), 0.U) 472 | val pf_inst_array = (ptw_ae_array | ptw_pf_array) 473 | 474 | val tlb_hit_if_not_gpa_miss = real_hits.orR 475 | val tlb_hit = real_hits.orR 476 | // leads to s_request 477 | val tlb_miss = vm_enabled && !tlb_hit 478 | 479 | val sectored_plru = new SetAssocLRU(cfg.nSets, sectored_entries.head.size, "plru") 480 | when(io.req.valid && vm_enabled) { 481 | // replace 482 | when(sector_hits.orR) { sectored_plru.access(memIdx, OHToUInt(sector_hits)) } 483 | } 484 | 485 | // Superpages create the possibility that two entries in the TLB may match. 486 | // This corresponds to a software bug, but we can't return complete garbage; 487 | // we must return either the old translation or the new translation. This 488 | // isn't compatible with the Mux1H approach. So, flush the TLB and report 489 | // a miss on duplicate entries. 490 | val multipleHits = PopCountAtLeast(real_hits, 2) 491 | 492 | // only pull up req.ready when this is s_ready state. 493 | io.req.ready := state === s_ready 494 | // page fault 495 | io.resp.pf.ld := (bad_va && cmd_read) || (pf_ld_array & hits).orR 496 | io.resp.pf.st := (bad_va && cmd_write_perms) || (pf_st_array & hits).orR 497 | io.resp.pf.inst := bad_va || (pf_inst_array & hits).orR 498 | // access exception 499 | io.resp.ae.ld := (ae_ld_array & hits).orR 500 | io.resp.ae.st := (ae_st_array & hits).orR 501 | io.resp.ae.inst := (~px_array & hits).orR 502 | // misaligned 503 | io.resp.ma.ld := misaligned && cmd_read 504 | io.resp.ma.st := misaligned && cmd_write 505 | io.resp.ma.inst := false.B // this is up to the pipeline to figure out 506 | // io.resp.cacheable := (c_array & hits).orR 507 | // io.resp.must_alloc := (must_alloc_array & hits).orR 508 | // io.resp.prefetchable := (prefetchable_array & hits).orR // && edge.manager.managers 509 | // .forall(m => !m.supportsAcquireB || m.supportsHint) 510 | // .B 511 | io.resp.miss := do_refill || tlb_miss || multipleHits 512 | io.resp.paddr := Cat(ppn, io.req.bits.vaddr(cfg.pgIdxBits - 1, 0)) 513 | 514 | io.ptw.req.valid := state === s_request 515 | io.ptw.req.bits.valid := !io.kill 516 | io.ptw.req.bits.bits.addr := r_refill_tag 517 | io.ptw.req.bits.bits.vstage1 := false.B 518 | io.ptw.req.bits.bits.stage2 := false.B 519 | 520 | val sfence = io.sfence.valid 521 | // this is [[s_ready]] 522 | // handle miss/hit at the first cycle. 523 | // if miss, request PTW(L2TLB). 524 | when(io.req.fire && tlb_miss) { 525 | state := s_request 526 | r_refill_tag := vpn 527 | r_sectored_repl_addr := replacementEntry(sectored_entries(memIdx), sectored_plru.way(memIdx)) 528 | r_sectored_hit.valid := sector_hits.orR 529 | r_sectored_hit.bits := OHToUInt(sector_hits) 530 | } 531 | // Handle SFENCE.VMA when send request to PTW. 532 | // SFENCE.VMA io.ptw.req.ready kill 533 | // ? ? 1 534 | // 0 0 0 535 | // 0 1 0 -> s_wait 536 | // 1 0 0 -> s_wait_invalidate 537 | // 1 0 0 -> s_ready 538 | when(state === s_request) { 539 | // SFENCE.VMA will kill TLB entries based on rs1 and rs2. It will take 1 cycle. 540 | when(sfence) { state := s_ready } 541 | // here should be io.ptw.req.fire, but assert(io.ptw.req.ready === true.B) 542 | // fire -> s_wait 543 | when(io.ptw.req.ready) { state := Mux(sfence, s_wait_invalidate, s_wait) } 544 | // If CPU kills request(frontend.s2_redirect) 545 | when(io.kill) { state := s_ready } 546 | } 547 | // sfence in refill will results in invalidate 548 | when(state === s_wait && sfence) { 549 | state := s_wait_invalidate 550 | } 551 | // after CPU acquire response, go back to s_ready. 552 | when(io.ptw.resp.valid) { 553 | state := s_ready 554 | } 555 | 556 | // SFENCE processing logic. 557 | when(sfence) { 558 | assert(!io.sfence.bits.rs1 || (io.sfence.bits.addr >> cfg.pgIdxBits) === vpn) 559 | for (e <- all_real_entries) { 560 | when(io.sfence.bits.rs1) { e.invalidateVPN(vpn, asid) } 561 | .elsewhen(io.sfence.bits.rs2) { e.invalidateNonGlobal(asid) } 562 | .otherwise { e.invalidate(asid) } 563 | } 564 | } 565 | 566 | when(multipleHits || reset.asBool) { 567 | all_real_entries.foreach(_.invalidate()) 568 | } 569 | 570 | ccover(io.ptw.req.fire, "MISS", "TLB miss") 571 | ccover(io.ptw.req.valid && !io.ptw.req.ready, "PTW_STALL", "TLB miss, but PTW busy") 572 | ccover(state === s_wait_invalidate, "SFENCE_DURING_REFILL", "flush TLB during TLB refill") 573 | ccover(sfence && !io.sfence.bits.rs1 && !io.sfence.bits.rs2, "SFENCE_ALL", "flush TLB") 574 | ccover(sfence && !io.sfence.bits.rs1 && io.sfence.bits.rs2, "SFENCE_ASID", "flush TLB ASID") 575 | ccover(sfence && io.sfence.bits.rs1 && !io.sfence.bits.rs2, "SFENCE_LINE", "flush TLB line") 576 | ccover(sfence && io.sfence.bits.rs1 && io.sfence.bits.rs2, "SFENCE_LINE_ASID", "flush TLB line/ASID") 577 | ccover(multipleHits, "MULTIPLE_HITS", "Two matching translations in TLB") 578 | 579 | def ccover( 580 | cond: Bool, 581 | label: String, 582 | desc: String 583 | )( 584 | implicit sourceInfo: SourceInfo 585 | ) = 586 | property.cover(cond, s"${if (instruction) "I" else "D"}TLB_$label", "MemorySystem;;" + desc) 587 | 588 | /** Decides which entry to be replaced 589 | * 590 | * If there is a invalid entry, replace it with priorityencoder; if not, replace the alt entry 591 | * 592 | * @return 593 | * mask for TLBEntry replacement 594 | */ 595 | def replacementEntry(set: Seq[TLBEntry], alt: UInt) = { 596 | val valids = set.map(_.valid.orR).asUInt 597 | Mux(valids.andR, alt, PriorityEncoder(~valids)) 598 | } 599 | } 600 | -------------------------------------------------------------------------------- /src/main/scala/core/VGPR.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.lib._ 7 | import ogpu.config._ 8 | 9 | class VGPR( 10 | implicit p: Parameters) 11 | extends Module() { 12 | val numWarps = p(WarpNum) 13 | val numRegs = p(RegNum) 14 | val numThreads = p(ThreadNum) 15 | val xLen = p(XLen) 16 | 17 | val io = IO(new Bundle { 18 | val writeback = Flipped(DecoupledIO(new CommitVData())) 19 | val writeback_cmd = Flipped(DecoupledIO(new CommitVData())) 20 | val read_req = Flipped(new ReadGPRReq()) 21 | val read_rsp = new ReadVGPRRsp() 22 | }) 23 | 24 | val gpr_ram = VecInit(Seq.fill(numWarps)((Module(new MaskedSmem_2R1W(xLen, numRegs, numThreads)).io))) 25 | val raddr_reg = RegInit(0.U) 26 | val raddr2_reg = RegInit(0.U) 27 | val rwid_reg = RegInit(0.U(log2Ceil(numWarps).W)) 28 | val ready_reg = RegInit(0.B) 29 | val cmd_ready_reg = RegInit(0.B) 30 | 31 | val need_forward1 = RegInit(0.B) 32 | val need_forward2 = RegInit(0.B) 33 | val forward_data = RegInit(0.U.asTypeOf(Vec(numThreads, UInt(xLen.W)))) 34 | need_forward1 := io.writeback.bits.rd === io.read_req.rs1 && io.writeback.bits.wid === io.read_req.wid 35 | need_forward2 := io.writeback.bits.rd === io.read_req.rs2 && io.writeback.bits.wid === io.read_req.wid 36 | forward_data := io.writeback_cmd.bits.data 37 | raddr_reg := io.read_req.rs1 38 | raddr2_reg := io.read_req.rs2 39 | rwid_reg := io.read_req.wid 40 | 41 | io.writeback.ready := ready_reg 42 | io.writeback_cmd.ready := cmd_ready_reg 43 | 44 | for (i <- 0 until numWarps) { 45 | // init 46 | gpr_ram(i).write_en := 0.B 47 | gpr_ram(i).waddr := 0.U 48 | gpr_ram(i).raddr := io.read_req.rs1 49 | gpr_ram(i).raddr2 := io.read_req.rs2 50 | gpr_ram(i).mask := 0.U.asTypeOf(io.writeback.bits.mask) 51 | gpr_ram(i).dataIn := 0.U.asTypeOf(io.writeback.bits.data) 52 | 53 | when(io.writeback_cmd.valid && i.U === io.writeback_cmd.bits.wid) { 54 | gpr_ram(i).write_en := io.writeback_cmd.valid 55 | gpr_ram(i).waddr := io.writeback_cmd.bits.rd 56 | gpr_ram(i).mask := io.writeback_cmd.bits.mask 57 | gpr_ram(i).dataIn := io.writeback_cmd.bits.data 58 | }.elsewhen(io.writeback.valid && i.U === io.writeback.bits.wid) { 59 | gpr_ram(i).write_en := io.writeback.valid 60 | gpr_ram(i).waddr := io.writeback.bits.rd 61 | gpr_ram(i).mask := io.writeback.bits.mask 62 | gpr_ram(i).dataIn := io.writeback.bits.data 63 | } 64 | 65 | } 66 | 67 | ready_reg := 0.B 68 | cmd_ready_reg := 0.B 69 | when(io.writeback_cmd.valid) { 70 | cmd_ready_reg := 1.B 71 | }.elsewhen(io.writeback.valid) { 72 | ready_reg := 1.B 73 | } 74 | 75 | io.read_rsp.rs1_data := Mux( 76 | raddr_reg === 0.U, 77 | 0.U.asTypeOf(gpr_ram(0).dataOut), 78 | Mux(need_forward1, forward_data, gpr_ram(rwid_reg).dataOut) 79 | ) 80 | io.read_rsp.rs2_data := Mux( 81 | raddr2_reg === 0.U, 82 | 0.U.asTypeOf(gpr_ram(0).dataOut), 83 | Mux(need_forward2, forward_data, gpr_ram(rwid_reg).dataOut2) 84 | ) 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/core/VectorALU.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | 8 | class VectorALU( 9 | implicit p: Parameters) 10 | extends Module { 11 | val numThread = p(ThreadNum) 12 | val io = IO(new Bundle { 13 | val in = Flipped(DecoupledIO(new VALUData())) 14 | val out = DecoupledIO(new CommitVData()) 15 | val branch_data = DecoupledIO(new BranchData()) 16 | }) 17 | 18 | val alu = VecInit(Seq.fill(numThread)((Module(new ScalarALU()).io))) 19 | 20 | val result = Module(new Queue(new CommitVData(), 1, pipe = true)) 21 | val branch_result = Module(new Queue(new BranchData(), 1, pipe = true)) 22 | 23 | for (x <- 0 until numThread) { 24 | alu(x).in1 := io.in.bits.op1(x) 25 | alu(x).in2 := io.in.bits.op2(x) 26 | alu(x).fn := io.in.bits.func 27 | result.io.enq.bits.data(x) := alu(x).out 28 | result.io.enq.bits.mask(x) := io.in.bits.mask(x) 29 | branch_result.io.enq.bits.mask(x) := alu(x).cmp_out & io.in.bits.mask(x) 30 | } 31 | 32 | branch_result.io.enq.bits.branch := io.in.bits.branch 33 | branch_result.io.enq.bits.wid := io.in.bits.wid 34 | branch_result.io.enq.bits.pc := io.in.bits.pc 35 | branch_result.io.enq.bits.orig_mask := io.in.bits.mask 36 | branch_result.io.enq.bits.imm := io.in.bits.imm 37 | branch_result.io.enq.bits.rs1_data := io.in.bits.rs1_data 38 | 39 | io.in.ready := result.io.enq.ready && branch_result.io.enq.ready 40 | 41 | result.io.enq.valid := io.in.valid 42 | result.io.enq.bits.wid := io.in.bits.wid 43 | result.io.enq.bits.pc := io.in.bits.pc 44 | result.io.enq.bits.rd := io.in.bits.rd 45 | result.io.enq.bits.eop := 1.B 46 | 47 | val is_branch = io.in.bits.branch.jal | io.in.bits.branch.jalr | io.in.bits.branch.branch 48 | branch_result.io.enq.valid := io.in.valid && is_branch 49 | 50 | io.out <> result.io.deq 51 | io.branch_data <> branch_result.io.deq 52 | } 53 | 54 | // object VectorALURTL extends App { 55 | // implicit val p = new CoreConfig 56 | // emitVerilog(new VectorALU(), Array("--target-dir", "generated")) 57 | // } 58 | // 59 | // object VectorALUFIR extends App { 60 | // // ChiselStage.emitFirrtl(new VectorALU()) 61 | // implicit val p = new CoreConfig 62 | // ChiselStage.emitCHIRRTL(new VectorALU()) 63 | // } 64 | 65 | // object VectorALUGraph extends App { 66 | // (new ChiselStage).emitGraphML(new VectorALU() , Array("--target-dir", "graphs")) 67 | // } 68 | -------------------------------------------------------------------------------- /src/main/scala/core/WarpScheduler.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import ogpu.config._ 7 | import ogpu.tile._ 8 | 9 | class VGPRWriter( 10 | implicit p: Parameters) 11 | extends Module { 12 | val numWarps = p(WarpNum) 13 | val numRegs = p(RegNum) 14 | val numThreads = p(ThreadNum) 15 | val xlen = p(XLen) 16 | val addrWidth = p(AddrWidth) 17 | 18 | val io = IO(new Bundle { 19 | val warp_cmd = Input(Valid(new CuTaskBundle())) 20 | val wid = Input(UInt(log2Ceil(numWarps).W)) 21 | val commit_data = DecoupledIO(new CommitVData()) 22 | val finish = DecoupledIO(new Bool()) 23 | val idle = Output(Bool()) 24 | }) 25 | 26 | val s_idle :: s_working :: s_finish :: Nil = Enum(3) 27 | val state = RegInit(s_idle) 28 | 29 | val commit_counter = RegInit(0.U(2.W)) 30 | io.idle := state === s_idle 31 | 32 | val counter_add1 = commit_counter + 1.U 33 | 34 | val tid_data = Wire(Vec(3, Vec(numThreads, UInt(xlen.W)))) 35 | tid_data(0) := VecInit.tabulate(numThreads) { i => io.warp_cmd.bits.thread_dims(0) | i.U } 36 | tid_data(1) := VecInit.tabulate(numThreads) { _ => io.warp_cmd.bits.thread_dims(1) } 37 | tid_data(2) := VecInit.tabulate(numThreads) { _ => io.warp_cmd.bits.thread_dims(2) } 38 | 39 | switch(state) { 40 | is(s_idle) { 41 | when(io.warp_cmd.valid) { 42 | state := s_working 43 | } 44 | } 45 | is(s_working) { 46 | when(((counter_add1 === io.warp_cmd.bits.vgpr_num) & io.commit_data.fire) | io.warp_cmd.bits.vgpr_num === 0.U) { 47 | state := s_finish 48 | } 49 | } 50 | is(s_finish) { 51 | when(io.finish.fire) { 52 | state := s_idle 53 | } 54 | } 55 | } 56 | 57 | io.commit_data.bits.wid := io.wid 58 | io.commit_data.bits.mask := io.warp_cmd.bits.mask 59 | io.commit_data.bits.rd := counter_add1 60 | io.commit_data.bits.eop := true.B 61 | io.commit_data.bits.pc := 0.U 62 | io.commit_data.valid := false.B 63 | io.commit_data.bits.data := tid_data(commit_counter) 64 | io.finish.bits := 0.U 65 | io.commit_data.valid := state === s_working 66 | io.finish.valid := state === s_finish 67 | switch(state) { 68 | is(s_idle) { 69 | commit_counter := 0.U 70 | } 71 | is(s_working) { 72 | when(io.commit_data.fire & counter_add1 =/= io.warp_cmd.bits.vgpr_num) { 73 | commit_counter := counter_add1 74 | } 75 | } 76 | } 77 | } 78 | 79 | class SGPRWriter( 80 | implicit p: Parameters) 81 | extends Module { 82 | val numWarps = p(WarpNum) 83 | val numRegs = p(RegNum) 84 | val addrWidth = p(AddrWidth) 85 | 86 | val io = IO(new Bundle { 87 | val warp_cmd = Input(Valid(new CuTaskBundle())) 88 | val wid = Input(UInt(log2Ceil(numWarps).W)) 89 | val commit_data = DecoupledIO(new CommitSData()) 90 | val finish = DecoupledIO(Bool()) 91 | val idle = Output(Bool()) 92 | }) 93 | 94 | val s_idle :: s_working :: s_finish :: Nil = Enum(3) 95 | 96 | val commit_counter = RegInit(0.U(5.W)) 97 | val state = RegInit(s_idle) 98 | 99 | val counter_add1 = commit_counter + 1.U 100 | val commit_data = io.warp_cmd.bits.sgprs(commit_counter) 101 | 102 | io.idle := state === s_idle 103 | switch(state) { 104 | is(s_idle) { 105 | when(io.warp_cmd.valid) { 106 | state := s_working 107 | } 108 | } 109 | is(s_working) { 110 | when(((commit_counter === io.warp_cmd.bits.sgpr_num) & io.commit_data.fire) | io.warp_cmd.bits.sgpr_num === 0.U) { 111 | state := s_finish 112 | } 113 | } 114 | is(s_finish) { 115 | when(io.finish.fire) { 116 | state := s_idle 117 | } 118 | } 119 | } 120 | 121 | io.commit_data.bits.wid := io.wid 122 | io.commit_data.bits.rd := commit_counter 123 | io.commit_data.bits.eop := true.B 124 | io.commit_data.bits.pc := 0.U 125 | io.commit_data.bits.data := commit_data 126 | io.commit_data.bits.mask := io.warp_cmd.bits.mask(0) 127 | io.finish.valid := state === s_finish 128 | io.commit_data.valid := state === s_working 129 | io.finish.bits := 0.U 130 | switch(state) { 131 | is(s_idle) { 132 | io.commit_data.valid := false.B 133 | commit_counter := 0.U 134 | } 135 | is(s_working) { 136 | when(io.commit_data.fire & commit_counter =/= io.warp_cmd.bits.sgpr_num) { 137 | commit_counter := counter_add1 138 | } 139 | } 140 | } 141 | } 142 | 143 | class WarpScheduler( 144 | implicit p: Parameters) 145 | extends Module { 146 | val numWarps = p(WarpNum) 147 | val numRegs = p(RegNum) 148 | val numThreads = p(ThreadNum) 149 | val addrWidth = p(AddrWidth) 150 | 151 | val xLen = p(XLen) 152 | 153 | val io = IO(new Bundle { 154 | val warp_cmd = Flipped(DecoupledIO(new CuTaskBundle())) 155 | val warp_ctl = Flipped(DecoupledIO(new WarpControlData())) 156 | val branch_ctl = Flipped(DecoupledIO(new BranchControlData())) 157 | val inst_fetch = DecoupledIO(new InstFetchData()) 158 | val warp_end = DecoupledIO(new WarpEndData()) 159 | val sgpr_commit = DecoupledIO(new CommitSData()) 160 | val vgpr_commit = DecoupledIO(new CommitVData()) 161 | }) 162 | 163 | val warp_idle = RegInit(VecInit(Seq.fill(numWarps)(1.B))) 164 | val warp_active = RegInit(VecInit(Seq.fill(numWarps)(0.B))) 165 | val warp_pc = RegInit(VecInit(Seq.fill(numWarps)(0.U(addrWidth.W)))) 166 | val warp_tmask = RegInit(VecInit(Seq.fill(numWarps)(VecInit(Seq.fill(numThreads)(0.B))))) 167 | val pop_valid = RegInit(0.B) 168 | 169 | val pop_wid = RegInit(0.U(log2Ceil(numWarps).W)) 170 | io.warp_ctl.ready := true.B 171 | io.branch_ctl.ready := true.B 172 | 173 | val has_idle = warp_idle.asUInt.orR 174 | val has_active = warp_active.asUInt.orR 175 | val idle_id = PriorityEncoder(warp_idle) 176 | val active_id = PriorityEncoder(warp_active) 177 | 178 | val simt_stack = VecInit(Seq.fill(numWarps)(Module(new SIMTStack()).io)) 179 | 180 | val pop_diverge = Wire(Bool()) 181 | val pop_data = Wire(new StackData()) 182 | 183 | val vgpr_writer = Module(new VGPRWriter()) 184 | val sgpr_writer = Module(new SGPRWriter()) 185 | 186 | val lock_warp = RegInit(0.U(log2Ceil(numWarps).W)) 187 | 188 | vgpr_writer.io.warp_cmd.bits := io.warp_cmd.bits 189 | sgpr_writer.io.warp_cmd.bits := io.warp_cmd.bits 190 | vgpr_writer.io.wid := lock_warp 191 | sgpr_writer.io.wid := lock_warp 192 | 193 | val writer_finish = RegInit(false.B) 194 | writer_finish := sgpr_writer.io.finish.valid & vgpr_writer.io.finish.valid 195 | 196 | io.warp_cmd.ready := writer_finish 197 | vgpr_writer.io.finish.ready := writer_finish 198 | sgpr_writer.io.finish.ready := writer_finish 199 | 200 | sgpr_writer.io.wid := lock_warp 201 | vgpr_writer.io.wid := lock_warp 202 | io.sgpr_commit <> sgpr_writer.io.commit_data 203 | io.vgpr_commit <> vgpr_writer.io.commit_data 204 | 205 | val s_idle :: s_waiting :: Nil = Enum(2) 206 | val state = RegInit(s_idle) 207 | 208 | switch(state) { 209 | is(s_idle) { 210 | when(io.warp_cmd.valid & sgpr_writer.io.idle & vgpr_writer.io.idle) { 211 | state := s_waiting 212 | } 213 | } 214 | is(s_waiting) { 215 | when(writer_finish) { 216 | state := s_idle 217 | } 218 | } 219 | } 220 | 221 | vgpr_writer.io.warp_cmd.valid := false.B 222 | sgpr_writer.io.warp_cmd.valid := false.B 223 | 224 | switch(state) { 225 | is(s_idle) { 226 | when(io.warp_cmd.valid & sgpr_writer.io.idle & vgpr_writer.io.idle & has_idle) { 227 | lock_warp := idle_id 228 | vgpr_writer.io.warp_cmd.valid := true.B 229 | sgpr_writer.io.warp_cmd.valid := true.B 230 | } 231 | } 232 | is(s_waiting) { 233 | vgpr_writer.io.warp_cmd.valid := false.B 234 | sgpr_writer.io.warp_cmd.valid := false.B 235 | } 236 | } 237 | 238 | for (i <- 0 until numWarps) { 239 | simt_stack(i).in_diverge := io.branch_ctl.bits.wid === i.U && io.branch_ctl.valid && io.branch_ctl.bits.diverge 240 | simt_stack(i).in_data := io.branch_ctl.bits.data 241 | simt_stack(i).push := io.branch_ctl.bits.wid === i.U && io.branch_ctl.valid 242 | simt_stack(i).pop := io.warp_ctl.bits.wid === i.U && io.warp_ctl.valid && io.warp_ctl.bits.join 243 | } 244 | 245 | io.warp_end.bits.wid := io.warp_ctl.bits.wid 246 | io.warp_end.valid := io.warp_ctl.valid & io.warp_ctl.bits.end 247 | 248 | pop_valid := io.warp_ctl.valid & io.warp_ctl.bits.join 249 | pop_wid := io.warp_ctl.bits.wid 250 | 251 | pop_diverge := simt_stack(pop_wid).out_diverge 252 | pop_data := simt_stack(pop_wid).out_data 253 | 254 | for (i <- 0 until numWarps) { 255 | when(io.warp_cmd.fire && i.U === lock_warp) { 256 | warp_idle(i) := false.B 257 | warp_active(i) := true.B 258 | warp_pc(i) := io.warp_cmd.bits.pc 259 | warp_tmask(i) := io.warp_cmd.bits.mask 260 | } 261 | 262 | when(io.warp_ctl.fire && io.warp_ctl.bits.end && i.U === io.warp_ctl.bits.wid) { 263 | warp_idle(i) := true.B 264 | warp_active(i) := false.B 265 | } 266 | 267 | when(io.warp_ctl.valid && i.U === io.warp_ctl.bits.wid) { 268 | warp_active(i) := io.warp_ctl.bits.active 269 | } 270 | 271 | when(io.branch_ctl.valid && i.U === io.branch_ctl.bits.wid) { 272 | warp_pc(i) := io.branch_ctl.bits.pc 273 | warp_active(i) := 1.B 274 | warp_tmask(i) := io.branch_ctl.bits.mask 275 | } 276 | 277 | when(pop_valid && i.U === pop_wid) { 278 | warp_active(i) := 1.B 279 | when(pop_diverge) { 280 | warp_pc(i) := pop_data.pc 281 | warp_tmask(i) := pop_data.mask 282 | }.otherwise { 283 | warp_tmask(i) := pop_data.orig_mask 284 | } 285 | } 286 | 287 | when(io.inst_fetch.fire && i.U === active_id) { 288 | warp_active(i) := 0.B 289 | warp_pc(i) := warp_pc(i) + 4.U 290 | } 291 | } // loop num warps 292 | 293 | io.inst_fetch.bits.pc := 0.U 294 | io.inst_fetch.bits.mask := VecInit(Seq.fill(numThreads)(0.B)) 295 | io.inst_fetch.bits.wid := 0.U 296 | when(has_active) { 297 | io.inst_fetch.valid := !warp_idle(active_id) 298 | io.inst_fetch.bits.pc := warp_pc(active_id) 299 | io.inst_fetch.bits.mask := warp_tmask(active_id) 300 | io.inst_fetch.bits.wid := active_id 301 | }.otherwise { 302 | io.inst_fetch.valid := 0.B 303 | } 304 | } 305 | -------------------------------------------------------------------------------- /src/main/scala/core/Writeback.scala: -------------------------------------------------------------------------------- 1 | package ogpu.core 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | 7 | class Writeback( 8 | implicit p: Parameters) 9 | extends Module { 10 | val io = IO(new Bundle { 11 | val alu_commit = Flipped(DecoupledIO(new CommitVData())) 12 | val lsu_commit = Flipped(DecoupledIO(new CommitVData())) 13 | val writeback = DecoupledIO(new CommitVData()) 14 | }) 15 | 16 | val rsp_data = VecInit( 17 | Seq( 18 | io.alu_commit, 19 | io.lsu_commit 20 | ) 21 | ) 22 | 23 | val rsp_arbiter = Module(new RRArbiter(new CommitVData(), 2)) 24 | rsp_arbiter.io.in <> rsp_data 25 | 26 | val outQue = Module(new Queue(new CommitVData(), 1, pipe = true)) 27 | outQue.io.enq <> rsp_arbiter.io.out 28 | 29 | io.writeback <> outQue.io.deq 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/dispatcher/DispatcherBundle.scala: -------------------------------------------------------------------------------- 1 | package ogpu.dispatcher 2 | 3 | import chisel3._ 4 | 5 | class AQLBundle() extends Bundle { 6 | val header = UInt(16.W) 7 | val dimensions = UInt(2.W) 8 | val reserved1 = UInt(14.W) 9 | val workgroup_size_x = UInt(16.W) 10 | val workgroup_size_y = UInt(16.W) 11 | val workgroup_size_z = UInt(16.W) 12 | val reserved2 = UInt(16.W) 13 | val grid_size_x = UInt(32.W) 14 | val grid_size_y = UInt(32.W) 15 | val grid_size_z = UInt(32.W) 16 | val private_sgement_size = UInt(32.W) 17 | val group_segment_size = UInt(32.W) 18 | val kernel_object = UInt(64.W) 19 | val kernargs_address = UInt(64.W) 20 | val completion_signal = UInt(64.W) 21 | } 22 | 23 | class WorkGroupTaskBundle() extends Bundle { 24 | val workgroup_size_x = UInt(16.W) 25 | val workgroup_size_y = UInt(16.W) 26 | val workgroup_size_z = UInt(16.W) 27 | // val grid_size_x = UInt(32.W) 28 | // val grid_size_y = UInt(32.W) 29 | // val grid_size_z = UInt(32.W) 30 | val grid_id_x = UInt(32.W) 31 | val grid_id_y = UInt(32.W) 32 | val grid_id_z = UInt(32.W) 33 | val private_sgement_size = UInt(32.W) 34 | val group_segment_size = UInt(32.W) 35 | val kernel_object = UInt(64.W) 36 | 37 | val kernargs_address = UInt(64.W) 38 | } 39 | 40 | class WorkGroupTaskRespBundle() extends Bundle { 41 | val finish = Bool() 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/dispatcher/JobDispatcher.scala: -------------------------------------------------------------------------------- 1 | package ogpu.dispatcher 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} 7 | 8 | case class JobDispatchParams() { 9 | def buffer_num = 1 10 | } 11 | 12 | class JobDispatcher( 13 | params: JobDispatchParams 14 | )( 15 | implicit p: Parameters) 16 | extends LazyModule { 17 | 18 | lazy val module = new Impl(this) 19 | 20 | class Impl( 21 | outer: JobDispatcher 22 | )( 23 | implicit p: Parameters) 24 | extends LazyModuleImp(outer) { 25 | val io = IO(new Bundle { 26 | val aql = Flipped(DecoupledIO(new AQLBundle)) 27 | val task = DecoupledIO(new WorkGroupTaskBundle) 28 | val task_resp = Flipped(DecoupledIO(new WorkGroupTaskRespBundle)) 29 | }) 30 | 31 | val s_idle :: s_working :: s_finish :: Nil = Enum(3) 32 | val state = RegInit(s_idle) 33 | 34 | io.aql.ready := state === s_idle 35 | 36 | val grid_x = RegInit(UInt(32.W)) 37 | val grid_y = RegInit(UInt(32.W)) 38 | val grid_z = RegInit(UInt(32.W)) 39 | val workgroup_x = RegInit(UInt(16.W)) 40 | val workgroup_y = RegInit(UInt(16.W)) 41 | val workgroup_z = RegInit(UInt(16.W)) 42 | val grid_counter_x = RegInit(UInt(32.W)) 43 | val grid_counter_y = RegInit(UInt(32.W)) 44 | val grid_counter_z = RegInit(UInt(32.W)) 45 | 46 | val taskDone = grid_counter_x === (grid_x - 1.U) & 47 | grid_counter_y === (grid_y - 1.U) & 48 | grid_counter_z === (grid_z - 1.U) 49 | 50 | val grid_x_acc = (grid_counter_x === (grid_x - 1.U)) 51 | val grid_y_acc = (grid_counter_x === (grid_x - 1.U)) & (grid_counter_y =/= (grid_y - 1.U)) 52 | val grid_z_acc = (grid_counter_x === (grid_x - 1.U)) & (grid_counter_y === (grid_y - 1.U)) 53 | 54 | val grid_rcounter_x = RegInit(UInt(32.W)) 55 | val grid_rcounter_y = RegInit(UInt(32.W)) 56 | val grid_rcounter_z = RegInit(UInt(32.W)) 57 | 58 | val grid_x_racc = (grid_rcounter_x === (grid_x - 1.U)) 59 | val grid_y_racc = (grid_rcounter_x === (grid_x - 1.U)) & (grid_rcounter_y =/= (grid_y - 1.U)) 60 | val grid_z_racc = (grid_rcounter_x === (grid_x - 1.U)) & (grid_rcounter_y === (grid_y - 1.U)) 61 | 62 | val recDone = grid_rcounter_x === (grid_x - 1.U) & 63 | grid_rcounter_y === (grid_y - 1.U) & 64 | grid_rcounter_z === (grid_z - 1.U) 65 | 66 | // state transition 67 | switch(state) { 68 | is(s_idle) { 69 | when(io.aql.fire) { 70 | state := s_working 71 | } 72 | } 73 | is(s_working) { 74 | when(taskDone & io.task.fire) { 75 | state := s_finish 76 | } 77 | } 78 | is(s_finish) { 79 | when(state_rec === s_rec_finish) { 80 | state := s_idle 81 | } 82 | } 83 | } 84 | 85 | // state action 86 | switch(state) { 87 | is(s_idle) { 88 | when(io.aql.fire) { 89 | grid_x := io.aql.bits.grid_size_x 90 | grid_y := io.aql.bits.grid_size_y 91 | grid_z := io.aql.bits.grid_size_z 92 | workgroup_x := io.aql.bits.workgroup_size_x 93 | workgroup_y := io.aql.bits.workgroup_size_y 94 | workgroup_z := io.aql.bits.workgroup_size_z 95 | grid_counter_x := 0.U 96 | grid_counter_y := 0.U 97 | grid_counter_z := 0.U 98 | io.task.valid := true.B 99 | io.task.bits.workgroup_size_x := io.aql.bits.workgroup_size_x 100 | io.task.bits.workgroup_size_y := io.aql.bits.workgroup_size_y 101 | io.task.bits.workgroup_size_z := io.aql.bits.workgroup_size_z 102 | io.task.bits.grid_id_x := 0.U 103 | io.task.bits.grid_id_y := 0.U 104 | io.task.bits.grid_id_z := 0.U 105 | } 106 | } 107 | is(s_working) { 108 | io.task.bits.workgroup_size_x := workgroup_x 109 | io.task.bits.workgroup_size_y := workgroup_y 110 | io.task.bits.workgroup_size_z := workgroup_z 111 | io.task.bits.grid_id_x := grid_counter_x 112 | io.task.bits.grid_id_y := grid_counter_y 113 | io.task.bits.grid_id_z := grid_counter_z 114 | when(io.task.fire) { 115 | when(grid_x_acc) { 116 | grid_counter_x := grid_counter_x + 1.U 117 | }.otherwise { 118 | grid_counter_x := 0.U 119 | } 120 | 121 | when(grid_y_acc) { 122 | grid_counter_y := grid_counter_y + 1.U 123 | }.otherwise { 124 | grid_counter_y := 0.U 125 | } 126 | 127 | when(grid_z_acc) { 128 | grid_counter_z := grid_counter_z + 1.U 129 | }.otherwise { 130 | grid_counter_z := 0.U 131 | } 132 | } 133 | 134 | when(taskDone & io.task.fire) { 135 | io.task.valid := false.B 136 | } 137 | } 138 | } 139 | 140 | val s_rec_idle :: s_rec_working :: s_rec_finish :: Nil = Enum(3) 141 | val state_rec = RegInit(s_rec_idle) 142 | 143 | io.task_resp.ready := state_rec === s_rec_working 144 | 145 | switch(state_rec) { 146 | is(s_rec_idle) { 147 | when(io.task.fire) { 148 | state_rec := s_rec_working 149 | } 150 | } 151 | is(s_rec_working) { 152 | when(recDone & io.task_resp.fire) { 153 | state_rec := s_rec_finish 154 | } 155 | } 156 | is(s_rec_finish) { 157 | state_rec := s_rec_idle 158 | } 159 | } 160 | 161 | switch(state_rec) { 162 | is(s_rec_idle) { 163 | grid_rcounter_x := 0.U 164 | grid_rcounter_y := 0.U 165 | grid_rcounter_z := 0.U 166 | } 167 | is(s_rec_working) { 168 | when(io.task_resp.fire) { 169 | when(grid_x_racc) { 170 | grid_rcounter_x := grid_rcounter_x + 1.U 171 | }.otherwise { 172 | grid_rcounter_x := 0.U 173 | } 174 | 175 | when(grid_y_racc) { 176 | grid_rcounter_y := grid_rcounter_y + 1.U 177 | }.otherwise { 178 | grid_rcounter_y := 0.U 179 | } 180 | 181 | when(grid_z_racc) { 182 | grid_rcounter_z := grid_rcounter_z + 1.U 183 | }.otherwise { 184 | grid_rcounter_z := 0.U 185 | } 186 | } 187 | when(recDone & io.task_resp.fire) { 188 | // io.intr.valid := true.B 189 | } 190 | } 191 | is(s_rec_finish) { 192 | // io.intr.valid := false.B 193 | } 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/main/scala/dispatcher/TLQM.scala: -------------------------------------------------------------------------------- 1 | package ogpu.dispatcher 2 | 3 | import chisel3._ 4 | import org.chipsalliance.cde.config.Parameters 5 | import freechips.rocketchip.diplomacy._ 6 | import freechips.rocketchip.tilelink._ 7 | import freechips.rocketchip.regmapper._ 8 | import chisel3.util.{is, switch, Enum} 9 | 10 | case class QMParams(baseAddress: BigInt = 0x03000000) { 11 | def address = AddressSet(baseAddress, 0xff) 12 | } 13 | 14 | class TLQM( 15 | params: QMParams 16 | )( 17 | implicit p: Parameters) 18 | extends LazyModule { 19 | 20 | val device = new SimpleDevice("qm", Seq("ogpu, qm")) { 21 | override val alwaysExtended = true 22 | } 23 | 24 | val node = TLRegisterNode(address = Seq(params.address), device = device, beatBytes = 8) 25 | 26 | val clientParameters = TLMasterPortParameters.v1( 27 | clients = Seq( 28 | TLMasterParameters.v1( 29 | "tlqm master", 30 | sourceId = IdRange(0, 16) 31 | ) 32 | ) 33 | ) 34 | val clientNode = TLClientNode(Seq(clientParameters)) 35 | 36 | lazy val module = new Impl(this) 37 | class Impl( 38 | outer: TLQM 39 | )( 40 | implicit p: Parameters) 41 | extends LazyModuleImp(outer) { 42 | val io = IO(new Bundle { 43 | // val tlb = new TlbRequestIO(1) 44 | }) 45 | 46 | // io.tlb.req_kill := false.B 47 | 48 | // val (tl_out, edge_out) = outer.clientNode.out(0) 49 | // val base_addr = RegInit(0.U(64.W)) 50 | // val rptr = RegInit(0.U(64.W)) 51 | // val wptr = RegInit(0.U(64.W)) 52 | // val size = RegInit(0.U(64.W)) 53 | // val enable = RegInit(0.B) 54 | // val data = RegInit(0.U(512.W)) 55 | 56 | // val pending = WireInit(rptr =/= wptr) 57 | 58 | // // step1 issue tlb request, update rptr 59 | // val s1_idle :: s1_req :: s1_ack :: Nil = Enum(3) 60 | 61 | // val s1_state = RegInit(s1_idle) 62 | // val s1_rptr = RegInit(0.U(64.W)) 63 | 64 | // // s1 state transition 65 | // switch(s1_state) { 66 | // is(s1_idle) { 67 | // when(pending & enable) { 68 | // s1_state := s1_req 69 | // } 70 | // } 71 | // is(s1_req) { 72 | // when(io.tlb.req.fire) { 73 | // s1_state := s1_ack 74 | // } 75 | // } 76 | // is(s1_ack) { 77 | // when(io.tlb.resp.fire) { 78 | // s1_state := s1_idle 79 | // } 80 | // } 81 | // } 82 | 83 | // // s1 state action 84 | // switch(s1_state) { 85 | // is(s1_idle) { 86 | // when(pending & enable) { 87 | // io.tlb.req.valid := true.B 88 | // io.tlb.req.bits.vaddr := base_addr + (rptr % size) 89 | // }.otherwise { 90 | // io.tlb.req.valid := false.B 91 | // } 92 | // } 93 | // is(s1_req) { 94 | // when(io.tlb.req.fire) { 95 | // io.tlb.req.valid := false.B 96 | // } 97 | // } 98 | // is(s1_ack) { 99 | // when(io.tlb.resp.fire) { 100 | // rptr := rptr + 8.U 101 | // } 102 | // } 103 | // } 104 | 105 | // // s2 get paddr and read aql package 106 | // val s2_idle :: s2_req :: s2_ack :: Nil = Enum(3) 107 | // io.tlb.resp.ready := s2_state === s2_idle 108 | // val s2_state = RegInit(s2_idle) 109 | 110 | // // s2 state transition 111 | // switch(s2_state) { 112 | // is(s2_idle) { 113 | // when(io.tlb.resp.fire & enable) { 114 | // s2_state := s2_req 115 | // } 116 | // } 117 | // is(s2_req) { 118 | // when(tl_out.a.fire) { 119 | // s2_state := s2_ack 120 | // } 121 | // } 122 | // is(s2_ack) { 123 | // when(tl_out.d.fire) { 124 | // s2_state := s2_idle 125 | // } 126 | // } 127 | // } 128 | 129 | // // s2 state action 130 | // switch(s2_state) { 131 | // is(s2_idle) { 132 | // when(io.tlb.resp.fire) { 133 | // tl_out.a.valid := 1.B 134 | // tl_out.a.bits.address := 0.U 135 | // tl_out.a.bits.opcode := 0.U 136 | // tl_out.a.bits.size := 0.U 137 | // tl_out.a.bits.data := 0.U 138 | // tl_out.a.bits.mask := 0.U 139 | // } 140 | // } 141 | // is(s2_req) { 142 | // when(tl_out.a.fire) { 143 | // tl_out.a.valid := false.B 144 | // } 145 | // } 146 | // is(s2_ack) { 147 | // when(tl_out.d.fire) { 148 | // data := tl_out.d.bits.data 149 | // } 150 | // } 151 | // } 152 | 153 | // // s3 disptach aql data 154 | // val s3_idle :: s3_req :: s3_ack :: Nil = Enum(3) 155 | // val s3_state = RegInit(s3_idle) 156 | // tl_out.d.ready := s3_state === s3_idle 157 | 158 | // // ringbuffer base address 159 | // // ringbuffer rptr 160 | // // ringbuffer wptr, doorbell register 161 | // // ringbuffer size 162 | // // queue enable 163 | // node.regmap( 164 | // 0 -> Seq(RegField(64, base_addr, RegFieldDesc("base", "queue ring buffer base address", reset = Some(0)))), 165 | // 8 -> Seq(RegField(64, rptr, RegFieldDesc("rptr", "queue ring buffer read offset address", reset = Some(0)))), 166 | // 16 -> Seq( 167 | // RegField(64, wptr, RegFieldDesc("wptr", "queue ring buffer write offset", reset = Some(0))) 168 | // ), 169 | // 24 -> Seq(RegField(64, size, RegFieldDesc("size", "queue ring buffer size address", reset = Some(0)))), 170 | // 32 -> Seq(RegField(1, enable, RegFieldDesc("enable", "queue enable", reset = Some(0)))) 171 | // ) 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/scala/lib/Sram.scala: -------------------------------------------------------------------------------- 1 | package ogpu.lib 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class ReadWriteSmem(width: Int = 32, depth: Int = 1024) extends Module { 7 | val io = IO(new Bundle { 8 | val enable = Input(Bool()) 9 | val write = Input(Bool()) 10 | val addr = Input(UInt(log2Ceil(depth).W)) 11 | val dataIn = Input(UInt(width.W)) 12 | val dataOut = Output(UInt(width.W)) 13 | }) 14 | 15 | val mem = SyncReadMem(depth, UInt(width.W)) 16 | when(io.enable && io.write) { 17 | mem.write(io.addr, io.dataIn) 18 | } 19 | io.dataOut := mem.read(io.addr, io.enable) 20 | } 21 | 22 | class MaskedSmem_2R1W(width: Int = 32, depth: Int = 1024, vecLen: Int = 32) extends Module { 23 | val io = IO(new Bundle { 24 | val write_en = Input(Bool()) 25 | val waddr = Input(UInt(log2Ceil(depth).W)) 26 | val raddr = Input(UInt(log2Ceil(depth).W)) 27 | val raddr2 = Input(UInt(log2Ceil(depth).W)) 28 | val mask = Input(Vec(vecLen, Bool())) 29 | val dataIn = Input(Vec(vecLen, UInt(width.W))) 30 | val dataOut = Output(Vec(vecLen, UInt(width.W))) 31 | val dataOut2 = Output(Vec(vecLen, UInt(width.W))) 32 | }) 33 | 34 | val mem = SyncReadMem(depth, Vec(vecLen, UInt(width.W))) 35 | when(io.write_en) { 36 | mem.write(io.waddr, io.dataIn, io.mask) 37 | } 38 | io.dataOut := mem.read(io.raddr, 1.B) 39 | io.dataOut2 := mem.read(io.raddr2, 1.B) 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/package.scala: -------------------------------------------------------------------------------- 1 | // See LICENSE.Berkeley for license details. 2 | 3 | package ogpu 4 | 5 | package object ogpu extends constants.ScalarOpConstants with constants.MemoryOpConstants 6 | -------------------------------------------------------------------------------- /src/main/scala/smmu/SMMU.scala: -------------------------------------------------------------------------------- 1 | package ogpu.smmu 2 | 3 | import chisel3._ 4 | 5 | import org.chipsalliance.cde.config.Parameters 6 | import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp, SimpleDevice} 7 | import freechips.rocketchip.tilelink.TLRegisterNode 8 | import freechips.rocketchip.regmapper.{RegField, RegFieldDesc} 9 | 10 | case class SMMUParams(baseAddress: BigInt = 0x02000000) { 11 | def address = AddressSet(baseAddress, 0xff) 12 | } 13 | 14 | class TLSMMU( 15 | params: SMMUParams 16 | )( 17 | implicit p: Parameters) 18 | extends LazyModule { 19 | 20 | val device = new SimpleDevice("smmu", Seq("ogpu, smmu0")) { 21 | override val alwaysExtended = true 22 | } 23 | 24 | val node = TLRegisterNode(address = Seq(params.address), device = device, beatBytes = 8) 25 | // val ptwlm = LazyModule(new L2TLBWrapper()) 26 | 27 | lazy val module = new Impl(this) 28 | class Impl( 29 | outer: TLSMMU 30 | )( 31 | implicit p: Parameters) 32 | extends LazyModuleImp(outer) { 33 | val io = IO(new Bundle { 34 | // val tlb = Flipped(new TlbRequestIO(1)) 35 | }) 36 | 37 | // val (mem, edge) = outer.ptwlm.node.out.head 38 | // val satp = RegInit(0.U(64.W)) 39 | 40 | // val ptwm = ptwlm.module 41 | // val tlbm = Module(new TLB(1, nRespDups = 1, Seq(true), new TLBParameters)) 42 | 43 | // val tlb_ptw = Wire(new VectorTlbPtwIO(1)) 44 | // tlb_ptw.connect(tlbm.io.ptw) 45 | 46 | // val sfence = WireInit(0.U.asTypeOf(new SfenceBundle)) 47 | // val tlbCsr = WireInit(0.U.asTypeOf(new TlbCsrBundle)) 48 | // tlbCsr.satp.apply(satp) 49 | 50 | // tlbm.io.requestor(0) <> io.tlb 51 | // tlbm.io.csr := tlbCsr 52 | // tlbm.io.sfence := sfence 53 | // tlbm.io.hartId := 0.U 54 | // tlbm.io.flushPipe := 0.U.asTypeOf(tlbm.io.flushPipe) 55 | 56 | // val tlbRepeater1 = PTWFilter(16, tlb_ptw, sfence, tlbCsr, 8) 57 | // val tlbRepeater2 = PTWRepeaterNB(passReady = false, 16, tlbRepeater1.io.ptw, ptwm.io.tlb(0), sfence, tlbCsr) 58 | 59 | // ptwm.io.csr.tlb.satp.apply(satp) 60 | // ptwm.io.csr.tlb.priv := 0.U.asTypeOf(ptwm.io.csr.tlb.priv) 61 | 62 | // ptwm.io.sfence := sfence 63 | // ptwm.io.tlb(1) <> 0.U.asTypeOf(ptwm.io.tlb(1)) 64 | 65 | // ptwm.io.hartId := 0.U 66 | 67 | // // CSR has been written by csr inst, copies of csr should be updated 68 | // // for pmp, we dont use it 69 | // ptwm.io.csr.distribute_csr := 0.U.asTypeOf(ptwm.io.csr.distribute_csr) 70 | 71 | // tlbRepeater1.io.debugTopDown := DontCare 72 | 73 | // // 0 satp.ppn sv39 and sv48 74 | // // bits 63:60 mode 75 | // // bits 59:44 asid 76 | // // bits 43:0 ppn 77 | // node.regmap( 78 | // 0 -> Seq(RegField(64, satp, RegFieldDesc("satp", "satp: SMMU satp rw register.", reset = Some(0)))) 79 | // ) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/system/SoC.scala: -------------------------------------------------------------------------------- 1 | package ogpu.system 2 | 3 | import org.chipsalliance.cde.config.{Field, Parameters} 4 | import freechips.rocketchip.diplomacy._ 5 | 6 | case object SoCParamsKey extends Field[SoCParameters] 7 | 8 | /** Global cache coherence granularity, which applies to all caches, for now. */ 9 | // case object CacheBlockBytes extends Field[Int](64) 10 | 11 | case class SoCParameters( 12 | EnableILA: Boolean = false, 13 | PAddrBits: Int = 36, 14 | extIntrs: Int = 64) { 15 | // L3 configurations 16 | val L3InnerBusWidth = 256 17 | val L3BlockSize = 64 18 | // on chip network configurations 19 | val L3OuterBusWidth = 256 20 | } 21 | 22 | trait HasSoCParameter { 23 | implicit val p: Parameters 24 | 25 | val soc = p(SoCParamsKey) 26 | // val debugOpts = p(DebugOptionsKey) 27 | // val tiles = p(XSTileKey) 28 | 29 | // val NumCores = tiles.size 30 | val EnableILA = soc.EnableILA 31 | 32 | // L3 configurations 33 | val L3InnerBusWidth = soc.L3InnerBusWidth 34 | val L3BlockSize = soc.L3BlockSize 35 | 36 | // on chip network configurations 37 | val L3OuterBusWidth = soc.L3OuterBusWidth 38 | 39 | val NrExtIntr = soc.extIntrs 40 | } 41 | 42 | abstract class OGPUSystem( 43 | implicit p: Parameters) 44 | extends LazyModule {} 45 | -------------------------------------------------------------------------------- /src/main/scala/tile/CuTaskBundle.scala: -------------------------------------------------------------------------------- 1 | package ogpu.tile 2 | 3 | import chisel3._ 4 | import org.chipsalliance.cde.config.Parameters 5 | import ogpu.config._ 6 | 7 | class CuTaskBundle( 8 | implicit p: Parameters) 9 | extends Bundle { 10 | val numThreads = p(ThreadNum) 11 | val addrWidth = p(AddrWidth) 12 | val numWarps = p(WarpNum) 13 | val dimWidth = p(DimWidth) 14 | val xLen = p(XLen) 15 | 16 | val mask = Vec(numThreads, Bool()) 17 | // max threads num in a workgroup 18 | val thread_dims = Vec(3, UInt(dimWidth.W)) 19 | val vgpr_num = UInt(2.W) 20 | val sgprs = Vec(16, UInt(xLen.W)) 21 | val sgpr_num = UInt(4.W) 22 | val reg_index = UInt(p(RegIDWidth).W) 23 | val pc = UInt(addrWidth.W) 24 | } 25 | 26 | class CuTaskRespBundle( 27 | implicit p: Parameters) 28 | extends Bundle { 29 | val finish = Bool() 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/tile/WorkGroupScheduler.scala: -------------------------------------------------------------------------------- 1 | package ogpu.tile 2 | 3 | import org.chipsalliance.cde.config.Parameters 4 | import freechips.rocketchip.diplomacy._ 5 | import chisel3._ 6 | import chisel3.util._ 7 | 8 | import ogpu.dispatcher._ 9 | 10 | // init register and send warp task to warp scheduler in cu 11 | 12 | case class WgSchedParams() { 13 | def haha = 1 14 | } 15 | 16 | class WorkGroupScheduler( 17 | params: WgSchedParams 18 | )( 19 | implicit p: Parameters) 20 | extends LazyModule { 21 | lazy val module = new Impl(this) 22 | 23 | class Impl( 24 | outer: WorkGroupScheduler 25 | )( 26 | implicit p: Parameters) 27 | extends LazyModuleImp(outer) { 28 | val io = IO(new Bundle { 29 | val task = Flipped(DecoupledIO(new WorkGroupTaskBundle)) 30 | val task_resp = DecoupledIO(new WorkGroupTaskRespBundle) 31 | val cu_task = DecoupledIO(new CuTaskBundle) 32 | val cu_task_resp = Flipped(DecoupledIO(new CuTaskRespBundle)) 33 | }) 34 | 35 | val workgroup_x = RegInit(UInt(16.W)) 36 | val workgroup_y = RegInit(UInt(16.W)) 37 | val workgroup_z = RegInit(UInt(16.W)) 38 | val workgroup_counter_x = RegInit(UInt(16.W)) 39 | val workgroup_counter_y = RegInit(UInt(16.W)) 40 | val workgroup_counter_z = RegInit(UInt(16.W)) 41 | 42 | val workgroup_rcounter_x = RegInit(UInt(16.W)) 43 | val workgroup_rcounter_y = RegInit(UInt(16.W)) 44 | val workgroup_rcounter_z = RegInit(UInt(16.W)) 45 | 46 | val s_idle :: s_working :: s_finish :: Nil = Enum(3) 47 | val state = RegInit(s_idle) 48 | 49 | val taskDone = workgroup_counter_x === (workgroup_x - 32.U) & 50 | workgroup_counter_y === (workgroup_y - 1.U) & 51 | workgroup_counter_z === (workgroup_z - 1.U) 52 | 53 | val recDone = workgroup_rcounter_x === (workgroup_x - 32.U) & 54 | workgroup_rcounter_y === (workgroup_y - 1.U) & 55 | workgroup_rcounter_z === (workgroup_z - 1.U) 56 | 57 | val workgroup_x_acc = (workgroup_counter_x =/= (workgroup_x - 32.U)) 58 | val workgroup_y_acc = (workgroup_counter_x === (workgroup_x - 1.U)) & (workgroup_counter_y =/= (workgroup_y - 1.U)) 59 | val workgroup_z_acc = (workgroup_counter_x === (workgroup_x - 1.U)) & (workgroup_counter_y === (workgroup_y - 1.U)) 60 | 61 | val workgroup_x_racc = (workgroup_rcounter_x =/= (workgroup_x - 32.U)) 62 | val workgroup_y_racc = 63 | (workgroup_rcounter_x === (workgroup_x - 1.U)) & (workgroup_rcounter_y =/= (workgroup_y - 1.U)) 64 | val workgroup_z_racc = 65 | (workgroup_rcounter_x === (workgroup_x - 1.U)) & (workgroup_rcounter_y === (workgroup_y - 1.U)) 66 | 67 | // state transition 68 | switch(state) { 69 | is(s_idle) { 70 | when(io.task.fire) { 71 | state := s_working 72 | } 73 | } 74 | is(s_working) { 75 | when(taskDone & io.cu_task.fire) { 76 | state := s_finish 77 | } 78 | } 79 | is(s_finish) { 80 | when(io.task_resp.fire) { 81 | state := s_idle 82 | } 83 | } 84 | } 85 | 86 | io.cu_task.bits.thread_dims := VecInit(Seq(workgroup_counter_x, workgroup_counter_y, workgroup_counter_z)) 87 | io.cu_task.valid := state === s_working 88 | // state action 89 | switch(state) { 90 | is(s_idle) { 91 | when(io.task.fire) { 92 | workgroup_x := io.task.bits.workgroup_size_x 93 | workgroup_y := io.task.bits.workgroup_size_y 94 | workgroup_z := io.task.bits.workgroup_size_z 95 | workgroup_counter_x := 0.U 96 | workgroup_counter_y := 0.U 97 | workgroup_counter_z := 0.U 98 | } 99 | } 100 | is(s_working) { 101 | when(io.task.fire) { 102 | when(workgroup_x_acc) { 103 | workgroup_counter_x := workgroup_counter_x + 32.U 104 | }.otherwise { 105 | workgroup_counter_x := 0.U 106 | } 107 | 108 | when(workgroup_y_acc) { 109 | workgroup_counter_y := workgroup_counter_y + 1.U 110 | }.otherwise { 111 | workgroup_counter_y := 0.U 112 | } 113 | 114 | when(workgroup_z_acc) { 115 | workgroup_counter_z := workgroup_counter_z + 1.U 116 | }.otherwise { 117 | workgroup_counter_z := 0.U 118 | } 119 | } 120 | } 121 | } 122 | 123 | val s_rec_idle :: s_rec_working :: s_rec_finish :: Nil = Enum(3) 124 | val state_rec = RegInit(s_rec_idle) 125 | 126 | io.cu_task_resp.ready := state_rec === s_rec_working 127 | 128 | switch(state_rec) { 129 | is(s_rec_idle) { 130 | when(io.cu_task.fire) { 131 | state_rec := s_rec_working 132 | } 133 | } 134 | is(s_rec_working) { 135 | when(recDone & io.task_resp.fire) { 136 | state_rec := s_rec_finish 137 | } 138 | } 139 | is(s_rec_finish) { 140 | when(io.task_resp.fire) { 141 | state_rec := s_rec_idle 142 | } 143 | } 144 | } 145 | 146 | io.task_resp.valid := state_rec === s_rec_finish 147 | switch(state_rec) { 148 | is(s_rec_idle) { 149 | workgroup_rcounter_x := 0.U 150 | workgroup_rcounter_y := 0.U 151 | workgroup_rcounter_z := 0.U 152 | } 153 | is(s_rec_working) { 154 | when(io.task_resp.fire) { 155 | when(workgroup_x_racc) { 156 | workgroup_rcounter_x := workgroup_rcounter_x + 1.U 157 | }.otherwise { 158 | workgroup_rcounter_x := 0.U 159 | } 160 | 161 | when(workgroup_y_racc) { 162 | workgroup_rcounter_y := workgroup_rcounter_y + 1.U 163 | }.otherwise { 164 | workgroup_rcounter_y := 0.U 165 | } 166 | 167 | when(workgroup_z_racc) { 168 | workgroup_rcounter_z := workgroup_rcounter_z + 1.U 169 | }.otherwise { 170 | workgroup_rcounter_z := 0.U 171 | } 172 | } 173 | } 174 | } 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/main/scala/util/AddrBits.scala: -------------------------------------------------------------------------------- 1 | package ogpu.util 2 | 3 | import chisel3.util._ 4 | 5 | object VaddrHelper { 6 | 7 | def vaddrBits(xlen: Int, pglevel: Int, pgsize: Int, hvbits: Int): Int = { 8 | val pgLevelBits = 10 - log2Ceil(xlen / 32) 9 | val maxVAddrBits = pgLevelBits * pglevel + log2Ceil(pgsize) 10 | maxVAddrBits + hvbits 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/util/PipelineReg.scala: -------------------------------------------------------------------------------- 1 | package ogpu.util 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | object AddPipelineReg { 7 | 8 | class PipelineRegModule[T <: Data](gen: T) extends Module { 9 | 10 | val io = IO(new Bundle() { 11 | val in = Flipped(DecoupledIO(gen.cloneType)) 12 | val out = DecoupledIO(gen.cloneType) 13 | val isFlush = Input(Bool()) 14 | }) 15 | 16 | val valid = RegInit(false.B) 17 | valid.suggestName("pipeline_reg_valid") 18 | when(io.out.fire) { valid := false.B } 19 | when(io.in.fire) { valid := true.B } 20 | when(io.isFlush) { valid := false.B } 21 | 22 | io.in.ready := !valid || io.out.ready 23 | io.out.bits := RegEnable(io.in.bits, io.in.fire) 24 | io.out.valid := valid // && !isFlush 25 | } 26 | 27 | def apply[T <: Data]( 28 | left: DecoupledIO[T], 29 | right: DecoupledIO[T], 30 | isFlush: Bool, 31 | moduleName: Option[String] = None 32 | ) { 33 | val pipelineReg = Module(new PipelineRegModule[T](left.bits.cloneType)) 34 | if (moduleName.nonEmpty) pipelineReg.suggestName(moduleName.get) 35 | pipelineReg.io.in <> left 36 | right <> pipelineReg.io.out 37 | pipelineReg.io.isFlush := isFlush 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/data/test1/add.asm: -------------------------------------------------------------------------------- 1 | addi x3, x0, 32 2 | add x3, x3, x1 3 | lw x4, (x3) 4 | add x4, x4, 2 5 | addi x5, x3, 256 6 | sw x4, (x5) 7 | wfi 8 | 02000193 9 | 001181b3 10 | 0001a203 11 | 00220213 12 | 10018293 13 | 0042a023 14 | 10500073 15 | -------------------------------------------------------------------------------- /src/test/data/test1/add.hex: -------------------------------------------------------------------------------- 1 | 02000193 2 | 001181b3 3 | 0001a203 4 | 00220213 5 | 10018293 6 | 0042a023 7 | 10500073 8 | -------------------------------------------------------------------------------- /src/test/data/test1/add_0.txt: -------------------------------------------------------------------------------- 1 | 13 2 | 13 3 | 33 4 | 83 5 | 93 6 | 13 7 | 23 8 | 73 9 | -------------------------------------------------------------------------------- /src/test/data/test1/add_1.txt: -------------------------------------------------------------------------------- 1 | 01 2 | 11 3 | 81 4 | 21 5 | 81 6 | 02 7 | 20 8 | 00 9 | -------------------------------------------------------------------------------- /src/test/data/test1/add_2.txt: -------------------------------------------------------------------------------- 1 | 00 2 | 91 3 | 20 4 | 01 5 | 11 6 | 01 7 | 32 8 | 50 9 | -------------------------------------------------------------------------------- /src/test/data/test1/add_3.txt: -------------------------------------------------------------------------------- 1 | 40 2 | 01 3 | 00 4 | 00 5 | 00 6 | 10 7 | 00 8 | 10 9 | -------------------------------------------------------------------------------- /src/test/scala/AXI4RamTest.scala: -------------------------------------------------------------------------------- 1 | import chisel3._ 2 | import chiseltest._ 3 | 4 | import freechips.rocketchip.amba.axi4._ 5 | import org.chipsalliance.cde.config.Parameters 6 | import freechips.rocketchip.diplomacy._ 7 | import freechips.rocketchip.system._ 8 | import chiseltest.simulator.WriteVcdAnnotation 9 | 10 | import org.scalatest.flatspec.AnyFlatSpec 11 | 12 | class AXI4SlaveRAM( 13 | implicit p: Parameters) 14 | extends LazyModule { 15 | val ram = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff))) 16 | val axi_m_param = AXI4MasterParameters("myaximaster") 17 | val axi_m_port = AXI4MasterPortParameters(Seq(axi_m_param)) 18 | val axi_master = AXI4MasterNode(Seq(axi_m_port)) 19 | val ios = InModuleBody(axi_master.makeIOs()) 20 | 21 | ram.node := AXI4Buffer() := axi_master 22 | 23 | lazy val module = new Impl 24 | 25 | class Impl extends LazyModuleImp(this) { 26 | val io = ios.head 27 | } 28 | 29 | } 30 | 31 | // (implicit p: Parameters) 32 | class AXI4RAMTest extends AnyFlatSpec with ChiselScalatestTester { 33 | behavior.of("AXI4RAM") 34 | 35 | it should "perform axi rw operations correctly" in { 36 | implicit val p = new BaseConfig 37 | val axiram = LazyModule(new AXI4SlaveRAM()) 38 | // val mymod = Module(axiram.module) 39 | test(axiram.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 40 | // Write data to the axi4ram module 41 | dut.io.aw.valid.poke(true.B) 42 | dut.io.aw.bits.addr.poke(0x00000000L.U) 43 | dut.io.aw.bits.len.poke(0.U) 44 | dut.io.aw.bits.size.poke(2.U) 45 | dut.io.w.valid.poke(true.B) 46 | dut.io.w.bits.data.poke(0xabcd.U) 47 | dut.io.w.bits.strb.poke("b1111".U) 48 | dut.clock.step() 49 | println(dut.io.aw.bits.id.getClass) // .getSimpleName) 50 | println(dut.io.aw.bits.id.getWidth) 51 | // Wait for write transaction to finish 52 | while (dut.io.b.valid.peek().litToBoolean == false) { 53 | dut.clock.step() 54 | } 55 | 56 | dut.io.aw.valid.poke(false.B) 57 | dut.io.w.valid.poke(false.B) 58 | dut.io.ar.valid.poke(true.B) 59 | dut.io.ar.bits.addr.poke(0x00000000L.U) 60 | dut.io.ar.bits.len.poke(0.U) 61 | dut.io.ar.bits.size.poke(2.U) 62 | dut.clock.step() 63 | 64 | // Wait for read transaction to finish 65 | while (dut.io.r.valid.peek().litToBoolean == false) { 66 | dut.clock.step() 67 | } 68 | 69 | // Read data from the axi4ram module 70 | dut.io.r.bits.data.expect(0xabcd.U) 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/test/scala/DCacheTest.scala: -------------------------------------------------------------------------------- 1 | import chisel3._ 2 | import freechips.rocketchip.diplomacy._ 3 | import freechips.rocketchip.tilelink._ 4 | import org.chipsalliance.cde.config.Parameters 5 | import chiseltest._ 6 | import org.scalatest.flatspec.AnyFlatSpec 7 | import chiseltest.simulator.WriteVcdAnnotation 8 | 9 | import ogpu.core._ 10 | import ogpu.config._ 11 | 12 | class DCacheTestTop( 13 | )( 14 | implicit p: Parameters) 15 | extends LazyModule { 16 | 17 | val cfg = CacheParameter( 18 | nSets = 64, 19 | nWays = 4, 20 | paddrBits = 48, 21 | vaddrBits = 48, 22 | pgIdxBits = 12, 23 | dataBits = 64, 24 | coreId = 0, 25 | tagECC = None, 26 | dataECC = None 27 | ) 28 | 29 | val ram = LazyModule(new TLRAM(AddressSet(0x80000000L, 0xffffL), beatBytes = 8)) 30 | val dcache = LazyModule(new DCache(cfg)) 31 | ram.node :=* 32 | TLXbar() :=* 33 | TLFragmenter(8, 64) :=* 34 | TLCacheCork() :=* 35 | dcache.node 36 | 37 | lazy val module = new Impl 38 | class Impl extends LazyModuleImp(this) { 39 | val io = IO(new Bundle { 40 | val dcache = new CacheBundle(cfg) 41 | }) 42 | dcache.module.io <> io.dcache 43 | } 44 | } 45 | 46 | class DCacheTest extends AnyFlatSpec with ChiselScalatestTester { 47 | behavior.of("DCacheTest") 48 | 49 | it should "perfrom dcache test correctly" in { 50 | implicit val p = new OGPUDefaultConfig 51 | val top = LazyModule(new DCacheTestTop()) 52 | test(top.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 53 | dut.clock.step(80) 54 | dut.io.dcache.ptw.ptbr.mode.poke(0x8.U) 55 | dut.io.dcache.ptw.req.ready.poke(true.B) 56 | dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U) 57 | dut.io.dcache.cpu.req.valid.poke(true.B) 58 | dut.clock.step() 59 | dut.io.dcache.cpu.req.valid.poke(false.B) 60 | // while (dut.io.dcache.cpu.resp.valid.peek().litToBoolean == false) { 61 | while (dut.io.dcache.cpu.s2_nack.peek().litToBoolean == false) { 62 | dut.clock.step() 63 | } 64 | dut.clock.step(5) // ptw resp must be delayed 65 | println("read dcache failed because of tlb miss") 66 | dut.io.dcache.ptw.resp.valid.poke(true.B) 67 | dut.io.dcache.ptw.resp.bits.pte.ppn.poke(0x80000.U) 68 | dut.clock.step() 69 | dut.io.dcache.ptw.resp.valid.poke(false.B) 70 | dut.clock.step(5) 71 | dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U) // request same addr again 72 | dut.io.dcache.cpu.req.valid.poke(true.B) 73 | dut.clock.step() 74 | dut.io.dcache.cpu.req.valid.poke(false.B) 75 | dut.clock.step(20) 76 | dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U) // request again, cache hit 77 | dut.io.dcache.cpu.req.valid.poke(true.B) 78 | dut.clock.step() 79 | dut.io.dcache.cpu.req.valid.poke(false.B) 80 | while (dut.io.dcache.cpu.resp.valid.peek().litToBoolean == false) { 81 | dut.clock.step() 82 | } 83 | dut.clock.step(5) 84 | 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/test/scala/ICacheTest.scala: -------------------------------------------------------------------------------- 1 | import chisel3._ 2 | import freechips.rocketchip.diplomacy._ 3 | import freechips.rocketchip.tilelink._ 4 | import org.chipsalliance.cde.config.Parameters 5 | import chiseltest._ 6 | import org.scalatest.flatspec.AnyFlatSpec 7 | import chiseltest.simulator.WriteVcdAnnotation 8 | 9 | import ogpu.core._ 10 | import ogpu.config._ 11 | 12 | class ICacheTestTop( 13 | )( 14 | implicit p: Parameters) 15 | extends LazyModule { 16 | 17 | val cfg = ICacheParams( 18 | nSets = 64, 19 | nWays = 4, 20 | paddrBits = 48, 21 | vaddrBits = 48, 22 | pgIdxBits = 48, 23 | dataBits = 64, 24 | coreId = 0, 25 | tagECC = None, 26 | dataECC = None 27 | ) 28 | 29 | val ram = LazyModule(new TLRAM(AddressSet(0x80000000L, 0xffffL), beatBytes = 8)) 30 | val icache = LazyModule(new ICache(cfg)) 31 | ram.node :=* 32 | TLXbar() :=* 33 | TLFragmenter(8, 64) :=* 34 | TLCacheCork() :=* 35 | icache.masterNode 36 | 37 | lazy val module = new Impl 38 | class Impl extends LazyModuleImp(this) { 39 | val io = IO(new Bundle { 40 | val icache = new ICacheBundle(cfg) 41 | }) 42 | icache.module.io <> io.icache 43 | } 44 | } 45 | 46 | class ICacheTest extends AnyFlatSpec with ChiselScalatestTester { 47 | behavior.of("ICacheTest") 48 | 49 | it should "perfrom icache test correctly" in { 50 | implicit val p = new OGPUDefaultConfig 51 | val top = LazyModule(new ICacheTestTop()) 52 | test(top.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 53 | dut.clock.step() 54 | dut.io.icache.req.valid.poke(true.B) 55 | dut.io.icache.req.bits.addr.poke(0x1000.U) 56 | dut.clock.step() 57 | dut.io.icache.req.valid.poke(false.B) 58 | dut.io.icache.s1_paddr.poke(0x80002000L.U) 59 | dut.clock.step(10) 60 | // cache miss and request again 61 | dut.io.icache.req.valid.poke(true.B) 62 | dut.io.icache.req.bits.addr.poke(0x1000.U) 63 | dut.clock.step() 64 | dut.io.icache.req.valid.poke(false.B) 65 | dut.io.icache.s1_paddr.poke(0x80002000L.U) 66 | while (dut.io.icache.resp.valid.peek().litToBoolean == false) { 67 | dut.clock.step() 68 | } 69 | dut.clock.step(5) 70 | // request offset 71 | dut.io.icache.req.valid.poke(true.B) 72 | dut.io.icache.req.bits.addr.poke(0x1008.U) 73 | dut.clock.step() 74 | dut.io.icache.req.valid.poke(false.B) 75 | dut.io.icache.s1_paddr.poke(0x80002008L.U) 76 | // hit again 77 | while (dut.io.icache.resp.valid.peek().litToBoolean == false) { 78 | dut.clock.step() 79 | } 80 | dut.clock.step(5) 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/test/scala/IFetchTest.scala: -------------------------------------------------------------------------------- 1 | import chisel3._ 2 | import chiseltest._ 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import chiseltest.simulator.WriteVcdAnnotation 5 | 6 | import ogpu.core._ 7 | import ogpu.config._ 8 | 9 | class IFetchTest extends AnyFlatSpec with ChiselScalatestTester { 10 | behavior.of("IFetch") 11 | 12 | it should "perform ifetch operations correctly" in { 13 | 14 | val cfg = ICacheParams( 15 | nSets = 64, 16 | nWays = 4, 17 | paddrBits = 48, 18 | vaddrBits = 48, 19 | pgIdxBits = 48, 20 | dataBits = 64, 21 | coreId = 0, 22 | tagECC = None, 23 | dataECC = None 24 | ) 25 | 26 | implicit val p = new OGPUDefaultConfig 27 | test(new InstFetch(cfg)).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 28 | dut.io.to_icache.req.ready.poke(true.B) 29 | dut.io.inst_out.ready.poke(true.B) 30 | dut.io.to_ptw.ptbr.mode.poke(0x8.U) 31 | dut.io.to_ptw.req.ready.poke(true.B) 32 | dut.clock.step(5) 33 | dut.io.inst_fetch.valid.poke(true.B) 34 | dut.io.inst_fetch.bits.pc.poke(0x1024.U) 35 | dut.clock.step() 36 | dut.io.inst_fetch.valid.poke(false.B) 37 | dut.clock.step(5) 38 | dut.io.to_ptw.resp.valid.poke(true.B) 39 | dut.clock.step(1) 40 | dut.io.to_ptw.resp.valid.poke(false.B) 41 | dut.clock.step(1) 42 | dut.io.to_icache.resp.valid.poke(true.B) 43 | dut.clock.step(1) 44 | dut.io.to_icache.resp.valid.poke(true.B) 45 | dut.clock.step(1) 46 | dut.io.to_icache.resp.valid.poke(false.B) 47 | 48 | dut.clock.step(20) 49 | 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/test/scala/PTWTest.scala: -------------------------------------------------------------------------------- 1 | import chiseltest._ 2 | import chisel3._ 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import chiseltest.simulator.WriteVcdAnnotation 5 | 6 | import ogpu.core._ 7 | 8 | class PTWTest extends AnyFlatSpec with ChiselScalatestTester { 9 | behavior.of("PTW") 10 | 11 | it should "perform ptw operations correctly" in { 12 | 13 | val ptw_param = PTWParameter(paddrBits = 48, vaddrBits = 48) 14 | val cache_param = CacheParameter(paddrBits = 48, vaddrBits = 48) 15 | test(new PTW(1, ptw_param, cache_param)).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 16 | println("ptw test start") 17 | dut.io.mem.req.ready.poke(true.B) 18 | dut.clock.step(5) 19 | dut.io.requestor(0).req.valid.poke(true.B) 20 | dut.io.requestor(0).req.bits.valid.poke(true.B) 21 | dut.io.requestor(0).req.bits.bits.addr.poke("x0012345678".U) 22 | dut.clock.step(1) 23 | dut.io.requestor(0).req.valid.poke(false.B) 24 | dut.clock.step(5) 25 | dut.io.mem.resp.valid.poke(true.B) 26 | dut.io.mem.resp.bits.data.poke("x8000ffc01".U) // level 0 ppn 0x2003ff 27 | dut.clock.step(1) 28 | dut.io.mem.resp.valid.poke(false.B) 29 | dut.clock.step(4) 30 | dut.io.mem.resp.valid.poke(true.B) 31 | dut.io.mem.resp.bits.data.poke("x1".U) // level 1 32 | dut.clock.step(1) 33 | dut.io.mem.resp.valid.poke(false.B) 34 | dut.clock.step(3) 35 | dut.io.mem.resp.valid.poke(true.B) 36 | dut.io.mem.resp.bits.data.poke("x40001".U) // level 2 ppn 0x100 37 | dut.clock.step(1) 38 | dut.io.mem.resp.valid.poke(false.B) 39 | while (dut.io.requestor(0).resp.valid.peek().litToBoolean == false) { 40 | dut.clock.step(1) 41 | } 42 | dut.io.requestor(0).resp.bits.pte.ppn.expect(0x100.U) 43 | dut.clock.step(5) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/TLBTest.scala: -------------------------------------------------------------------------------- 1 | import chiseltest._ 2 | import chisel3._ 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import chiseltest.simulator.WriteVcdAnnotation 5 | 6 | import ogpu.core._ 7 | 8 | class TLBTest extends AnyFlatSpec with ChiselScalatestTester { 9 | behavior.of("TLB") 10 | 11 | it should "perform tlb operations correctly" in { 12 | 13 | val tlb_param = TLBParameter(nSets = 32, nWays = 4, paddrBits = 48, vaddrBits = 48) 14 | test(new TLB(true, tlb_param)).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 15 | println("tlb test start") 16 | dut.clock.step(5) 17 | dut.io.ptw.ptbr.mode.poke(0x8.U) 18 | dut.io.ptw.req.ready.poke(1.B) 19 | dut.io.req.bits.vaddr.poke(0x1024.U) 20 | dut.io.req.bits.passthrough.poke(true.B) 21 | dut.io.req.bits.size.poke(2.U) 22 | dut.io.req.valid.poke(true.B) 23 | while (dut.io.req.ready.peek().litToBoolean == false) { 24 | dut.clock.step(1) 25 | } 26 | dut.clock.step(1) 27 | dut.io.req.valid.poke(false.B) 28 | dut.io.resp.paddr.expect(0x1024.U) 29 | println(s" tlb return miss? ${dut.io.resp.miss.peek().litToBoolean}") 30 | println(s" tlb return paddr ${dut.io.resp.paddr.peek()}") 31 | dut.clock.step(5) 32 | dut.io.req.bits.vaddr.poke(0x80000.U) 33 | dut.io.req.bits.passthrough.poke(false.B) 34 | dut.io.req.valid.poke(true.B) 35 | while (dut.io.req.ready.peek().litToBoolean == false) { 36 | dut.clock.step(1) 37 | } 38 | dut.clock.step(1) 39 | dut.io.req.valid.poke(false.B) 40 | dut.clock.step(10) 41 | dut.io.ptw.resp.valid.poke(true.B) 42 | dut.clock.step(1) 43 | dut.io.ptw.resp.valid.poke(false.B) 44 | dut.clock.step(5) 45 | dut.io.req.bits.vaddr.poke(0x80008.U) 46 | dut.io.req.valid.poke(true.B) 47 | while (dut.io.req.ready.peek().litToBoolean == false) { 48 | dut.clock.step(1) 49 | } 50 | dut.clock.step(1) 51 | dut.io.req.valid.poke(false.B) 52 | dut.io.resp.paddr.expect(0x8.U) 53 | dut.io.resp.miss.expect(false.B) 54 | dut.clock.step(5) 55 | dut.io.req.bits.vaddr.poke(0x180008.U) // cache conflict 56 | dut.io.req.valid.poke(true.B) 57 | while (dut.io.req.ready.peek().litToBoolean == false) { 58 | dut.clock.step(1) 59 | } 60 | dut.clock.step(1) 61 | dut.io.req.valid.poke(false.B) 62 | dut.clock.step(10) 63 | dut.io.ptw.resp.valid.poke(true.B) 64 | dut.io.ptw.resp.bits.pte.ppn.poke(0x3030.U) 65 | dut.clock.step(1) 66 | dut.io.ptw.resp.valid.poke(false.B) 67 | dut.clock.step(10) 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/scala/WarpSchedulerTest.scala: -------------------------------------------------------------------------------- 1 | import chisel3._ 2 | import chiseltest._ 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import chiseltest.simulator.WriteVcdAnnotation 5 | 6 | import ogpu.config._ 7 | import ogpu.core._ 8 | 9 | class WarpSchedulerTest extends AnyFlatSpec with ChiselScalatestTester { 10 | behavior.of("WarpScheduler") 11 | 12 | it should "perform warp scheduler operations correctly" in { 13 | implicit val p = new OGPUDefaultConfig 14 | test(new WarpScheduler()).withAnnotations(Seq(WriteVcdAnnotation)) { dut => 15 | println("warp sched test start") 16 | dut.io.warp_cmd.valid.poke(1.B) 17 | dut.io.warp_cmd.bits.mask(0).poke(1.B) 18 | dut.io.warp_cmd.bits.mask(3).poke(1.B) 19 | dut.io.warp_cmd.bits.vgpr_num.poke(2.U) 20 | dut.io.warp_cmd.bits.pc.poke(0x800000000L) 21 | dut.io.vgpr_commit.ready.poke(1.B) 22 | if (dut.io.warp_cmd.ready.peek().litToBoolean == false) 23 | println("warp cmd ready is false") 24 | while (dut.io.warp_cmd.ready.peek().litToBoolean == false) { 25 | dut.clock.step(1) 26 | } 27 | dut.clock.step(1) 28 | dut.io.warp_cmd.valid.poke(0.B) 29 | dut.clock.step(5) 30 | } 31 | } 32 | } 33 | --------------------------------------------------------------------------------