├── .github
    └── workflows
    │   └── scala.yml
├── .gitmodules
├── .scalafmt.conf
├── LICENSE
├── Makefile
├── README.md
├── build.sc
├── common.sc
└── src
    ├── main
        └── scala
        │   ├── Consts.scala
        │   ├── config
        │       ├── Parameters.scala
        │       ├── SubsystemConfig.scala
        │       └── SystemConfig.scala
        │   ├── core
        │       ├── ALU.scala
        │       ├── AMOALU.scala
        │       ├── BranchUnit.scala
        │       ├── Cache.scala
        │       ├── DataStruct.scala
        │       ├── Dispatch.scala
        │       ├── ICache.scala
        │       ├── IDecodeUnit.scala
        │       ├── IFetch.scala
        │       ├── Instructions.scala
        │       ├── Issue.scala
        │       ├── PTW.scala
        │       ├── SGPR.scala
        │       ├── SIMTStack.scala
        │       ├── Scoreboard.scala
        │       ├── TLB.scala
        │       ├── VGPR.scala
        │       ├── VectorALU.scala
        │       ├── WarpScheduler.scala
        │       └── Writeback.scala
        │   ├── dispatcher
        │       ├── DispatcherBundle.scala
        │       ├── JobDispatcher.scala
        │       └── TLQM.scala
        │   ├── lib
        │       └── Sram.scala
        │   ├── package.scala
        │   ├── smmu
        │       └── SMMU.scala
        │   ├── system
        │       └── SoC.scala
        │   ├── tile
        │       ├── CuTaskBundle.scala
        │       └── WorkGroupScheduler.scala
        │   └── util
        │       ├── AddrBits.scala
        │       └── PipelineReg.scala
    └── test
        ├── data
            └── test1
            │   ├── add.asm
            │   ├── add.hex
            │   ├── add_0.txt
            │   ├── add_1.txt
            │   ├── add_2.txt
            │   └── add_3.txt
        └── scala
            ├── AXI4RamTest.scala
            ├── DCacheTest.scala
            ├── ICacheTest.scala
            ├── IFetchTest.scala
            ├── PTWTest.scala
            ├── TLBTest.scala
            └── WarpSchedulerTest.scala


/.github/workflows/scala.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | name: Scala CI
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [ "main" ]
11 |   pull_request:
12 |     branches: [ "main" ]
13 | 
14 | permissions:
15 |   contents: read
16 | 
17 | jobs:
18 |   build:
19 | 
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 |     - name: Set up JDK 11
25 |       uses: actions/setup-java@v3
26 |       with:
27 |         java-version: '11'
28 |         distribution: 'temurin'
29 |     - name: Download mill
30 |       run: 
31 |         curl -L https://github.com/com-lihaoyi/mill/releases/download/0.10.10/0.10.10 > mill.0.10 && chmod +x mill.0.10
32 |     - name: Run tests
33 |       run:
34 |         export PATH=`pwd`:${PATH} && make test
35 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "rocket-chip"]
2 | 	path = rocket-chip
3 | 	url = https://github.com/chipsalliance/rocket-chip.git
4 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version = 3.7.11
2 | maxColumn = 120
3 | runner.dialect=scala213
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 OpenGPGPU
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init:
 2 | 	git submodule update --init
 3 | 	cd rocket-chip && git submodule update --init
 4 | 
 5 | test: init format fix
 6 | 	mill -i ogpu\[chisel\].test
 7 | 
 8 | format:
 9 | 	mill -i ogpu\[chisel\].reformat
10 | 	mill -i ogpu\[chisel\].test.reformat
11 | 
12 | fix:
13 | 	mill -i ogpu\[chisel\].fix
14 | 	mill -i ogpu\[chisel\].test.fix
15 | 
16 | count:
17 | 	mill -i ogpu\[chisel\].printLineCount
18 | 
19 | .phony: test
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OPENGPGPU
2 | 
3 | A simple gpgpu for study purpose.
4 | 


--------------------------------------------------------------------------------
/build.sc:
--------------------------------------------------------------------------------
  1 | import $ivy.`com.goyeau::mill-scalafix_mill0.11:0.3.1`
  2 | import com.goyeau.mill.scalafix.ScalafixModule
  3 | import coursier.maven.MavenRepository
  4 | 
  5 | import mill._
  6 | import scalalib._
  7 | import scalafmt._
  8 | import $file.`rocket-chip`.common
  9 | import $file.`rocket-chip`.dependencies.cde.common
 10 | import $file.`rocket-chip`.dependencies.hardfloat.common
 11 | import $file.`rocket-chip`.dependencies.diplomacy.common
 12 | import $file.common
 13 | 
 14 | 
 15 | val defaultScalaVersion = "2.13.12"
 16 | 
 17 | 
 18 | def defaultVersions(chiselVersion: String) = chiselVersion match {
 19 |   case "chisel" => Map(
 20 |     "chisel"        -> ivy"org.chipsalliance::chisel:6.1.0",
 21 |     "chisel-plugin" -> ivy"org.chipsalliance:::chisel-plugin:6.1.0",
 22 |     "chiseltest"    -> ivy"edu.berkeley.cs::chiseltest:6.0.0",
 23 |     "sourcecode"    -> ivy"com.lihaoyi::sourcecode:0.3.1"
 24 |   )
 25 | }
 26 | 
 27 | trait HasChisel extends ScalaModule with Cross.Module[String] {
 28 | 
 29 | 
 30 |   def repositoriesTask = T.task {
 31 |     super.repositoriesTask() ++ Seq(MavenRepository("https://oss.sonatype.org/content/repositories/snapshots"))
 32 |   }
 33 | 
 34 |   def chiselModule: Option[ScalaModule] = None
 35 | 
 36 |   def chiselPluginJar: T[Option[PathRef]] = None
 37 | 
 38 |   def chiselIvy: Option[Dep] = Some(defaultVersions(crossValue)("chisel"))
 39 | 
 40 |   def chiselPluginIvy: Option[Dep] = Some(defaultVersions(crossValue)("chisel-plugin"))
 41 | 
 42 |   override def scalaVersion = defaultScalaVersion
 43 | 
 44 |   override def scalacOptions = super.scalacOptions() ++
 45 |     Agg("-language:reflectiveCalls", "-Ymacro-annotations", "-Ytasty-reader", "-Ywarn-unused")
 46 | 
 47 |   override def ivyDeps = super.ivyDeps() ++ Agg(chiselIvy.get)
 48 | 
 49 |   override def scalacPluginIvyDeps = super.scalacPluginIvyDeps() ++ Agg(chiselPluginIvy.get)
 50 | }
 51 | 
 52 | object macros extends Macros
 53 | 
 54 | trait Macros
 55 |   extends millbuild.`rocket-chip`.common.MacrosModule
 56 |     with SbtModule {
 57 | 
 58 |   def scalaVersion: T[String] = T(defaultScalaVersion)
 59 | 
 60 |   def scalaReflectIvy = ivy"org.scala-lang:scala-reflect:${defaultScalaVersion}"
 61 | }
 62 | 
 63 | object hardfloat extends Cross[Hardfloat]("chisel")
 64 | 
 65 | trait Hardfloat
 66 |   extends millbuild.`rocket-chip`.dependencies.hardfloat.common.HardfloatModule with HasChisel with SbtModule {
 67 | 
 68 |   def scalaVersion: T[String] = T(defaultScalaVersion)
 69 | 
 70 |   override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "hardfloat" / "hardfloat"
 71 | 
 72 | }
 73 | 
 74 | object cde extends CDE
 75 | 
 76 | trait CDE extends millbuild.`rocket-chip`.dependencies.cde.common.CDEModule with ScalaModule {
 77 | 
 78 |   def scalaVersion: T[String] = T(defaultScalaVersion)
 79 | 
 80 |   override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "cde" / "cde"
 81 | }
 82 | 
 83 | object diplomacy extends Cross[Diplomacy]("chisel")
 84 | 
 85 | trait Diplomacy
 86 |     extends millbuild.`rocket-chip`.dependencies.diplomacy.common.DiplomacyModule
 87 |     with HasChisel {
 88 | 
 89 |   override def scalaVersion: T[String] = T(defaultScalaVersion)
 90 | 
 91 |   def cdeModule = cde
 92 | 
 93 |   def sourcecodeIvy = defaultVersions(crossValue)("sourcecode")
 94 | 
 95 |   override def millSourcePath = os.pwd / "rocket-chip" / "dependencies" / "diplomacy" / "diplomacy"
 96 | }  
 97 | 
 98 | object rocketchip extends Cross[RocketChip]("chisel")
 99 | 
100 | trait RocketChip
101 |   extends millbuild.`rocket-chip`.common.RocketChipModule
102 |     with HasChisel with SbtModule {
103 | 
104 |   override def millSourcePath = os.pwd / "rocket-chip"
105 | 
106 |   def macrosModule = macros
107 | 
108 |   def hardfloatModule = hardfloat(crossValue)
109 | 
110 |   def cdeModule = cde
111 | 
112 |   def diplomacyModule = diplomacy(crossValue)
113 | 
114 |   def mainargsIvy = ivy"com.lihaoyi::mainargs:0.5.4"
115 | 
116 |   def json4sJacksonIvy = ivy"org.json4s::json4s-jackson:4.0.6"
117 | }
118 | 
119 | object ogpu extends Cross[OGPU]("chisel")
120 | 
121 | trait OGPU extends millbuild.common.OGPUModule
122 |     with HasChisel
123 |     with SbtModule
124 |     with ScalafixModule
125 |     with ScalafmtModule {
126 | 
127 |   override def millSourcePath = os.pwd
128 | 
129 |   def rocketModule = rocketchip(crossValue)
130 | 
131 |   override def forkArgs = Seq("-Xmx8G", "-Xss256m")
132 | 
133 |   override def sources = T.sources {
134 |     super.sources() ++ Seq(PathRef(this.millSourcePath / "src" / crossValue / "main" / "scala"))
135 |   }
136 | 
137 |   def lineCount = T {
138 |     this.sources().filter(ref => os.exists(ref.path)).flatMap(ref => os.walk(ref.path)).filter(os.isFile).flatMap(os.read.lines).size
139 |   }
140 | 
141 |   def printLineCount() = T.command {
142 |     println(s"Lines of code(LOC): ${lineCount()} !!!")
143 |   }
144 | 
145 |   object test extends SbtModuleTests
146 |       with TestModule.ScalaTest with ScalafixModule
147 |       with ScalafmtModule {
148 | 
149 |     override def forkArgs = Seq("-Xmx8G", "-Xss256m") 
150 | 
151 |     override def sources = T.sources {
152 |       super.sources() ++ Seq(PathRef(this.millSourcePath / "src" / crossValue / "test" / "scala"))
153 |     }
154 | 
155 |     override def ivyDeps = super.ivyDeps() ++ Agg(
156 |       defaultVersions(crossValue)("chiseltest")
157 |     )
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/common.sc:
--------------------------------------------------------------------------------
 1 | import mill._
 2 | import mill.scalalib._
 3 | 
 4 | trait OGPUModule extends ScalaModule {
 5 | 
 6 |   def rocketModule: ScalaModule
 7 | 
 8 |   override def moduleDeps = super.moduleDeps ++ Seq(
 9 |     rocketModule,
10 |   )
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/Consts.scala:
--------------------------------------------------------------------------------
 1 | // See LICENSE.Berkeley for license details.
 2 | 
 3 | package ogpu.constants
 4 | 
 5 | import chisel3._
 6 | import chisel3.util._
 7 | import freechips.rocketchip.util._
 8 | 
 9 | trait ScalarOpConstants {
10 |   val SZ_BR = 3
11 |   def BR_X = BitPat("b???")
12 |   def BR_EQ = 0.U(3.W)
13 |   def BR_NE = 1.U(3.W)
14 |   def BR_J = 2.U(3.W)
15 |   def BR_N = 3.U(3.W)
16 |   def BR_LT = 4.U(3.W)
17 |   def BR_GE = 5.U(3.W)
18 |   def BR_LTU = 6.U(3.W)
19 |   def BR_GEU = 7.U(3.W)
20 | 
21 |   def A1_X = BitPat("b??")
22 |   def A1_ZERO = 0.U(2.W)
23 |   def A1_RS1 = 1.U(2.W)
24 |   def A1_PC = 2.U(2.W)
25 | 
26 |   def IMM_X = BitPat("b???")
27 |   def IMM_S = 0.U(3.W)
28 |   def IMM_SB = 1.U(3.W)
29 |   def IMM_U = 2.U(3.W)
30 |   def IMM_UJ = 3.U(3.W)
31 |   def IMM_I = 4.U(3.W)
32 |   def IMM_Z = 5.U(3.W)
33 | 
34 |   def A2_X = BitPat("b??")
35 |   def A2_ZERO = 0.U(2.W)
36 |   def A2_SIZE = 1.U(2.W)
37 |   def A2_RS2 = 2.U(2.W)
38 |   def A2_IMM = 3.U(2.W)
39 | 
40 |   def X = BitPat("b?")
41 |   def N = BitPat("b0")
42 |   def Y = BitPat("b1")
43 | 
44 |   val SZ_DW = 1
45 |   def DW_X = X
46 |   def DW_32 = false.B
47 |   def DW_64 = true.B
48 |   def DW_XPR = DW_64
49 | }
50 | 
51 | trait MemoryOpConstants {
52 |   val NUM_XA_OPS = 9
53 |   val M_SZ = 5
54 |   def M_X = BitPat("b?????");
55 |   def M_XRD = "b00000".U; // int load
56 |   def M_XWR = "b00001".U; // int store
57 |   def M_PFR = "b00010".U; // prefetch with intent to read
58 |   def M_PFW = "b00011".U; // prefetch with intent to write
59 |   def M_XA_SWAP = "b00100".U
60 |   def M_FLUSH_ALL = "b00101".U // flush all lines
61 |   def M_XLR = "b00110".U
62 |   def M_XSC = "b00111".U
63 |   def M_XA_ADD = "b01000".U
64 |   def M_XA_XOR = "b01001".U
65 |   def M_XA_OR = "b01010".U
66 |   def M_XA_AND = "b01011".U
67 |   def M_XA_MIN = "b01100".U
68 |   def M_XA_MAX = "b01101".U
69 |   def M_XA_MINU = "b01110".U
70 |   def M_XA_MAXU = "b01111".U
71 |   def M_FLUSH = "b10000".U // write back dirty data and cede R/W permissions
72 |   def M_PWR = "b10001".U // partial (masked) store
73 |   def M_PRODUCE = "b10010".U // write back dirty data and cede W permissions
74 |   def M_CLEAN = "b10011".U // write back dirty data and retain R/W permissions
75 |   def M_SFENCE = "b10100".U // SFENCE.VMA
76 |   def M_HFENCEV = "b10101".U // HFENCE.VVMA
77 |   def M_HFENCEG = "b10110".U // HFENCE.GVMA
78 |   def M_WOK = "b10111".U // check write permissions but don't perform a write
79 |   def M_HLVX = "b10000".U // HLVX instruction
80 | 
81 |   def isAMOLogical(cmd:    UInt) = cmd.isOneOf(M_XA_SWAP, M_XA_XOR, M_XA_OR, M_XA_AND)
82 |   def isAMOArithmetic(cmd: UInt) = cmd.isOneOf(M_XA_ADD, M_XA_MIN, M_XA_MAX, M_XA_MINU, M_XA_MAXU)
83 |   def isAMO(cmd:           UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd)
84 |   def isPrefetch(cmd:      UInt) = cmd === M_PFR || cmd === M_PFW
85 |   def isRead(cmd:          UInt) = cmd.isOneOf(M_XRD, M_HLVX, M_XLR, M_XSC) || isAMO(cmd)
86 |   def isWrite(cmd:         UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd)
87 |   def isWriteIntent(cmd:   UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/config/Parameters.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.config
 2 | 
 3 | import org.chipsalliance.cde.config._
 4 | 
 5 | case object XLen extends Field[Int]
 6 | case object DTSModel extends Field[String]
 7 | case object DTSCompat extends Field[String]
 8 | case object ThreadNum extends Field[Int]
 9 | case object WarpNum extends Field[Int]
10 | case object RegNum extends Field[Int]
11 | case object RegIDWidth extends Field[Int]
12 | case object WarpIDWidth extends Field[Int]
13 | case object AddrWidth extends Field[Int]
14 | case object StackDepth extends Field[Int]
15 | case object DimWidth extends Field[Int]
16 | 


--------------------------------------------------------------------------------
/src/main/scala/config/SubsystemConfig.scala:
--------------------------------------------------------------------------------
 1 | // See LICENSE.SiFive for license details.
 2 | // See LICENSE.Berkeley for license details.
 3 | 
 4 | package ogpu.config
 5 | 
 6 | import org.chipsalliance.cde.config._
 7 | import freechips.rocketchip.diplomacy._
 8 | 
 9 | class BaseSubsystemConfig extends Config((_, _, _) => { case XLen => 64 })
10 | 
11 | class WithDTS(model: String, compat: Seq[String])
12 |     extends Config((_, _, _) => {
13 |       case DTSModel  => model
14 |       case DTSCompat => compat
15 |     })
16 | 


--------------------------------------------------------------------------------
/src/main/scala/config/SystemConfig.scala:
--------------------------------------------------------------------------------
 1 | // See LICENSE.SiFive for license details.
 2 | // See LICENSE.Berkeley for license details.
 3 | 
 4 | package ogpu.config
 5 | 
 6 | import chisel3.util._
 7 | import org.chipsalliance.cde.config._
 8 | 
 9 | class OGPUBaseConfig(n: Int)
10 |     extends Config((site, _, _) => {
11 |       case DTSModel    => ""
12 |       case ThreadNum   => 32
13 |       case XLen        => 64
14 |       case WarpNum     => 8
15 |       case RegNum      => 32
16 |       case RegIDWidth  => log2Ceil(site(RegNum))
17 |       case WarpIDWidth => log2Ceil(site(WarpNum))
18 |       case AddrWidth   => 64
19 |       case StackDepth  => 16
20 |       case DimWidth    => 16
21 |     })
22 | 
23 | class OGPUDefaultConfig(n: Int = 1)
24 |     extends Config(
25 |       new OGPUBaseConfig(n)
26 |     )
27 | 


--------------------------------------------------------------------------------
/src/main/scala/core/ALU.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import org.chipsalliance.cde.config.Parameters
  6 | import ogpu.config._
  7 | import freechips.rocketchip.rocket.ALU._
  8 | import freechips.rocketchip.rocket.{SZ_DW, DW_64, DW_32}
  9 | 
 10 | class ScalarALU(
 11 |   implicit p: Parameters)
 12 |     extends Module {
 13 |   val xLen = p(XLen)
 14 |   val io = IO(new Bundle() {
 15 |     val dw = Input(UInt(SZ_DW.W))
 16 |     val fn = Input(UInt(SZ_ALU_FN.W))
 17 |     val in1 = Input(UInt(xLen.W))
 18 |     val in2 = Input(UInt(xLen.W))
 19 |     val out = Output(UInt(xLen.W))
 20 |     val adder_out = Output(UInt(xLen.W))
 21 |     val cmp_out = Output(Bool())
 22 |   })
 23 | 
 24 |   override def desiredName = "CHIPALU"
 25 | 
 26 |   // ADD, SUB
 27 |   val in2_inv = Mux(isSub(io.fn), ~io.in2, io.in2)
 28 |   val in1_xor_in2 = io.in1 ^ in2_inv
 29 |   val in1_and_in2 = io.in1 & in2_inv
 30 |   io.adder_out := io.in1 + in2_inv + isSub(io.fn)
 31 | 
 32 |   // SLT, SLTU
 33 |   val slt =
 34 |     Mux(io.in1(xLen-1) === io.in2(xLen-1), io.adder_out(xLen-1),
 35 |     Mux(cmpUnsigned(io.fn), io.in2(xLen-1), io.in1(xLen-1)))
 36 |   io.cmp_out := cmpInverted(io.fn) ^ Mux(cmpEq(io.fn), in1_xor_in2 === 0.U, slt)
 37 | 
 38 |   // SLL, SRL, SRA
 39 |   val (shamt, shin_r) =
 40 |     if (xLen == 32) (io.in2(4,0), io.in1)
 41 |     else {
 42 |       require(xLen == 64)
 43 |       val shin_hi_32 = Fill(32, isSub(io.fn) && io.in1(31))
 44 |       val shin_hi = Mux(io.dw === DW_64, io.in1(63,32), shin_hi_32)
 45 |       val shamt = Cat(io.in2(5) & (io.dw === DW_64), io.in2(4,0))
 46 |       (shamt, Cat(shin_hi, io.in1(31,0)))
 47 |     }
 48 |   val shin = Mux(shiftReverse(io.fn), Reverse(shin_r), shin_r)
 49 |   val shout_r = (Cat(isSub(io.fn) & shin(xLen-1), shin).asSInt >> shamt)(xLen-1,0)
 50 |   val shout_l = Reverse(shout_r)
 51 |   val shout = Mux(io.fn === FN_SR || io.fn === FN_SRA || io.fn === FN_BEXT, shout_r, 0.U) |
 52 |               Mux(io.fn === FN_SL,                                          shout_l, 0.U)
 53 | 
 54 |   // CZEQZ, CZNEZ
 55 |   val in2_not_zero = io.in2.orR
 56 |   val usingConditionalZero = true
 57 |   val cond_out = Option.when(usingConditionalZero)(
 58 |     Mux((io.fn === FN_CZEQZ && in2_not_zero) || (io.fn === FN_CZNEZ && !in2_not_zero), io.in1, 0.U)
 59 |   )
 60 | 
 61 |   // AND, OR, XOR
 62 |   val logic = Mux(io.fn === FN_XOR || io.fn === FN_OR || io.fn === FN_ORN || io.fn === FN_XNOR, in1_xor_in2, 0.U) |
 63 |               Mux(io.fn === FN_OR || io.fn === FN_AND || io.fn === FN_ORN || io.fn === FN_ANDN, in1_and_in2, 0.U)
 64 | 
 65 |   val useZbs = true
 66 |   val bext_mask = Mux(useZbs.B && io.fn === FN_BEXT, 1.U, ~(0.U(xLen.W)))
 67 |   val shift_logic = (isCmp (io.fn) && slt) | logic | (shout & bext_mask)
 68 |   val shift_logic_cond = cond_out match {
 69 |     case Some(co) => shift_logic | co
 70 |     case _ => shift_logic
 71 |   }
 72 | 
 73 |   // CLZ, CTZ, CPOP
 74 |   val tz_in = MuxLookup((io.dw === DW_32) ## !io.in2(0), 0.U)(Seq(
 75 |     0.U -> io.in1,
 76 |     1.U -> Reverse(io.in1),
 77 |     2.U -> 1.U ## io.in1(31,0),
 78 |     3.U -> 1.U ## Reverse(io.in1(31,0))
 79 |   ))
 80 |   val popc_in = Mux(io.in2(1),
 81 |     Mux(io.dw === DW_32, io.in1(31,0), io.in1),
 82 |     PriorityEncoderOH(1.U ## tz_in) - 1.U)(xLen-1,0)
 83 |   val count = PopCount(popc_in)
 84 |   val in1_bytes = io.in1.asTypeOf(Vec(xLen / 8, UInt(8.W)))
 85 |   val orcb = VecInit(in1_bytes.map(b => Fill(8, b =/= 0.U))).asUInt
 86 |   val rev8 = VecInit(in1_bytes.reverse).asUInt
 87 |   val unary = MuxLookup(io.in2(11,0), count)(Seq(
 88 |     0x287.U -> orcb,
 89 |     (if (xLen == 32) 0x698 else 0x6b8).U -> rev8,
 90 |     0x080.U -> io.in1(15,0),
 91 |     0x604.U -> Fill(xLen-8, io.in1(7)) ## io.in1(7,0),
 92 |     0x605.U -> Fill(xLen-16, io.in1(15)) ## io.in1(15,0)
 93 |   ))
 94 | 
 95 |   // MAX, MIN, MAXU, MINU
 96 |   val maxmin_out = Mux(io.cmp_out, io.in2, io.in1)
 97 | 
 98 |   // ROL, ROR
 99 |   val rot_shamt = Mux(io.dw === DW_32, 32.U, xLen.U) - shamt
100 |   val rotin = Mux(io.fn(0), shin_r, Reverse(shin_r))
101 |   val rotout_r = (rotin >> rot_shamt)(xLen-1,0)
102 |   val rotout_l = Reverse(rotout_r)
103 |   val rotout = Mux(io.fn(0), rotout_r, rotout_l) | Mux(io.fn(0), shout_l, shout_r)
104 | 
105 |   val useZbb = true
106 |   val out = MuxLookup(io.fn, shift_logic_cond)(Seq(
107 |     FN_ADD -> io.adder_out,
108 |     FN_SUB -> io.adder_out
109 |   ) ++ (if (useZbb) Seq(
110 |     FN_UNARY -> unary,
111 |     FN_MAX -> maxmin_out,
112 |     FN_MIN -> maxmin_out,
113 |     FN_MAXU -> maxmin_out,
114 |     FN_MINU -> maxmin_out,
115 |     FN_ROL -> rotout,
116 |     FN_ROR -> rotout,
117 |   ) else Nil))
118 | 
119 | 
120 |   io.out := out
121 |   if (xLen > 32) {
122 |     require(xLen == 64)
123 |     when (io.dw === DW_32) { io.out := Cat(Fill(32, out(31)), out(31,0)) }
124 |   }
125 | }
126 | 
127 | // object ALURTL extends App {
128 | //   implicit val p = new CoreConfig
129 | //   emitVerilog(new ScalarALU(), Array("--target-dir", "generated"))
130 | // }
131 | 


--------------------------------------------------------------------------------
/src/main/scala/core/AMOALU.scala:
--------------------------------------------------------------------------------
  1 | // See LICENSE.SiFive for license details.
  2 | // See LICENSE.Berkeley for license details.
  3 | 
  4 | package ogpu.core
  5 | 
  6 | import chisel3._
  7 | import chisel3.util._
  8 | import org.chipsalliance.cde.config.Parameters
  9 | 
 10 | import ogpu.ogpu._
 11 | 
 12 | class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) {
 13 |   val size = Wire(UInt(log2Up(log2Up(maxSize) + 1).W))
 14 |   size := typ
 15 | 
 16 |   def misaligned: Bool =
 17 |     (addr & ((1.U << size) - 1.U)(log2Up(maxSize) - 1, 0)).orR
 18 | 
 19 |   def mask = {
 20 |     var res = 1.U
 21 |     for (i <- 0 until log2Up(maxSize)) {
 22 |       val upper =
 23 |         Mux(addr(i), res, 0.U) | Mux(size >= (i + 1).U, ((BigInt(1) << (1 << i)) - 1).U, 0.U)
 24 |       val lower = Mux(addr(i), 0.U, res)
 25 |       res = Cat(upper, lower)
 26 |     }
 27 |     res
 28 |   }
 29 | 
 30 |   protected def genData(i: Int): UInt =
 31 |     if (i >= log2Up(maxSize)) dat
 32 |     else Mux(size === i.U, Fill(1 << (log2Up(maxSize) - i), dat((8 << i) - 1, 0)), genData(i + 1))
 33 | 
 34 |   def data = genData(0)
 35 |   def wordData = genData(2)
 36 | }
 37 | 
 38 | class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) {
 39 |   private val size = new StoreGen(typ, addr, dat, maxSize).size
 40 | 
 41 |   private def genData(logMinSize: Int): UInt = {
 42 |     var res = dat
 43 |     for (i <- log2Up(maxSize) - 1 to logMinSize by -1) {
 44 |       val pos = 8 << i
 45 |       val shifted = Mux(addr(i), res(2 * pos - 1, pos), res(pos - 1, 0))
 46 |       val doZero = (i == 0).B && zero
 47 |       val zeroed = Mux(doZero, 0.U, shifted)
 48 |       res = Cat(
 49 |         Mux(
 50 |           size === i.U || doZero,
 51 |           Fill(8 * maxSize - pos, signed && zeroed(pos - 1)),
 52 |           res(8 * maxSize - 1, pos)
 53 |         ),
 54 |         zeroed
 55 |       )
 56 |     }
 57 |     res
 58 |   }
 59 | 
 60 |   def wordData = genData(2)
 61 |   def data = genData(0)
 62 | }
 63 | 
 64 | class AMOALU(
 65 |   operandBits: Int
 66 | )(
 67 |   implicit p: Parameters)
 68 |     extends Module {
 69 |   val minXLen = 32
 70 |   val widths = (0 to log2Ceil(operandBits / minXLen)).map(minXLen << _)
 71 | 
 72 |   val io = IO(new Bundle {
 73 |     val mask = Input(UInt((operandBits / 8).W))
 74 |     val cmd = Input(UInt(M_SZ.W))
 75 |     val lhs = Input(UInt(operandBits.W))
 76 |     val rhs = Input(UInt(operandBits.W))
 77 |     val out = Output(UInt(operandBits.W))
 78 |     val out_unmasked = Output(UInt(operandBits.W))
 79 |   })
 80 | 
 81 |   val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU
 82 |   val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU
 83 |   val add = io.cmd === M_XA_ADD
 84 |   val logic_and = io.cmd === M_XA_OR || io.cmd === M_XA_AND
 85 |   val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR
 86 | 
 87 |   val adder_out = {
 88 |     // partition the carry chain to support sub-xLen addition
 89 |     val mask =
 90 |       ~(0.U(operandBits.W) +: widths.init.map(w => !io.mask(w / 8 - 1) << (w - 1))).reduce(_ | _)
 91 |     (io.lhs & mask) + (io.rhs & mask)
 92 |   }
 93 | 
 94 |   val less = {
 95 |     // break up the comparator so the lower parts will be CSE'd
 96 |     def isLessUnsigned(x: UInt, y: UInt, n: Int): Bool = {
 97 |       if (n == minXLen) x(n - 1, 0) < y(n - 1, 0)
 98 |       else
 99 |         x(n - 1, n / 2) < y(n - 1, n / 2) || x(n - 1, n / 2) === y(n - 1, n / 2) && isLessUnsigned(
100 |           x,
101 |           y,
102 |           n / 2
103 |         )
104 |     }
105 | 
106 |     def isLess(x: UInt, y: UInt, n: Int): Bool = {
107 |       val signed = {
108 |         val mask = M_XA_MIN ^ M_XA_MINU
109 |         (io.cmd & mask) === (M_XA_MIN & mask)
110 |       }
111 |       Mux(x(n - 1) === y(n - 1), isLessUnsigned(x, y, n), Mux(signed, x(n - 1), y(n - 1)))
112 |     }
113 | 
114 |     PriorityMux(widths.reverse.map(w => (io.mask(w / 8 / 2), isLess(io.lhs, io.rhs, w))))
115 |   }
116 | 
117 |   val minmax = Mux(Mux(less, min, max), io.lhs, io.rhs)
118 | 
119 |   val logic =
120 |     Mux(logic_and, io.lhs & io.rhs, 0.U) |
121 |       Mux(logic_xor, io.lhs ^ io.rhs, 0.U)
122 | 
123 |   val out =
124 |     Mux(add, adder_out, Mux(logic_and || logic_xor, logic, minmax))
125 | 
126 |   val wmask = FillInterleaved(8, io.mask)
127 |   io.out := wmask & out | ~wmask & io.lhs
128 |   io.out_unmasked := out
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/core/BranchUnit.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | 
 8 | class BranchUnit(
 9 |   implicit p: Parameters)
10 |     extends Module {
11 |   val numThread = p(ThreadNum)
12 |   val io = IO(new Bundle {
13 |     val branch_data = Flipped(DecoupledIO(new BranchData()))
14 |     val branch_ctl = DecoupledIO(new BranchControlData())
15 |   })
16 | 
17 |   val branch_result = Module(new Queue(new BranchControlData(), 1, pipe = true))
18 | 
19 |   io.branch_data.ready := branch_result.io.enq.ready
20 |   branch_result.io.enq.valid := io.branch_data.valid
21 | 
22 |   // default
23 |   branch_result.io.enq.bits.pc := 0.U
24 |   branch_result.io.enq.bits.mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask)
25 |   branch_result.io.enq.bits.wid := 0.U
26 |   branch_result.io.enq.bits.diverge := 0.U
27 | 
28 |   branch_result.io.enq.bits.data.mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask)
29 |   branch_result.io.enq.bits.data.pc := 0.U
30 |   branch_result.io.enq.bits.data.orig_mask := 0.U.asTypeOf(branch_result.io.enq.bits.mask)
31 | 
32 |   val pc_imm = io.branch_data.bits.pc + io.branch_data.bits.imm
33 |   val pc_rs1 = io.branch_data.bits.rs1_data + io.branch_data.bits.imm
34 |   val pc_next = io.branch_data.bits.pc + 4.U
35 | 
36 |   val taken_all = io.branch_data.bits.mask === io.branch_data.bits.orig_mask
37 |   val taken_none = io.branch_data.bits.mask === 0.U.asTypeOf(io.branch_data.bits.orig_mask)
38 |   val diverge = !(taken_all | taken_none)
39 | 
40 |   when(io.branch_data.bits.branch.jal) {
41 |     branch_result.io.enq.bits.pc := pc_imm
42 |     branch_result.io.enq.bits.mask := io.branch_data.bits.orig_mask
43 |     branch_result.io.enq.bits.wid := io.branch_data.bits.wid
44 |     branch_result.io.enq.bits.diverge := false.B
45 | 
46 |   }.elsewhen(io.branch_data.bits.branch.jalr) {
47 |     branch_result.io.enq.bits.pc := pc_rs1
48 |     branch_result.io.enq.bits.mask := io.branch_data.bits.orig_mask
49 |     branch_result.io.enq.bits.wid := io.branch_data.bits.wid
50 |     branch_result.io.enq.bits.diverge := false.B
51 |   }.elsewhen(io.branch_data.bits.branch.branch) {
52 |     branch_result.io.enq.bits.pc := Mux(taken_none, pc_next, pc_imm)
53 |     branch_result.io.enq.bits.mask := Mux(diverge, io.branch_data.bits.mask, io.branch_data.bits.orig_mask)
54 |     branch_result.io.enq.bits.wid := io.branch_data.bits.wid
55 |     branch_result.io.enq.bits.diverge := diverge
56 | 
57 |     branch_result.io.enq.bits.data.mask := io.branch_data.bits.orig_mask.zip(io.branch_data.bits.mask).map {
58 |       case (a, b) => a & !b
59 |     }
60 |     branch_result.io.enq.bits.data.pc := pc_next
61 |     branch_result.io.enq.bits.data.orig_mask := io.branch_data.bits.orig_mask
62 |   }
63 | 
64 |   io.branch_ctl <> branch_result.io.deq
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/core/DataStruct.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import org.chipsalliance.cde.config.Parameters
  6 | import freechips.rocketchip.rocket._
  7 | import ogpu.config._
  8 | import freechips.rocketchip.rocket.ALU._
  9 | 
 10 | class BranchSignal(
 11 |   implicit p: Parameters)
 12 |     extends Bundle {
 13 |   val jal = Bool()
 14 |   val jalr = Bool()
 15 |   val branch = Bool()
 16 | }
 17 | 
 18 | class VALUData(
 19 |   implicit p: Parameters)
 20 |     extends Bundle {
 21 |   val numThreads = p(ThreadNum)
 22 |   val numWarps = p(WarpNum)
 23 |   val xLen = p(XLen)
 24 |   val addrWidth = p(AddrWidth)
 25 |   val regIDWidth = p(RegIDWidth)
 26 | 
 27 |   val op1 = Vec(numThreads, UInt(xLen.W))
 28 |   val op2 = Vec(numThreads, UInt(xLen.W))
 29 |   val func = UInt(SZ_ALU_FN.W)
 30 |   val mask = Vec(numThreads, Bool())
 31 |   val wid = UInt(log2Ceil(numWarps).W)
 32 |   val pc = UInt(addrWidth.W)
 33 |   val rd = UInt(regIDWidth.W)
 34 |   val branch = new BranchSignal()
 35 |   val imm = UInt(xLen.W)
 36 |   val rs1_data = UInt(xLen.W)
 37 | }
 38 | 
 39 | class SALUData(
 40 |   implicit p: Parameters)
 41 |     extends Bundle {
 42 |   val numThreads = p(ThreadNum)
 43 |   val numWarps = p(WarpNum)
 44 |   val xLen = p(XLen)
 45 |   val addrWidth = p(AddrWidth)
 46 |   val regIDWidth = p(RegIDWidth)
 47 | 
 48 |   val op1 = UInt(xLen.W)
 49 |   val op2 = UInt(xLen.W)
 50 |   val func = UInt(SZ_ALU_FN.W)
 51 |   val wid = UInt(log2Ceil(numWarps).W)
 52 |   val pc = UInt(addrWidth.W)
 53 |   val rd = UInt(regIDWidth.W)
 54 |   val branch = new BranchSignal()
 55 |   val imm = UInt(xLen.W)
 56 |   val rs1_data = UInt(xLen.W)
 57 | }
 58 | 
 59 | class BranchData(
 60 |   implicit p: Parameters)
 61 |     extends Bundle {
 62 |   val numThreads = p(ThreadNum)
 63 |   val numWarps = p(WarpNum)
 64 |   val addrWidth = p(AddrWidth)
 65 |   val xLen = p(XLen)
 66 | 
 67 |   val branch = new BranchSignal()
 68 |   val mask = Vec(numThreads, Bool())
 69 |   val orig_mask = Vec(numThreads, Bool())
 70 |   val wid = UInt(log2Ceil(numWarps).W)
 71 |   val pc = UInt(addrWidth.W)
 72 |   val imm = UInt(xLen.W)
 73 |   val rs1_data = UInt(xLen.W)
 74 | }
 75 | 
 76 | class LSUData(
 77 |   implicit p: Parameters)
 78 |     extends Bundle {
 79 |   val numThreads = p(ThreadNum)
 80 |   val xLen = p(XLen)
 81 |   val addrWidth = p(AddrWidth)
 82 |   val numWarps = p(WarpNum)
 83 |   val regIDWidth = p(RegIDWidth)
 84 | 
 85 |   val addr = Vec(numThreads, UInt(addrWidth.W))
 86 |   val data = Vec(numThreads, UInt(xLen.W))
 87 |   val mask = Vec(numThreads, Bool())
 88 |   val func = UInt(1.W)
 89 |   val wid = UInt(log2Ceil(numWarps).W)
 90 | 
 91 |   val pc = UInt(addrWidth.W)
 92 |   // val fence = Bool()
 93 |   val offset = UInt(xLen.W)
 94 |   val rd = UInt(regIDWidth.W)
 95 | }
 96 | 
 97 | class CommitVData(
 98 |   implicit p: Parameters)
 99 |     extends Bundle {
100 |   val numThreads = p(ThreadNum)
101 |   val xLen = p(XLen)
102 |   val addrWidth = p(AddrWidth)
103 |   val numWarps = p(WarpNum)
104 |   val regIDWidth = p(RegIDWidth)
105 | 
106 |   val wid = UInt(log2Ceil(numWarps).W)
107 |   val mask = Vec(numThreads, Bool())
108 |   val pc = UInt(addrWidth.W)
109 |   val eop = Bool()
110 |   val rd = UInt(regIDWidth.W)
111 |   val data = Vec(numThreads, UInt(xLen.W))
112 | }
113 | 
114 | class CommitSData(
115 |   implicit p: Parameters)
116 |     extends Bundle {
117 |   val xLen = p(XLen)
118 |   val addrWidth = p(AddrWidth)
119 |   val numWarps = p(WarpNum)
120 |   val regIDWidth = p(RegIDWidth)
121 | 
122 |   val wid = UInt(log2Ceil(numWarps).W)
123 |   val mask = Bool()
124 |   val pc = UInt(addrWidth.W)
125 |   val eop = Bool()
126 |   val rd = UInt(regIDWidth.W)
127 |   val data = UInt(xLen.W)
128 | }
129 | 
130 | class StackData(
131 |   implicit p: Parameters)
132 |     extends Bundle {
133 |   val numThreads = p(ThreadNum)
134 |   val addrWidth = p(AddrWidth)
135 | 
136 |   val mask = Vec(numThreads, Bool())
137 |   val pc = UInt(addrWidth.W)
138 |   val orig_mask = Vec(numThreads, Bool())
139 | }
140 | 
141 | class InstData(
142 |   implicit p: Parameters)
143 |     extends Bundle {
144 |   val numThreads = p(ThreadNum)
145 |   val addrWidth = p(AddrWidth)
146 |   val numWarps = p(WarpNum)
147 | 
148 |   val mask = Vec(numThreads, Bool())
149 |   val wid = UInt(log2Ceil(numWarps).W)
150 |   val pc = UInt(addrWidth.W)
151 |   val data = UInt(32.W)
152 | }
153 | 
154 | class ExType(
155 |   implicit p: Parameters)
156 |     extends Bundle {
157 |   val lsu = Bool()
158 |   val alu = Bool()
159 | }
160 | 
161 | class DecodeData(
162 |   implicit p: Parameters)
163 |     extends Bundle {
164 |   val numThreads = p(ThreadNum)
165 |   val xLen = p(XLen)
166 |   val addrWidth = p(AddrWidth)
167 |   val numWarps = p(WarpNum)
168 |   val regIDWidth = p(RegIDWidth)
169 | 
170 |   val wid = UInt(log2Ceil(numWarps).W)
171 |   val mask = Vec(numThreads, Bool())
172 |   val wb = Bool()
173 |   val imm = UInt(xLen.W)
174 |   val sel_alu1 = UInt(A1_X.getWidth.W)
175 |   val sel_alu2 = UInt(A2_X.getWidth.W)
176 |   val ex_type = new ExType()
177 |   val func = UInt(SZ_ALU_FN.W)
178 |   val mem_cmd = UInt(1.W)
179 |   val branch = new BranchSignal()
180 |   val pc = UInt(addrWidth.W)
181 |   val rd = UInt(regIDWidth.W)
182 |   val rs1 = UInt(regIDWidth.W)
183 |   val rs2 = UInt(regIDWidth.W)
184 | }
185 | 
186 | class WarpControlData(
187 |   implicit p: Parameters)
188 |     extends Bundle {
189 |   val numWarps = p(WarpNum)
190 | 
191 |   val wid = UInt(log2Ceil(numWarps).W)
192 |   val active = Bool()
193 |   val join = Bool()
194 |   val end = Bool()
195 | }
196 | 
197 | class InstFetchData(
198 |   implicit p: Parameters)
199 |     extends Bundle {
200 |   val numThreads = p(ThreadNum)
201 |   val addrWidth = p(AddrWidth)
202 |   val numWarps = p(WarpNum)
203 | 
204 |   val mask = Vec(numThreads, Bool())
205 |   val wid = UInt(log2Ceil(numWarps).W)
206 |   val pc = UInt(addrWidth.W)
207 | }
208 | 
209 | class WarpCommandData(
210 |   implicit p: Parameters)
211 |     extends Bundle {
212 |   val numThreads = p(ThreadNum)
213 |   val addrWidth = p(AddrWidth)
214 |   val numWarps = p(WarpNum)
215 |   val dimWidth = p(DimWidth)
216 |   val xLen = p(XLen)
217 | 
218 |   val mask = Vec(numThreads, Bool())
219 |   // max threads num in a workgroup
220 |   val thread_dims = Vec(3, UInt(dimWidth.W))
221 |   val vgpr_num = UInt(2.W)
222 |   val sgprs = Vec(16, UInt(xLen.W))
223 |   val sgpr_num = UInt(4.W)
224 |   val reg_index = UInt(p(RegIDWidth).W)
225 |   val pc = UInt(addrWidth.W)
226 | }
227 | 
228 | class WarpEndData(
229 |   implicit p: Parameters)
230 |     extends Bundle {
231 |   val numThreads = p(ThreadNum)
232 |   val addrWidth = p(AddrWidth)
233 |   val numWarps = p(WarpNum)
234 | 
235 |   val wid = UInt(log2Ceil(numWarps).W)
236 | }
237 | 
238 | class BranchControlData(
239 |   implicit p: Parameters)
240 |     extends Bundle {
241 |   val numThreads = p(ThreadNum)
242 |   val addrWidth = p(AddrWidth)
243 |   val numWarps = p(WarpNum)
244 | 
245 |   val mask = Vec(numThreads, Bool())
246 |   val wid = UInt(log2Ceil(numWarps).W)
247 |   val pc = UInt(addrWidth.W)
248 |   val data = new StackData()
249 |   val diverge = Bool()
250 | }
251 | 
252 | class WritebackData(
253 |   implicit p: Parameters)
254 |     extends Bundle {
255 |   val numThreads = p(ThreadNum)
256 |   val xLen = p(XLen)
257 |   val addrWidth = p(AddrWidth)
258 |   val numWarps = p(WarpNum)
259 |   val regIDWidth = p(RegIDWidth)
260 | 
261 |   val wid = UInt(log2Ceil(numWarps).W)
262 |   val mask = Vec(numThreads, Bool())
263 |   val pc = UInt(addrWidth.W)
264 |   val eop = Bool()
265 |   val rd = UInt(regIDWidth.W)
266 |   val data = Vec(numThreads, UInt(xLen.W))
267 | }
268 | 
269 | class ReadGPRReq(
270 |   implicit p: Parameters)
271 |     extends Bundle {
272 |   val numWarps = p(WarpNum)
273 |   val regIDWidth = p(RegIDWidth)
274 | 
275 |   val wid = UInt(log2Ceil(numWarps).W)
276 |   val rs1 = UInt(regIDWidth.W)
277 |   val rs2 = UInt(regIDWidth.W)
278 | }
279 | 
280 | class ReadSGPRRsp(
281 |   implicit p: Parameters)
282 |     extends Bundle {
283 |   val xLen = p(XLen)
284 | 
285 |   val rs1_data = UInt(xLen.W)
286 |   val rs2_data = UInt(xLen.W)
287 | }
288 | 
289 | class ReadVGPRRsp(
290 |   implicit p: Parameters)
291 |     extends Bundle {
292 |   val numThreads = p(ThreadNum)
293 |   val xLen = p(XLen)
294 | 
295 |   val rs1_data = Vec(numThreads, UInt(xLen.W))
296 |   val rs2_data = Vec(numThreads, UInt(xLen.W))
297 | }
298 | 
299 | class ThreadMask(
300 |   implicit p: Parameters)
301 |     extends Bundle {
302 |   val numThreads = p(ThreadNum)
303 | 
304 |   val mask = Vec(numThreads, Bool())
305 | }
306 | 


--------------------------------------------------------------------------------
/src/main/scala/core/Dispatch.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | import freechips.rocketchip.rocket._
 8 | 
 9 | class Dispatch(
10 |   implicit p: Parameters)
11 |     extends Module {
12 |   val numThread = p(ThreadNum)
13 |   val xLen = p(XLen)
14 |   val io = IO(new Bundle {
15 |     val ibuffer = Flipped(DecoupledIO(new DecodeData()))
16 |     val vgpr_rsp = Flipped(new ReadVGPRRsp())
17 | 
18 |     val alu = DecoupledIO(new VALUData())
19 |     val lsu = DecoupledIO(new LSUData())
20 |   })
21 | 
22 |   val buffer = Module(
23 |     new Queue(
24 |       new Bundle {
25 |         val decode = new DecodeData
26 |         val vgpr_rsp = new ReadVGPRRsp()
27 |       },
28 |       1,
29 |       pipe = true
30 |     )
31 |   )
32 | 
33 |   val buffer_deq = buffer.io.deq.bits
34 |   buffer.io.enq.valid := io.ibuffer.valid
35 |   buffer.io.enq.bits.decode := io.ibuffer.bits
36 |   buffer.io.enq.bits.vgpr_rsp := io.vgpr_rsp
37 |   io.ibuffer.ready := buffer.io.enq.ready
38 | 
39 |   val ex_op1 = Wire(Vec(numThread, UInt(xLen.W)))
40 |   val ex_op2 = Wire(Vec(numThread, UInt(xLen.W)))
41 |   val pc_vec = Wire(Vec(numThread, UInt(xLen.W)))
42 |   val imm_vec = Wire(Vec(numThread, UInt(xLen.W)))
43 |   val const_vec = Wire(Vec(numThread, UInt(xLen.W)))
44 | 
45 |   pc_vec := VecInit.tabulate(numThread) { _ => buffer_deq.decode.pc }
46 |   imm_vec := VecInit.tabulate(numThread) { _ => buffer_deq.decode.imm }
47 |   const_vec := VecInit.tabulate(numThread) { _ => 4.U }
48 | 
49 |   ex_op1 := MuxLookup(buffer_deq.decode.sel_alu1.asUInt, 0.U.asTypeOf(ex_op1))(
50 |     Seq(A1_RS1.asUInt -> buffer_deq.vgpr_rsp.rs1_data, A1_PC.asUInt -> pc_vec)
51 |   )
52 |   ex_op2 := MuxLookup(buffer_deq.decode.sel_alu2.asUInt, 0.U.asTypeOf(ex_op1))(
53 |     Seq(A2_RS2.asUInt -> buffer_deq.vgpr_rsp.rs2_data, A2_IMM.asUInt -> imm_vec, A2_SIZE.asUInt -> const_vec)
54 |   )
55 | 
56 |   io.alu.valid := buffer.io.deq.valid && buffer_deq.decode.ex_type.alu
57 |   io.alu.bits.op1 := ex_op1
58 |   io.alu.bits.op2 := ex_op2
59 |   io.alu.bits.func := buffer_deq.decode.func
60 |   io.alu.bits.mask := buffer_deq.decode.mask
61 |   io.alu.bits.wid := buffer_deq.decode.wid
62 |   io.alu.bits.pc := buffer_deq.decode.pc
63 |   io.alu.bits.rd := buffer_deq.decode.rd
64 |   io.alu.bits.imm := buffer_deq.decode.imm
65 |   io.alu.bits.rs1_data := buffer_deq.vgpr_rsp.rs1_data(PriorityEncoder(buffer_deq.decode.mask))
66 |   io.alu.bits.branch := buffer_deq.decode.branch
67 | 
68 |   io.lsu.valid := buffer.io.deq.valid && buffer_deq.decode.ex_type.lsu
69 |   io.lsu.bits.func := buffer_deq.decode.mem_cmd
70 |   io.lsu.bits.wid := buffer_deq.decode.wid
71 |   io.lsu.bits.mask := buffer_deq.decode.mask
72 |   io.lsu.bits.addr := buffer_deq.vgpr_rsp.rs1_data
73 |   io.lsu.bits.rd := buffer_deq.decode.rd
74 |   io.lsu.bits.data := buffer_deq.vgpr_rsp.rs2_data
75 |   io.lsu.bits.offset := buffer_deq.decode.imm
76 |   io.lsu.bits.pc := buffer_deq.decode.pc
77 | 
78 |   val mapping = Seq((1.U, io.alu.ready), (2.U, io.lsu.ready))
79 |   buffer.io.deq.ready := MuxLookup(Cat(buffer_deq.decode.ex_type.lsu, buffer_deq.decode.ex_type.alu), 1.B)(mapping)
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/core/ICache.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import freechips.rocketchip.util._
  6 | import freechips.rocketchip.diplomacy._
  7 | import freechips.rocketchip.tilelink._
  8 | import freechips.rocketchip.amba._
  9 | import org.chipsalliance.cde.config.Parameters
 10 | import chisel3.util.random.LFSR
 11 | 
 12 | case class ICacheParams(
 13 |   nSets:      Int = 64,
 14 |   nWays:      Int = 4,
 15 |   rowBits:    Int = 128,
 16 |   paddrBits:  Int = 48,
 17 |   vaddrBits:  Int = 48,
 18 |   pgIdxBits:  Int = 12,
 19 |   dataBits:   Int = 64,
 20 |   nTLBSets:   Int = 32,
 21 |   nTLBWays:   Int = 4,
 22 |   coreId:     Int = 0,
 23 |   tagECC:     Option[String] = None,
 24 |   dataECC:    Option[String] = None,
 25 |   prefetch:   Boolean = false,
 26 |   pgLevels:   Int = 3,
 27 |   blockBytes: Int = 64,
 28 |   latency:    Int = 2) {
 29 |   def tagCode:  Code = Code.fromString(tagECC)
 30 |   def dataCode: Code = Code.fromString(dataECC)
 31 |   def replacement = new RandomReplacement(nWays)
 32 |   def blockOffBits: Int = log2Ceil(blockBytes)
 33 |   def lgCacheBlockBytes = blockOffBits
 34 |   def untagBits:   Int = log2Ceil(nSets) + blockOffBits
 35 |   def tagBits:     Int = vaddrBits - untagBits
 36 |   def pgUntagBits: Int = untagBits
 37 |   def idxBits = log2Up(nSets)
 38 |   def isDM = nWays == 1
 39 |   def cacheDataBeats = (blockBytes * 8) / dataBits
 40 |   def fetchBytes = dataBits / 8
 41 |   def refillCycles = cacheDataBeats
 42 |   def vpnBits: Int = vaddrBits - pgIdxBits
 43 | }
 44 | 
 45 | class ICacheReq(vaddrBits: Int) extends Bundle {
 46 |   val addr = UInt(vaddrBits.W)
 47 | }
 48 | 
 49 | class ICacheResp(dataBits: Int) extends Bundle {
 50 |   val data = UInt(dataBits.W)
 51 |   val replay = Bool()
 52 |   val ae = Bool()
 53 | }
 54 | 
 55 | class ICacheBundle(cfg: ICacheParams) extends Bundle {
 56 |   val req = Flipped(Decoupled(new ICacheReq(cfg.vaddrBits)))
 57 |   val s1_paddr = Input(UInt(cfg.paddrBits.W))
 58 |   val s1_kill = Input(Bool())
 59 |   val s2_kill = Input(Bool())
 60 |   val s2_cacheable = Input(Bool())
 61 |   val s2_prefetch = Input(Bool())
 62 |   val resp = Valid(new ICacheResp(cfg.dataBits))
 63 |   val invalidate = Input(Bool())
 64 | }
 65 | 
 66 | class ICache(
 67 |   val cfg: ICacheParams
 68 | )(
 69 |   implicit p: Parameters)
 70 |     extends LazyModule {
 71 |   lazy val module = new ICacheModule(this)
 72 |   val masterNode = TLClientNode(
 73 |     Seq(
 74 |       TLMasterPortParameters.v1(
 75 |         clients = Seq(
 76 |           TLMasterParameters.v1(
 77 |             sourceId = IdRange(0, 2), // 0=refill, 1=hint
 78 |             name = s"Core ${cfg.coreId} ICache"
 79 |           )
 80 |         ),
 81 |         requestFields = Seq()
 82 |       )
 83 |     )
 84 |   )
 85 | 
 86 |   val size = cfg.nSets * cfg.nWays * cfg.blockBytes
 87 | 
 88 | }
 89 | 
 90 | class ICacheModule(outer: ICache) extends LazyModuleImp(outer) {
 91 |   val cfg = outer.cfg
 92 |   val io = IO(new ICacheBundle(cfg))
 93 |   val (tl_out, edge_out) = outer.masterNode.out(0)
 94 | 
 95 |   val tECC = cfg.tagCode
 96 |   val dECC = cfg.dataCode
 97 |   require(isPow2(cfg.nSets) && isPow2(cfg.nWays))
 98 | 
 99 |   /** valid signal for CPU accessing cache in stage 0. */
100 |   val s0_valid = io.req.fire
101 | 
102 |   /** virtual address from CPU in stage 0. */
103 |   val s0_vaddr = io.req.bits.addr
104 | 
105 |   /** valid signal for stage 1, drived by s0_valid. */
106 |   val s1_valid = RegInit(false.B)
107 | 
108 |   /** virtual address from CPU in stage 1. */
109 |   val s1_vaddr = RegEnable(s0_vaddr, s0_valid)
110 | 
111 |   /** tag hit vector to indicate hit which way. */
112 |   val s1_tag_hit = Wire(Vec(cfg.nWays, Bool()))
113 | 
114 |   val s1_hit = s1_tag_hit.reduce(_ || _)
115 |   dontTouch(s1_hit)
116 |   val s2_valid = RegNext(s1_valid && !io.s1_kill, false.B)
117 |   val s2_hit = RegNext(s1_hit)
118 | 
119 |   /** status register to indicate a cache flush. */
120 |   val invalidated = Reg(Bool())
121 |   val refill_valid = RegInit(false.B)
122 | 
123 |   /** register to indicate [[tl_out]] is performing a hint. prefetch only happens after refilling
124 |     */
125 |   val send_hint = RegInit(false.B)
126 | 
127 |   /** indicate [[tl_out]] is performing a refill. */
128 |   val refill_fire = tl_out.a.fire && !send_hint
129 | 
130 |   /** register to indicate there is a outstanding hint. */
131 |   val hint_outstanding = RegInit(false.B)
132 | 
133 |   /** [[io]] access L1 I$ miss. */
134 |   val s2_miss = s2_valid && !s2_hit && !io.s2_kill
135 | 
136 |   /** forward signal to stage 1, permit stage 1 refill. */
137 |   val s1_can_request_refill = !(s2_miss || refill_valid)
138 | 
139 |   /** real refill signal, stage 2 miss, and was permit to refill in stage 1. Since a miss will trigger burst. miss under
140 |     * miss won't trigger another burst.
141 |     */
142 |   val s2_request_refill = s2_miss && RegNext(s1_can_request_refill)
143 |   val refill_paddr = RegEnable(io.s1_paddr, s1_valid && s1_can_request_refill)
144 |   val refill_vaddr = RegEnable(s1_vaddr, s1_valid && s1_can_request_refill)
145 |   val refill_tag = refill_paddr >> cfg.pgUntagBits
146 |   val refill_idx = index(refill_vaddr, refill_paddr)
147 | 
148 |   /** AccessAckData, is refilling I$, it will block request from CPU. */
149 |   val refill_one_beat = tl_out.d.fire && edge_out.hasData(tl_out.d.bits)
150 | 
151 |   /** block request from CPU when refill or scratch pad access. */
152 |   io.req.ready := !(refill_one_beat)
153 |   s1_valid := s0_valid
154 | 
155 |   val (_, _, d_done, refill_cnt) = edge_out.count(tl_out.d)
156 | 
157 |   /** at last beat of `tl_out.d.fire`, finish refill. */
158 |   val refill_done = refill_one_beat && d_done
159 | 
160 |   /** scratchpad is writing data. block refill. */
161 |   tl_out.d.ready := true.B
162 | 
163 |   require(edge_out.manager.minLatency > 0)
164 | 
165 |   /** way to be replaced, implemented with a hardcoded random replacement algorithm */
166 |   val repl_way =
167 |     if (cfg.isDM) 0.U
168 |     else {
169 |       // pick a way that is not used by the scratchpad
170 |       val v0 = LFSR(16, refill_fire)(log2Up(cfg.nWays) - 1, 0)
171 |       v0
172 |     }
173 | 
174 |   /** Tag SRAM, indexed with virtual memory, content with `refillError ## tag[19:0]` after ECC
175 |     */
176 |   val tag_array = DescribedSRAM(
177 |     name = "tag_array",
178 |     desc = "ICache Tag Array",
179 |     size = cfg.nSets,
180 |     data = Vec(cfg.nWays, UInt(tECC.width(1 + cfg.tagBits).W))
181 |   )
182 |   val tag_rdata = tag_array.read(s0_vaddr(cfg.untagBits - 1, cfg.blockOffBits), !refill_done && s0_valid)
183 | 
184 |   /** register indicates the ongoing GetAckData transaction is corrupted. */
185 |   val accruedRefillError = Reg(Bool())
186 | 
187 |   /** wire indicates the ongoing GetAckData transaction is corrupted. */
188 |   val refillError = tl_out.d.bits.corrupt || (refill_cnt > 0.U && accruedRefillError)
189 |   when(refill_done) {
190 |     // For AccessAckData, denied => corrupt
191 |     /** data written to [[tag_array]]. ECC encoded `refillError ## refill_tag`
192 |       */
193 |     val enc_tag = tECC.encode(Cat(refillError, refill_tag))
194 |     tag_array.write(refill_idx, VecInit(Seq.fill(cfg.nWays) { enc_tag }), Seq.tabulate(cfg.nWays)(repl_way === _.U))
195 | 
196 |   }
197 |   // notify CPU, I$ has corrupt.
198 |   // io.errors.bus.valid := tl_out.d.fire && (tl_out.d.bits.denied || tl_out.d.bits.corrupt)
199 |   // io.errors.bus.bits := (refill_paddr >> blockOffBits) << blockOffBits
200 | 
201 |   /** true indicate this cacheline is valid, indexed by (wayIndex ## setIndex) after refill_done and not FENCE.I,
202 |     * (repl_way ## refill_idx) set to true.
203 |     */
204 |   val vb_array = RegInit(0.U((cfg.nSets * cfg.nWays).W))
205 |   when(refill_one_beat) {
206 |     accruedRefillError := refillError
207 |     // clear bit when refill starts so hit-under-miss doesn't fetch bad data
208 |     vb_array := vb_array.bitSet(Cat(repl_way, refill_idx), refill_done && !invalidated)
209 |   }
210 | 
211 |   /** flush cache when invalidate is true. */
212 |   val invalidate = WireDefault(io.invalidate)
213 |   when(invalidate) {
214 |     vb_array := 0.U
215 |     invalidated := true.B
216 |   }
217 | 
218 |   /** wire indicates that tag is correctable or uncorrectable. will trigger CPU to replay and I$ invalidating, if
219 |     * correctable.
220 |     */
221 |   val s1_tag_disparity = Wire(Vec(cfg.nWays, Bool()))
222 | 
223 |   /** wire indicates that bus has an uncorrectable error. respond to CPU [[io.resp.bits.ae]], cause
224 |     * [[Causes.fetch_access]].
225 |     */
226 |   val s1_tl_error = Wire(Vec(cfg.nWays, Bool()))
227 | 
228 |   /** how many bits will be fetched by CPU for each fetch. */
229 |   val wordBits = cfg.fetchBytes * 8
230 | 
231 |   /** a set of raw data read from [[data_arrays]]. */
232 |   val s1_dout = Wire(Vec(cfg.nWays, UInt(dECC.width(wordBits).W)))
233 |   s1_dout := DontCare
234 | 
235 |   // /** address accessed by [[tl_in]] for ITIM. */
236 |   // val s0_slaveAddr = tl_in.map(_.a.bits.address).getOrElse(0.U)
237 |   // /** address used at stage 1 and 3.
238 |   //   * {{{
239 |   //   * In stage 1, it caches TileLink data, store in stage 2 if ECC passed.
240 |   //   * In stage 3, it caches corrected data from stage 2, and store in stage 4.}}}
241 |   //   */
242 |   // val s1s3_slaveAddr = Reg(UInt(log2Ceil(outer.size).W))
243 |   // /** data used at stage 1 and 3.
244 |   //   * {{{
245 |   //   * In stage 1, it caches TileLink data, store in stage 2.
246 |   //   * In stage 3, it caches corrected data from data ram, and return to d channel.}}}
247 |   //   */
248 |   // val s1s3_slaveData = Reg(UInt(wordBits.W))
249 | 
250 |   for (i <- 0 until cfg.nWays) {
251 |     val s1_idx = index(s1_vaddr, io.s1_paddr)
252 |     val s1_tag = io.s1_paddr >> cfg.pgUntagBits
253 | 
254 |     /** this way is used by scratchpad. [[tag_array]] corrupted.
255 |       */
256 |     // val scratchpadHit = scratchpadWayValid(i.U) &&
257 |     //   Mux(s1_slaveValid,
258 |     //     // scratchpad accessing form [[tl_in]].
259 |     //     // @todo I think XBar will guarantee there won't be an illegal access on the bus?
260 |     //     //       so why did have this check `lineInScratchpad(scratchpadLine(s1s3_slaveAddr))`?
261 |     //     //       I think it will always be true.
262 |     //     lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i.U,
263 |     //     // scratchpad accessing from [[io]].
264 |     //     // @todo Accessing ITIM correspond address will be able to read cacheline?
265 |     //     //       is this desired behavior?
266 |     //     addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i.U)
267 |     val s1_vb = vb_array(Cat(i.U, s1_idx))
268 |     // printf(cf"width is variable because of i.U  ${Cat(i.U, s1_idx).getWidth}\n")
269 |     val enc_tag = tECC.decode(tag_rdata(i))
270 | 
271 |     /** [[tl_error]] ECC error bit. [[tag]] of [[tag_array]] access.
272 |       */
273 |     val (tl_error, tag) = Split(enc_tag.uncorrected, cfg.tagBits)
274 |     val tagMatch = s1_vb && tag === s1_tag
275 | 
276 |     /** tag error happens. */
277 |     s1_tag_disparity(i) := s1_vb && enc_tag.error
278 | 
279 |     /** if tag matched but ecc checking failed, this access will trigger [[Causes.fetch_access]] exception. */
280 |     s1_tl_error(i) := tagMatch && tl_error.asBool
281 |     s1_tag_hit(i) := tagMatch
282 |   }
283 |   assert(!(s1_valid) || PopCount(s1_tag_hit.zip(s1_tag_disparity).map { case (h, d) => h && !d }) <= 1.U)
284 | 
285 |   println(s"tl width ${tl_out.d.bits.data.getWidth}")
286 |   println(s"tl mask width ${tl_out.a.bits.mask.getWidth}")
287 |   require(tl_out.d.bits.data.getWidth % wordBits == 0)
288 | 
289 |   /** Data SRAM
290 |     *
291 |     * banked with TileLink beat bytes / CPU fetch bytes, indexed with [[index]] and multi-beats cycle, content with
292 |     * `eccError ## wordBits` after ECC.
293 |     * {{{
294 |     * │                          │xx│xxxxxx│xxx│x│xx│
295 |     *                                            ↑word
296 |     *                                          ↑bank
297 |     *                            ↑way
298 |     *                               └─set──┴─offset─┘
299 |     *                               └────row───┘
300 |     * }}}
301 |     * Note: Data SRAM is indexed with virtual memory(vaddr[11:2]),
302 |     *   - vaddr[11:3]->row,
303 |     *   - vaddr[2]->bank=i
304 |     *   - Cache line size = refillCycels(8) * bank(2) * datasize(4 bytes) = 64 bytes
305 |     *   - data width = 32
306 |     *
307 |     * read: read happens in stage 0
308 |     *
309 |     * write: It takes 8 beats to refill 16 instruction in each refilling cycle. Data_array receives data[63:0](2
310 |     * instructions) at once,they will be allocated in deferent bank according to vaddr[2]
311 |     */
312 |   val data_arrays = Seq.tabulate(tl_out.d.bits.data.getWidth / wordBits) { i =>
313 |     DescribedSRAM(
314 |       name = s"data_arrays_${i}",
315 |       desc = "ICache Data Array",
316 |       size = cfg.nSets * cfg.refillCycles,
317 |       data = Vec(cfg.nWays, UInt(dECC.width(wordBits).W))
318 |     )
319 |   }
320 | 
321 |   for ((data_array, i) <- data_arrays.zipWithIndex) {
322 | 
323 |     /** bank match (vaddr[2]) */
324 |     def wordMatch(addr: UInt) =
325 |       addr.extract(log2Ceil(tl_out.d.bits.data.getWidth / 8) - 1, log2Ceil(wordBits / 8)) === i.U
326 |     def row(addr: UInt) = addr(cfg.untagBits - 1, cfg.blockOffBits - log2Ceil(cfg.refillCycles))
327 | 
328 |     /** read_enable signal */
329 |     val s0_ren = (s0_valid && wordMatch(s0_vaddr))
330 | 
331 |     /** write_enable signal refill from [[tl_out]] or ITIM write.
332 |       */
333 |     val wen = (refill_one_beat && !invalidated)
334 | 
335 |     /** index to access [[data_array]]. */
336 |     val mem_idx =
337 |       // I$ refill. refill_idx[2:0] is the beats
338 |       Mux(
339 |         refill_one_beat,
340 |         (refill_idx << log2Ceil(cfg.refillCycles)) | refill_cnt,
341 |         // CPU read.
342 |         row(s0_vaddr)
343 |       )
344 |     when(wen) {
345 |       // wr_data
346 |       val data = tl_out.d.bits.data(wordBits * (i + 1) - 1, wordBits * i)
347 |       // the way to be replaced/written
348 |       val way = repl_way
349 |       data_array.write(
350 |         mem_idx,
351 |         VecInit(Seq.fill(cfg.nWays) { dECC.encode(data) }),
352 |         (0 until cfg.nWays).map(way === _.U)
353 |       )
354 |     }
355 |     // write access
356 |     /** data read from [[data_array]]. */
357 |     val dout = data_array.read(mem_idx, !wen && s0_ren)
358 |     // Mux to select a way to [[s1_dout]]
359 |     when(wordMatch(io.s1_paddr)) {
360 |       s1_dout := dout
361 |     }
362 |   }
363 | 
364 |   /** When writing full words to ITIM, ECC errors are correctable. When writing a full scratchpad word, suppress the
365 |     * read so Xs don't leak out
366 |     */
367 |   // val s1s2_full_word_write = WireDefault(false.B)
368 |   // val s1_dont_read = s1_slaveValid && s1s2_full_word_write
369 | 
370 |   /** clock gate signal for [[s2_tag_hit]], [[s2_dout]], [[s2_tag_disparity]], [[s2_tl_error]], [[s2_scratchpad_hit]].
371 |     */
372 |   val s1_clk_en = s1_valid
373 |   val s2_tag_hit = RegEnable(s1_tag_hit, s1_clk_en)
374 | 
375 |   /** way index to access [[data_arrays]]. */
376 |   val s2_hit_way = OHToUInt(s2_tag_hit)
377 | 
378 |   /** ITIM index to access [[data_arrays]]. replace tag with way, word set to 0.
379 |     */
380 |   val s2_dout = RegEnable(s1_dout, s1_clk_en)
381 |   val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
382 |   val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_clk_en).asUInt.orR
383 |   val s2_tl_error = RegEnable(s1_tl_error.asUInt.orR, s1_clk_en)
384 | 
385 |   /** ECC decode result for [[data_arrays]]. */
386 |   val s2_data_decoded = dECC.decode(s2_way_mux)
387 | 
388 |   /** ECC error happened, correctable or uncorrectable, ask CPU to replay. */
389 |   val s2_disparity = s2_tag_disparity || s2_data_decoded.error
390 | 
391 |   /** access hit in ITIM, if [[s1_slaveValid]], this access is from [[tl_in]], else from CPU [[io]]. */
392 |   // val s1_scratchpad_hit = Mux(s1_slaveValid, lineInScratchpad(scratchpadLine(s1s3_slaveAddr)), addrInScratchpad(io.s1_paddr))
393 |   /** stage 2 of [[s1_scratchpad_hit]]. */
394 |   // val s2_scratchpad_hit = RegEnable(s1_scratchpad_hit, s1_clk_en)
395 |   /** ITIM uncorrectable read. `s2_scratchpad_hit`: processing a scratchpad read(from [[tl_in]] or [[io]])
396 |     * `s2_data_decoded.uncorrectable`: read a uncorrectable data. `s2_valid`: [[io]] non-canceled read. `(s2_slaveValid
397 |     * && !s2_full_word_write)`: [[tl_in]] read or write a word with wormhole. if write a full word, even stage 2 read
398 |     * uncorrectable. stage 3 full word write will recovery this.
399 |     */
400 |   // val s2_report_uncorrectable_error = s2_scratchpad_hit && s2_data_decoded.uncorrectable && (s2_valid || (s2_slaveValid && !s1s2_full_word_write))
401 |   /** ECC uncorrectable address, send to Bus Error Unit. */
402 |   // val s2_error_addr = scratchpadBase.map(base => Mux(s2_scratchpad_hit, base + s2_scratchpad_word_addr, 0.U)).getOrElse(0.U)
403 | 
404 |   // output signals
405 |   outer.cfg.latency match {
406 |     // if I$ latency is 1, no ITIM, no ECC.
407 |     case 1 =>
408 |       require(tECC.isInstanceOf[IdentityCode])
409 |       require(dECC.isInstanceOf[IdentityCode])
410 |       // reply data to CPU at stage 2. no replay.
411 |       io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout)
412 |       io.resp.bits.ae := s1_tl_error.asUInt.orR
413 |       io.resp.valid := s1_valid && s1_hit
414 |       io.resp.bits.replay := false.B
415 | 
416 |     // if I$ latency is 2, can have ITIM and ECC.
417 |     case 2 =>
418 |       // when some sort of memory bit error have occurred
419 |       // @todo why so aggressive to invalidate all when ecc corrupted.
420 |       when(s2_valid && s2_disparity) { invalidate := true.B }
421 | 
422 |       // reply data to CPU at stage 2.
423 |       io.resp.bits.data := s2_data_decoded.uncorrected
424 |       io.resp.bits.ae := s2_tl_error
425 |       io.resp.bits.replay := s2_disparity
426 |       io.resp.valid := s2_valid && s2_hit
427 | 
428 |     // // report correctable error to BEU at stage 2.
429 |     // io.errors.correctable.foreach { c =>
430 |     //   c.valid := (s2_valid || s2_slaveValid) && s2_disparity
431 |     //   c.bits := s2_error_addr
432 |     // }
433 |     // // report uncorrectable error to BEU at stage 2.
434 |     // io.errors.uncorrectable.foreach { u =>
435 |     //   u.valid := false.B
436 |     //   u.bits := s2_error_addr
437 |     // }
438 | 
439 |   }
440 | 
441 |   println(s"edge out bundle ${edge_out.bundle}")
442 |   tl_out.a.valid := s2_request_refill
443 |   tl_out.a.bits := edge_out
444 |     .Get(
445 |       fromSource = 0.U,
446 |       toAddress = (refill_paddr >> cfg.blockOffBits) << cfg.blockOffBits,
447 |       lgSize = cfg.lgCacheBlockBytes.U
448 |     )
449 |     ._2
450 | 
451 |   // // prefetch when next-line access does not cross a page
452 |   // if (cacheParams.prefetch) {
453 |   //   /** [[crosses_page]]  indicate if there is a crosses page access
454 |   //     * [[next_block]] : the address to be prefetched.
455 |   //     */
456 |   //   val (crosses_page, next_block) = Split(refill_paddr(pgIdxBits-1, blockOffBits) +& 1.U, pgIdxBits-blockOffBits)
457 | 
458 |   //   when (tl_out.a.fire) {
459 |   //     send_hint := !hint_outstanding && io.s2_prefetch && !crosses_page
460 |   //     when (send_hint) {
461 |   //       send_hint := false.B
462 |   //       hint_outstanding := true.B
463 |   //     }
464 |   //   }
465 | 
466 |   //   // @todo why refill_done will kill hint at this cycle?
467 |   //   when (refill_done) {
468 |   //     send_hint := false.B
469 |   //   }
470 | 
471 |   //   // D channel reply with HintAck.
472 |   //   when (tl_out.d.fire && !refill_one_beat) {
473 |   //     hint_outstanding := false.B
474 |   //   }
475 | 
476 |   //   when (send_hint) {
477 |   //     tl_out.a.valid := true.B
478 |   //     tl_out.a.bits := edge_out.Hint(
479 |   //                       fromSource = 1.U,
480 |   //                       toAddress = Cat(refill_paddr >> pgIdxBits, next_block) << blockOffBits,
481 |   //                       lgSize = lgCacheBlockBytes.U,
482 |   //                       param = TLHints.PREFETCH_READ)._2
483 |   //   }
484 | 
485 |   //   ccover(send_hint && !tl_out.a.ready, "PREFETCH_A_STALL", "I$ prefetch blocked by A-channel")
486 |   //   ccover(refill_valid && (tl_out.d.fire && !refill_one_beat), "PREFETCH_D_BEFORE_MISS_D", "I$ prefetch resolves before miss")
487 |   //   ccover(!refill_valid && (tl_out.d.fire && !refill_one_beat), "PREFETCH_D_AFTER_MISS_D", "I$ prefetch resolves after miss")
488 |   //   ccover(tl_out.a.fire && hint_outstanding, "PREFETCH_D_AFTER_MISS_A", "I$ prefetch resolves after second miss")
489 |   // }
490 | 
491 |   // Drive APROT information
492 |   tl_out.a.bits.user.lift(AMBAProt).foreach { x =>
493 |     // Rocket caches all fetch requests, and it's difficult to differentiate privileged/unprivileged on
494 |     // cached data, so mark as privileged
495 |     x.fetch := true.B
496 |     x.secure := true.B
497 |     x.privileged := true.B
498 |     x.bufferable := true.B
499 |     x.modifiable := true.B
500 |     x.readalloc := io.s2_cacheable
501 |     x.writealloc := io.s2_cacheable
502 |   }
503 |   tl_out.b.ready := true.B
504 |   tl_out.c.valid := false.B
505 |   tl_out.e.valid := false.B
506 | 
507 |   // if there is an outstanding refill, cannot flush I$.
508 |   when(!refill_valid) { invalidated := false.B }
509 |   when(refill_fire) { refill_valid := true.B }
510 |   when(refill_done) { refill_valid := false.B }
511 | 
512 |   // io.perf.acquire := refill_fire
513 |   // don't gate I$ clock since there are outstanding transcations.
514 |   // io.keep_clock_enabled :=
515 |   //   tl_in
516 |   //     .map(tl => tl.a.valid || tl.d.valid || s1_slaveValid || s2_slaveValid || s3_slaveValid)
517 |   //     .getOrElse(false.B) || // ITIM
518 |   //     s1_valid || s2_valid || refill_valid || send_hint || hint_outstanding // I$
519 | 
520 |   /** index to access [[data_arrays]] and [[tag_array]].
521 |     * @note
522 |     *   if [[untagBits]] > [[pgIdxBits]] in
523 |     *   {{{
524 |     *                        ┌──idxBits──┐
525 |     *                        ↓           ↓
526 |     * │          tag         │    set    │offset│
527 |     * │              pageTag     │     pageIndex│
528 |     *                        ↑   ↑       ↑      │
529 |     *                   untagBits│  blockOffBits│
530 |     *                       pgIdxBits    │
531 |     *                        └msb┴──lsb──┘
532 |     *                        vaddr paddr
533 |     *   }}}
534 |     *
535 |     * else use paddr directly. Note: if [[untagBits]] > [[pgIdxBits]], there will be a alias issue which isn't
536 |     * addressend by the icache yet.
537 |     */
538 |   def index(vaddr: UInt, paddr: UInt) = {
539 | 
540 |     /** [[paddr]] as LSB to be used for VIPT. */
541 |     val lsbs = paddr(cfg.pgUntagBits - 1, cfg.blockOffBits)
542 | 
543 |     /** if [[untagBits]] > [[pgIdxBits]], append [[vaddr]] to higher bits of index as [[msbs]]. */
544 |     val msbs = (cfg.idxBits + cfg.blockOffBits > cfg.pgUntagBits)
545 |       .option(vaddr(cfg.idxBits + cfg.blockOffBits - 1, cfg.pgUntagBits))
546 |     msbs ## lsbs
547 |   }
548 | }
549 | 


--------------------------------------------------------------------------------
/src/main/scala/core/IDecodeUnit.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import org.chipsalliance.cde.config.Parameters
  6 | import ogpu.config._
  7 | import freechips.rocketchip.rocket._
  8 | import freechips.rocketchip.util._
  9 | import freechips.rocketchip.rocket.ALU._
 10 | 
 11 | // add join inst based on jal, use wxd to distinguish
 12 | /* Automatically generated by parse_opcodes */
 13 | object GPUInstructions {
 14 |   def JOIN = BitPat("b?????????????????000?????1101011")
 15 | 
 16 | }
 17 | 
 18 | import GPUInstructions._
 19 | class GPUDecode(
 20 |   implicit val p: Parameters)
 21 |     extends DecodeConstants {
 22 |   val aluFn = ALU
 23 |   val table: Array[(BitPat, List[BitPat])] = Array(
 24 |     // format: off
 25 |     JOIN -> List(
 26 |       Y,N,N,N,Y,N,N,N,N,N,N,A2_SIZE,A1_PC,IMM_UJ,DW_XPR,aluFn.FN_ADD,N,M_X,N,N,N,N,N,N,N,CSR.N,N,N,N,N))
 27 |     // format: on
 28 | }
 29 | 
 30 | // reuse rocketchip decode
 31 | class IDecodeUnit(
 32 |   implicit p: Parameters)
 33 |     extends Module() {
 34 |   val io = IO(new Bundle {
 35 |     val inst = Flipped(DecoupledIO(new InstData()))
 36 |     val decode = DecoupledIO(new DecodeData())
 37 |     val wcontrol = DecoupledIO(new WarpControlData())
 38 |   })
 39 | 
 40 |   val aluFn = ALU
 41 |   val decode_table = {
 42 |     Seq(new GPUDecode()) ++:
 43 |       Seq(new IDecode()) ++:
 44 |       Seq(new I64Decode())
 45 |   }.flatMap(_.table)
 46 |   val id_ctrl = Wire(new IntCtrlSigs()).decode(io.inst.bits.data, decode_table)
 47 |   val exec_ctrl = RegInit(0.U.asTypeOf(new IntCtrlSigs()))
 48 |   val ctrl_valid = RegInit(0.B)
 49 |   val decode_valid = RegInit(0.B)
 50 |   val inst_reg = RegInit(0.U.asTypeOf(io.inst.bits))
 51 |   when(io.inst.fire) {
 52 |     exec_ctrl := id_ctrl
 53 |     ctrl_valid := 1.B
 54 |     decode_valid := 1.B
 55 |     inst_reg := io.inst.bits
 56 |   }.otherwise {
 57 |     ctrl_valid := 0.B
 58 |   }
 59 | 
 60 |   when(io.decode.fire & !io.inst.fire) {
 61 |     decode_valid := 0.B
 62 |   }
 63 |   io.inst.ready := !io.decode.valid || io.decode.fire
 64 |   io.decode.valid := decode_valid
 65 |   io.wcontrol.valid := ctrl_valid
 66 | 
 67 |   val ctrl = exec_ctrl
 68 |   val is_alu = ctrl.wxd && !(ctrl.mem || ctrl.fp || ctrl.mul || ctrl.div || ctrl.csr =/= CSR.N)
 69 |   val is_lsu = ctrl.mem
 70 |   val is_csr = ctrl.csr =/= CSR.N
 71 | 
 72 |   // branch inst break warp schedule
 73 |   val is_jal = ctrl.wxd && ctrl.jal
 74 |   val is_jalr = ctrl.jalr
 75 |   val is_join = !ctrl.wxd && ctrl.jal
 76 |   val is_branch = ctrl.branch
 77 |   val is_end = inst_reg.data === 0x10500073.U
 78 |   val imm = ImmGen(ctrl.sel_imm, inst_reg.data)
 79 | 
 80 |   // output
 81 |   io.decode.bits.wid := inst_reg.wid
 82 |   io.decode.bits.mask := inst_reg.mask
 83 |   io.decode.bits.pc := inst_reg.pc
 84 |   io.decode.bits.ex_type.lsu := is_lsu
 85 |   io.decode.bits.ex_type.alu := is_alu
 86 | 
 87 |   io.decode.bits.func := ctrl.alu_fn
 88 |   io.decode.bits.mem_cmd := ctrl.mem_cmd(0)
 89 |   io.decode.bits.wb := ctrl.wxd
 90 |   io.decode.bits.sel_alu2 := ctrl.sel_alu2.asUInt
 91 |   io.decode.bits.sel_alu1 := ctrl.sel_alu1.asUInt
 92 |   io.decode.bits.imm := imm.asUInt
 93 |   io.decode.bits.branch.jal := is_jal
 94 |   io.decode.bits.branch.jalr := is_jalr
 95 |   io.decode.bits.branch.branch := is_branch
 96 |   io.decode.bits.rd := inst_reg.data(11, 7)
 97 |   io.decode.bits.rs1 := inst_reg.data(19, 15)
 98 |   io.decode.bits.rs2 := inst_reg.data(24, 20)
 99 | 
100 |   io.wcontrol.bits.wid := inst_reg.wid
101 |   io.wcontrol.bits.join := is_join
102 |   io.wcontrol.bits.active := !(is_branch || is_jal || is_jalr || is_end)
103 |   // wfi inst as end inst
104 |   io.wcontrol.bits.end := is_end
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/scala/core/IFetch.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | 
 7 | class InstFetch(
 8 |   cfg: ICacheParams
 9 | )(
10 |   implicit p: Parameters)
11 |     extends Module {
12 | 
13 |   val io = IO(new Bundle {
14 |     val inst_fetch = Flipped(Decoupled(new InstFetchData()))
15 |     val to_icache = Flipped(new ICacheBundle(cfg))
16 |     val to_ptw = new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels)
17 |     val inst_out = Decoupled(new InstData())
18 |   })
19 | 
20 |   val tlb_param =
21 |     TLBParameter(nSets = cfg.nTLBSets, nWays = cfg.nTLBWays, paddrBits = cfg.paddrBits, vaddrBits = cfg.vaddrBits)
22 |   val tlb = Module(new TLB(true, tlb_param))
23 | 
24 |   // fetch inst fetch
25 |   val fetch_idle :: fetch_req :: fetch_wait1 :: fetch_wait2 :: dispatch_wait :: Nil = Enum(5)
26 |   val s_state = RegInit(fetch_idle)
27 | 
28 |   val ifetch_data = RegInit(0.U.asTypeOf(new InstFetchData))
29 | 
30 |   switch(s_state) {
31 |     is(fetch_idle) {
32 |       when(io.inst_fetch.fire) {
33 |         s_state := fetch_req
34 |         ifetch_data := io.inst_fetch.bits
35 |       }
36 |     }
37 |     is(fetch_req) {
38 |       when(io.to_icache.req.fire) {
39 |         s_state := fetch_wait1
40 |       }
41 |     }
42 |     is(fetch_wait1) {
43 |       when(io.to_icache.resp.valid) {
44 |         s_state := dispatch_wait
45 |       }.otherwise {
46 |         s_state := fetch_wait2
47 |       }
48 |     }
49 |     is(fetch_wait2) {
50 |       when(io.to_icache.resp.valid) {
51 |         s_state := dispatch_wait
52 |       }.otherwise {
53 |         s_state := fetch_req
54 |       }
55 |     }
56 |     is(dispatch_wait) {
57 |       when(io.inst_out.fire) {
58 |         s_state := fetch_idle
59 |       }
60 |     }
61 |   }
62 | 
63 |   io.to_icache.req.valid := s_state === fetch_req
64 |   io.to_icache.req.bits.addr := ifetch_data.pc
65 |   io.to_icache.s1_paddr := tlb.io.resp.paddr
66 |   io.to_icache.s1_kill := tlb.io.resp.miss
67 |   io.to_icache.s2_kill := false.B
68 |   io.to_icache.s2_cacheable := true.B
69 |   io.to_icache.s2_prefetch := false.B
70 |   io.to_icache.invalidate := false.B
71 | 
72 |   tlb.io.req.valid := (s_state === fetch_req)
73 |   tlb.io.req.bits.vaddr := ifetch_data.pc
74 |   tlb.io.req.bits.passthrough := false.B
75 |   tlb.io.req.bits.size := 2.U
76 |   tlb.io.req.bits.cmd := 0.U
77 |   tlb.io.req.bits.prv := 0.U
78 | 
79 |   tlb.io.sfence := 0.U.asTypeOf(tlb.io.sfence)
80 |   tlb.io.kill := false.B
81 |   io.to_ptw <> tlb.io.ptw
82 | 
83 |   io.inst_fetch.ready := (s_state === fetch_idle)
84 | 
85 |   val cache_data = RegInit(0.U.asTypeOf(new ICacheResp(cfg.dataBits)))
86 | 
87 |   when(io.to_icache.resp.valid) {
88 |     cache_data := io.to_icache.resp.bits
89 |   }
90 | 
91 |   io.inst_out.valid := s_state === dispatch_wait
92 |   io.inst_out.bits.pc := ifetch_data.pc
93 |   io.inst_out.bits.mask := ifetch_data.mask
94 |   io.inst_out.bits.wid := ifetch_data.wid
95 |   io.inst_out.bits.data := (cache_data.data >> (ifetch_data.pc(5, 2) * 32.U))(31, 0)
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/scala/core/Issue.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | 
 8 | class Issue(
 9 |   implicit p: Parameters)
10 |     extends Module {
11 |   val numThreads = p(ThreadNum)
12 |   val addrWidth = p(AddrWidth)
13 | 
14 |   val io = IO(new Bundle {
15 |     val writeback = Flipped(DecoupledIO(new CommitVData()))
16 |     val writeback_cmd = Flipped(DecoupledIO(new CommitVData()))
17 |     val decode = Flipped(DecoupledIO(new DecodeData()))
18 | 
19 |     val alu = DecoupledIO(new VALUData())
20 |     val lsu = DecoupledIO(new LSUData())
21 | 
22 |   })
23 | 
24 |   val decode_n = Reg(new DecodeData())
25 |   val decode_valid_n = RegInit(0.B)
26 |   when(io.decode.fire) {
27 |     decode_n := io.decode.bits
28 |     decode_valid_n := 1.B
29 |   }.otherwise {
30 |     decode_valid_n := 0.B
31 |   }
32 |   val vgpr = Module(new VGPR())
33 |   val score_board = Module(new ScoreBoard())
34 |   val dispatch = Module(new Dispatch())
35 | 
36 |   io.decode.ready := score_board.io.ibuffer.ready && dispatch.io.ibuffer.ready && !decode_valid_n
37 | 
38 |   vgpr.io.writeback.bits := io.writeback.bits
39 |   vgpr.io.writeback.valid := io.writeback.valid
40 |   vgpr.io.read_req.wid := io.decode.bits.wid
41 |   vgpr.io.read_req.rs1 := io.decode.bits.rs1
42 |   vgpr.io.read_req.rs2 := io.decode.bits.rs2
43 |   vgpr.io.writeback_cmd <> io.writeback_cmd
44 | 
45 |   score_board.io.writeback <> io.writeback
46 |   score_board.io.ibuffer.bits := io.decode.bits
47 |   score_board.io.ibuffer.valid := io.decode.valid
48 | 
49 |   dispatch.io.ibuffer.valid := decode_valid_n
50 |   dispatch.io.ibuffer.bits := decode_n
51 |   dispatch.io.vgpr_rsp := vgpr.io.read_rsp
52 | 
53 |   io.alu <> dispatch.io.alu
54 |   io.lsu <> dispatch.io.lsu
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/core/PTW.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import freechips.rocketchip.rocket.MStatus
  6 | import freechips.rocketchip.util._
  7 | import freechips.rocketchip.rocket.{M_XRD, PRV}
  8 | 
  9 | class PTE() extends Bundle {
 10 |   val reserved_for_future = UInt(10.W)
 11 |   val ppn = UInt(44.W)
 12 |   val reserved_for_software = Bits(2.W)
 13 | 
 14 |   /** dirty bit */
 15 |   val d = Bool()
 16 | 
 17 |   /** access bit */
 18 |   val a = Bool()
 19 | 
 20 |   /** global mapping */
 21 |   val g = Bool()
 22 | 
 23 |   /** user mode accessible */
 24 |   val u = Bool()
 25 | 
 26 |   /** whether the page is executable */
 27 |   val x = Bool()
 28 | 
 29 |   /** whether the page is writable */
 30 |   val w = Bool()
 31 | 
 32 |   /** whether the page is readable */
 33 |   val r = Bool()
 34 | 
 35 |   /** valid bit */
 36 |   val v = Bool()
 37 | 
 38 |   /** return true if find a pointer to next level page table */
 39 |   def table(dummy: Int = 0) = v && !r && !w && !x && !d && !a && !u && reserved_for_future === 0.U
 40 | 
 41 |   /** return true if find a leaf PTE */
 42 |   def leaf(dummy: Int = 0) = v && (r || (x && !w)) && a
 43 | 
 44 |   /** user read */
 45 |   def ur(dummy: Int = 0) = sr() && u
 46 | 
 47 |   /** user write */
 48 |   def uw(dummy: Int = 0) = sw() && u
 49 | 
 50 |   /** user execute */
 51 |   def ux(dummy: Int = 0) = sx() && u
 52 | 
 53 |   /** supervisor read */
 54 |   def sr(dummy: Int = 0) = leaf() && r
 55 | 
 56 |   /** supervisor write */
 57 |   def sw(dummy: Int = 0) = leaf() && w && d
 58 | 
 59 |   /** supervisor execute */
 60 |   def sx(dummy: Int = 0) = leaf() && x
 61 | 
 62 |   /** full permission: writable and executable in user mode */
 63 |   def isFullPerm(dummy: Int = 0) = uw() && ux()
 64 | }
 65 | 
 66 | class PTWReq(vpnBits: Int) extends Bundle {
 67 |   val addr = UInt(vpnBits.W)
 68 |   val vstage1 = Bool()
 69 |   val stage2 = Bool()
 70 | }
 71 | 
 72 | class PTWResp(vaddrBits: Int, pgLevels: Int) extends Bundle {
 73 | 
 74 |   /** ptw access exception */
 75 |   val ae_ptw = Bool()
 76 | 
 77 |   /** final access exception */
 78 |   val ae_final = Bool()
 79 | 
 80 |   /** page fault */
 81 |   val pf = Bool()
 82 | 
 83 |   // /** guest page fault */
 84 |   // val gf = Bool()
 85 | 
 86 |   // /** hypervisor read */
 87 |   // val hr = Bool()
 88 | 
 89 |   // /** hypervisor write */
 90 |   // val hw = Bool()
 91 | 
 92 |   // /** hypervisor execute */
 93 |   // val hx = Bool()
 94 | 
 95 |   /** PTE to refill L1TLB
 96 |     *
 97 |     * source: L2TLB
 98 |     */
 99 |   val pte = new PTE
100 | 
101 |   /** pte pglevel */
102 |   val level = UInt(log2Ceil(pgLevels).W)
103 | 
104 |   /** fragmented_superpage support */
105 |   // val fragmented_superpage = Bool()
106 | 
107 |   /** homogeneous for both pma and pmp */
108 |   val homogeneous = Bool()
109 |   // val gpa = Valid(UInt(vaddrBits.W))
110 |   // val gpa_is_pte = Bool()
111 | }
112 | 
113 | class PTBR() extends Bundle {
114 |   val mode = UInt(4.W)
115 |   val asid = UInt(16.W)
116 |   val ppn = UInt(44.W)
117 | }
118 | 
119 | /** IO between TLB and PTW
120 |   *
121 |   * PTW receives :
122 |   *   - PTE request
123 |   *   - CSRs info
124 |   *   - pmp results from PMP(in TLB)
125 |   */
126 | class TLBPTWIO(vpnBits: Int, vaddrBits: Int, pgLevels: Int) extends Bundle {
127 |   val req = Decoupled(Valid(new PTWReq(vpnBits)))
128 |   val resp = Flipped(Valid(new PTWResp(vaddrBits, pgLevels)))
129 |   val ptbr = Input(new PTBR())
130 | 
131 |   // val hgatp = Input(new PTBR())
132 |   // val vsatp = Input(new PTBR())
133 |   val status = Input(new MStatus())
134 |   // val hstatus = Input(new HStatus())
135 |   // val gstatus = Input(new MStatus())
136 |   // val customCSRs = Flipped(coreParams.customCSRs)
137 | }
138 | 
139 | class DatapathPTWIO(vaddrBits: Int) extends Bundle {
140 |   val ptbr = Input(new PTBR())
141 |   val sfence = Flipped(Valid(new SFenceReq(vaddrBits)))
142 |   val status = Input(new MStatus())
143 |   val clock_enabled = Output(Bool())
144 | }
145 | 
146 | case class PTWParameter(
147 |   paddrBits:   Int,
148 |   vaddrBits:   Int,
149 |   pgIdxBits:   Int = 12,
150 |   pgLevelBits: Int = 9,
151 |   nSectors:    Int = 4,
152 |   xLen:        Int = 64,
153 |   pgLevels:    Int = 3) {
154 | 
155 |   def ppnBits: Int = paddrBits - pgIdxBits
156 |   def vpnBits: Int = vaddrBits - pgIdxBits
157 | }
158 | 
159 | /** PTW contains L2TLB, and performs page table walk for high level TLB, and cache queries from L1 TLBs(I$, D$, RoCC)
160 |   *
161 |   * It performs hierarchy page table query to mem for the desired leaf PTE and cache them in l2tlb. Besides leaf PTEs,
162 |   * it also caches non-leaf PTEs in pte_cache to accerlerate the process.
163 |   *
164 |   * ==Structure==
165 |   *   - l2tlb : for leaf PTEs
166 |   *     - set-associative (configurable with [[CoreParams.nL2TLBEntries]]and [[CoreParams.nL2TLBWays]]))
167 |   *     - PLRU
168 |   *   - pte_cache: for non-leaf PTEs
169 |   *     - set-associative
170 |   *     - LRU
171 |   *   - s2_pte_cache: for non-leaf PTEs in 2-stage translation
172 |   *     - set-associative
173 |   *     - PLRU
174 |   *
175 |   * l2tlb Pipeline: 3 stage
176 |   * {{{
177 |   * stage 0 : read
178 |   * stage 1 : decode
179 |   * stage 2 : hit check
180 |   * }}}
181 |   * ==State Machine==
182 |   * s_ready: ready to reveive request from TLB s_req: request mem; pte_cache hit judge s_wait1: deal with l2tlb error
183 |   * s_wait2: final hit judge s_wait3: receive mem response s_fragment_superpage: for superpage PTE
184 |   *
185 |   * @note
186 |   *   l2tlb hit happens in s_req or s_wait1
187 |   * @see
188 |   *   RV-priv spec 4.3-4.6 for Virtual-Memory System
189 |   * @see
190 |   *   RV-priv spec 8.5 for Two-Stage Address Translation
191 |   * @todo
192 |   *   details in two-stage translation
193 |   */
194 | class PTW(n: Int, cfg: PTWParameter, cache_cfg: CacheParameter) extends Module {
195 |   val io = IO(new Bundle {
196 | 
197 |     /** to n TLB */
198 |     val requestor = Flipped(Vec(n, new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels)))
199 | 
200 |     /** to Cache */
201 |     val mem = new CacheIO(cache_cfg)
202 | 
203 |     /** to Core
204 |       *
205 |       * contains CSRs info and performance statistics
206 |       */
207 |     val dpath = new DatapathPTWIO(cfg.vaddrBits)
208 |   })
209 | 
210 |   val s_ready :: s_req :: s_wait1 :: s_dummy1 :: s_wait2 :: s_wait3 :: s_dummy2 :: s_fragment_superpage :: Nil = Enum(8)
211 |   val state = RegInit(s_ready)
212 |   val l2_refill_wire = Wire(Bool())
213 | 
214 |   /** Arbiter to arbite request from nTLB */
215 |   val arb = Module(new Arbiter(Valid(new PTWReq(cfg.vpnBits)), n))
216 |   // use TLB req as arbitor's input
217 |   arb.io.in <> io.requestor.map(_.req)
218 |   // receive req only when s_ready and not in refill
219 |   arb.io.out.ready := (state === s_ready) && !l2_refill_wire
220 | 
221 |   val resp_valid = RegNext(VecInit(Seq.fill(io.requestor.size)(false.B)))
222 | 
223 |   val clock_en =
224 |     state =/= s_ready || l2_refill_wire || arb.io.out.valid || io.dpath.sfence.valid
225 |   io.dpath.clock_enabled := clock_en
226 | 
227 |   val invalidated = Reg(Bool())
228 | 
229 |   /** current PTE level
230 |     * {{{
231 |     * 0 <= count <= pgLevel-1
232 |     * count = pgLevel - 1 : leaf PTE
233 |     * count < pgLevel - 1 : non-leaf PTE
234 |     * }}}
235 |     */
236 |   val count = Reg(UInt(log2Ceil(cfg.pgLevels).W))
237 |   val resp_ae_ptw = Reg(Bool())
238 |   val resp_ae_final = Reg(Bool())
239 |   val resp_pf = Reg(Bool())
240 | 
241 |   /** tlb request */
242 |   val r_req = Reg(new PTWReq(cfg.vpnBits))
243 | 
244 |   /** current selected way in arbitor */
245 |   val r_req_dest = Reg(Bits())
246 |   // to construct mem.req.addr
247 |   val r_pte = Reg(new PTE)
248 | 
249 |   val aux_pte = Reg(new PTE)
250 | 
251 |   val satp = io.dpath.ptbr
252 |   val vpn = r_req.addr
253 | 
254 |   val mem_resp_valid = RegNext(io.mem.resp.valid)
255 |   val mem_resp_data = RegNext(io.mem.resp.bits.data)
256 |   // io.mem.uncached_resp.map { resp =>
257 |   //   assert(!(resp.valid && io.mem.resp.valid))
258 |   //   resp.ready := true.B
259 |   //   when(resp.valid) {
260 |   //     mem_resp_valid := true.B
261 |   //     mem_resp_data := resp.bits.data
262 |   //   }
263 |   // }
264 |   // construct pte from mem.resp
265 |   val (pte, invalid_paddr) = {
266 |     val tmp = mem_resp_data.asTypeOf(new PTE())
267 |     val res = WireDefault(tmp)
268 |     res.ppn := tmp.ppn(cfg.ppnBits - 1, 0)
269 |     when(tmp.r || tmp.w || tmp.x) {
270 |       // for superpage mappings, make sure PPN LSBs are zero
271 |       for (i <- 0 until cfg.pgLevels - 1)
272 |         when(
273 |           count <= i.U && tmp.ppn(
274 |             (cfg.pgLevels - 1 - i) * cfg.pgLevelBits - 1,
275 |             (cfg.pgLevels - 2 - i) * cfg.pgLevelBits
276 |           ) =/= 0.U
277 |         ) { res.v := false.B }
278 |     }
279 |     (res, (tmp.ppn >> cfg.ppnBits) =/= 0.U)
280 |   }
281 |   // find non-leaf PTE, need traverse
282 |   val traverse = pte.table() && !invalid_paddr && count < (cfg.pgLevels - 1).U
283 | 
284 |   /** address send to mem for enquerry */
285 |   val pte_addr = {
286 |     val vpn_idxs = (0 until cfg.pgLevels).map { i =>
287 |       val width = cfg.pgLevelBits
288 |       (vpn >> (cfg.pgLevels - i - 1) * cfg.pgLevelBits)(width - 1, 0)
289 |     }
290 |     val mask = ((1 << cfg.pgLevelBits) - 1).U
291 |     val vpn_idx = vpn_idxs(count) & mask
292 |     val raw_pte_addr = ((r_pte.ppn << cfg.pgLevelBits) | vpn_idx) << log2Ceil(cfg.xLen / 8)
293 |     val size = cfg.paddrBits
294 |     // use r_pte.ppn as page table base address
295 |     // use vpn slice as offset
296 |     raw_pte_addr.apply(size.min(raw_pte_addr.getWidth) - 1, 0)
297 |   }
298 | 
299 |   /** pte_cache input addr */
300 |   val pte_cache_addr = pte_addr
301 | 
302 |   /** PTECache caches non-leaf PTE
303 |     * @param s2
304 |     *   true: 2-stage address translation
305 |     */
306 |   def makePTECache(s2: Boolean): (Bool, UInt) = (false.B, 0.U)
307 |   // generate pte_cache
308 |   val (pte_cache_hit, pte_cache_data) = makePTECache(false)
309 |   // pte_cache hit or 2-stage pte_cache hit
310 |   val pte_hit = RegNext(false.B)
311 |   // l2_refill happens when find the leaf pte
312 |   val l2_refill = RegNext(false.B)
313 |   l2_refill_wire := l2_refill
314 |   // l2tlb
315 |   val (l2_hit, l2_error, l2_pte, l2_tlb_ram) = (false.B, false.B, WireDefault(0.U.asTypeOf(new PTE)), None)
316 | 
317 |   // if SFENCE occurs during walk, don't refill PTE cache or L2 TLB until next walk
318 |   invalidated := io.dpath.sfence.valid || (invalidated && state =/= s_ready)
319 |   // mem request
320 |   // io.mem.keep_clock_enabled := false.B
321 | 
322 |   io.mem.req.valid := state === s_req
323 |   io.mem.req.bits.tag := 0.U
324 |   io.mem.req.bits.phys := true.B
325 |   io.mem.req.bits.cmd := M_XRD
326 |   io.mem.req.bits.size := log2Ceil(cfg.xLen / 8).U
327 |   io.mem.req.bits.signed := false.B
328 |   io.mem.req.bits.addr := pte_addr
329 |   // io.mem.req.bits.idx.foreach(_ := pte_addr)
330 |   io.mem.req.bits.dprv := PRV.S.U // PTW accesses are S-mode by definition
331 |   io.mem.req.bits.dv := false.B
332 |   // io.mem.req.bits.tag := DontCare
333 |   io.mem.req.bits.no_alloc := DontCare
334 |   io.mem.req.bits.no_xcpt := DontCare
335 |   io.mem.req.bits.data := DontCare
336 |   io.mem.req.bits.mask := DontCare
337 | 
338 |   io.mem.s1_kill := l2_hit || state =/= s_wait1
339 |   io.mem.s1_data := DontCare
340 |   io.mem.s2_kill := false.B
341 | 
342 |   val homogeneous = true.B
343 |   // response to tlb
344 |   for (i <- 0 until io.requestor.size) {
345 |     io.requestor(i).resp.valid := resp_valid(i)
346 |     io.requestor(i).resp.bits.ae_ptw := resp_ae_ptw
347 |     io.requestor(i).resp.bits.ae_final := resp_ae_final
348 |     io.requestor(i).resp.bits.pf := resp_pf
349 |     io.requestor(i).resp.bits.pte := r_pte
350 |     io.requestor(i).resp.bits.level := count
351 |     io.requestor(i).resp.bits.homogeneous := homogeneous
352 |     io.requestor(i).ptbr := io.dpath.ptbr
353 |     // io.requestor(i).customCSRs <> io.dpath.customCSRs
354 |     io.requestor(i).status := io.dpath.status
355 |     // io.requestor(i).pmp := io.dpath.pmp
356 |   }
357 | 
358 |   // control state machine
359 |   val next_state = WireDefault(state)
360 |   state := OptimizationBarrier(next_state)
361 | 
362 |   switch(state) {
363 |     is(s_ready) {
364 |       when(arb.io.out.fire) {
365 |         val aux_ppn = arb.io.out.bits.bits.addr
366 | 
367 |         r_req := arb.io.out.bits.bits
368 |         r_req_dest := arb.io.chosen
369 |         next_state := Mux(arb.io.out.bits.valid, s_req, s_ready)
370 |         count := 0.U
371 |         aux_pte.ppn := aux_ppn
372 |         aux_pte.reserved_for_future := 0.U
373 |         resp_ae_ptw := false.B
374 |         resp_ae_final := false.B
375 |       }
376 |     }
377 |     is(s_req) {
378 |       // pte_cache hit
379 |       when(pte_cache_hit) {
380 |         count := count + 1.U
381 |         pte_hit := true.B
382 |       }.otherwise {
383 |         next_state := Mux(io.mem.req.ready, s_wait1, s_req)
384 |       }
385 |     }
386 |     is(s_wait1) {
387 |       // This Mux is for the l2_error case; the l2_hit && !l2_error case is overriden below
388 |       next_state := Mux(l2_hit, s_req, s_wait2)
389 |     }
390 |     is(s_wait2) {
391 |       next_state := s_wait3
392 |       when(io.mem.s2_xcpt.ae.ld) {
393 |         resp_ae_ptw := true.B
394 |         next_state := s_ready
395 |         resp_valid(r_req_dest) := true.B
396 |       }
397 |     }
398 |   }
399 | 
400 |   r_pte := OptimizationBarrier(
401 |     // l2tlb hit->find a leaf PTE(l2_pte), respond to L1TLB
402 |     Mux(
403 |       l2_hit && !l2_error,
404 |       l2_pte,
405 |       // pte cache hit->find a non-leaf PTE(pte_cache),continue to request mem
406 |       Mux(
407 |         state === s_req && pte_cache_hit,
408 |         makePTE(pte_cache_data, l2_pte),
409 |         // when mem respond, store mem.resp.pte
410 |         Mux(
411 |           mem_resp_valid,
412 |           pte,
413 |           // when tlb request come->request mem, use root address in satp(or vsatp,hgatp)
414 |           Mux(arb.io.out.fire, makePTE(satp.ppn, r_pte), r_pte)
415 |         )
416 |       )
417 |     )
418 |   )
419 | 
420 |   when(l2_hit && !l2_error) {
421 |     assert(state === s_req || state === s_wait1)
422 |     next_state := s_ready
423 |     resp_valid(r_req_dest) := true.B
424 |     count := (cfg.pgLevels - 1).U
425 |   }
426 |   when(mem_resp_valid) {
427 |     assert(state === s_wait3)
428 |     next_state := s_req
429 |     when(traverse) {
430 |       count := count + 1.U
431 |     }.otherwise {
432 |       val ae = pte.v && invalid_paddr
433 |       val pf = pte.v && pte.reserved_for_future =/= 0.U
434 |       val success = pte.v && !ae && !pf
435 | 
436 |       // find a leaf pte, start l2 refill
437 |       l2_refill := success && count === (cfg.pgLevels - 1).U
438 |       count := 0.U
439 | 
440 |       next_state := s_ready
441 |       resp_valid(r_req_dest) := true.B
442 | 
443 |       resp_ae_ptw := ae && count < (cfg.pgLevels - 1).U && pte.table()
444 |       resp_ae_final := ae
445 |       resp_pf := pf
446 |     }
447 |   }
448 |   when(io.mem.s2_nack) {
449 |     assert(state === s_wait2)
450 |     next_state := s_req
451 |   }
452 | 
453 |   /** Relace PTE.ppn with ppn */
454 |   private def makePTE(ppn: UInt, default: PTE) = {
455 |     val pte = WireDefault(default)
456 |     pte.ppn := ppn
457 |     pte
458 |   }
459 | }
460 | 


--------------------------------------------------------------------------------
/src/main/scala/core/SGPR.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | import chisel3._
 3 | import chisel3.util._
 4 | import org.chipsalliance.cde.config.Parameters
 5 | import ogpu.lib._
 6 | import ogpu.config._
 7 | 
 8 | class SGPR(
 9 |   implicit p: Parameters)
10 |     extends Module() {
11 |   val numWarps = p(WarpNum)
12 |   val numRegs = p(RegNum)
13 |   val xLen = p(XLen)
14 | 
15 |   val io = IO(new Bundle {
16 |     val writeback = Flipped(DecoupledIO(new CommitSData()))
17 |     val writeback_cmd = Flipped(DecoupledIO(new CommitSData()))
18 |     val read_req = Flipped(new ReadGPRReq())
19 |     val read_rsp = new ReadSGPRRsp()
20 |   })
21 | 
22 |   val gpr_ram = VecInit(Seq.fill(numWarps)((Module(new MaskedSmem_2R1W(xLen, numRegs, 1)).io)))
23 |   val raddr_reg = RegInit(0.U)
24 |   val raddr2_reg = RegInit(0.U)
25 |   val rwid_reg = RegInit(0.U(log2Ceil(numWarps).W))
26 |   val ready_reg = RegInit(0.B)
27 |   val cmd_ready_reg = RegInit(0.B)
28 | 
29 |   raddr_reg := io.read_req.rs1
30 |   raddr2_reg := io.read_req.rs2
31 |   rwid_reg := io.read_req.wid
32 | 
33 |   io.writeback.ready := ready_reg
34 |   io.writeback_cmd.ready := cmd_ready_reg
35 | 
36 |   for (i <- 0 until numWarps) {
37 |     // init
38 |     gpr_ram(i).write_en := 0.B
39 |     gpr_ram(i).waddr := 0.U
40 |     gpr_ram(i).raddr := io.read_req.rs1
41 |     gpr_ram(i).raddr2 := io.read_req.rs2
42 |     gpr_ram(i).mask := 0.U.asTypeOf(io.writeback.bits.mask)
43 |     gpr_ram(i).dataIn := 0.U.asTypeOf(io.writeback.bits.data)
44 | 
45 |     when(io.writeback_cmd.valid && i.U === io.writeback_cmd.bits.wid) {
46 |       gpr_ram(i).write_en := io.writeback_cmd.valid
47 |       gpr_ram(i).waddr := io.writeback_cmd.bits.rd
48 |       gpr_ram(i).mask := io.writeback_cmd.bits.mask
49 |       gpr_ram(i).dataIn := io.writeback_cmd.bits.data
50 |     }.elsewhen(io.writeback.valid && i.U === io.writeback.bits.wid) {
51 |       gpr_ram(i).write_en := io.writeback.valid
52 |       gpr_ram(i).waddr := io.writeback.bits.rd
53 |       gpr_ram(i).dataIn := io.writeback.bits.data
54 |     }
55 | 
56 |   }
57 | 
58 |   ready_reg := 0.B
59 |   cmd_ready_reg := 0.B
60 |   when(io.writeback_cmd.valid) {
61 |     cmd_ready_reg := 1.B
62 |   }.elsewhen(io.writeback.valid) {
63 |     ready_reg := 1.B
64 |   }
65 | 
66 |   io.read_rsp.rs1_data := Mux(raddr_reg === 0.U, 0.U.asTypeOf(gpr_ram(0).dataOut), gpr_ram(rwid_reg).dataOut)
67 |   io.read_rsp.rs2_data := Mux(raddr2_reg === 0.U, 0.U.asTypeOf(gpr_ram(0).dataOut), gpr_ram(rwid_reg).dataOut2)
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/core/SIMTStack.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | import ogpu.lib._
 8 | 
 9 | class SIMTStack(
10 |   implicit p: Parameters)
11 |     extends Module {
12 |   val numThreads = p(ThreadNum)
13 |   val addrWidth = p(AddrWidth)
14 |   val stackDepth = p(StackDepth)
15 | 
16 |   val io = IO(new Bundle {
17 |     val in_diverge = Input(Bool())
18 |     val in_data = Input(new StackData())
19 |     val out_data = Output(new StackData())
20 |     val push = Input(Bool())
21 |     val pop = Input(Bool())
22 |     val out_diverge = Output(Bool())
23 |     val empty = Output(Bool())
24 |     val full = Output(Bool())
25 |   })
26 | 
27 |   val stack_addr = RegInit(0.U(log2Ceil(stackDepth + 1).W))
28 |   val stack_pop_addr = RegInit(0.U(log2Ceil(stackDepth + 1).W))
29 |   val out_diverge = RegInit(0.B)
30 |   val out_data = Wire(new StackData())
31 |   val diverge_status = RegInit(VecInit(Seq.fill(stackDepth)(false.B)))
32 |   val stack_sram = Module(new ReadWriteSmem(io.in_data.getWidth, stackDepth))
33 | 
34 |   stack_pop_addr := stack_addr - 1.U
35 |   stack_sram.io.enable := io.push || io.pop
36 |   stack_sram.io.write := io.push
37 |   stack_sram.io.addr := Mux(io.push, stack_addr, stack_pop_addr)
38 |   stack_sram.io.dataIn := io.in_data.asUInt
39 |   out_data := stack_sram.io.dataOut.asTypeOf(new StackData())
40 | 
41 |   when(io.push) {
42 |     stack_addr := stack_addr + 1.U
43 |     stack_pop_addr := stack_addr
44 |   }.elsewhen(io.pop && ~diverge_status(stack_pop_addr)) {
45 |     stack_addr := stack_addr - 1.U
46 |     stack_pop_addr := stack_pop_addr - 1.U
47 |   }
48 | 
49 |   when(io.push) {
50 |     diverge_status(stack_addr) := io.in_diverge
51 |   }.elsewhen(io.pop) {
52 |     diverge_status(stack_pop_addr) := 0.B
53 |     out_diverge := diverge_status(stack_pop_addr)
54 |   }
55 | 
56 |   io.empty := stack_addr === 0.U
57 |   io.full := stack_addr === stackDepth.U
58 |   io.out_diverge := out_diverge
59 |   io.out_data := out_data
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/core/Scoreboard.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | 
 8 | class ScoreBoard(
 9 |   implicit p: Parameters)
10 |     extends Module {
11 |   val numWarps = p(WarpNum)
12 |   val numRegs = p(RegNum)
13 |   val io = IO(new Bundle {
14 |     val ibuffer = Flipped(DecoupledIO(new DecodeData()))
15 |     val writeback = Flipped(DecoupledIO(new WritebackData()))
16 |   })
17 | 
18 |   // Registers to hold the state of inuse_regs
19 |   val inuseRegs = RegInit(VecInit(Seq.fill(numWarps)(VecInit(Seq.fill(numRegs)(false.B)))))
20 | 
21 |   // Wires to get the state immediately
22 |   val inuseRegsCurrent = Wire(Vec(numWarps, Vec(numRegs, Bool())))
23 | 
24 |   // Reserve a register when instruction is valid, ready, and writeback is enabled
25 |   val reserveReg = io.ibuffer.valid && io.ibuffer.ready && io.ibuffer.bits.wb
26 | 
27 |   // Release a register when writeback to a register is complete (and it is the last instruction of a packet)
28 |   val releaseReg = io.writeback.valid && io.writeback.ready && io.writeback.bits.eop
29 | 
30 |   // Update `inuseRegsCurrent` with `reserveReg` and `releaseReg`
31 |   for (i <- 0 until numWarps) {
32 |     for (j <- 0 until numRegs) {
33 |       inuseRegsCurrent(i)(j) := inuseRegs(i)(j)
34 |       when(reserveReg && io.ibuffer.bits.wid === i.U && io.ibuffer.bits.rd === j.U) {
35 |         inuseRegsCurrent(i)(j) := true.B
36 |       }
37 |       when(releaseReg && io.writeback.bits.wid === i.U && io.writeback.bits.rd === j.U) {
38 |         inuseRegsCurrent(i)(j) := false.B
39 |       }
40 |     }
41 |   }
42 | 
43 |   // Update `inuseRegs` with `inuseRegsCurrent` on rising edge of clock
44 |   inuseRegs := inuseRegsCurrent
45 | 
46 |   // Check if the requested registers are free
47 |   val deqInuseRd = RegInit(false.B)
48 |   val deqInuseRs1 = RegInit(false.B)
49 |   val deqInuseRs2 = RegInit(false.B)
50 | 
51 |   deqInuseRd := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rd)
52 |   deqInuseRs1 := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rs1)
53 |   deqInuseRs2 := inuseRegsCurrent(io.ibuffer.bits.wid)(io.ibuffer.bits.rs2)
54 | 
55 |   io.writeback.ready := true.B
56 |   io.ibuffer.ready := !(deqInuseRd || deqInuseRs1 || deqInuseRs2)
57 | 
58 |   // Check and assert if any deadlock is detected
59 |   val deadlockCtr = RegInit(0.U(32.W))
60 |   val deadlockTimeout = 100000.U
61 | 
62 |   when(io.ibuffer.valid && !io.ibuffer.ready) {
63 |     deadlockCtr := deadlockCtr + 1.U
64 |     assert(
65 |       deadlockCtr < deadlockTimeout,
66 |       cf"Deadlock detected - PC: 0x${Hexadecimal(io.ibuffer.bits.pc)}, wid: ${io.ibuffer.bits.wid}, rd: ${io.ibuffer.bits.rd}"
67 |     )
68 |   }.elsewhen(io.ibuffer.valid && io.ibuffer.ready) {
69 |     deadlockCtr := 0.U
70 |   }.elsewhen(io.writeback.valid && io.writeback.ready && io.writeback.bits.eop) {
71 |     assert(
72 |       inuseRegs(io.writeback.bits.wid)(io.writeback.bits.rd),
73 |       cf"Invalid writeback register - PC: 0x${Hexadecimal(io.writeback.bits.pc)}, wid: ${io.writeback.bits.wid}, rd: ${io.writeback.bits.rd}"
74 |     )
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/core/TLB.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import chisel3.experimental.SourceInfo
  6 | import freechips.rocketchip.rocket.{
  7 |   isAMOArithmetic,
  8 |   isAMOLogical,
  9 |   isRead,
 10 |   isWrite,
 11 |   M_FLUSH_ALL,
 12 |   M_PWR,
 13 |   M_SZ,
 14 |   M_WOK,
 15 |   M_XLR,
 16 |   M_XSC,
 17 |   PRV
 18 | }
 19 | import freechips.rocketchip.util._
 20 | 
 21 | case class TLBParameter(
 22 |   nSets:       Int,
 23 |   nWays:       Int,
 24 |   paddrBits:   Int,
 25 |   vaddrBits:   Int,
 26 |   xLen:        Int = 32,
 27 |   pgIdxBits:   Int = 12,
 28 |   minPgLevels: Int = 2,
 29 |   pgLevelBits: Int = 9,
 30 |   nSectors:    Int = 4,
 31 |   pgLevels:    Int = 3) {
 32 | 
 33 |   def ppnBits: Int = paddrBits - pgIdxBits
 34 |   def vpnBits: Int = vaddrBits - pgIdxBits
 35 | 
 36 | }
 37 | 
 38 | class SFenceReq(vaddrBits: Int) extends Bundle {
 39 |   val rs1 = Bool()
 40 |   val rs2 = Bool()
 41 |   val addr = UInt(vaddrBits.W)
 42 |   val asid = UInt(16.W)
 43 | }
 44 | 
 45 | class TLBReq(lgMaxSize: Int, vaddrBits: Int) extends Bundle {
 46 | 
 47 |   /** request address from CPU. */
 48 |   val vaddr = UInt(vaddrBits.W)
 49 | 
 50 |   /** don't lookup TLB, bypass vaddr as paddr */
 51 |   val passthrough = Bool()
 52 | 
 53 |   /** granularity */
 54 |   val size = UInt(log2Ceil(lgMaxSize + 1).W)
 55 | 
 56 |   /** memory command. */
 57 |   val cmd = Bits(M_SZ.W)
 58 |   val prv = UInt(PRV.SZ.W)
 59 | 
 60 |   /** virtualization mode */
 61 |   // val v = Bool()
 62 | 
 63 | }
 64 | 
 65 | class TLBExceptions extends Bundle {
 66 |   val ld = Bool()
 67 |   val st = Bool()
 68 |   val inst = Bool()
 69 | }
 70 | 
 71 | class TLBResp(paddrBits: Int, vaddrBits: Int) extends Bundle {
 72 |   // lookup responses
 73 |   val miss = Bool()
 74 | 
 75 |   /** physical address */
 76 |   val paddr = UInt(paddrBits.W)
 77 | 
 78 |   /** page fault exception */
 79 |   val pf = new TLBExceptions
 80 | 
 81 |   /** access exception */
 82 |   val ae = new TLBExceptions
 83 | 
 84 |   /** misaligned access exception */
 85 |   val ma = new TLBExceptions
 86 | }
 87 | 
 88 | class TLBEntryData(tlbParam: TLBParameter) extends Bundle {
 89 |   val ppn = UInt(tlbParam.ppnBits.W)
 90 | 
 91 |   /** pte.u user */
 92 |   val u = Bool()
 93 | 
 94 |   /** access exception. D$ -> PTW -> TLB AE Alignment failed.
 95 |     */
 96 |   val ae_ptw = Bool()
 97 |   val ae_final = Bool()
 98 | 
 99 |   /** page fault */
100 |   val pf = Bool()
101 | 
102 |   /** prot_w */
103 |   val pw = Bool()
104 | 
105 |   /** prot_x */
106 |   val px = Bool()
107 | 
108 |   /** prot_r */
109 |   val pr = Bool()
110 | 
111 | }
112 | 
113 | /** basic cell for TLB data */
114 | class TLBEntry(val tlbParam: TLBParameter) extends Bundle {
115 | 
116 |   val level = UInt(log2Ceil(tlbParam.pgLevels).W)
117 | 
118 |   /** use vpn as tag */
119 |   val tag_vpn = UInt(tlbParam.vpnBits.W)
120 | 
121 |   val tag_asid = Vec(tlbParam.nSectors, UInt(16.W))
122 | 
123 |   /** entry data */
124 |   val data = Vec(tlbParam.nSectors, UInt(new TLBEntryData(tlbParam).getWidth.W))
125 | 
126 |   /** valid bit */
127 |   val valid = Vec(tlbParam.nSectors, Bool())
128 | 
129 |   /** returns all entry data in this entry */
130 |   def entry_data = data.map(_.asTypeOf(new TLBEntryData(tlbParam)))
131 | 
132 |   /** returns the index of sector */
133 |   private def sectorIdx(vpn: UInt) = vpn.extract(tlbParam.nSectors.log2 - 1, 0)
134 | 
135 |   /** returns the entry data matched with this vpn */
136 |   def getData(vpn: UInt) = OptimizationBarrier(data(sectorIdx(vpn)).asTypeOf(new TLBEntryData(tlbParam)))
137 | 
138 |   /** returns whether a sector hits */
139 |   def sectorHit(vpn: UInt) = valid.orR && sectorTagMatch(vpn)
140 | 
141 |   /** returns whether tag matches vpn */
142 |   def sectorTagMatch(vpn: UInt) =
143 |     (((tag_vpn ^ vpn) >> tlbParam.nSectors.log2) === 0.U)
144 | 
145 |   /** returns hit signal */
146 |   def hit(vpn: UInt, asid: UInt): Bool = {
147 |     val idx = sectorIdx(vpn)
148 |     (tag_asid(idx) === asid) && valid(idx) && sectorTagMatch(vpn)
149 |   }
150 | 
151 |   /** returns the ppn of the input TLBEntryData */
152 |   def ppn(data: TLBEntryData) = {
153 |     data.ppn
154 |   }
155 | 
156 |   /** does the refill
157 |     *
158 |     * find the target entry with vpn tag and replace the target entry with the input entry data
159 |     */
160 |   def insert(vpn: UInt, asid: UInt, level: UInt, entry: TLBEntryData): Unit = {
161 |     this.tag_vpn := vpn
162 |     this.level := level.extract(log2Ceil(tlbParam.pgLevels) - 1, 0)
163 | 
164 |     val idx = sectorIdx(vpn)
165 |     valid(idx) := true.B
166 |     data(idx) := entry.asUInt
167 |     tag_asid(idx) := asid
168 |   }
169 | 
170 |   def invalidate(): Unit = { valid.foreach(_ := false.B) }
171 |   def invalidate(asid: UInt): Unit = {
172 |     for (((v, id), e) <- valid.zip(tag_asid).zip(entry_data))
173 |       when(id === asid) { v := false.B }
174 |   }
175 |   def invalidateVPN(vpn: UInt, asid: UInt): Unit = {
176 |     when(sectorTagMatch(vpn)) {
177 |       for ((((v, id), e), i) <- (valid.zip(tag_asid).zip(entry_data)).zipWithIndex)
178 |         when(id === asid && i.U === sectorIdx(vpn)) { v := false.B }
179 |     }
180 |   }
181 |   def invalidateNonGlobal(asid: UInt): Unit = {
182 |     for (((v, id), e) <- valid.zip(tag_asid).zip(entry_data))
183 |       when(id === asid) { v := false.B }
184 |   }
185 | }
186 | 
187 | /** =Overview=
188 |   * [[TLB]] is a TLB template.
189 |   *
190 |   * TLB caches PTE and accelerates the address translation process. When tlb miss happens, ask PTW(L2TLB) for Page Table
191 |   * Walk.
192 |   *
193 |   * ==Cache Structure==
194 |   *   - Sectored Entry (PTE)
195 |   *     - set-associative or direct-mapped
196 |   *       - nsets = [[nSets]]
197 |   *       - nways = [[nWays]] / [[nSectors]]
198 |   *       - PTEEntry( sectors = [[nSectors]] )
199 |   *     - LRU(if set-associative)
200 |   *
201 |   * ==Address structure==
202 |   * {{{
203 |   * |vaddr                                                 |
204 |   * |ppn/vpn                                   | pgIndex   |
205 |   * |                                          |           |
206 |   * |           |nSets             |nSector    |           |
207 |   * }}}
208 |   *
209 |   * ==State Machine==
210 |   * {{{
211 |   * s_ready: ready to accept request from EXE.
212 |   * s_request: when L1TLB(this) miss, send request to PTW(L2TLB), .
213 |   * s_wait: wait for PTW to refill L1TLB.
214 |   * s_wait_invalidate: L1TLB is waiting for respond from PTW, but L1TLB will invalidate respond from PTW.
215 |   * }}}
216 |   *
217 |   * ==Note==
218 |   * Boom use Rocket ITLB, and its own DTLB.
219 |   *
220 |   * Accelerators:{{{ sha3: DTLB gemmini: DTLB hwacha: DTLB*2+ITLB}}}
221 |   * @param instruction
222 |   *   true for ITLB, false for DTLB
223 |   * @param lgMaxSize
224 |   *   \@todo seems granularity
225 |   * @param cfg
226 |   *   [[TLBConfig]]
227 |   * @param edge
228 |   *   collect SoC metadata.
229 |   */
230 | class TLB(
231 |   instruction: Boolean,
232 |   cfg:         TLBParameter) //(
233 | // implicit edge: TLEdgeOut,
234 | // p:             Parameters)
235 |     extends Module {
236 |   val io = IO(new Bundle {
237 | 
238 |     /** request from Core */
239 |     val req = Flipped(Decoupled(new TLBReq(cfg.xLen / 8, cfg.vaddrBits)))
240 | 
241 |     /** response to Core */
242 |     val resp = Output(new TLBResp(cfg.paddrBits, cfg.vaddrBits))
243 | 
244 |     /** SFence Input */
245 |     val sfence = Flipped(Valid(new SFenceReq(cfg.vaddrBits)))
246 | 
247 |     /** IO to PTW */
248 |     val ptw = new TLBPTWIO(cfg.vpnBits, cfg.vaddrBits, cfg.pgLevels)
249 | 
250 |     /** suppress a TLB refill, one cycle after a miss */
251 |     val kill = Input(Bool())
252 |   })
253 | 
254 |   val usingAtomicsInCache = true
255 |   val usingAtomics = true
256 |   val vpn = io.req.bits.vaddr(cfg.vaddrBits - 1, cfg.pgIdxBits)
257 | 
258 |   /** index for sectored_Entry */
259 |   val memIdx = vpn.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
260 | 
261 |   /** TLB Entry */
262 |   val sectored_entries = Reg(Vec(cfg.nSets, Vec(cfg.nWays / cfg.nSectors, new TLBEntry(cfg))))
263 |   def ordinary_entries = sectored_entries(memIdx)
264 |   def all_entries = ordinary_entries
265 |   def all_real_entries = sectored_entries.flatten
266 | 
267 |   val s_ready :: s_request :: s_wait :: s_wait_invalidate :: Nil = Enum(4)
268 |   val state = RegInit(s_ready)
269 |   // use vpn as refill_tag
270 |   val r_refill_tag = Reg(UInt(cfg.vpnBits.W))
271 |   val r_sectored_repl_addr = Reg(UInt(log2Ceil(sectored_entries.head.size).W))
272 |   val r_sectored_hit = Reg(Valid(UInt(log2Ceil(sectored_entries.head.size).W)))
273 | 
274 |   /** privilege mode */
275 |   val priv = io.req.bits.prv
276 |   val priv_v = false.B
277 |   val priv_s = priv(0)
278 |   // user mode and supervisor mode
279 |   val priv_uses_vm = priv <= PRV.S.U
280 |   val satp = io.ptw.ptbr
281 |   val asid = satp.asid
282 |   val stage1_en = satp.mode(satp.mode.getWidth - 1)
283 | 
284 |   /** Enable Virtual Memory when:
285 |     *   1. statically configured
286 |     *   1. satp highest bits enabled
287 |     *      i. RV32:
288 |     *         - 0 -> Bare
289 |     *         - 1 -> SV32
290 |     *      i. RV64:
291 |     *         - 0000 -> Bare
292 |     *         - 1000 -> SV39
293 |     *         - 1001 -> SV48
294 |     *         - 1010 -> SV57
295 |     *         - 1011 -> SV64
296 |     *   1. In virtualization mode, vsatp highest bits enabled
297 |     *   1. priv mode in U and S.
298 |     *   1. in H & M mode, disable VM.
299 |     *   1. no passthrough(micro-arch defined.)
300 |     *
301 |     * @see
302 |     *   RV-priv spec 4.1.11 Supervisor Address Translation and Protection (satp) Register
303 |     * @see
304 |     *   RV-priv spec 8.2.18 Virtual Supervisor Address Translation and Protection Register (vsatp)
305 |     */
306 |   val vm_enabled = stage1_en && priv_uses_vm && !io.req.bits.passthrough
307 | 
308 |   // share a single physical memory attribute checker (unshare if critical path)
309 |   val refill_ppn = io.ptw.resp.bits.pte.ppn(cfg.ppnBits - 1, 0)
310 | 
311 |   /** refill signal */
312 |   val do_refill = io.ptw.resp.valid
313 | 
314 |   /** sfence invalidate refill */
315 |   val invalidate_refill = state.isOneOf(s_request /* don't care */, s_wait_invalidate) || io.sfence.valid
316 | 
317 |   val mpu_ppn = refill_ppn
318 |   val mpu_physaddr = Cat(mpu_ppn, io.req.bits.vaddr(cfg.pgIdxBits - 1, 0))
319 |   // PMA
320 |   // check exist a slave can consume this address.
321 |   // val legal_address = edge.manager.findSafe(mpu_physaddr).reduce(_ || _)
322 |   // check utility to help check SoC property.
323 |   // def fastCheck(member: TLManagerParameters => Boolean) =
324 |   //   legal_address && edge.manager.fastProperty(mpu_physaddr, member, (b: Boolean) => b.B)
325 | 
326 |   // val cacheable = fastCheck(_.supportsAcquireB) && (instruction).B
327 |   // val cacheable = (instruction).B
328 | 
329 |   val homogeneous = false.B
330 |   //  TLBPageLookup(edge.manager.managers, cfg.xLen, cfg.CacheBlockBytes, BigInt(1) << cfg.pgIdxBits)(
331 |   //    mpu_physaddr
332 |   //  ).homogeneous
333 |   val prot_r = true.B // fastCheck(_.supportsGet)
334 |   val prot_w = true.B // fastCheck(_.supportsPutFull)
335 |   val prot_pp = true.B // fastCheck(_.supportsPutPartial)
336 |   val prot_al = true.B // fastCheck(_.supportsLogical)
337 |   val prot_aa = true.B // fastCheck(_.supportsArithmetic)
338 |   val prot_x = true.B // fastCheck(_.executable)
339 |   val prot_eff = true.B // fastCheck(Seq(RegionType.PUT_EFFECTS, RegionType.GET_EFFECTS) contains _.regionType)
340 | 
341 |   // hit check
342 |   val sector_hits = sectored_entries(memIdx).map(_.sectorHit(vpn))
343 |   val hitsVec = all_entries.map(vm_enabled && _.hit(vpn, asid))
344 |   val real_hits = hitsVec.asUInt
345 |   val hits = Cat(!vm_enabled, real_hits)
346 | 
347 |   // use ptw response to refill
348 |   // permission bit arrays
349 |   when(do_refill) {
350 |     val pte = io.ptw.resp.bits.pte
351 |     // val refill_v = r_vstage1_en || r_stage2_en
352 |     // val asid
353 |     val newEntry = Wire(new TLBEntryData(cfg))
354 |     newEntry.ppn := pte.ppn
355 |     // newEntry.c := cacheable
356 |     newEntry.u := pte.u
357 |     // newEntry.g := pte.g && pte.v
358 |     newEntry.ae_ptw := io.ptw.resp.bits.ae_ptw
359 |     newEntry.ae_final := io.ptw.resp.bits.ae_final
360 |     newEntry.pf := io.ptw.resp.bits.pf
361 |     newEntry.pr := prot_r
362 |     newEntry.pw := prot_w
363 |     newEntry.px := prot_x
364 |     // newEntry.ppp := prot_pp
365 |     // newEntry.pal := prot_al
366 |     // newEntry.paa := prot_aa
367 |     // newEntry.eff := prot_eff
368 |     // refill sectored_hit
369 |     val r_memIdx = r_refill_tag.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
370 |     val waddr = Mux(r_sectored_hit.valid, r_sectored_hit.bits, r_sectored_repl_addr)
371 |     for ((e, i) <- sectored_entries(r_memIdx).zipWithIndex) when(waddr === i.U) {
372 |       when(!r_sectored_hit.valid) { e.invalidate() }
373 |       e.insert(r_refill_tag, asid, 0.U, newEntry)
374 |       when(invalidate_refill) { e.invalidate() }
375 |     }
376 |   }
377 | 
378 |   // get all entries data.
379 |   val entries = all_entries.map(_.getData(vpn))
380 |   val normal_entries = entries.take(ordinary_entries.size)
381 |   // parallel query PPN from [[all_entries]], if VM not enabled return VPN instead
382 |   val ppn = Mux1H(
383 |     hitsVec :+ !vm_enabled,
384 |     (all_entries.zip(entries)).map { case (entry, data) => entry.ppn(data) } :+ vpn(cfg.ppnBits - 1, 0)
385 |   )
386 | 
387 |   val nPhysicalEntries = 1
388 |   // generally PTW misaligned load exception.
389 |   val ptw_ae_array = Cat(false.B, entries.map(_.ae_ptw).asUInt)
390 |   val final_ae_array = Cat(false.B, entries.map(_.ae_final).asUInt)
391 |   val ptw_pf_array = Cat(false.B, entries.map(_.pf).asUInt)
392 |   val sum = io.ptw.status.sum
393 |   // if in hypervisor/machine mode, cannot read/write user entries.
394 |   // if in superviosr/user mode, "If the SUM bit in the sstatus register is set, supervisor mode software may also access pages with U=1.(from spec)"
395 |   val priv_rw_ok = entries.map(_.u).asUInt
396 |   // if in hypervisor/machine mode, other than user pages, all pages are executable.
397 |   // if in superviosr/user mode, only user page can execute.
398 |   val priv_x_ok = entries.map(_.u).asUInt
399 |   val mxr = io.ptw.status.mxr
400 |   // "The vsstatus field MXR, which makes execute-only pages readable, only overrides VS-stage page protection.(from spec)"
401 |   // val r_array =
402 |   //   Cat(true.B, (priv_rw_ok & (entries.map(_.sr).asUInt | Mux(mxr, entries.map(_.sx).asUInt, 0.U))))
403 |   // These array is for each TLB entries.
404 |   // user mode can read: PMA OK, TLB OK, AE OK
405 |   val pr_array = Cat(Fill(nPhysicalEntries, prot_r), normal_entries.map(_.pr).asUInt) & ~(ptw_ae_array | final_ae_array)
406 |   // user mode can write: PMA OK, TLB OK, AE OK
407 |   val pw_array = Cat(Fill(nPhysicalEntries, prot_w), normal_entries.map(_.pw).asUInt) & ~(ptw_ae_array | final_ae_array)
408 |   // user mode can write: PMA OK, TLB OK, AE OK
409 |   val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~(ptw_ae_array | final_ae_array)
410 |   // put effect
411 |   // val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt)
412 |   // cacheable
413 |   // val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt)
414 |   // put partial
415 |   // val ppp_array = Cat(Fill(nPhysicalEntries, prot_pp), normal_entries.map(_.ppp).asUInt)
416 |   // // atomic arithmetic
417 |   // val paa_array = Cat(Fill(nPhysicalEntries, prot_aa), normal_entries.map(_.paa).asUInt)
418 |   // // atomic logic
419 |   // val pal_array = Cat(Fill(nPhysicalEntries, prot_al), normal_entries.map(_.pal).asUInt)
420 |   // val ppp_array_if_cached = ppp_array // | c_array
421 |   // val paa_array_if_cached = paa_array // | (if (usingAtomicsInCache) c_array else 0.U)
422 |   // val pal_array_if_cached = pal_array // | (if (usingAtomicsInCache) c_array else 0.U)
423 | 
424 |   // vaddr misaligned: vaddr[1:0]=b00
425 |   val misaligned = (io.req.bits.vaddr & (UIntToOH(io.req.bits.size) - 1.U)).orR
426 |   // def badVA(): Bool = {
427 |   //   val additionalPgLevels = satp.additionalPgLevels
428 |   //   val signed = 1
429 |   //   val nPgLevelChoices = cfg.pgLevels - cfg.minPgLevels + 1
430 |   //   val minVAddrBits = cfg.pgIdxBits + cfg.minPgLevels * cfg.pgLevelBits
431 |   //   (for (i <- 0 until nPgLevelChoices) yield {
432 |   //     val mask =
433 |   //       ((BigInt(1) << cfg.vaddrBits) - (BigInt(1) << (minVAddrBits + i * cfg.pgLevelBits - signed.toInt))).U
434 |   //     val maskedVAddr = io.req.bits.vaddr & mask
435 |   //     additionalPgLevels === i.U && !(maskedVAddr === 0.U || signed.B && maskedVAddr === mask)
436 |   //   }).orR
437 |   // }
438 |   val bad_gpa = false.B
439 |   val bad_va = false.B
440 | 
441 |   val cmd_lrsc = usingAtomics.B && io.req.bits.cmd.isOneOf(M_XLR, M_XSC)
442 |   val cmd_amo_logical = usingAtomics.B && isAMOLogical(io.req.bits.cmd)
443 |   val cmd_amo_arithmetic = usingAtomics.B && isAMOArithmetic(io.req.bits.cmd)
444 |   val cmd_put_partial = io.req.bits.cmd === M_PWR
445 |   val cmd_read = isRead(io.req.bits.cmd)
446 |   val cmd_readx = false.B
447 |   val cmd_write = isWrite(io.req.bits.cmd)
448 |   val cmd_write_perms = cmd_write ||
449 |     io.req.bits.cmd.isOneOf(M_FLUSH_ALL, M_WOK) // not a write, but needs write permissions
450 | 
451 |   // val lrscAllowed = Mux((usingDataScratchpad || usingAtomicsOnlyForIO).B, 0.U, c_array)
452 |   val lrscAllowed = 0.U
453 |   val ae_array =
454 |     // Mux(misaligned, eff_array, 0.U) |
455 |     Mux(cmd_lrsc, ~lrscAllowed, 0.U)
456 | 
457 |   // access exception needs SoC information from PMA
458 |   val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U)
459 |   val ae_st_array =
460 |     Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) // |
461 |   //  Mux(cmd_put_partial, ~ppp_array_if_cached, 0.U) |
462 |   //  Mux(cmd_amo_logical, ~pal_array_if_cached, 0.U) |
463 |   //  Mux(cmd_amo_arithmetic, ~paa_array_if_cached, 0.U)
464 |   // val must_alloc_array =
465 |   //   Mux(cmd_put_partial, ~ppp_array, 0.U) |
466 |   //     Mux(cmd_amo_logical, ~pal_array, 0.U) |
467 |   //     Mux(cmd_amo_arithmetic, ~paa_array, 0.U) |
468 |   //     Mux(cmd_lrsc, ~0.U(pal_array.getWidth.W), 0.U)
469 |   val pf_ld_array =
470 |     Mux(cmd_read, (ptw_ae_array | ptw_pf_array), 0.U)
471 |   val pf_st_array = Mux(cmd_write_perms, (ptw_ae_array | ptw_pf_array), 0.U)
472 |   val pf_inst_array = (ptw_ae_array | ptw_pf_array)
473 | 
474 |   val tlb_hit_if_not_gpa_miss = real_hits.orR
475 |   val tlb_hit = real_hits.orR
476 |   // leads to s_request
477 |   val tlb_miss = vm_enabled && !tlb_hit
478 | 
479 |   val sectored_plru = new SetAssocLRU(cfg.nSets, sectored_entries.head.size, "plru")
480 |   when(io.req.valid && vm_enabled) {
481 |     // replace
482 |     when(sector_hits.orR) { sectored_plru.access(memIdx, OHToUInt(sector_hits)) }
483 |   }
484 | 
485 |   // Superpages create the possibility that two entries in the TLB may match.
486 |   // This corresponds to a software bug, but we can't return complete garbage;
487 |   // we must return either the old translation or the new translation.  This
488 |   // isn't compatible with the Mux1H approach.  So, flush the TLB and report
489 |   // a miss on duplicate entries.
490 |   val multipleHits = PopCountAtLeast(real_hits, 2)
491 | 
492 |   // only pull up req.ready when this is s_ready state.
493 |   io.req.ready := state === s_ready
494 |   // page fault
495 |   io.resp.pf.ld := (bad_va && cmd_read) || (pf_ld_array & hits).orR
496 |   io.resp.pf.st := (bad_va && cmd_write_perms) || (pf_st_array & hits).orR
497 |   io.resp.pf.inst := bad_va || (pf_inst_array & hits).orR
498 |   // access exception
499 |   io.resp.ae.ld := (ae_ld_array & hits).orR
500 |   io.resp.ae.st := (ae_st_array & hits).orR
501 |   io.resp.ae.inst := (~px_array & hits).orR
502 |   // misaligned
503 |   io.resp.ma.ld := misaligned && cmd_read
504 |   io.resp.ma.st := misaligned && cmd_write
505 |   io.resp.ma.inst := false.B // this is up to the pipeline to figure out
506 |   // io.resp.cacheable := (c_array & hits).orR
507 |   // io.resp.must_alloc := (must_alloc_array & hits).orR
508 |   // io.resp.prefetchable := (prefetchable_array & hits).orR // && edge.manager.managers
509 |   // .forall(m => !m.supportsAcquireB || m.supportsHint)
510 |   // .B
511 |   io.resp.miss := do_refill || tlb_miss || multipleHits
512 |   io.resp.paddr := Cat(ppn, io.req.bits.vaddr(cfg.pgIdxBits - 1, 0))
513 | 
514 |   io.ptw.req.valid := state === s_request
515 |   io.ptw.req.bits.valid := !io.kill
516 |   io.ptw.req.bits.bits.addr := r_refill_tag
517 |   io.ptw.req.bits.bits.vstage1 := false.B
518 |   io.ptw.req.bits.bits.stage2 := false.B
519 | 
520 |   val sfence = io.sfence.valid
521 |   // this is [[s_ready]]
522 |   // handle miss/hit at the first cycle.
523 |   // if miss, request PTW(L2TLB).
524 |   when(io.req.fire && tlb_miss) {
525 |     state := s_request
526 |     r_refill_tag := vpn
527 |     r_sectored_repl_addr := replacementEntry(sectored_entries(memIdx), sectored_plru.way(memIdx))
528 |     r_sectored_hit.valid := sector_hits.orR
529 |     r_sectored_hit.bits := OHToUInt(sector_hits)
530 |   }
531 |   // Handle SFENCE.VMA when send request to PTW.
532 |   // SFENCE.VMA    io.ptw.req.ready     kill
533 |   //       ?                 ?            1
534 |   //       0                 0            0
535 |   //       0                 1            0 -> s_wait
536 |   //       1                 0            0 -> s_wait_invalidate
537 |   //       1                 0            0 -> s_ready
538 |   when(state === s_request) {
539 |     // SFENCE.VMA will kill TLB entries based on rs1 and rs2. It will take 1 cycle.
540 |     when(sfence) { state := s_ready }
541 |     // here should be io.ptw.req.fire, but assert(io.ptw.req.ready === true.B)
542 |     // fire -> s_wait
543 |     when(io.ptw.req.ready) { state := Mux(sfence, s_wait_invalidate, s_wait) }
544 |     // If CPU kills request(frontend.s2_redirect)
545 |     when(io.kill) { state := s_ready }
546 |   }
547 |   // sfence in refill will results in invalidate
548 |   when(state === s_wait && sfence) {
549 |     state := s_wait_invalidate
550 |   }
551 |   // after CPU acquire response, go back to s_ready.
552 |   when(io.ptw.resp.valid) {
553 |     state := s_ready
554 |   }
555 | 
556 |   // SFENCE processing logic.
557 |   when(sfence) {
558 |     assert(!io.sfence.bits.rs1 || (io.sfence.bits.addr >> cfg.pgIdxBits) === vpn)
559 |     for (e <- all_real_entries) {
560 |       when(io.sfence.bits.rs1) { e.invalidateVPN(vpn, asid) }
561 |         .elsewhen(io.sfence.bits.rs2) { e.invalidateNonGlobal(asid) }
562 |         .otherwise { e.invalidate(asid) }
563 |     }
564 |   }
565 | 
566 |   when(multipleHits || reset.asBool) {
567 |     all_real_entries.foreach(_.invalidate())
568 |   }
569 | 
570 |   ccover(io.ptw.req.fire, "MISS", "TLB miss")
571 |   ccover(io.ptw.req.valid && !io.ptw.req.ready, "PTW_STALL", "TLB miss, but PTW busy")
572 |   ccover(state === s_wait_invalidate, "SFENCE_DURING_REFILL", "flush TLB during TLB refill")
573 |   ccover(sfence && !io.sfence.bits.rs1 && !io.sfence.bits.rs2, "SFENCE_ALL", "flush TLB")
574 |   ccover(sfence && !io.sfence.bits.rs1 && io.sfence.bits.rs2, "SFENCE_ASID", "flush TLB ASID")
575 |   ccover(sfence && io.sfence.bits.rs1 && !io.sfence.bits.rs2, "SFENCE_LINE", "flush TLB line")
576 |   ccover(sfence && io.sfence.bits.rs1 && io.sfence.bits.rs2, "SFENCE_LINE_ASID", "flush TLB line/ASID")
577 |   ccover(multipleHits, "MULTIPLE_HITS", "Two matching translations in TLB")
578 | 
579 |   def ccover(
580 |     cond:  Bool,
581 |     label: String,
582 |     desc:  String
583 |   )(
584 |     implicit sourceInfo: SourceInfo
585 |   ) =
586 |     property.cover(cond, s"${if (instruction) "I" else "D"}TLB_$label", "MemorySystem;;" + desc)
587 | 
588 |   /** Decides which entry to be replaced
589 |     *
590 |     * If there is a invalid entry, replace it with priorityencoder; if not, replace the alt entry
591 |     *
592 |     * @return
593 |     *   mask for TLBEntry replacement
594 |     */
595 |   def replacementEntry(set: Seq[TLBEntry], alt: UInt) = {
596 |     val valids = set.map(_.valid.orR).asUInt
597 |     Mux(valids.andR, alt, PriorityEncoder(~valids))
598 |   }
599 | }
600 | 


--------------------------------------------------------------------------------
/src/main/scala/core/VGPR.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.lib._
 7 | import ogpu.config._
 8 | 
 9 | class VGPR(
10 |   implicit p: Parameters)
11 |     extends Module() {
12 |   val numWarps = p(WarpNum)
13 |   val numRegs = p(RegNum)
14 |   val numThreads = p(ThreadNum)
15 |   val xLen = p(XLen)
16 | 
17 |   val io = IO(new Bundle {
18 |     val writeback = Flipped(DecoupledIO(new CommitVData()))
19 |     val writeback_cmd = Flipped(DecoupledIO(new CommitVData()))
20 |     val read_req = Flipped(new ReadGPRReq())
21 |     val read_rsp = new ReadVGPRRsp()
22 |   })
23 | 
24 |   val gpr_ram = VecInit(Seq.fill(numWarps)((Module(new MaskedSmem_2R1W(xLen, numRegs, numThreads)).io)))
25 |   val raddr_reg = RegInit(0.U)
26 |   val raddr2_reg = RegInit(0.U)
27 |   val rwid_reg = RegInit(0.U(log2Ceil(numWarps).W))
28 |   val ready_reg = RegInit(0.B)
29 |   val cmd_ready_reg = RegInit(0.B)
30 | 
31 |   val need_forward1 = RegInit(0.B)
32 |   val need_forward2 = RegInit(0.B)
33 |   val forward_data = RegInit(0.U.asTypeOf(Vec(numThreads, UInt(xLen.W))))
34 |   need_forward1 := io.writeback.bits.rd === io.read_req.rs1 && io.writeback.bits.wid === io.read_req.wid
35 |   need_forward2 := io.writeback.bits.rd === io.read_req.rs2 && io.writeback.bits.wid === io.read_req.wid
36 |   forward_data := io.writeback_cmd.bits.data
37 |   raddr_reg := io.read_req.rs1
38 |   raddr2_reg := io.read_req.rs2
39 |   rwid_reg := io.read_req.wid
40 | 
41 |   io.writeback.ready := ready_reg
42 |   io.writeback_cmd.ready := cmd_ready_reg
43 | 
44 |   for (i <- 0 until numWarps) {
45 |     // init
46 |     gpr_ram(i).write_en := 0.B
47 |     gpr_ram(i).waddr := 0.U
48 |     gpr_ram(i).raddr := io.read_req.rs1
49 |     gpr_ram(i).raddr2 := io.read_req.rs2
50 |     gpr_ram(i).mask := 0.U.asTypeOf(io.writeback.bits.mask)
51 |     gpr_ram(i).dataIn := 0.U.asTypeOf(io.writeback.bits.data)
52 | 
53 |     when(io.writeback_cmd.valid && i.U === io.writeback_cmd.bits.wid) {
54 |       gpr_ram(i).write_en := io.writeback_cmd.valid
55 |       gpr_ram(i).waddr := io.writeback_cmd.bits.rd
56 |       gpr_ram(i).mask := io.writeback_cmd.bits.mask
57 |       gpr_ram(i).dataIn := io.writeback_cmd.bits.data
58 |     }.elsewhen(io.writeback.valid && i.U === io.writeback.bits.wid) {
59 |       gpr_ram(i).write_en := io.writeback.valid
60 |       gpr_ram(i).waddr := io.writeback.bits.rd
61 |       gpr_ram(i).mask := io.writeback.bits.mask
62 |       gpr_ram(i).dataIn := io.writeback.bits.data
63 |     }
64 | 
65 |   }
66 | 
67 |   ready_reg := 0.B
68 |   cmd_ready_reg := 0.B
69 |   when(io.writeback_cmd.valid) {
70 |     cmd_ready_reg := 1.B
71 |   }.elsewhen(io.writeback.valid) {
72 |     ready_reg := 1.B
73 |   }
74 | 
75 |   io.read_rsp.rs1_data := Mux(
76 |     raddr_reg === 0.U,
77 |     0.U.asTypeOf(gpr_ram(0).dataOut),
78 |     Mux(need_forward1, forward_data, gpr_ram(rwid_reg).dataOut)
79 |   )
80 |   io.read_rsp.rs2_data := Mux(
81 |     raddr2_reg === 0.U,
82 |     0.U.asTypeOf(gpr_ram(0).dataOut),
83 |     Mux(need_forward2, forward_data, gpr_ram(rwid_reg).dataOut2)
84 |   )
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/core/VectorALU.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import ogpu.config._
 7 | 
 8 | class VectorALU(
 9 |   implicit p: Parameters)
10 |     extends Module {
11 |   val numThread = p(ThreadNum)
12 |   val io = IO(new Bundle {
13 |     val in = Flipped(DecoupledIO(new VALUData()))
14 |     val out = DecoupledIO(new CommitVData())
15 |     val branch_data = DecoupledIO(new BranchData())
16 |   })
17 | 
18 |   val alu = VecInit(Seq.fill(numThread)((Module(new ScalarALU()).io)))
19 | 
20 |   val result = Module(new Queue(new CommitVData(), 1, pipe = true))
21 |   val branch_result = Module(new Queue(new BranchData(), 1, pipe = true))
22 | 
23 |   for (x <- 0 until numThread) {
24 |     alu(x).in1 := io.in.bits.op1(x)
25 |     alu(x).in2 := io.in.bits.op2(x)
26 |     alu(x).fn := io.in.bits.func
27 |     result.io.enq.bits.data(x) := alu(x).out
28 |     result.io.enq.bits.mask(x) := io.in.bits.mask(x)
29 |     branch_result.io.enq.bits.mask(x) := alu(x).cmp_out & io.in.bits.mask(x)
30 |   }
31 | 
32 |   branch_result.io.enq.bits.branch := io.in.bits.branch
33 |   branch_result.io.enq.bits.wid := io.in.bits.wid
34 |   branch_result.io.enq.bits.pc := io.in.bits.pc
35 |   branch_result.io.enq.bits.orig_mask := io.in.bits.mask
36 |   branch_result.io.enq.bits.imm := io.in.bits.imm
37 |   branch_result.io.enq.bits.rs1_data := io.in.bits.rs1_data
38 | 
39 |   io.in.ready := result.io.enq.ready && branch_result.io.enq.ready
40 | 
41 |   result.io.enq.valid := io.in.valid
42 |   result.io.enq.bits.wid := io.in.bits.wid
43 |   result.io.enq.bits.pc := io.in.bits.pc
44 |   result.io.enq.bits.rd := io.in.bits.rd
45 |   result.io.enq.bits.eop := 1.B
46 | 
47 |   val is_branch = io.in.bits.branch.jal | io.in.bits.branch.jalr | io.in.bits.branch.branch
48 |   branch_result.io.enq.valid := io.in.valid && is_branch
49 | 
50 |   io.out <> result.io.deq
51 |   io.branch_data <> branch_result.io.deq
52 | }
53 | 
54 | // object VectorALURTL extends App {
55 | //   implicit val p = new CoreConfig
56 | //   emitVerilog(new VectorALU(), Array("--target-dir", "generated"))
57 | // }
58 | //
59 | // object VectorALUFIR extends App {
60 | //   // ChiselStage.emitFirrtl(new VectorALU())
61 | //   implicit val p = new CoreConfig
62 | //   ChiselStage.emitCHIRRTL(new VectorALU())
63 | // }
64 | 
65 | // object VectorALUGraph extends App {
66 | //   (new ChiselStage).emitGraphML(new VectorALU() , Array("--target-dir", "graphs"))
67 | // }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/core/WarpScheduler.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.core
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import org.chipsalliance.cde.config.Parameters
  6 | import ogpu.config._
  7 | import ogpu.tile._
  8 | 
  9 | class VGPRWriter(
 10 |   implicit p: Parameters)
 11 |     extends Module {
 12 |   val numWarps = p(WarpNum)
 13 |   val numRegs = p(RegNum)
 14 |   val numThreads = p(ThreadNum)
 15 |   val xlen = p(XLen)
 16 |   val addrWidth = p(AddrWidth)
 17 | 
 18 |   val io = IO(new Bundle {
 19 |     val warp_cmd = Input(Valid(new CuTaskBundle()))
 20 |     val wid = Input(UInt(log2Ceil(numWarps).W))
 21 |     val commit_data = DecoupledIO(new CommitVData())
 22 |     val finish = DecoupledIO(new Bool())
 23 |     val idle = Output(Bool())
 24 |   })
 25 | 
 26 |   val s_idle :: s_working :: s_finish :: Nil = Enum(3)
 27 |   val state = RegInit(s_idle)
 28 | 
 29 |   val commit_counter = RegInit(0.U(2.W))
 30 |   io.idle := state === s_idle
 31 | 
 32 |   val counter_add1 = commit_counter + 1.U
 33 | 
 34 |   val tid_data = Wire(Vec(3, Vec(numThreads, UInt(xlen.W))))
 35 |   tid_data(0) := VecInit.tabulate(numThreads) { i => io.warp_cmd.bits.thread_dims(0) | i.U }
 36 |   tid_data(1) := VecInit.tabulate(numThreads) { _ => io.warp_cmd.bits.thread_dims(1) }
 37 |   tid_data(2) := VecInit.tabulate(numThreads) { _ => io.warp_cmd.bits.thread_dims(2) }
 38 | 
 39 |   switch(state) {
 40 |     is(s_idle) {
 41 |       when(io.warp_cmd.valid) {
 42 |         state := s_working
 43 |       }
 44 |     }
 45 |     is(s_working) {
 46 |       when(((counter_add1 === io.warp_cmd.bits.vgpr_num) & io.commit_data.fire) | io.warp_cmd.bits.vgpr_num === 0.U) {
 47 |         state := s_finish
 48 |       }
 49 |     }
 50 |     is(s_finish) {
 51 |       when(io.finish.fire) {
 52 |         state := s_idle
 53 |       }
 54 |     }
 55 |   }
 56 | 
 57 |   io.commit_data.bits.wid := io.wid
 58 |   io.commit_data.bits.mask := io.warp_cmd.bits.mask
 59 |   io.commit_data.bits.rd := counter_add1
 60 |   io.commit_data.bits.eop := true.B
 61 |   io.commit_data.bits.pc := 0.U
 62 |   io.commit_data.valid := false.B
 63 |   io.commit_data.bits.data := tid_data(commit_counter)
 64 |   io.finish.bits := 0.U
 65 |   io.commit_data.valid := state === s_working
 66 |   io.finish.valid := state === s_finish
 67 |   switch(state) {
 68 |     is(s_idle) {
 69 |       commit_counter := 0.U
 70 |     }
 71 |     is(s_working) {
 72 |       when(io.commit_data.fire & counter_add1 =/= io.warp_cmd.bits.vgpr_num) {
 73 |         commit_counter := counter_add1
 74 |       }
 75 |     }
 76 |   }
 77 | }
 78 | 
 79 | class SGPRWriter(
 80 |   implicit p: Parameters)
 81 |     extends Module {
 82 |   val numWarps = p(WarpNum)
 83 |   val numRegs = p(RegNum)
 84 |   val addrWidth = p(AddrWidth)
 85 | 
 86 |   val io = IO(new Bundle {
 87 |     val warp_cmd = Input(Valid(new CuTaskBundle()))
 88 |     val wid = Input(UInt(log2Ceil(numWarps).W))
 89 |     val commit_data = DecoupledIO(new CommitSData())
 90 |     val finish = DecoupledIO(Bool())
 91 |     val idle = Output(Bool())
 92 |   })
 93 | 
 94 |   val s_idle :: s_working :: s_finish :: Nil = Enum(3)
 95 | 
 96 |   val commit_counter = RegInit(0.U(5.W))
 97 |   val state = RegInit(s_idle)
 98 | 
 99 |   val counter_add1 = commit_counter + 1.U
100 |   val commit_data = io.warp_cmd.bits.sgprs(commit_counter)
101 | 
102 |   io.idle := state === s_idle
103 |   switch(state) {
104 |     is(s_idle) {
105 |       when(io.warp_cmd.valid) {
106 |         state := s_working
107 |       }
108 |     }
109 |     is(s_working) {
110 |       when(((commit_counter === io.warp_cmd.bits.sgpr_num) & io.commit_data.fire) | io.warp_cmd.bits.sgpr_num === 0.U) {
111 |         state := s_finish
112 |       }
113 |     }
114 |     is(s_finish) {
115 |       when(io.finish.fire) {
116 |         state := s_idle
117 |       }
118 |     }
119 |   }
120 | 
121 |   io.commit_data.bits.wid := io.wid
122 |   io.commit_data.bits.rd := commit_counter
123 |   io.commit_data.bits.eop := true.B
124 |   io.commit_data.bits.pc := 0.U
125 |   io.commit_data.bits.data := commit_data
126 |   io.commit_data.bits.mask := io.warp_cmd.bits.mask(0)
127 |   io.finish.valid := state === s_finish
128 |   io.commit_data.valid := state === s_working
129 |   io.finish.bits := 0.U
130 |   switch(state) {
131 |     is(s_idle) {
132 |       io.commit_data.valid := false.B
133 |       commit_counter := 0.U
134 |     }
135 |     is(s_working) {
136 |       when(io.commit_data.fire & commit_counter =/= io.warp_cmd.bits.sgpr_num) {
137 |         commit_counter := counter_add1
138 |       }
139 |     }
140 |   }
141 | }
142 | 
143 | class WarpScheduler(
144 |   implicit p: Parameters)
145 |     extends Module {
146 |   val numWarps = p(WarpNum)
147 |   val numRegs = p(RegNum)
148 |   val numThreads = p(ThreadNum)
149 |   val addrWidth = p(AddrWidth)
150 | 
151 |   val xLen = p(XLen)
152 | 
153 |   val io = IO(new Bundle {
154 |     val warp_cmd = Flipped(DecoupledIO(new CuTaskBundle()))
155 |     val warp_ctl = Flipped(DecoupledIO(new WarpControlData()))
156 |     val branch_ctl = Flipped(DecoupledIO(new BranchControlData()))
157 |     val inst_fetch = DecoupledIO(new InstFetchData())
158 |     val warp_end = DecoupledIO(new WarpEndData())
159 |     val sgpr_commit = DecoupledIO(new CommitSData())
160 |     val vgpr_commit = DecoupledIO(new CommitVData())
161 |   })
162 | 
163 |   val warp_idle = RegInit(VecInit(Seq.fill(numWarps)(1.B)))
164 |   val warp_active = RegInit(VecInit(Seq.fill(numWarps)(0.B)))
165 |   val warp_pc = RegInit(VecInit(Seq.fill(numWarps)(0.U(addrWidth.W))))
166 |   val warp_tmask = RegInit(VecInit(Seq.fill(numWarps)(VecInit(Seq.fill(numThreads)(0.B)))))
167 |   val pop_valid = RegInit(0.B)
168 | 
169 |   val pop_wid = RegInit(0.U(log2Ceil(numWarps).W))
170 |   io.warp_ctl.ready := true.B
171 |   io.branch_ctl.ready := true.B
172 | 
173 |   val has_idle = warp_idle.asUInt.orR
174 |   val has_active = warp_active.asUInt.orR
175 |   val idle_id = PriorityEncoder(warp_idle)
176 |   val active_id = PriorityEncoder(warp_active)
177 | 
178 |   val simt_stack = VecInit(Seq.fill(numWarps)(Module(new SIMTStack()).io))
179 | 
180 |   val pop_diverge = Wire(Bool())
181 |   val pop_data = Wire(new StackData())
182 | 
183 |   val vgpr_writer = Module(new VGPRWriter())
184 |   val sgpr_writer = Module(new SGPRWriter())
185 | 
186 |   val lock_warp = RegInit(0.U(log2Ceil(numWarps).W))
187 | 
188 |   vgpr_writer.io.warp_cmd.bits := io.warp_cmd.bits
189 |   sgpr_writer.io.warp_cmd.bits := io.warp_cmd.bits
190 |   vgpr_writer.io.wid := lock_warp
191 |   sgpr_writer.io.wid := lock_warp
192 | 
193 |   val writer_finish = RegInit(false.B)
194 |   writer_finish := sgpr_writer.io.finish.valid & vgpr_writer.io.finish.valid
195 | 
196 |   io.warp_cmd.ready := writer_finish
197 |   vgpr_writer.io.finish.ready := writer_finish
198 |   sgpr_writer.io.finish.ready := writer_finish
199 | 
200 |   sgpr_writer.io.wid := lock_warp
201 |   vgpr_writer.io.wid := lock_warp
202 |   io.sgpr_commit <> sgpr_writer.io.commit_data
203 |   io.vgpr_commit <> vgpr_writer.io.commit_data
204 | 
205 |   val s_idle :: s_waiting :: Nil = Enum(2)
206 |   val state = RegInit(s_idle)
207 | 
208 |   switch(state) {
209 |     is(s_idle) {
210 |       when(io.warp_cmd.valid & sgpr_writer.io.idle & vgpr_writer.io.idle) {
211 |         state := s_waiting
212 |       }
213 |     }
214 |     is(s_waiting) {
215 |       when(writer_finish) {
216 |         state := s_idle
217 |       }
218 |     }
219 |   }
220 | 
221 |   vgpr_writer.io.warp_cmd.valid := false.B
222 |   sgpr_writer.io.warp_cmd.valid := false.B
223 | 
224 |   switch(state) {
225 |     is(s_idle) {
226 |       when(io.warp_cmd.valid & sgpr_writer.io.idle & vgpr_writer.io.idle & has_idle) {
227 |         lock_warp := idle_id
228 |         vgpr_writer.io.warp_cmd.valid := true.B
229 |         sgpr_writer.io.warp_cmd.valid := true.B
230 |       }
231 |     }
232 |     is(s_waiting) {
233 |       vgpr_writer.io.warp_cmd.valid := false.B
234 |       sgpr_writer.io.warp_cmd.valid := false.B
235 |     }
236 |   }
237 | 
238 |   for (i <- 0 until numWarps) {
239 |     simt_stack(i).in_diverge := io.branch_ctl.bits.wid === i.U && io.branch_ctl.valid && io.branch_ctl.bits.diverge
240 |     simt_stack(i).in_data := io.branch_ctl.bits.data
241 |     simt_stack(i).push := io.branch_ctl.bits.wid === i.U && io.branch_ctl.valid
242 |     simt_stack(i).pop := io.warp_ctl.bits.wid === i.U && io.warp_ctl.valid && io.warp_ctl.bits.join
243 |   }
244 | 
245 |   io.warp_end.bits.wid := io.warp_ctl.bits.wid
246 |   io.warp_end.valid := io.warp_ctl.valid & io.warp_ctl.bits.end
247 | 
248 |   pop_valid := io.warp_ctl.valid & io.warp_ctl.bits.join
249 |   pop_wid := io.warp_ctl.bits.wid
250 | 
251 |   pop_diverge := simt_stack(pop_wid).out_diverge
252 |   pop_data := simt_stack(pop_wid).out_data
253 | 
254 |   for (i <- 0 until numWarps) {
255 |     when(io.warp_cmd.fire && i.U === lock_warp) {
256 |       warp_idle(i) := false.B
257 |       warp_active(i) := true.B
258 |       warp_pc(i) := io.warp_cmd.bits.pc
259 |       warp_tmask(i) := io.warp_cmd.bits.mask
260 |     }
261 | 
262 |     when(io.warp_ctl.fire && io.warp_ctl.bits.end && i.U === io.warp_ctl.bits.wid) {
263 |       warp_idle(i) := true.B
264 |       warp_active(i) := false.B
265 |     }
266 | 
267 |     when(io.warp_ctl.valid && i.U === io.warp_ctl.bits.wid) {
268 |       warp_active(i) := io.warp_ctl.bits.active
269 |     }
270 | 
271 |     when(io.branch_ctl.valid && i.U === io.branch_ctl.bits.wid) {
272 |       warp_pc(i) := io.branch_ctl.bits.pc
273 |       warp_active(i) := 1.B
274 |       warp_tmask(i) := io.branch_ctl.bits.mask
275 |     }
276 | 
277 |     when(pop_valid && i.U === pop_wid) {
278 |       warp_active(i) := 1.B
279 |       when(pop_diverge) {
280 |         warp_pc(i) := pop_data.pc
281 |         warp_tmask(i) := pop_data.mask
282 |       }.otherwise {
283 |         warp_tmask(i) := pop_data.orig_mask
284 |       }
285 |     }
286 | 
287 |     when(io.inst_fetch.fire && i.U === active_id) {
288 |       warp_active(i) := 0.B
289 |       warp_pc(i) := warp_pc(i) + 4.U
290 |     }
291 |   } // loop num warps
292 | 
293 |   io.inst_fetch.bits.pc := 0.U
294 |   io.inst_fetch.bits.mask := VecInit(Seq.fill(numThreads)(0.B))
295 |   io.inst_fetch.bits.wid := 0.U
296 |   when(has_active) {
297 |     io.inst_fetch.valid := !warp_idle(active_id)
298 |     io.inst_fetch.bits.pc := warp_pc(active_id)
299 |     io.inst_fetch.bits.mask := warp_tmask(active_id)
300 |     io.inst_fetch.bits.wid := active_id
301 |   }.otherwise {
302 |     io.inst_fetch.valid := 0.B
303 |   }
304 | }
305 | 


--------------------------------------------------------------------------------
/src/main/scala/core/Writeback.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.core
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | 
 7 | class Writeback(
 8 |   implicit p: Parameters)
 9 |     extends Module {
10 |   val io = IO(new Bundle {
11 |     val alu_commit = Flipped(DecoupledIO(new CommitVData()))
12 |     val lsu_commit = Flipped(DecoupledIO(new CommitVData()))
13 |     val writeback = DecoupledIO(new CommitVData())
14 |   })
15 | 
16 |   val rsp_data = VecInit(
17 |     Seq(
18 |       io.alu_commit,
19 |       io.lsu_commit
20 |     )
21 |   )
22 | 
23 |   val rsp_arbiter = Module(new RRArbiter(new CommitVData(), 2))
24 |   rsp_arbiter.io.in <> rsp_data
25 | 
26 |   val outQue = Module(new Queue(new CommitVData(), 1, pipe = true))
27 |   outQue.io.enq <> rsp_arbiter.io.out
28 | 
29 |   io.writeback <> outQue.io.deq
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/dispatcher/DispatcherBundle.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.dispatcher
 2 | 
 3 | import chisel3._
 4 | 
 5 | class AQLBundle() extends Bundle {
 6 |   val header = UInt(16.W)
 7 |   val dimensions = UInt(2.W)
 8 |   val reserved1 = UInt(14.W)
 9 |   val workgroup_size_x = UInt(16.W)
10 |   val workgroup_size_y = UInt(16.W)
11 |   val workgroup_size_z = UInt(16.W)
12 |   val reserved2 = UInt(16.W)
13 |   val grid_size_x = UInt(32.W)
14 |   val grid_size_y = UInt(32.W)
15 |   val grid_size_z = UInt(32.W)
16 |   val private_sgement_size = UInt(32.W)
17 |   val group_segment_size = UInt(32.W)
18 |   val kernel_object = UInt(64.W)
19 |   val kernargs_address = UInt(64.W)
20 |   val completion_signal = UInt(64.W)
21 | }
22 | 
23 | class WorkGroupTaskBundle() extends Bundle {
24 |   val workgroup_size_x = UInt(16.W)
25 |   val workgroup_size_y = UInt(16.W)
26 |   val workgroup_size_z = UInt(16.W)
27 |   // val grid_size_x = UInt(32.W)
28 |   // val grid_size_y = UInt(32.W)
29 |   // val grid_size_z = UInt(32.W)
30 |   val grid_id_x = UInt(32.W)
31 |   val grid_id_y = UInt(32.W)
32 |   val grid_id_z = UInt(32.W)
33 |   val private_sgement_size = UInt(32.W)
34 |   val group_segment_size = UInt(32.W)
35 |   val kernel_object = UInt(64.W)
36 | 
37 |   val kernargs_address = UInt(64.W)
38 | }
39 | 
40 | class WorkGroupTaskRespBundle() extends Bundle {
41 |   val finish = Bool()
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/dispatcher/JobDispatcher.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.dispatcher
  2 | 
  3 | import chisel3._
  4 | import chisel3.util._
  5 | import org.chipsalliance.cde.config.Parameters
  6 | import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
  7 | 
  8 | case class JobDispatchParams() {
  9 |   def buffer_num = 1
 10 | }
 11 | 
 12 | class JobDispatcher(
 13 |   params: JobDispatchParams
 14 | )(
 15 |   implicit p: Parameters)
 16 |     extends LazyModule {
 17 | 
 18 |   lazy val module = new Impl(this)
 19 | 
 20 |   class Impl(
 21 |     outer: JobDispatcher
 22 |   )(
 23 |     implicit p: Parameters)
 24 |       extends LazyModuleImp(outer) {
 25 |     val io = IO(new Bundle {
 26 |       val aql = Flipped(DecoupledIO(new AQLBundle))
 27 |       val task = DecoupledIO(new WorkGroupTaskBundle)
 28 |       val task_resp = Flipped(DecoupledIO(new WorkGroupTaskRespBundle))
 29 |     })
 30 | 
 31 |     val s_idle :: s_working :: s_finish :: Nil = Enum(3)
 32 |     val state = RegInit(s_idle)
 33 | 
 34 |     io.aql.ready := state === s_idle
 35 | 
 36 |     val grid_x = RegInit(UInt(32.W))
 37 |     val grid_y = RegInit(UInt(32.W))
 38 |     val grid_z = RegInit(UInt(32.W))
 39 |     val workgroup_x = RegInit(UInt(16.W))
 40 |     val workgroup_y = RegInit(UInt(16.W))
 41 |     val workgroup_z = RegInit(UInt(16.W))
 42 |     val grid_counter_x = RegInit(UInt(32.W))
 43 |     val grid_counter_y = RegInit(UInt(32.W))
 44 |     val grid_counter_z = RegInit(UInt(32.W))
 45 | 
 46 |     val taskDone = grid_counter_x === (grid_x - 1.U) &
 47 |       grid_counter_y === (grid_y - 1.U) &
 48 |       grid_counter_z === (grid_z - 1.U)
 49 | 
 50 |     val grid_x_acc = (grid_counter_x === (grid_x - 1.U))
 51 |     val grid_y_acc = (grid_counter_x === (grid_x - 1.U)) & (grid_counter_y =/= (grid_y - 1.U))
 52 |     val grid_z_acc = (grid_counter_x === (grid_x - 1.U)) & (grid_counter_y === (grid_y - 1.U))
 53 | 
 54 |     val grid_rcounter_x = RegInit(UInt(32.W))
 55 |     val grid_rcounter_y = RegInit(UInt(32.W))
 56 |     val grid_rcounter_z = RegInit(UInt(32.W))
 57 | 
 58 |     val grid_x_racc = (grid_rcounter_x === (grid_x - 1.U))
 59 |     val grid_y_racc = (grid_rcounter_x === (grid_x - 1.U)) & (grid_rcounter_y =/= (grid_y - 1.U))
 60 |     val grid_z_racc = (grid_rcounter_x === (grid_x - 1.U)) & (grid_rcounter_y === (grid_y - 1.U))
 61 | 
 62 |     val recDone = grid_rcounter_x === (grid_x - 1.U) &
 63 |       grid_rcounter_y === (grid_y - 1.U) &
 64 |       grid_rcounter_z === (grid_z - 1.U)
 65 | 
 66 |     // state transition
 67 |     switch(state) {
 68 |       is(s_idle) {
 69 |         when(io.aql.fire) {
 70 |           state := s_working
 71 |         }
 72 |       }
 73 |       is(s_working) {
 74 |         when(taskDone & io.task.fire) {
 75 |           state := s_finish
 76 |         }
 77 |       }
 78 |       is(s_finish) {
 79 |         when(state_rec === s_rec_finish) {
 80 |           state := s_idle
 81 |         }
 82 |       }
 83 |     }
 84 | 
 85 |     // state action
 86 |     switch(state) {
 87 |       is(s_idle) {
 88 |         when(io.aql.fire) {
 89 |           grid_x := io.aql.bits.grid_size_x
 90 |           grid_y := io.aql.bits.grid_size_y
 91 |           grid_z := io.aql.bits.grid_size_z
 92 |           workgroup_x := io.aql.bits.workgroup_size_x
 93 |           workgroup_y := io.aql.bits.workgroup_size_y
 94 |           workgroup_z := io.aql.bits.workgroup_size_z
 95 |           grid_counter_x := 0.U
 96 |           grid_counter_y := 0.U
 97 |           grid_counter_z := 0.U
 98 |           io.task.valid := true.B
 99 |           io.task.bits.workgroup_size_x := io.aql.bits.workgroup_size_x
100 |           io.task.bits.workgroup_size_y := io.aql.bits.workgroup_size_y
101 |           io.task.bits.workgroup_size_z := io.aql.bits.workgroup_size_z
102 |           io.task.bits.grid_id_x := 0.U
103 |           io.task.bits.grid_id_y := 0.U
104 |           io.task.bits.grid_id_z := 0.U
105 |         }
106 |       }
107 |       is(s_working) {
108 |         io.task.bits.workgroup_size_x := workgroup_x
109 |         io.task.bits.workgroup_size_y := workgroup_y
110 |         io.task.bits.workgroup_size_z := workgroup_z
111 |         io.task.bits.grid_id_x := grid_counter_x
112 |         io.task.bits.grid_id_y := grid_counter_y
113 |         io.task.bits.grid_id_z := grid_counter_z
114 |         when(io.task.fire) {
115 |           when(grid_x_acc) {
116 |             grid_counter_x := grid_counter_x + 1.U
117 |           }.otherwise {
118 |             grid_counter_x := 0.U
119 |           }
120 | 
121 |           when(grid_y_acc) {
122 |             grid_counter_y := grid_counter_y + 1.U
123 |           }.otherwise {
124 |             grid_counter_y := 0.U
125 |           }
126 | 
127 |           when(grid_z_acc) {
128 |             grid_counter_z := grid_counter_z + 1.U
129 |           }.otherwise {
130 |             grid_counter_z := 0.U
131 |           }
132 |         }
133 | 
134 |         when(taskDone & io.task.fire) {
135 |           io.task.valid := false.B
136 |         }
137 |       }
138 |     }
139 | 
140 |     val s_rec_idle :: s_rec_working :: s_rec_finish :: Nil = Enum(3)
141 |     val state_rec = RegInit(s_rec_idle)
142 | 
143 |     io.task_resp.ready := state_rec === s_rec_working
144 | 
145 |     switch(state_rec) {
146 |       is(s_rec_idle) {
147 |         when(io.task.fire) {
148 |           state_rec := s_rec_working
149 |         }
150 |       }
151 |       is(s_rec_working) {
152 |         when(recDone & io.task_resp.fire) {
153 |           state_rec := s_rec_finish
154 |         }
155 |       }
156 |       is(s_rec_finish) {
157 |         state_rec := s_rec_idle
158 |       }
159 |     }
160 | 
161 |     switch(state_rec) {
162 |       is(s_rec_idle) {
163 |         grid_rcounter_x := 0.U
164 |         grid_rcounter_y := 0.U
165 |         grid_rcounter_z := 0.U
166 |       }
167 |       is(s_rec_working) {
168 |         when(io.task_resp.fire) {
169 |           when(grid_x_racc) {
170 |             grid_rcounter_x := grid_rcounter_x + 1.U
171 |           }.otherwise {
172 |             grid_rcounter_x := 0.U
173 |           }
174 | 
175 |           when(grid_y_racc) {
176 |             grid_rcounter_y := grid_rcounter_y + 1.U
177 |           }.otherwise {
178 |             grid_rcounter_y := 0.U
179 |           }
180 | 
181 |           when(grid_z_racc) {
182 |             grid_rcounter_z := grid_rcounter_z + 1.U
183 |           }.otherwise {
184 |             grid_rcounter_z := 0.U
185 |           }
186 |         }
187 |         when(recDone & io.task_resp.fire) {
188 |           // io.intr.valid := true.B
189 |         }
190 |       }
191 |       is(s_rec_finish) {
192 |         // io.intr.valid := false.B
193 |       }
194 |     }
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/src/main/scala/dispatcher/TLQM.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.dispatcher
  2 | 
  3 | import chisel3._
  4 | import org.chipsalliance.cde.config.Parameters
  5 | import freechips.rocketchip.diplomacy._
  6 | import freechips.rocketchip.tilelink._
  7 | import freechips.rocketchip.regmapper._
  8 | import chisel3.util.{is, switch, Enum}
  9 | 
 10 | case class QMParams(baseAddress: BigInt = 0x03000000) {
 11 |   def address = AddressSet(baseAddress, 0xff)
 12 | }
 13 | 
 14 | class TLQM(
 15 |   params: QMParams
 16 | )(
 17 |   implicit p: Parameters)
 18 |     extends LazyModule {
 19 | 
 20 |   val device = new SimpleDevice("qm", Seq("ogpu, qm")) {
 21 |     override val alwaysExtended = true
 22 |   }
 23 | 
 24 |   val node = TLRegisterNode(address = Seq(params.address), device = device, beatBytes = 8)
 25 | 
 26 |   val clientParameters = TLMasterPortParameters.v1(
 27 |     clients = Seq(
 28 |       TLMasterParameters.v1(
 29 |         "tlqm master",
 30 |         sourceId = IdRange(0, 16)
 31 |       )
 32 |     )
 33 |   )
 34 |   val clientNode = TLClientNode(Seq(clientParameters))
 35 | 
 36 |   lazy val module = new Impl(this)
 37 |   class Impl(
 38 |     outer: TLQM
 39 |   )(
 40 |     implicit p: Parameters)
 41 |       extends LazyModuleImp(outer) {
 42 |     val io = IO(new Bundle {
 43 |       // val tlb = new TlbRequestIO(1)
 44 |     })
 45 | 
 46 |     // io.tlb.req_kill := false.B
 47 | 
 48 |     // val (tl_out, edge_out) = outer.clientNode.out(0)
 49 |     // val base_addr = RegInit(0.U(64.W))
 50 |     // val rptr = RegInit(0.U(64.W))
 51 |     // val wptr = RegInit(0.U(64.W))
 52 |     // val size = RegInit(0.U(64.W))
 53 |     // val enable = RegInit(0.B)
 54 |     // val data = RegInit(0.U(512.W))
 55 | 
 56 |     // val pending = WireInit(rptr =/= wptr)
 57 | 
 58 |     // // step1 issue tlb request, update rptr
 59 |     // val s1_idle :: s1_req :: s1_ack :: Nil = Enum(3)
 60 | 
 61 |     // val s1_state = RegInit(s1_idle)
 62 |     // val s1_rptr = RegInit(0.U(64.W))
 63 | 
 64 |     // // s1 state transition
 65 |     // switch(s1_state) {
 66 |     //   is(s1_idle) {
 67 |     //     when(pending & enable) {
 68 |     //       s1_state := s1_req
 69 |     //     }
 70 |     //   }
 71 |     //   is(s1_req) {
 72 |     //     when(io.tlb.req.fire) {
 73 |     //       s1_state := s1_ack
 74 |     //     }
 75 |     //   }
 76 |     //   is(s1_ack) {
 77 |     //     when(io.tlb.resp.fire) {
 78 |     //       s1_state := s1_idle
 79 |     //     }
 80 |     //   }
 81 |     // }
 82 | 
 83 |     // // s1 state action
 84 |     // switch(s1_state) {
 85 |     //   is(s1_idle) {
 86 |     //     when(pending & enable) {
 87 |     //       io.tlb.req.valid := true.B
 88 |     //       io.tlb.req.bits.vaddr := base_addr + (rptr % size)
 89 |     //     }.otherwise {
 90 |     //       io.tlb.req.valid := false.B
 91 |     //     }
 92 |     //   }
 93 |     //   is(s1_req) {
 94 |     //     when(io.tlb.req.fire) {
 95 |     //       io.tlb.req.valid := false.B
 96 |     //     }
 97 |     //   }
 98 |     //   is(s1_ack) {
 99 |     //     when(io.tlb.resp.fire) {
100 |     //       rptr := rptr + 8.U
101 |     //     }
102 |     //   }
103 |     // }
104 | 
105 |     // // s2 get paddr and read aql package
106 |     // val s2_idle :: s2_req :: s2_ack :: Nil = Enum(3)
107 |     // io.tlb.resp.ready := s2_state === s2_idle
108 |     // val s2_state = RegInit(s2_idle)
109 | 
110 |     // // s2 state transition
111 |     // switch(s2_state) {
112 |     //   is(s2_idle) {
113 |     //     when(io.tlb.resp.fire & enable) {
114 |     //       s2_state := s2_req
115 |     //     }
116 |     //   }
117 |     //   is(s2_req) {
118 |     //     when(tl_out.a.fire) {
119 |     //       s2_state := s2_ack
120 |     //     }
121 |     //   }
122 |     //   is(s2_ack) {
123 |     //     when(tl_out.d.fire) {
124 |     //       s2_state := s2_idle
125 |     //     }
126 |     //   }
127 |     // }
128 | 
129 |     // // s2 state action
130 |     // switch(s2_state) {
131 |     //   is(s2_idle) {
132 |     //     when(io.tlb.resp.fire) {
133 |     //       tl_out.a.valid := 1.B
134 |     //       tl_out.a.bits.address := 0.U
135 |     //       tl_out.a.bits.opcode := 0.U
136 |     //       tl_out.a.bits.size := 0.U
137 |     //       tl_out.a.bits.data := 0.U
138 |     //       tl_out.a.bits.mask := 0.U
139 |     //     }
140 |     //   }
141 |     //   is(s2_req) {
142 |     //     when(tl_out.a.fire) {
143 |     //       tl_out.a.valid := false.B
144 |     //     }
145 |     //   }
146 |     //   is(s2_ack) {
147 |     //     when(tl_out.d.fire) {
148 |     //       data := tl_out.d.bits.data
149 |     //     }
150 |     //   }
151 |     // }
152 | 
153 |     // // s3 disptach aql data
154 |     // val s3_idle :: s3_req :: s3_ack :: Nil = Enum(3)
155 |     // val s3_state = RegInit(s3_idle)
156 |     // tl_out.d.ready := s3_state === s3_idle
157 | 
158 |     // // ringbuffer base address
159 |     // // ringbuffer rptr
160 |     // // ringbuffer wptr, doorbell register
161 |     // // ringbuffer size
162 |     // // queue enable
163 |     // node.regmap(
164 |     //   0 -> Seq(RegField(64, base_addr, RegFieldDesc("base", "queue ring buffer base address", reset = Some(0)))),
165 |     //   8 -> Seq(RegField(64, rptr, RegFieldDesc("rptr", "queue ring buffer read offset address", reset = Some(0)))),
166 |     //   16 -> Seq(
167 |     //     RegField(64, wptr, RegFieldDesc("wptr", "queue ring buffer write offset", reset = Some(0)))
168 |     //   ),
169 |     //   24 -> Seq(RegField(64, size, RegFieldDesc("size", "queue ring buffer size address", reset = Some(0)))),
170 |     //   32 -> Seq(RegField(1, enable, RegFieldDesc("enable", "queue enable", reset = Some(0))))
171 |     // )
172 |   }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/scala/lib/Sram.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.lib
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | 
 6 | class ReadWriteSmem(width: Int = 32, depth: Int = 1024) extends Module {
 7 |   val io = IO(new Bundle {
 8 |     val enable = Input(Bool())
 9 |     val write = Input(Bool())
10 |     val addr = Input(UInt(log2Ceil(depth).W))
11 |     val dataIn = Input(UInt(width.W))
12 |     val dataOut = Output(UInt(width.W))
13 |   })
14 | 
15 |   val mem = SyncReadMem(depth, UInt(width.W))
16 |   when(io.enable && io.write) {
17 |     mem.write(io.addr, io.dataIn)
18 |   }
19 |   io.dataOut := mem.read(io.addr, io.enable)
20 | }
21 | 
22 | class MaskedSmem_2R1W(width: Int = 32, depth: Int = 1024, vecLen: Int = 32) extends Module {
23 |   val io = IO(new Bundle {
24 |     val write_en = Input(Bool())
25 |     val waddr = Input(UInt(log2Ceil(depth).W))
26 |     val raddr = Input(UInt(log2Ceil(depth).W))
27 |     val raddr2 = Input(UInt(log2Ceil(depth).W))
28 |     val mask = Input(Vec(vecLen, Bool()))
29 |     val dataIn = Input(Vec(vecLen, UInt(width.W)))
30 |     val dataOut = Output(Vec(vecLen, UInt(width.W)))
31 |     val dataOut2 = Output(Vec(vecLen, UInt(width.W)))
32 |   })
33 | 
34 |   val mem = SyncReadMem(depth, Vec(vecLen, UInt(width.W)))
35 |   when(io.write_en) {
36 |     mem.write(io.waddr, io.dataIn, io.mask)
37 |   }
38 |   io.dataOut := mem.read(io.raddr, 1.B)
39 |   io.dataOut2 := mem.read(io.raddr2, 1.B)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/package.scala:
--------------------------------------------------------------------------------
1 | // See LICENSE.Berkeley for license details.
2 | 
3 | package ogpu
4 | 
5 | package object ogpu extends constants.ScalarOpConstants with constants.MemoryOpConstants
6 | 


--------------------------------------------------------------------------------
/src/main/scala/smmu/SMMU.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.smmu
 2 | 
 3 | import chisel3._
 4 | 
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp, SimpleDevice}
 7 | import freechips.rocketchip.tilelink.TLRegisterNode
 8 | import freechips.rocketchip.regmapper.{RegField, RegFieldDesc}
 9 | 
10 | case class SMMUParams(baseAddress: BigInt = 0x02000000) {
11 |   def address = AddressSet(baseAddress, 0xff)
12 | }
13 | 
14 | class TLSMMU(
15 |   params: SMMUParams
16 | )(
17 |   implicit p: Parameters)
18 |     extends LazyModule {
19 | 
20 |   val device = new SimpleDevice("smmu", Seq("ogpu, smmu0")) {
21 |     override val alwaysExtended = true
22 |   }
23 | 
24 |   val node = TLRegisterNode(address = Seq(params.address), device = device, beatBytes = 8)
25 |   // val ptwlm = LazyModule(new L2TLBWrapper())
26 | 
27 |   lazy val module = new Impl(this)
28 |   class Impl(
29 |     outer: TLSMMU
30 |   )(
31 |     implicit p: Parameters)
32 |       extends LazyModuleImp(outer) {
33 |     val io = IO(new Bundle {
34 |       // val tlb = Flipped(new TlbRequestIO(1))
35 |     })
36 | 
37 |     // val (mem, edge) = outer.ptwlm.node.out.head
38 |     // val satp = RegInit(0.U(64.W))
39 | 
40 |     // val ptwm = ptwlm.module
41 |     // val tlbm = Module(new TLB(1, nRespDups = 1, Seq(true), new TLBParameters))
42 | 
43 |     // val tlb_ptw = Wire(new VectorTlbPtwIO(1))
44 |     // tlb_ptw.connect(tlbm.io.ptw)
45 | 
46 |     // val sfence = WireInit(0.U.asTypeOf(new SfenceBundle))
47 |     // val tlbCsr = WireInit(0.U.asTypeOf(new TlbCsrBundle))
48 |     // tlbCsr.satp.apply(satp)
49 | 
50 |     // tlbm.io.requestor(0) <> io.tlb
51 |     // tlbm.io.csr := tlbCsr
52 |     // tlbm.io.sfence := sfence
53 |     // tlbm.io.hartId := 0.U
54 |     // tlbm.io.flushPipe := 0.U.asTypeOf(tlbm.io.flushPipe)
55 | 
56 |     // val tlbRepeater1 = PTWFilter(16, tlb_ptw, sfence, tlbCsr, 8)
57 |     // val tlbRepeater2 = PTWRepeaterNB(passReady = false, 16, tlbRepeater1.io.ptw, ptwm.io.tlb(0), sfence, tlbCsr)
58 | 
59 |     // ptwm.io.csr.tlb.satp.apply(satp)
60 |     // ptwm.io.csr.tlb.priv := 0.U.asTypeOf(ptwm.io.csr.tlb.priv)
61 | 
62 |     // ptwm.io.sfence := sfence
63 |     // ptwm.io.tlb(1) <> 0.U.asTypeOf(ptwm.io.tlb(1))
64 | 
65 |     // ptwm.io.hartId := 0.U
66 | 
67 |     // // CSR has been written by csr inst, copies of csr should be updated
68 |     // // for pmp, we dont use it
69 |     // ptwm.io.csr.distribute_csr := 0.U.asTypeOf(ptwm.io.csr.distribute_csr)
70 | 
71 |     // tlbRepeater1.io.debugTopDown := DontCare
72 | 
73 |     // // 0 satp.ppn  sv39 and sv48
74 |     // //   bits 63:60  mode
75 |     // //   bits 59:44  asid
76 |     // //   bits 43:0   ppn
77 |     // node.regmap(
78 |     //   0 -> Seq(RegField(64, satp, RegFieldDesc("satp", "satp: SMMU satp rw register.", reset = Some(0))))
79 |     // )
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/system/SoC.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.system
 2 | 
 3 | import org.chipsalliance.cde.config.{Field, Parameters}
 4 | import freechips.rocketchip.diplomacy._
 5 | 
 6 | case object SoCParamsKey extends Field[SoCParameters]
 7 | 
 8 | /** Global cache coherence granularity, which applies to all caches, for now. */
 9 | // case object CacheBlockBytes extends Field[Int](64)
10 | 
11 | case class SoCParameters(
12 |   EnableILA: Boolean = false,
13 |   PAddrBits: Int = 36,
14 |   extIntrs:  Int = 64) {
15 |   // L3 configurations
16 |   val L3InnerBusWidth = 256
17 |   val L3BlockSize = 64
18 |   // on chip network configurations
19 |   val L3OuterBusWidth = 256
20 | }
21 | 
22 | trait HasSoCParameter {
23 |   implicit val p: Parameters
24 | 
25 |   val soc = p(SoCParamsKey)
26 |   // val debugOpts = p(DebugOptionsKey)
27 |   // val tiles = p(XSTileKey)
28 | 
29 |   // val NumCores = tiles.size
30 |   val EnableILA = soc.EnableILA
31 | 
32 |   // L3 configurations
33 |   val L3InnerBusWidth = soc.L3InnerBusWidth
34 |   val L3BlockSize = soc.L3BlockSize
35 | 
36 |   // on chip network configurations
37 |   val L3OuterBusWidth = soc.L3OuterBusWidth
38 | 
39 |   val NrExtIntr = soc.extIntrs
40 | }
41 | 
42 | abstract class OGPUSystem(
43 |   implicit p: Parameters)
44 |     extends LazyModule {}
45 | 


--------------------------------------------------------------------------------
/src/main/scala/tile/CuTaskBundle.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.tile
 2 | 
 3 | import chisel3._
 4 | import org.chipsalliance.cde.config.Parameters
 5 | import ogpu.config._
 6 | 
 7 | class CuTaskBundle(
 8 |   implicit p: Parameters)
 9 |     extends Bundle {
10 |   val numThreads = p(ThreadNum)
11 |   val addrWidth = p(AddrWidth)
12 |   val numWarps = p(WarpNum)
13 |   val dimWidth = p(DimWidth)
14 |   val xLen = p(XLen)
15 | 
16 |   val mask = Vec(numThreads, Bool())
17 |   // max threads num in a workgroup
18 |   val thread_dims = Vec(3, UInt(dimWidth.W))
19 |   val vgpr_num = UInt(2.W)
20 |   val sgprs = Vec(16, UInt(xLen.W))
21 |   val sgpr_num = UInt(4.W)
22 |   val reg_index = UInt(p(RegIDWidth).W)
23 |   val pc = UInt(addrWidth.W)
24 | }
25 | 
26 | class CuTaskRespBundle(
27 |   implicit p: Parameters)
28 |     extends Bundle {
29 |   val finish = Bool()
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/tile/WorkGroupScheduler.scala:
--------------------------------------------------------------------------------
  1 | package ogpu.tile
  2 | 
  3 | import org.chipsalliance.cde.config.Parameters
  4 | import freechips.rocketchip.diplomacy._
  5 | import chisel3._
  6 | import chisel3.util._
  7 | 
  8 | import ogpu.dispatcher._
  9 | 
 10 | // init register and send warp task to warp scheduler in cu
 11 | 
 12 | case class WgSchedParams() {
 13 |   def haha = 1
 14 | }
 15 | 
 16 | class WorkGroupScheduler(
 17 |   params: WgSchedParams
 18 | )(
 19 |   implicit p: Parameters)
 20 |     extends LazyModule {
 21 |   lazy val module = new Impl(this)
 22 | 
 23 |   class Impl(
 24 |     outer: WorkGroupScheduler
 25 |   )(
 26 |     implicit p: Parameters)
 27 |       extends LazyModuleImp(outer) {
 28 |     val io = IO(new Bundle {
 29 |       val task = Flipped(DecoupledIO(new WorkGroupTaskBundle))
 30 |       val task_resp = DecoupledIO(new WorkGroupTaskRespBundle)
 31 |       val cu_task = DecoupledIO(new CuTaskBundle)
 32 |       val cu_task_resp = Flipped(DecoupledIO(new CuTaskRespBundle))
 33 |     })
 34 | 
 35 |     val workgroup_x = RegInit(UInt(16.W))
 36 |     val workgroup_y = RegInit(UInt(16.W))
 37 |     val workgroup_z = RegInit(UInt(16.W))
 38 |     val workgroup_counter_x = RegInit(UInt(16.W))
 39 |     val workgroup_counter_y = RegInit(UInt(16.W))
 40 |     val workgroup_counter_z = RegInit(UInt(16.W))
 41 | 
 42 |     val workgroup_rcounter_x = RegInit(UInt(16.W))
 43 |     val workgroup_rcounter_y = RegInit(UInt(16.W))
 44 |     val workgroup_rcounter_z = RegInit(UInt(16.W))
 45 | 
 46 |     val s_idle :: s_working :: s_finish :: Nil = Enum(3)
 47 |     val state = RegInit(s_idle)
 48 | 
 49 |     val taskDone = workgroup_counter_x === (workgroup_x - 32.U) &
 50 |       workgroup_counter_y === (workgroup_y - 1.U) &
 51 |       workgroup_counter_z === (workgroup_z - 1.U)
 52 | 
 53 |     val recDone = workgroup_rcounter_x === (workgroup_x - 32.U) &
 54 |       workgroup_rcounter_y === (workgroup_y - 1.U) &
 55 |       workgroup_rcounter_z === (workgroup_z - 1.U)
 56 | 
 57 |     val workgroup_x_acc = (workgroup_counter_x =/= (workgroup_x - 32.U))
 58 |     val workgroup_y_acc = (workgroup_counter_x === (workgroup_x - 1.U)) & (workgroup_counter_y =/= (workgroup_y - 1.U))
 59 |     val workgroup_z_acc = (workgroup_counter_x === (workgroup_x - 1.U)) & (workgroup_counter_y === (workgroup_y - 1.U))
 60 | 
 61 |     val workgroup_x_racc = (workgroup_rcounter_x =/= (workgroup_x - 32.U))
 62 |     val workgroup_y_racc =
 63 |       (workgroup_rcounter_x === (workgroup_x - 1.U)) & (workgroup_rcounter_y =/= (workgroup_y - 1.U))
 64 |     val workgroup_z_racc =
 65 |       (workgroup_rcounter_x === (workgroup_x - 1.U)) & (workgroup_rcounter_y === (workgroup_y - 1.U))
 66 | 
 67 |     // state transition
 68 |     switch(state) {
 69 |       is(s_idle) {
 70 |         when(io.task.fire) {
 71 |           state := s_working
 72 |         }
 73 |       }
 74 |       is(s_working) {
 75 |         when(taskDone & io.cu_task.fire) {
 76 |           state := s_finish
 77 |         }
 78 |       }
 79 |       is(s_finish) {
 80 |         when(io.task_resp.fire) {
 81 |           state := s_idle
 82 |         }
 83 |       }
 84 |     }
 85 | 
 86 |     io.cu_task.bits.thread_dims := VecInit(Seq(workgroup_counter_x, workgroup_counter_y, workgroup_counter_z))
 87 |     io.cu_task.valid := state === s_working
 88 |     // state action
 89 |     switch(state) {
 90 |       is(s_idle) {
 91 |         when(io.task.fire) {
 92 |           workgroup_x := io.task.bits.workgroup_size_x
 93 |           workgroup_y := io.task.bits.workgroup_size_y
 94 |           workgroup_z := io.task.bits.workgroup_size_z
 95 |           workgroup_counter_x := 0.U
 96 |           workgroup_counter_y := 0.U
 97 |           workgroup_counter_z := 0.U
 98 |         }
 99 |       }
100 |       is(s_working) {
101 |         when(io.task.fire) {
102 |           when(workgroup_x_acc) {
103 |             workgroup_counter_x := workgroup_counter_x + 32.U
104 |           }.otherwise {
105 |             workgroup_counter_x := 0.U
106 |           }
107 | 
108 |           when(workgroup_y_acc) {
109 |             workgroup_counter_y := workgroup_counter_y + 1.U
110 |           }.otherwise {
111 |             workgroup_counter_y := 0.U
112 |           }
113 | 
114 |           when(workgroup_z_acc) {
115 |             workgroup_counter_z := workgroup_counter_z + 1.U
116 |           }.otherwise {
117 |             workgroup_counter_z := 0.U
118 |           }
119 |         }
120 |       }
121 |     }
122 | 
123 |     val s_rec_idle :: s_rec_working :: s_rec_finish :: Nil = Enum(3)
124 |     val state_rec = RegInit(s_rec_idle)
125 | 
126 |     io.cu_task_resp.ready := state_rec === s_rec_working
127 | 
128 |     switch(state_rec) {
129 |       is(s_rec_idle) {
130 |         when(io.cu_task.fire) {
131 |           state_rec := s_rec_working
132 |         }
133 |       }
134 |       is(s_rec_working) {
135 |         when(recDone & io.task_resp.fire) {
136 |           state_rec := s_rec_finish
137 |         }
138 |       }
139 |       is(s_rec_finish) {
140 |         when(io.task_resp.fire) {
141 |           state_rec := s_rec_idle
142 |         }
143 |       }
144 |     }
145 | 
146 |     io.task_resp.valid := state_rec === s_rec_finish
147 |     switch(state_rec) {
148 |       is(s_rec_idle) {
149 |         workgroup_rcounter_x := 0.U
150 |         workgroup_rcounter_y := 0.U
151 |         workgroup_rcounter_z := 0.U
152 |       }
153 |       is(s_rec_working) {
154 |         when(io.task_resp.fire) {
155 |           when(workgroup_x_racc) {
156 |             workgroup_rcounter_x := workgroup_rcounter_x + 1.U
157 |           }.otherwise {
158 |             workgroup_rcounter_x := 0.U
159 |           }
160 | 
161 |           when(workgroup_y_racc) {
162 |             workgroup_rcounter_y := workgroup_rcounter_y + 1.U
163 |           }.otherwise {
164 |             workgroup_rcounter_y := 0.U
165 |           }
166 | 
167 |           when(workgroup_z_racc) {
168 |             workgroup_rcounter_z := workgroup_rcounter_z + 1.U
169 |           }.otherwise {
170 |             workgroup_rcounter_z := 0.U
171 |           }
172 |         }
173 |       }
174 |     }
175 |   }
176 | }
177 | 


--------------------------------------------------------------------------------
/src/main/scala/util/AddrBits.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.util
 2 | 
 3 | import chisel3.util._
 4 | 
 5 | object VaddrHelper {
 6 | 
 7 |   def vaddrBits(xlen: Int, pglevel: Int, pgsize: Int, hvbits: Int): Int = {
 8 |     val pgLevelBits = 10 - log2Ceil(xlen / 32)
 9 |     val maxVAddrBits = pgLevelBits * pglevel + log2Ceil(pgsize)
10 |     maxVAddrBits + hvbits
11 |   }
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/util/PipelineReg.scala:
--------------------------------------------------------------------------------
 1 | package ogpu.util
 2 | 
 3 | import chisel3._
 4 | import chisel3.util._
 5 | 
 6 | object AddPipelineReg {
 7 | 
 8 |   class PipelineRegModule[T <: Data](gen: T) extends Module {
 9 | 
10 |     val io = IO(new Bundle() {
11 |       val in = Flipped(DecoupledIO(gen.cloneType))
12 |       val out = DecoupledIO(gen.cloneType)
13 |       val isFlush = Input(Bool())
14 |     })
15 | 
16 |     val valid = RegInit(false.B)
17 |     valid.suggestName("pipeline_reg_valid")
18 |     when(io.out.fire) { valid := false.B }
19 |     when(io.in.fire) { valid := true.B }
20 |     when(io.isFlush) { valid := false.B }
21 | 
22 |     io.in.ready := !valid || io.out.ready
23 |     io.out.bits := RegEnable(io.in.bits, io.in.fire)
24 |     io.out.valid := valid // && !isFlush
25 |   }
26 | 
27 |   def apply[T <: Data](
28 |     left:       DecoupledIO[T],
29 |     right:      DecoupledIO[T],
30 |     isFlush:    Bool,
31 |     moduleName: Option[String] = None
32 |   ) {
33 |     val pipelineReg = Module(new PipelineRegModule[T](left.bits.cloneType))
34 |     if (moduleName.nonEmpty) pipelineReg.suggestName(moduleName.get)
35 |     pipelineReg.io.in <> left
36 |     right <> pipelineReg.io.out
37 |     pipelineReg.io.isFlush := isFlush
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add.asm:
--------------------------------------------------------------------------------
 1 |     addi x3, x0, 32
 2 |     add x3, x3, x1
 3 |     lw x4, (x3)
 4 |     add x4, x4, 2
 5 |     addi x5, x3, 256
 6 |     sw x4, (x5)
 7 |     wfi
 8 | 02000193
 9 | 001181b3
10 | 0001a203
11 | 00220213
12 | 10018293
13 | 0042a023
14 | 10500073
15 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add.hex:
--------------------------------------------------------------------------------
1 | 02000193
2 | 001181b3
3 | 0001a203
4 | 00220213
5 | 10018293
6 | 0042a023
7 | 10500073
8 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add_0.txt:
--------------------------------------------------------------------------------
1 | 13
2 | 13
3 | 33
4 | 83
5 | 93
6 | 13
7 | 23
8 | 73
9 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add_1.txt:
--------------------------------------------------------------------------------
1 | 01
2 | 11
3 | 81
4 | 21
5 | 81
6 | 02
7 | 20
8 | 00
9 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add_2.txt:
--------------------------------------------------------------------------------
1 | 00
2 | 91
3 | 20
4 | 01
5 | 11
6 | 01
7 | 32
8 | 50
9 | 


--------------------------------------------------------------------------------
/src/test/data/test1/add_3.txt:
--------------------------------------------------------------------------------
1 | 40
2 | 01
3 | 00
4 | 00
5 | 00
6 | 10
7 | 00
8 | 10
9 | 


--------------------------------------------------------------------------------
/src/test/scala/AXI4RamTest.scala:
--------------------------------------------------------------------------------
 1 | import chisel3._
 2 | import chiseltest._
 3 | 
 4 | import freechips.rocketchip.amba.axi4._
 5 | import org.chipsalliance.cde.config.Parameters
 6 | import freechips.rocketchip.diplomacy._
 7 | import freechips.rocketchip.system._
 8 | import chiseltest.simulator.WriteVcdAnnotation
 9 | 
10 | import org.scalatest.flatspec.AnyFlatSpec
11 | 
12 | class AXI4SlaveRAM(
13 |   implicit p: Parameters)
14 |     extends LazyModule {
15 |   val ram = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff)))
16 |   val axi_m_param = AXI4MasterParameters("myaximaster")
17 |   val axi_m_port = AXI4MasterPortParameters(Seq(axi_m_param))
18 |   val axi_master = AXI4MasterNode(Seq(axi_m_port))
19 |   val ios = InModuleBody(axi_master.makeIOs())
20 | 
21 |   ram.node := AXI4Buffer() := axi_master
22 | 
23 |   lazy val module = new Impl
24 | 
25 |   class Impl extends LazyModuleImp(this) {
26 |     val io = ios.head
27 |   }
28 | 
29 | }
30 | 
31 | // (implicit p: Parameters)
32 | class AXI4RAMTest extends AnyFlatSpec with ChiselScalatestTester {
33 |   behavior.of("AXI4RAM")
34 | 
35 |   it should "perform axi rw operations correctly" in {
36 |     implicit val p = new BaseConfig
37 |     val axiram = LazyModule(new AXI4SlaveRAM())
38 |     // val mymod = Module(axiram.module)
39 |     test(axiram.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
40 |       // Write data to the axi4ram module
41 |       dut.io.aw.valid.poke(true.B)
42 |       dut.io.aw.bits.addr.poke(0x00000000L.U)
43 |       dut.io.aw.bits.len.poke(0.U)
44 |       dut.io.aw.bits.size.poke(2.U)
45 |       dut.io.w.valid.poke(true.B)
46 |       dut.io.w.bits.data.poke(0xabcd.U)
47 |       dut.io.w.bits.strb.poke("b1111".U)
48 |       dut.clock.step()
49 |       println(dut.io.aw.bits.id.getClass) // .getSimpleName)
50 |       println(dut.io.aw.bits.id.getWidth)
51 |       // Wait for write transaction to finish
52 |       while (dut.io.b.valid.peek().litToBoolean == false) {
53 |         dut.clock.step()
54 |       }
55 | 
56 |       dut.io.aw.valid.poke(false.B)
57 |       dut.io.w.valid.poke(false.B)
58 |       dut.io.ar.valid.poke(true.B)
59 |       dut.io.ar.bits.addr.poke(0x00000000L.U)
60 |       dut.io.ar.bits.len.poke(0.U)
61 |       dut.io.ar.bits.size.poke(2.U)
62 |       dut.clock.step()
63 | 
64 |       // Wait for read transaction to finish
65 |       while (dut.io.r.valid.peek().litToBoolean == false) {
66 |         dut.clock.step()
67 |       }
68 | 
69 |       // Read data from the axi4ram module
70 |       dut.io.r.bits.data.expect(0xabcd.U)
71 |     }
72 |   }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/test/scala/DCacheTest.scala:
--------------------------------------------------------------------------------
 1 | import chisel3._
 2 | import freechips.rocketchip.diplomacy._
 3 | import freechips.rocketchip.tilelink._
 4 | import org.chipsalliance.cde.config.Parameters
 5 | import chiseltest._
 6 | import org.scalatest.flatspec.AnyFlatSpec
 7 | import chiseltest.simulator.WriteVcdAnnotation
 8 | 
 9 | import ogpu.core._
10 | import ogpu.config._
11 | 
12 | class DCacheTestTop(
13 | )(
14 |   implicit p: Parameters)
15 |     extends LazyModule {
16 | 
17 |   val cfg = CacheParameter(
18 |     nSets = 64,
19 |     nWays = 4,
20 |     paddrBits = 48,
21 |     vaddrBits = 48,
22 |     pgIdxBits = 12,
23 |     dataBits = 64,
24 |     coreId = 0,
25 |     tagECC = None,
26 |     dataECC = None
27 |   )
28 | 
29 |   val ram = LazyModule(new TLRAM(AddressSet(0x80000000L, 0xffffL), beatBytes = 8))
30 |   val dcache = LazyModule(new DCache(cfg))
31 |   ram.node :=*
32 |     TLXbar() :=*
33 |     TLFragmenter(8, 64) :=*
34 |     TLCacheCork() :=*
35 |     dcache.node
36 | 
37 |   lazy val module = new Impl
38 |   class Impl extends LazyModuleImp(this) {
39 |     val io = IO(new Bundle {
40 |       val dcache = new CacheBundle(cfg)
41 |     })
42 |     dcache.module.io <> io.dcache
43 |   }
44 | }
45 | 
46 | class DCacheTest extends AnyFlatSpec with ChiselScalatestTester {
47 |   behavior.of("DCacheTest")
48 | 
49 |   it should "perfrom dcache test correctly" in {
50 |     implicit val p = new OGPUDefaultConfig
51 |     val top = LazyModule(new DCacheTestTop())
52 |     test(top.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
53 |       dut.clock.step(80)
54 |       dut.io.dcache.ptw.ptbr.mode.poke(0x8.U)
55 |       dut.io.dcache.ptw.req.ready.poke(true.B)
56 |       dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U)
57 |       dut.io.dcache.cpu.req.valid.poke(true.B)
58 |       dut.clock.step()
59 |       dut.io.dcache.cpu.req.valid.poke(false.B)
60 |       // while (dut.io.dcache.cpu.resp.valid.peek().litToBoolean == false) {
61 |       while (dut.io.dcache.cpu.s2_nack.peek().litToBoolean == false) {
62 |         dut.clock.step()
63 |       }
64 |       dut.clock.step(5) // ptw resp must be delayed
65 |       println("read dcache failed because of tlb miss")
66 |       dut.io.dcache.ptw.resp.valid.poke(true.B)
67 |       dut.io.dcache.ptw.resp.bits.pte.ppn.poke(0x80000.U)
68 |       dut.clock.step()
69 |       dut.io.dcache.ptw.resp.valid.poke(false.B)
70 |       dut.clock.step(5)
71 |       dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U) // request same addr again
72 |       dut.io.dcache.cpu.req.valid.poke(true.B)
73 |       dut.clock.step()
74 |       dut.io.dcache.cpu.req.valid.poke(false.B)
75 |       dut.clock.step(20)
76 |       dut.io.dcache.cpu.req.bits.addr.poke(0x1024000.U) // request again, cache hit
77 |       dut.io.dcache.cpu.req.valid.poke(true.B)
78 |       dut.clock.step()
79 |       dut.io.dcache.cpu.req.valid.poke(false.B)
80 |       while (dut.io.dcache.cpu.resp.valid.peek().litToBoolean == false) {
81 |         dut.clock.step()
82 |       }
83 |       dut.clock.step(5)
84 | 
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/test/scala/ICacheTest.scala:
--------------------------------------------------------------------------------
 1 | import chisel3._
 2 | import freechips.rocketchip.diplomacy._
 3 | import freechips.rocketchip.tilelink._
 4 | import org.chipsalliance.cde.config.Parameters
 5 | import chiseltest._
 6 | import org.scalatest.flatspec.AnyFlatSpec
 7 | import chiseltest.simulator.WriteVcdAnnotation
 8 | 
 9 | import ogpu.core._
10 | import ogpu.config._
11 | 
12 | class ICacheTestTop(
13 | )(
14 |   implicit p: Parameters)
15 |     extends LazyModule {
16 | 
17 |   val cfg = ICacheParams(
18 |     nSets = 64,
19 |     nWays = 4,
20 |     paddrBits = 48,
21 |     vaddrBits = 48,
22 |     pgIdxBits = 48,
23 |     dataBits = 64,
24 |     coreId = 0,
25 |     tagECC = None,
26 |     dataECC = None
27 |   )
28 | 
29 |   val ram = LazyModule(new TLRAM(AddressSet(0x80000000L, 0xffffL), beatBytes = 8))
30 |   val icache = LazyModule(new ICache(cfg))
31 |   ram.node :=*
32 |     TLXbar() :=*
33 |     TLFragmenter(8, 64) :=*
34 |     TLCacheCork() :=*
35 |     icache.masterNode
36 | 
37 |   lazy val module = new Impl
38 |   class Impl extends LazyModuleImp(this) {
39 |     val io = IO(new Bundle {
40 |       val icache = new ICacheBundle(cfg)
41 |     })
42 |     icache.module.io <> io.icache
43 |   }
44 | }
45 | 
46 | class ICacheTest extends AnyFlatSpec with ChiselScalatestTester {
47 |   behavior.of("ICacheTest")
48 | 
49 |   it should "perfrom icache test correctly" in {
50 |     implicit val p = new OGPUDefaultConfig
51 |     val top = LazyModule(new ICacheTestTop())
52 |     test(top.module).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
53 |       dut.clock.step()
54 |       dut.io.icache.req.valid.poke(true.B)
55 |       dut.io.icache.req.bits.addr.poke(0x1000.U)
56 |       dut.clock.step()
57 |       dut.io.icache.req.valid.poke(false.B)
58 |       dut.io.icache.s1_paddr.poke(0x80002000L.U)
59 |       dut.clock.step(10)
60 |       // cache miss and request again
61 |       dut.io.icache.req.valid.poke(true.B)
62 |       dut.io.icache.req.bits.addr.poke(0x1000.U)
63 |       dut.clock.step()
64 |       dut.io.icache.req.valid.poke(false.B)
65 |       dut.io.icache.s1_paddr.poke(0x80002000L.U)
66 |       while (dut.io.icache.resp.valid.peek().litToBoolean == false) {
67 |         dut.clock.step()
68 |       }
69 |       dut.clock.step(5)
70 |       // request offset
71 |       dut.io.icache.req.valid.poke(true.B)
72 |       dut.io.icache.req.bits.addr.poke(0x1008.U)
73 |       dut.clock.step()
74 |       dut.io.icache.req.valid.poke(false.B)
75 |       dut.io.icache.s1_paddr.poke(0x80002008L.U)
76 |       // hit again
77 |       while (dut.io.icache.resp.valid.peek().litToBoolean == false) {
78 |         dut.clock.step()
79 |       }
80 |       dut.clock.step(5)
81 |     }
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/test/scala/IFetchTest.scala:
--------------------------------------------------------------------------------
 1 | import chisel3._
 2 | import chiseltest._
 3 | import org.scalatest.flatspec.AnyFlatSpec
 4 | import chiseltest.simulator.WriteVcdAnnotation
 5 | 
 6 | import ogpu.core._
 7 | import ogpu.config._
 8 | 
 9 | class IFetchTest extends AnyFlatSpec with ChiselScalatestTester {
10 |   behavior.of("IFetch")
11 | 
12 |   it should "perform ifetch operations correctly" in {
13 | 
14 |     val cfg = ICacheParams(
15 |       nSets = 64,
16 |       nWays = 4,
17 |       paddrBits = 48,
18 |       vaddrBits = 48,
19 |       pgIdxBits = 48,
20 |       dataBits = 64,
21 |       coreId = 0,
22 |       tagECC = None,
23 |       dataECC = None
24 |     )
25 | 
26 |     implicit val p = new OGPUDefaultConfig
27 |     test(new InstFetch(cfg)).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
28 |       dut.io.to_icache.req.ready.poke(true.B)
29 |       dut.io.inst_out.ready.poke(true.B)
30 |       dut.io.to_ptw.ptbr.mode.poke(0x8.U)
31 |       dut.io.to_ptw.req.ready.poke(true.B)
32 |       dut.clock.step(5)
33 |       dut.io.inst_fetch.valid.poke(true.B)
34 |       dut.io.inst_fetch.bits.pc.poke(0x1024.U)
35 |       dut.clock.step()
36 |       dut.io.inst_fetch.valid.poke(false.B)
37 |       dut.clock.step(5)
38 |       dut.io.to_ptw.resp.valid.poke(true.B)
39 |       dut.clock.step(1)
40 |       dut.io.to_ptw.resp.valid.poke(false.B)
41 |       dut.clock.step(1)
42 |       dut.io.to_icache.resp.valid.poke(true.B)
43 |       dut.clock.step(1)
44 |       dut.io.to_icache.resp.valid.poke(true.B)
45 |       dut.clock.step(1)
46 |       dut.io.to_icache.resp.valid.poke(false.B)
47 | 
48 |       dut.clock.step(20)
49 | 
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/scala/PTWTest.scala:
--------------------------------------------------------------------------------
 1 | import chiseltest._
 2 | import chisel3._
 3 | import org.scalatest.flatspec.AnyFlatSpec
 4 | import chiseltest.simulator.WriteVcdAnnotation
 5 | 
 6 | import ogpu.core._
 7 | 
 8 | class PTWTest extends AnyFlatSpec with ChiselScalatestTester {
 9 |   behavior.of("PTW")
10 | 
11 |   it should "perform ptw operations correctly" in {
12 | 
13 |     val ptw_param = PTWParameter(paddrBits = 48, vaddrBits = 48)
14 |     val cache_param = CacheParameter(paddrBits = 48, vaddrBits = 48)
15 |     test(new PTW(1, ptw_param, cache_param)).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
16 |       println("ptw test start")
17 |       dut.io.mem.req.ready.poke(true.B)
18 |       dut.clock.step(5)
19 |       dut.io.requestor(0).req.valid.poke(true.B)
20 |       dut.io.requestor(0).req.bits.valid.poke(true.B)
21 |       dut.io.requestor(0).req.bits.bits.addr.poke("x0012345678".U)
22 |       dut.clock.step(1)
23 |       dut.io.requestor(0).req.valid.poke(false.B)
24 |       dut.clock.step(5)
25 |       dut.io.mem.resp.valid.poke(true.B)
26 |       dut.io.mem.resp.bits.data.poke("x8000ffc01".U) // level 0 ppn 0x2003ff
27 |       dut.clock.step(1)
28 |       dut.io.mem.resp.valid.poke(false.B)
29 |       dut.clock.step(4)
30 |       dut.io.mem.resp.valid.poke(true.B)
31 |       dut.io.mem.resp.bits.data.poke("x1".U) // level 1
32 |       dut.clock.step(1)
33 |       dut.io.mem.resp.valid.poke(false.B)
34 |       dut.clock.step(3)
35 |       dut.io.mem.resp.valid.poke(true.B)
36 |       dut.io.mem.resp.bits.data.poke("x40001".U) // level 2 ppn 0x100
37 |       dut.clock.step(1)
38 |       dut.io.mem.resp.valid.poke(false.B)
39 |       while (dut.io.requestor(0).resp.valid.peek().litToBoolean == false) {
40 |         dut.clock.step(1)
41 |       }
42 |       dut.io.requestor(0).resp.bits.pte.ppn.expect(0x100.U)
43 |       dut.clock.step(5)
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/test/scala/TLBTest.scala:
--------------------------------------------------------------------------------
 1 | import chiseltest._
 2 | import chisel3._
 3 | import org.scalatest.flatspec.AnyFlatSpec
 4 | import chiseltest.simulator.WriteVcdAnnotation
 5 | 
 6 | import ogpu.core._
 7 | 
 8 | class TLBTest extends AnyFlatSpec with ChiselScalatestTester {
 9 |   behavior.of("TLB")
10 | 
11 |   it should "perform tlb operations correctly" in {
12 | 
13 |     val tlb_param = TLBParameter(nSets = 32, nWays = 4, paddrBits = 48, vaddrBits = 48)
14 |     test(new TLB(true, tlb_param)).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
15 |       println("tlb test start")
16 |       dut.clock.step(5)
17 |       dut.io.ptw.ptbr.mode.poke(0x8.U)
18 |       dut.io.ptw.req.ready.poke(1.B)
19 |       dut.io.req.bits.vaddr.poke(0x1024.U)
20 |       dut.io.req.bits.passthrough.poke(true.B)
21 |       dut.io.req.bits.size.poke(2.U)
22 |       dut.io.req.valid.poke(true.B)
23 |       while (dut.io.req.ready.peek().litToBoolean == false) {
24 |         dut.clock.step(1)
25 |       }
26 |       dut.clock.step(1)
27 |       dut.io.req.valid.poke(false.B)
28 |       dut.io.resp.paddr.expect(0x1024.U)
29 |       println(s" tlb return miss? ${dut.io.resp.miss.peek().litToBoolean}")
30 |       println(s" tlb return paddr ${dut.io.resp.paddr.peek()}")
31 |       dut.clock.step(5)
32 |       dut.io.req.bits.vaddr.poke(0x80000.U)
33 |       dut.io.req.bits.passthrough.poke(false.B)
34 |       dut.io.req.valid.poke(true.B)
35 |       while (dut.io.req.ready.peek().litToBoolean == false) {
36 |         dut.clock.step(1)
37 |       }
38 |       dut.clock.step(1)
39 |       dut.io.req.valid.poke(false.B)
40 |       dut.clock.step(10)
41 |       dut.io.ptw.resp.valid.poke(true.B)
42 |       dut.clock.step(1)
43 |       dut.io.ptw.resp.valid.poke(false.B)
44 |       dut.clock.step(5)
45 |       dut.io.req.bits.vaddr.poke(0x80008.U)
46 |       dut.io.req.valid.poke(true.B)
47 |       while (dut.io.req.ready.peek().litToBoolean == false) {
48 |         dut.clock.step(1)
49 |       }
50 |       dut.clock.step(1)
51 |       dut.io.req.valid.poke(false.B)
52 |       dut.io.resp.paddr.expect(0x8.U)
53 |       dut.io.resp.miss.expect(false.B)
54 |       dut.clock.step(5)
55 |       dut.io.req.bits.vaddr.poke(0x180008.U) // cache conflict
56 |       dut.io.req.valid.poke(true.B)
57 |       while (dut.io.req.ready.peek().litToBoolean == false) {
58 |         dut.clock.step(1)
59 |       }
60 |       dut.clock.step(1)
61 |       dut.io.req.valid.poke(false.B)
62 |       dut.clock.step(10)
63 |       dut.io.ptw.resp.valid.poke(true.B)
64 |       dut.io.ptw.resp.bits.pte.ppn.poke(0x3030.U)
65 |       dut.clock.step(1)
66 |       dut.io.ptw.resp.valid.poke(false.B)
67 |       dut.clock.step(10)
68 |     }
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/test/scala/WarpSchedulerTest.scala:
--------------------------------------------------------------------------------
 1 | import chisel3._
 2 | import chiseltest._
 3 | import org.scalatest.flatspec.AnyFlatSpec
 4 | import chiseltest.simulator.WriteVcdAnnotation
 5 | 
 6 | import ogpu.config._
 7 | import ogpu.core._
 8 | 
 9 | class WarpSchedulerTest extends AnyFlatSpec with ChiselScalatestTester {
10 |   behavior.of("WarpScheduler")
11 | 
12 |   it should "perform warp scheduler operations correctly" in {
13 |     implicit val p = new OGPUDefaultConfig
14 |     test(new WarpScheduler()).withAnnotations(Seq(WriteVcdAnnotation)) { dut =>
15 |       println("warp sched test start")
16 |       dut.io.warp_cmd.valid.poke(1.B)
17 |       dut.io.warp_cmd.bits.mask(0).poke(1.B)
18 |       dut.io.warp_cmd.bits.mask(3).poke(1.B)
19 |       dut.io.warp_cmd.bits.vgpr_num.poke(2.U)
20 |       dut.io.warp_cmd.bits.pc.poke(0x800000000L)
21 |       dut.io.vgpr_commit.ready.poke(1.B)
22 |       if (dut.io.warp_cmd.ready.peek().litToBoolean == false)
23 |         println("warp cmd ready is false")
24 |       while (dut.io.warp_cmd.ready.peek().litToBoolean == false) {
25 |         dut.clock.step(1)
26 |       }
27 |       dut.clock.step(1)
28 |       dut.io.warp_cmd.valid.poke(0.B)
29 |       dut.clock.step(5)
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------