├── sw
├── Makefile
└── BandwidthBenchmark.cpp
├── src
├── TestAXIRouter.scala
├── Elaborate.scala
├── Interfaces.scala
├── BandwidthProbe.scala
├── Util.scala
├── deprecated
│ ├── Interfaces.scala
│ ├── NVMeBlackBox.scala
│ └── NVMe.scala
├── AXI2NVMeRam.scala
├── NVMeLatencyBenchmarkTop.scala
├── NVMeBandwidthBenchmarkTop.scala
├── NVMeLatencyBenchmark.scala
├── NVMeBandwidthBenchmark.scala
└── NVMeCore.scala
├── tb
├── tb_TestLatencyBucket.sv
└── tb_TestAXIRouter.sv
├── sv
└── NVMeBenchmarkTop.xdc
└── README.md
/sw/Makefile:
--------------------------------------------------------------------------------
1 | CC = g++
2 | CFLAGS = -lqdma
3 |
4 | default_target: all
5 |
6 | all: NVMeBenchmark
7 |
8 | NVMeBenchmark: NVMeBenchmark.cpp
9 | $(CC) NVMeBenchmark.cpp $(CFLAGS) -o NVMeBenchmark
--------------------------------------------------------------------------------
/src/TestAXIRouter.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import qdma._
9 |
10 | class TestAXIRouter extends Module {
11 | val io = IO(new Bundle {
12 | val axibIn = Flipped(new AXIB)
13 | val ramOut = Vec(3, new NVMeRamIO)
14 | })
15 |
16 | val axibRt = AXIRouter(3, io.axibIn)
17 | axibRt.io.in <> io.axibIn
18 | axibRt.io.wrIdx := Mux(
19 | axibRt.io.in.aw.bits.addr(27),
20 | 2.U,
21 | Mux(axibRt.io.in.aw.bits.addr(26), 1.U, 0.U)
22 | )
23 | axibRt.io.rdIdx := Mux(
24 | axibRt.io.in.ar.bits.addr(27),
25 | 2.U,
26 | Mux(axibRt.io.in.ar.bits.addr(26), 1.U, 0.U)
27 | )
28 | for (idx <- 0 until 3) {
29 | io.ramOut(idx) <> AXI2NVMeRam(axibRt.io.out(idx))
30 | }
31 | }
--------------------------------------------------------------------------------
/src/Elaborate.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 | import chisel3._
3 | import chisel3.util._
4 | import common._
5 | import common.storage._
6 | import qdma._
7 | import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage}
8 | import firrtl.options.TargetDirAnnotation
9 |
10 | object elaborate extends App {
11 | println("Generating a %s class".format(args(0)))
12 | val stage = new chisel3.stage.ChiselStage
13 | val arr = Array("-X", "sverilog", "--full-stacktrace")
14 | val dir = TargetDirAnnotation("Verilog")
15 |
16 | args(0) match{
17 | case "NVMeBandwidthBenchmarkTop" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new NVMeBandwidthBenchmarkTop()),dir))
18 | case "NVMeLatencyBenchmarkTop" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new NVMeLatencyBenchmarkTop()),dir))
19 | case "TestAXIRouter" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new TestAXIRouter()),dir))
20 | case "BandwidthProbe" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new BandwidthProbe(100, 4096)),dir))
21 | case "LatencyBucket" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new LatencyBucket(32, 1)),dir))
22 | case _ => println("Module match failed!")
23 | }
24 | }
--------------------------------------------------------------------------------
/tb/tb_TestLatencyBucket.sv:
--------------------------------------------------------------------------------
1 | module testbench_LatencyBucket(
2 |
3 | );
4 |
5 | reg clock =0;
6 | reg reset =0;
7 | reg io_enable =0;
8 | reg io_start =0;
9 | reg io_end =0;
10 | reg [4:0] io_bucketRdId =0;
11 | wire [31:0] io_bucketValue ;
12 | reg io_resetBucket =0;
13 | wire io_resetDone ;
14 |
15 |
16 | LatencyBucket LatencyBucket_inst(
17 | .*
18 | );
19 |
20 |
21 | initial begin
22 | reset <= 1;
23 | clock = 1;
24 | #100;
25 | reset <= 0;
26 | io_enable <= 1;
27 | #6;
28 | io_start <= 1;
29 | #2;
30 | io_start <= 0;
31 | #2;
32 | io_end <= 1;
33 | #1
34 | io_end <= 0;
35 | #5;
36 | io_end <= 1;
37 | #1;
38 | io_end <= 0;
39 | io_start <= 1;
40 | #1;
41 | io_start <= 0;
42 | #10;
43 | io_end <= 1;
44 | #1;
45 | io_end <= 0;
46 | #10;
47 | io_enable <= 0;
48 | io_bucketRdId <= 0;
49 | while (io_bucketRdId < 31) begin
50 | #5;
51 | io_bucketRdId <= io_bucketRdId + 'd1;
52 | end
53 | #5;
54 | io_resetBucket <= 1;
55 | #1;
56 | io_resetBucket <= 0;
57 | #50;
58 | $stop();
59 | end
60 | always #0.5 clock=~clock;
61 |
62 | endmodule
--------------------------------------------------------------------------------
/src/Interfaces.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import common.ToZero
9 |
10 | class NVMeRamIO extends Bundle{
11 | val readEnable = Output(UInt(1.W))
12 | val readAddr = Output(UInt(64.W))
13 | val readData = Input(UInt(512.W))
14 | val writeMask = Output(UInt(64.W))
15 | val writeAddr = Output(UInt(64.W))
16 | val writeData = Output(UInt(512.W))
17 | }
18 |
19 | class SSDSetup extends Bundle {
20 | val ssdId = Output(UInt(32.W))
21 | val ssdBarAddr = Output(UInt(64.W))
22 | }
23 |
24 | class NVMeCoreControl extends Bundle {
25 | val enable = Output(Bool())
26 | val ssdSetup = Valid(new SSDSetup)
27 | }
28 |
29 | class NVMeParameters extends Bundle {
30 | val ssdNum = Output(UInt(32.W))
31 | val queueNum = Output(UInt(32.W))
32 | val queueLowBit = Output(UInt(32.W))
33 | val ssdLowBit = Output(UInt(32.W))
34 | val queueDepth = Output(UInt(32.W))
35 | val ramTypeBit = Output(UInt(32.W))
36 | }
37 |
38 | class NVMeStat extends Bundle {
39 | val executeTime = Output(UInt(64.W))
40 | val successfulOp = Output(UInt(32.W))
41 | val failedOp = Output(UInt(32.W))
42 | val totalLatency = Output(UInt(64.W))
43 | }
44 |
45 | class NVMeCoreStatus extends Bundle {
46 | val running = Output(Bool())
47 | val params = new NVMeParameters
48 | val stat = new NVMeStat
49 | }
50 |
51 | class SSDCompletion extends Bundle {
52 | val cmdId = Output(UInt(16.W))
53 | val status = Output(UInt(8.W))
54 | }
--------------------------------------------------------------------------------
/sv/NVMeBenchmarkTop.xdc:
--------------------------------------------------------------------------------
1 | create_clock -name sys_clk -period 10 [get_ports qdma_pin_sys_clk_p]
2 |
3 | set_false_path -from [get_ports qdma_pin_sys_rst_n]
4 | set_property PULLUP true [get_ports qdma_pin_sys_rst_n]
5 | set_property IOSTANDARD LVCMOS18 [get_ports qdma_pin_sys_rst_n]
6 | set_property PACKAGE_PIN AW27 [get_ports qdma_pin_sys_rst_n]
7 | set_property CONFIG_VOLTAGE 1.8 [current_design]
8 |
9 | set_property LOC [get_package_pins -of_objects [get_bels [get_sites -filter {NAME =~ *COMMON*} -of_objects [get_iobanks -of_objects [get_sites GTYE4_CHANNEL_X1Y7]]]/REFCLK0P]] [get_ports qdma_pin_sys_clk_p]
10 | set_property LOC [get_package_pins -of_objects [get_bels [get_sites -filter {NAME =~ *COMMON*} -of_objects [get_iobanks -of_objects [get_sites GTYE4_CHANNEL_X1Y7]]]/REFCLK0N]] [get_ports qdma_pin_sys_clk_n]
11 |
12 | set_property PACKAGE_PIN J18 [get_ports led]
13 | set_property IOSTANDARD LVCMOS18 [get_ports led]
14 |
15 | # create_clock -name sys_100M_clock_0 -period 10 -add [get_ports sys_100M_0_p]
16 |
17 | # set_property PACKAGE_PIN BJ43 [get_ports sys_100M_0_p]
18 | # set_property PACKAGE_PIN BJ44 [get_ports sys_100M_0_n]
19 | # set_property IOSTANDARD DIFF_SSTL12 [get_ports sys_100M_0_p]
20 | # set_property IOSTANDARD DIFF_SSTL12 [get_ports sys_100M_0_n]
21 |
22 | set_false_path -from [get_cells -regexp {qdma/axil2reg/reg_control_[0-9]*_reg\[.*]}]
23 | set_false_path -to [get_cells -regexp {qdma/axil2reg/reg_status_[0-9]*_reg\[.*]}]
24 | #reg_control_0_reg[0]
25 | #set_false_path -from [get_cells qdma/axil2reg/reg_control_[*]]
26 | #set_false_path -to [get_cells qdma/axil_reg_0/reg_status_[*]]
27 |
28 | ###
29 | set_false_path -to [get_pins -hier *sync_reg[0]/D]
30 | ###
31 | set_property C_USER_SCAN_CHAIN 1 [get_debug_cores dbg_hub]
32 | connect_debug_port dbg_hub/clk [get_nets dbg_clk_pad_O]
--------------------------------------------------------------------------------
/src/BandwidthProbe.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 | import chisel3._
3 | import chisel3.util._
4 | import chisel3.experimental.ChiselEnum
5 | import common.axi._
6 | import common.storage._
7 | import common._
8 | import math.ceil
9 |
10 | class BandwidthProbe (
11 | CYCLE : Int = 25000000,
12 | DEPTH : Int = 4096
13 | ) extends Module {
14 | val io = IO(new Bundle{
15 | val enable = Input(Bool())
16 | val fire = Input(Bool())
17 | val count = Decoupled(Output(UInt(32.W)))
18 | })
19 |
20 | val time_cnt = RegInit(UInt(log2Ceil(CYCLE).W), 0.U)
21 | val record_valid = time_cnt === (CYCLE-1).U
22 | val band_cnt = RegInit(UInt(32.W), 0.U)
23 |
24 | when (~io.enable) {
25 | time_cnt := 0.U
26 | band_cnt := 0.U
27 | }.otherwise {
28 | when (record_valid) {
29 | time_cnt := 0.U
30 | }.otherwise {
31 | time_cnt := time_cnt + 1.U
32 | }
33 |
34 | when (record_valid) {
35 | when (io.fire) {
36 | band_cnt := 1.U
37 | }.otherwise {
38 | band_cnt := 0.U
39 | }
40 | }.elsewhen (io.fire) {
41 | band_cnt := band_cnt + 1.U
42 | }
43 | }
44 |
45 | val FIFO_CNT = ceil(DEPTH / 4096).toInt
46 |
47 | if (FIFO_CNT == 0) {
48 | val q = XQueue(UInt(32.W), DEPTH)
49 | q.io.out <> io.count
50 | q.io.in.valid := record_valid
51 | q.io.in.bits := band_cnt
52 | } else {
53 | val q = XQueue(FIFO_CNT)(UInt(32.W), 4096)
54 | q(0).io.in.valid := record_valid
55 | q(0).io.in.bits := band_cnt
56 | q(FIFO_CNT-1).io.out <> io.count
57 | for (i <- 1 until FIFO_CNT) {
58 | q(i-1).io.out <> q(i).io.in
59 | }
60 | }
61 |
62 | }
--------------------------------------------------------------------------------
/src/Util.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 |
6 | object NVMeCommandSet {
7 | def nvmWrite (id : UInt, prp1 : UInt, prp2 : UInt, slba : UInt, nlb : UInt) = {
8 |
9 | // Make sure the lengths of input signals are correct.
10 |
11 | val idSig = Wire(UInt(16.W))
12 | val prp1Sig = Wire(UInt(64.W))
13 | val prp2Sig = Wire(UInt(64.W))
14 | val slbaSig = Wire(UInt(64.W))
15 | val nlbSig = Wire(UInt(16.W))
16 |
17 | idSig := id
18 | prp1Sig := prp1
19 | prp2Sig := prp2
20 | slbaSig := slba
21 | nlbSig := nlb
22 |
23 | // Generate the NVMe-format command
24 |
25 | Cat(
26 | // DW 14-15:
27 | 0.U(64.W), // End to end protection, not used
28 | // DW 13:
29 | 0.U(24.W), // Rsvd
30 | 0.U(8.W), // Dataset, not used generally
31 | // DW 12:
32 | 1.U(1.W), // Limited retry
33 | 0.U(1.W), // Forced unit access
34 | 0.U(4.W), // Protection information
35 | 0.U(10.W), // Rsvd
36 | nlbSig, // Number of logical blocks, 0's based
37 | // DW 11-10:
38 | slbaSig, // Starting LB address
39 | // DW 9-8:
40 | prp2Sig, // PRP 2
41 | // DW 7-6
42 | prp1Sig, // PRP 1
43 | // DW 5-4:
44 | 0.U(64.W), // Metadata ptr, not used here
45 | // DW 3-2:
46 | 0.U(64.W), // Rsvd
47 | // DW 1:
48 | 1.U(32.W), // Namespace, typically 1 for most cases
49 | // DW 0:
50 | idSig, // Command ID
51 | 0.U(2.W), // Use PRP
52 | 0.U(4.W), // Rsvd
53 | 0.U(2.W), // Fuse command
54 | 0x01.U(8.W) // Opcode
55 | )
56 | }
57 |
58 | def nvmRead (id : UInt, prp1 : UInt, prp2 : UInt, slba : UInt, nlb : UInt) = {
59 |
60 | // Make sure the lengths of input signals are correct.
61 |
62 | val idSig = Wire(UInt(16.W))
63 | val prp1Sig = Wire(UInt(64.W))
64 | val prp2Sig = Wire(UInt(64.W))
65 | val slbaSig = Wire(UInt(64.W))
66 | val nlbSig = Wire(UInt(16.W))
67 |
68 | idSig := id
69 | prp1Sig := prp1
70 | prp2Sig := prp2
71 | slbaSig := slba
72 | nlbSig := nlb
73 |
74 | // Generate the NVMe-format command
75 |
76 | Cat(
77 | // DW 14-15:
78 | 0.U(64.W), // End to end protection, not used
79 | // DW 13:
80 | 0.U(24.W), // Rsvd
81 | 0.U(8.W), // Dataset, not used generally
82 | // DW 12:
83 | 1.U(1.W), // Limited retry
84 | 0.U(1.W), // Forced unit access
85 | 0.U(4.W), // Protection information
86 | 0.U(10.W), // Rsvd
87 | nlbSig, // Number of logical blocks, 0's based
88 | // DW 11-10:
89 | slbaSig, // Starting LB address
90 | // DW 9-8:
91 | prp2Sig, // PRP 2
92 | // DW 7-6
93 | prp1Sig, // PRP 1
94 | // DW 5-4:
95 | 0.U(64.W), // Metadata ptr, not used here
96 | // DW 3-2:
97 | 0.U(64.W), // Rsvd
98 | // DW 1:
99 | 1.U(32.W), // Namespace, typically 1 for most cases
100 | // DW 0:
101 | idSig, // Command ID
102 | 0.U(2.W), // Use PRP
103 | 0.U(4.W), // Rsvd
104 | 0.U(2.W), // Fuse command
105 | 0x02.U(8.W) // Opcode
106 | )
107 | }
108 | }
--------------------------------------------------------------------------------
/src/deprecated/Interfaces.scala:
--------------------------------------------------------------------------------
1 | package nvme.deprecated
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import common.ToZero
9 |
10 | class NVMeCommand extends Bundle{
11 | val op = Output(UInt(1.W))
12 | val numLb = Output(UInt(16.W))
13 | val ssdAddr = Output(UInt(64.W))
14 | val memAddr = Output(UInt(64.W))
15 | }
16 |
17 | class NVMeControl extends Bundle {
18 | val init_start = Input(UInt(1.W))
19 | val init_nsid = Input(UInt(32.W))
20 | val init_dma_addr = Input(UInt(64.W))
21 | val init_byp_addr = Input(UInt(64.W))
22 | val init_ssd_addr = Input(UInt(64.W))
23 | val init_ssdid = Input(UInt(32.W))
24 | val p2pdma_read = Input(UInt(1.W))
25 | val p2pdma_write = Input(UInt(1.W))
26 | val p2pdma_cmd_addr = Input(UInt(64.W))
27 | val p2pdma_cmd_len = Input(UInt(16.W))
28 | val p2pdma_c2h_data = Input(UInt(512.W))
29 | val ssd_init = Input(UInt(1.W))
30 | val exec_start = Input(UInt(1.W))
31 | val exec_time = Input(UInt(32.W))
32 | val band_tr_en = Input(UInt(1.W))
33 | val band_tr_read = Input(UInt(1.W))
34 | }
35 |
36 | object Reg2NVMeControl {
37 | def apply(controlReg : Vec[UInt]) : NVMeControl = {
38 | val target = Wire(new NVMeControl)
39 | val source = Cat(
40 | controlReg(32)(0),
41 | controlReg(33),
42 | Cat(controlReg(35), controlReg(34)),
43 | Cat(controlReg(37), controlReg(36)),
44 | Cat(controlReg(39), controlReg(38)),
45 | controlReg(40),
46 | controlReg(64)(0),
47 | controlReg(65)(0),
48 | Cat(controlReg(67), controlReg(66)),
49 | controlReg(68)(15, 0),
50 | Cat(controlReg(84), controlReg(83), controlReg(82), controlReg(81), controlReg(80), controlReg(79), controlReg(78), controlReg(77), controlReg(76), controlReg(75), controlReg(74), controlReg(73), controlReg(72), controlReg(71), controlReg(70), controlReg(69)),
51 | controlReg(128)(0),
52 | controlReg(160)(0),
53 | controlReg(163),
54 | controlReg(165)(0),
55 | controlReg(166)(0),
56 | )
57 | target := source.asTypeOf(new NVMeControl)
58 | target
59 | }
60 | }
61 |
62 | class NVMeStatus extends Bundle {
63 | val p2pdma_h2c_data = Output(UInt(512.W))
64 | val p2pdma_h2c_done = Output(UInt(1.W))
65 | val p2pdma_c2h_done = Output(UInt(1.W))
66 | val nvme_init_done = Output(UInt(1.W))
67 | val nvme_exec_done = Output(UInt(1.W))
68 | val stat_op_succ = Output(UInt(32.W))
69 | val stat_op_fail = Output(UInt(32.W))
70 | val stat_exec_time = Output(UInt(64.W))
71 | val stat_io_ssd0 = Output(UInt(32.W))
72 | val stat_io_ssd1 = Output(UInt(32.W))
73 | val stat_io_ssd2 = Output(UInt(32.W))
74 | val stat_io_ssd3 = Output(UInt(32.W))
75 | val band_tr_rd = Output(UInt(32.W))
76 | val band_tr_wr = Output(UInt(32.W))
77 | }
78 |
79 | object NVMeStatus2Reg {
80 | def apply(source : NVMeStatus, statusReg : Vec[UInt])= {
81 | var i = 0;
82 | for (i <- 0 until 16) {
83 | statusReg(96+i) := source.p2pdma_h2c_data(i*32+31, i*32)
84 | }
85 | for (i <- 0 until 2) {
86 | statusReg(196+i) := source.stat_exec_time(i*32+31, i*32)
87 | }
88 | statusReg(128) := source.p2pdma_h2c_done
89 | statusReg(129) := source.p2pdma_c2h_done
90 | statusReg(160) := source.nvme_init_done
91 | statusReg(192) := source.nvme_exec_done
92 | statusReg(193) := source.stat_op_succ
93 | statusReg(194) := source.stat_op_fail
94 | statusReg(200) := source.stat_io_ssd0
95 | statusReg(201) := source.stat_io_ssd1
96 | statusReg(202) := source.stat_io_ssd2
97 | statusReg(203) := source.stat_io_ssd3
98 | statusReg(216) := source.band_tr_rd
99 | statusReg(217) := source.band_tr_wr
100 | }
101 | }
--------------------------------------------------------------------------------
/src/AXI2NVMeRam.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.axi._
7 | import common.storage._
8 | import common.connection._
9 |
10 | object AXI2NVMeRam {
11 | def apply(in : AXI) = {
12 | val inst = Module(new AXI2NVMeRam(
13 | in.ar.bits.addr.getWidth,
14 | in.r.bits.data.getWidth,
15 | in.ar.bits.id.getWidth,
16 | in.ar.bits.user.getWidth,
17 | in.ar.bits.len.getWidth
18 | ))
19 | val out = Wire(new NVMeRamIO)
20 | inst.io.in <> in
21 | inst.io.out <> out
22 |
23 | out
24 | }
25 |
26 | class AXI2NVMeRam(ADDR_WIDTH:Int, DATA_WIDTH:Int, ID_WIDTH:Int, USER_WIDTH:Int, LEN_WIDTH:Int) extends Module {
27 | val io = IO(new Bundle{
28 | val in = Flipped(new AXI(ADDR_WIDTH, DATA_WIDTH, ID_WIDTH, USER_WIDTH, LEN_WIDTH))
29 | val out = new NVMeRamIO
30 | })
31 |
32 | // W channels
33 |
34 | val rWid = RegInit(UInt(ID_WIDTH.W), 0.U)
35 | val rNextWrAddr = RegInit(UInt(ADDR_WIDTH.W), 0.U)
36 |
37 | val sWrReq :: sWrData :: Nil = Enum(2)
38 | val wrSt = RegInit(sWrReq)
39 |
40 | val wrFirstBeat = (wrSt === sWrReq) && io.in.aw.fire && io.in.w.fire
41 | val wrRemainBeat = (wrSt === sWrData) && io.in.w.fire
42 |
43 | val backFifo = XQueue(UInt(ID_WIDTH.W), 32)
44 | val backFifoIn = Wire(Decoupled(UInt(ID_WIDTH.W)))
45 |
46 | backFifo.io.in <> RegSlice(2)(backFifoIn)
47 |
48 | io.out.writeMask := Mux(io.in.w.fire, io.in.w.bits.strb, 0.U)
49 | io.out.writeAddr := Mux(wrFirstBeat, Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W)), rNextWrAddr)
50 | io.out.writeData := io.in.w.bits.data
51 |
52 | io.in.aw.ready := (wrSt === sWrReq)
53 | io.in.w.ready := backFifoIn.ready
54 | io.in.b.bits.id := backFifo.io.out.bits
55 | io.in.b.bits.resp := 0.U
56 | io.in.b.bits.user := 0.U
57 | io.in.b.valid := backFifo.io.out.valid
58 | backFifo.io.out.ready := io.in.b.ready
59 |
60 | backFifoIn.valid := io.in.w.fire && io.in.w.bits.last.asBool
61 | backFifoIn.bits := rWid
62 |
63 | switch (wrSt) {
64 | is (sWrReq) {
65 | when (io.in.aw.fire) { // Received a request
66 | when (io.in.w.fire && io.in.w.bits.last.asBool) { // 1-beat data, already handled.
67 | wrSt := sWrReq
68 | }.otherwise {
69 | wrSt := sWrData
70 | }
71 | }.otherwise {
72 | wrSt := sWrReq
73 | }
74 | }
75 | is (sWrData) {
76 | when (io.in.w.fire && io.in.w.bits.last.asBool) { // Last beat ends
77 | wrSt := sWrReq
78 | }.otherwise {
79 | wrSt := sWrData
80 | }
81 | }
82 | }
83 |
84 | when (io.in.aw.fire) {
85 | rWid := io.in.aw.bits.id
86 | rNextWrAddr := Mux(
87 | wrFirstBeat,
88 | Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W)) + "h40".U,
89 | Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W))
90 | )
91 | }.elsewhen (wrRemainBeat) {
92 | when (!io.in.w.bits.last.asBool){
93 | rNextWrAddr := rNextWrAddr + "h40".U
94 | }
95 | }
96 |
97 | // R channels
98 |
99 | val rRid = RegInit(UInt(ID_WIDTH.W), 0.U)
100 | val rLen = RegInit(UInt(LEN_WIDTH.W), 0.U)
101 | val rNextRdAddr = RegInit(UInt(ADDR_WIDTH.W), 0.U)
102 |
103 | val sRdReq :: sRdData :: Nil = Enum(2)
104 | val rdSt = RegInit(sRdReq)
105 |
106 | val rdFirstBeat = (rdSt === sRdReq) && io.in.ar.fire
107 | val rdRemainBeat = (rdSt === sRdData) && io.in.r.fire && (rLen =/= 0.U)
108 |
109 | io.out.readAddr := Mux(rdFirstBeat, Cat(io.in.ar.bits.addr(63, 6), 0.U(6.W)), rNextRdAddr)
110 | io.out.readEnable := rdFirstBeat || rdRemainBeat
111 |
112 | io.in.ar.ready := (rdSt === sRdReq)
113 | io.in.r.valid := (rdSt === sRdData)
114 | io.in.r.bits.id := rRid
115 | io.in.r.bits.user := 0.U
116 | io.in.r.bits.last := (rdSt === sRdData && rLen === 0.U)
117 | io.in.r.bits.data := io.out.readData
118 | io.in.r.bits.resp := 0.U
119 |
120 | switch (rdSt) {
121 | is (sRdReq) {
122 | when (io.in.ar.fire) {
123 | rdSt := sRdData
124 | }.otherwise {
125 | rdSt := sRdReq
126 | }
127 | }
128 | is (sRdData) {
129 | when (io.in.r.fire && io.in.r.bits.last.asBool) {
130 | rdSt := sRdReq
131 | }.otherwise {
132 | rdSt := sRdData
133 | }
134 | }
135 | }
136 |
137 | when (rdFirstBeat) {
138 | rLen := io.in.ar.bits.len
139 | rNextRdAddr := Cat(io.in.ar.bits.addr(63, 6), 0.U(6.W)) + "h40".U
140 | rRid := io.in.ar.bits.id
141 | }.elsewhen(rdRemainBeat) {
142 | when (rLen =/= 1.U) {
143 | rNextRdAddr := rNextRdAddr + "h40".U
144 | }
145 | rLen := rLen - 1.U
146 | }
147 | }
148 | }
--------------------------------------------------------------------------------
/src/deprecated/NVMeBlackBox.scala:
--------------------------------------------------------------------------------
1 | package nvme.deprecated
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import common.ToZero
9 | import qdma._
10 |
11 | class NVMeBlackBox (
12 | SSD_MAX_ID : Int = 0,
13 | BUFFER_DATA_SHIFT : Int = 27,
14 | SSD_NUM_SHIFT : Int = 2,
15 | QUEUE_DEPTH_SHIFT : Int = 8,
16 | QUEUE_MAX_ID : Int = 3,
17 | QUEUE_COUNT_SHIFT : Int = 2,
18 | MAX_SQ_INTERVAL : Int = 30
19 | ) extends BlackBox(Map(
20 | "SSD_NUM" -> SSD_MAX_ID,
21 | "BRIDGE_DATA_SHIFT" -> BUFFER_DATA_SHIFT,
22 | "SSD_NUM_SHIFT" -> SSD_NUM_SHIFT,
23 | "QUEUE_DEPTH_SHIFT" -> QUEUE_DEPTH_SHIFT,
24 | "QUEUE_COUNT_SHIFT" -> QUEUE_COUNT_SHIFT,
25 | "MAX_SQ_INTERVAL" -> MAX_SQ_INTERVAL,
26 | "USED_QUEUE_MAX_ID" -> QUEUE_MAX_ID
27 | )) {
28 | val QUEUE_NUM = (SSD_MAX_ID+1) * (QUEUE_MAX_ID+1)
29 |
30 | val io = IO(new Bundle{
31 | val clk_core = Input(Clock())
32 | val sys_reset = Input(Bool())
33 |
34 | val status_p2pdma_h2c_data = Output(UInt(512.W)) // Reg(111:96)
35 | val status_p2pdma_h2c_done = Output(UInt(1.W)) // Reg(128)
36 | val status_p2pdma_c2h_done = Output(UInt(1.W)) // Reg(129)
37 | val status_nvme_init_done = Output(UInt(1.W)) // Reg(160)
38 | val status_nvme_exec_done = Output(UInt(1.W)) // Reg(192)
39 | val status_stat_op_succ = Output(UInt(32.W)) // Reg(193)
40 | val status_stat_op_fail = Output(UInt(32.W)) // Reg(194)
41 | val status_stat_exec_time = Output(UInt(64.W)) // Reg(197:196)
42 | val status_stat_io_ssd0 = Output(UInt(32.W)) // Reg(200)
43 | val status_stat_io_ssd1 = Output(UInt(32.W)) // Reg(201)
44 | val status_stat_io_ssd2 = Output(UInt(32.W)) // Reg(202)
45 | val status_stat_io_ssd3 = Output(UInt(32.W)) // Reg(203)
46 | val status_band_tr_rd = Output(UInt(32.W)) // Reg(216)
47 | val status_band_tr_wr = Output(UInt(32.W)) // Reg(217)
48 |
49 | val control_init_start = Input(UInt(1.W)) // Reg(32)
50 | val control_init_nsid = Input(UInt(32.W)) // Reg(33)
51 | val control_init_dma_addr = Input(UInt(64.W)) // Reg(35:34)
52 | val control_init_byp_addr = Input(UInt(64.W)) // Reg(37:36)
53 | val control_init_ssd_addr = Input(UInt(64.W)) // Reg(39:38)
54 | val control_init_ssdid = Input(UInt(32.W)) // Reg(40)
55 | val control_p2pdma_read = Input(UInt(1.W)) // Reg(64)
56 | val control_p2pdma_write = Input(UInt(1.W)) // Reg(65)
57 | val control_p2pdma_cmd_addr = Input(UInt(64.W)) // Reg(67:66)
58 | val control_p2pdma_cmd_len = Input(UInt(16.W)) // Reg(68)
59 | val control_p2pdma_c2h_data = Input(UInt(512.W)) // Reg(84:69)
60 | val control_ssd_init = Input(UInt(1.W)) // Reg(128)
61 | val control_exec_start = Input(UInt(1.W)) // Reg(160)
62 | val control_exec_time = Input(UInt(32.W)) // Reg(163)
63 | val control_band_tr_en = Input(UInt(1.W)) // Reg(165)
64 | val control_band_tr_read = Input(UInt(1.W)) // Reg(166)
65 |
66 | val s_axib_awid = Output(UInt(4.W))
67 | val s_axib_awaddr = Output(UInt(64.W))
68 | val s_axib_awlen = Output(UInt(8.W))
69 | val s_axib_awsize = Output(UInt(3.W))
70 | val s_axib_awburst = Output(UInt(2.W))
71 | val s_axib_awuser = Output(UInt(12.W))
72 | val s_axib_awregion = Output(UInt(4.W))
73 | val s_axib_awvalid = Output(UInt(1.W))
74 | val s_axib_awready = Input(UInt(1.W))
75 | val s_axib_wdata = Output(UInt(512.W))
76 | val s_axib_wstrb = Output(UInt(64.W))
77 | val s_axib_wlast = Output(UInt(1.W))
78 | val s_axib_wuser = Output(UInt(64.W))
79 | val s_axib_wvalid = Output(UInt(1.W))
80 | val s_axib_wready = Input(UInt(1.W))
81 | val s_axib_bid = Input(UInt(4.W))
82 | val s_axib_bresp = Input(UInt(2.W))
83 | val s_axib_bvalid = Input(UInt(1.W))
84 | val s_axib_bready = Output(UInt(1.W))
85 | val s_axib_arid = Output(UInt(4.W))
86 | val s_axib_araddr = Output(UInt(64.W))
87 | val s_axib_aruser = Output(UInt(12.W))
88 | val s_axib_arlen = Output(UInt(8.W))
89 | val s_axib_arsize = Output(UInt(3.W))
90 | val s_axib_arburst = Output(UInt(2.W))
91 | val s_axib_arregion = Output(UInt(4.W))
92 | val s_axib_arvalid = Output(UInt(1.W))
93 | val s_axib_arready = Input(UInt(1.W))
94 | val s_axib_rid = Input(UInt(4.W))
95 | val s_axib_rdata = Input(UInt(512.W))
96 | val s_axib_rresp = Input(UInt(2.W))
97 | val s_axib_ruser = Input(UInt(64.W))
98 | val s_axib_rlast = Input(UInt(1.W))
99 | val s_axib_rvalid = Input(UInt(1.W))
100 | val s_axib_rready = Output(UInt(1.W))
101 |
102 | val axib_read_enable = Input(UInt(1.W))
103 | val axib_read_addr = Input(UInt(64.W))
104 | val axib_read_data = Output(UInt(512.W))
105 | val axib_write_mask = Input(UInt(64.W))
106 | val axib_write_addr = Input(UInt(64.W))
107 | val axib_write_data = Input(UInt(512.W))
108 |
109 | val ssd_cmd_op = Input(UInt(QUEUE_NUM.W))
110 | val ssd_cmd_nlb = Input(UInt((QUEUE_NUM*16).W))
111 | val ssd_cmd_lba = Input(UInt((QUEUE_NUM*32).W))
112 | val ssd_cmd_offset = Input(UInt((QUEUE_NUM*32).W))
113 | val ssd_cmd_valid = Input(UInt(QUEUE_NUM.W))
114 | val ssd_cmd_ready = Output(UInt(QUEUE_NUM.W))
115 |
116 | val pcie_hbm_write_transfer = Input(UInt(2.W))
117 | })
118 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Chisel NVMe Host Controller
2 | NVMe host controller written in Chisel.
3 |
4 | ## Table of contents
5 | - [Chisel NVMe Host Controller](#chisel-nvme-host-controller)
6 | - [Table of contents](#table-of-contents)
7 | - [How to Add this Module in Your Chisel Project:](#how-to-add-this-module-in-your-chisel-project)
8 | - [NVMeCore Module](#nvmecore-module)
9 | - [Parameter Description](#parameter-description)
10 | - [Port Description](#port-description)
11 | - [NVMeCoreControl](#nvmecorecontrol)
12 | - [NVMeCoreStatus](#nvmecorestatus)
13 | - [Other Modules and APIs](#other-modules-and-apis)
14 | - [AXI2NVMeRam](#axi2nvmeram)
15 | - [NVMe Command Builders](#nvme-command-builders)
16 | - [BandwidthProbe](#bandwidthprobe)
17 | - [Example Design](#example-design)
18 |
19 | ## How to Add this Module in Your Chisel Project:
20 | [QDMA](https://github.com/carlzhang4/qdma) is this module's dependency. Before using this module, make sure QDMA module are installed.
21 | To install this module, use the command below:
22 | ```bash
23 | $ git submodule add git@github.com:JewdrunAleph/fpga-nvme-controller nvme
24 | $ git clone git@github.com:JewdrunAleph/fpga-nvme-controller nvme
25 | ```
26 |
27 | ## NVMeCore Module
28 | `NVMeCore` can offload I/O queue management of SSDs from CPU to FPGA. It takes NVMe commands as inputs, maintains I/O queues and rings doorbell signal via QDMA.
29 |
30 | ### Parameter Description
31 | |Parameter |Type |Range |Description |
32 | |:--- |:--- |:--- |:--- |
33 | |SSD_NUM |Int |Larger than 0. |Number of SSDs used. |
34 | |QUEUE_NUM |Int |Larger than 0. |Number of I/O queues FPGA taken care of for each SSD.|
35 | |QUEUE_DEPTH |Int |Larger than 4, exponential of 2. |Queue depth of each queue I/O queue. Depth of each queue should be equal.|
36 | |MAX_SQ_INTERVAL |Int |Larger than 0. |Time window for this module to wait new command in cycles.
To reduce doorbell traffic, when a new command is inserted, queue management module will not immediately ring the doorbell, instead, it will wait for a period to see whether new command has come.|
37 | |QDMA_INTERFACE |String |"DMA" or "SAXIB" |Interface of QDMA used to ring doorbell.
You can choose either DMA C2H engine, or AXI Slave Bridge. |
38 |
39 |
40 | ### Port Description
41 | |Port |Type |Direction |Description |
42 | |:--- |:--- |:--- |:--- |
43 | |ssdCmd |Vec[Vec[DecoupledIO[UInt]]]|Input |Command of each queue of each SSD. Each command should follow NVMe format.|
44 | |control |NVMeCoreControl |Input |Control signals for this module. Refer to [here](#NVMeCoreControl)|
45 | |status |NVMeCoreStatus |Output |Status signals for this module. Refer to [here](#NVMeCoreStatus)|
46 | |ramIO |NVMeRamIO | |SQ/CQ RAM I/O request from the host. This port can be converted from an AXI slave interface, see [here](#AXI2NVMeRam).|
47 | |sAxib |Option[AXIB_SLAVE] |Output |Used when `QDMA_INTERFACE == "SAXIB"`. Connect to QDMA's AXI Slave Bridge|
48 | |c2hCmd |Option[DecoupledIO[C2H_CMD]]|Output |Used when `QDMA_INTERFACE == "DMA"`. Connect to QDMA's C2H command port.|
49 | |c2hData |Option[DecoupledIO[C2H_DATA]]|Output |Used when `QDMA_INTERFACE == "DMA"`. Connect to QDMA's C2H data port.|
50 |
51 | #### NVMeCoreControl
52 | NVMe control signals are listed here.
53 | **enable**
54 | Only when this signal is high will this module work and accept new commands. When this signal is low, it still processes existing commands, but won't accept new commands anymore. Designed for benchmarking.
55 | **ssdSetup**
56 | Initialize an SSD with data required by this module. It has two signals:
57 | - `ssdId`: Index of SSD to be initialized.
58 | - `ssdBarAddr`: **Physical** address of BAR 0 of this SSD. It should be got from the host.
59 |
60 | #### NVMeCoreStatus
61 | NVMe status signals are listed here. This interface includes signals either needed by host or helpful for benchmarking.
62 | - `running`: Whether this module is processing or accepting commands. When `enable` signal is set to low and this module finishes processing all existing commands, this signal will turn to low.
63 | - `params`: Parameters required by the host. With these parameters, host can create I/O queue for all SSDs and assign correct address for these queues.
64 | - `stat`: Statistical informations **since the module is enabled**, include:
65 | - `executeTime`: Total execution time.
66 | - `successfulOp`: Number of commands SSD processed successfully.
67 | - `failedOp`: Number of commands SSD failed to process by SSDs.
68 | - `totalLatency`: Total latency for all commands processed **in cycles**. To get average latency of each SSD, devide this over number of commands processed.
69 |
70 | ## Other Modules and APIs
71 |
72 | ### AXI2NVMeRam
73 | For simplicity, NVMe core module takes `NVMeRamIO` as input, which is similar to simple dual port BRAM I/O signals. However, in real applications requests are from AXI interface. Therefore, this repo provides an `AXI2NVMeRam` module for interface conversion.
74 |
75 | ### NVMe Command Builders
76 | `NVMeCommandSet` object provides a set of functions which helps to fill in NVMe-formatted commands with some simple and basic items. Check Util.scala for more details.
77 |
78 | ### BandwidthProbe
79 | `BandwidthProbe` helps to record actual data transfer bandwidth of certain interface.
80 |
81 | ## Example Design
82 | This repo provides an example benchmark design includes hardware design and corresponding software. Example design is tested on an Alveo U50 Card. For U280 board, please use your own xdc file.
83 | To test this design:
84 | 1. Install [QDMA driver and LibQDMA](https://github.com/carlzhang4/qdma_improve).
85 | 2. Generate bitstream file, and program to FPGA.
86 | 3. Change list of NVMe devices in NVMeBenchmark.cpp
87 | 4. Use Makefile to generate executable.
88 | 5. Reboot your computer and run.
--------------------------------------------------------------------------------
/src/NVMeLatencyBenchmarkTop.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import qdma._
9 |
10 | class NVMeLatencyBenchmarkTop extends RawModule{
11 | val qdma_pin = IO(new QDMAPin(PCIE_WIDTH=8))
12 | val led = IO(Output(UInt(1.W)))
13 |
14 | led := 0.U
15 |
16 | // Global parameters
17 | val DATA_BUFFER_SHIFT = 27 // Upper 128 MiB is used for data buffer.
18 | val SSD_NUM = 4
19 | val QUEUE_DEPTH = 64
20 | val QDMA_INTERFACE = "SAXIB"
21 |
22 | val qdma = Module(new QDMA(
23 | VIVADO_VERSION = "202101",
24 | PCIE_WIDTH = 8,
25 | SLAVE_BRIDGE = (QDMA_INTERFACE == "SAXIB"),
26 | TLB_TYPE = new BypassTLB, // TLB is not used here.
27 | BRIDGE_BAR_SCALE= "Megabytes",
28 | BRIDGE_BAR_SIZE = 256
29 | ))
30 | qdma.getTCL()
31 |
32 | val user_clk = qdma.io.pcie_clk
33 | val user_rstn = qdma.io.pcie_arstn
34 |
35 | ToZero(qdma.io.reg_status)
36 | qdma.io.pin <> qdma_pin
37 |
38 | // TODO: Notify Carl Zhang move AXI clock convertor into QDMA module :/
39 | qdma.io.user_clk := user_clk
40 | qdma.io.user_arstn := user_rstn
41 | qdma.io.soft_rstn := 1.U
42 |
43 | Collector.connect_to_status_reg(qdma.io.reg_status, 400)
44 |
45 | withClockAndReset(qdma.io.user_clk,!qdma.io.user_arstn) {
46 | val nvmeCore = Module(new NVMeController(
47 | SSD_NUM = SSD_NUM,
48 | QUEUE_NUM = 1,
49 | QUEUE_DEPTH = QUEUE_DEPTH,
50 | QDMA_INTERFACE = QDMA_INTERFACE
51 | ))
52 |
53 | if (QDMA_INTERFACE == "DMA") {
54 | nvmeCore.io.c2hCmd.get <> qdma.io.c2h_cmd
55 | nvmeCore.io.c2hData.get <> qdma.io.c2h_data
56 | } else if (QDMA_INTERFACE == "SAXIB") {
57 | qdma.io.s_axib.get <> AXIRegSlice(3)(nvmeCore.io.sAxib.get)
58 | qdma.io.c2h_cmd <> DontCare
59 | qdma.io.c2h_data <> DontCare
60 | }
61 |
62 | val controlReg = qdma.io.reg_control
63 | val statusReg = qdma.io.reg_status
64 |
65 | statusReg(65) := nvmeCore.io.status.params.ssdNum
66 | statusReg(67) := nvmeCore.io.status.params.queueLowBit
67 | statusReg(64) := nvmeCore.io.status.params.ssdLowBit
68 | statusReg(66) := nvmeCore.io.status.params.queueDepth
69 | statusReg(68) := nvmeCore.io.status.params.queueNum
70 | statusReg(69) := nvmeCore.io.status.params.ramTypeBit
71 |
72 | // In such scenario, whole BAR space of QDMA is often used by different modules.
73 | // Thus, we must split AXI Bridge into different ways.
74 | // The 2nd input of AXIRouter is to automatically decide data widths, it will NOT
75 | // be connected to the router.
76 |
77 | // In this example design, AXIB space is splited into 3 ways:
78 | // 0 - Lower space is used to store SQ / CQ RAMs
79 | // 1 - Middle space is used to store PRP lists
80 | // 2 - Upper space is used for data buffer.
81 |
82 | val axibRt = AXIRouter(3, qdma.io.axib)
83 | axibRt.io.in <> AXIRegSlice(2)(qdma.io.axib)
84 | axibRt.io.wrIdx := Mux(
85 | axibRt.io.in.aw.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U,
86 | 0.U, Mux(axibRt.io.in.aw.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U)
87 | )
88 | axibRt.io.rdIdx := Mux(
89 | axibRt.io.in.ar.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U,
90 | 0.U, Mux(axibRt.io.in.ar.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U)
91 | )
92 |
93 | nvmeCore.io.ramIO <> AXI2NVMeRam(AXIRegSlice(5)(axibRt.io.out(0)))
94 |
95 | qdma.io.h2c_cmd <> DontCare
96 | qdma.io.h2c_data <> DontCare
97 |
98 | nvmeCore.io.control.enable := controlReg(160)
99 | nvmeCore.io.control.ssdSetup.valid := controlReg(32) & ~RegNext(controlReg(32))
100 | nvmeCore.io.control.ssdSetup.bits.ssdId := controlReg(40)
101 | nvmeCore.io.control.ssdSetup.bits.ssdBarAddr := Cat(controlReg(39), controlReg(38))
102 |
103 | // Benchmark module
104 |
105 | val benchmark = Module(new NVMeLatencyBenchmark(
106 | SSD_NUM = SSD_NUM,
107 | DATA_BUFFER_SHIFT = DATA_BUFFER_SHIFT
108 | ))
109 |
110 | for (i <- 0 until SSD_NUM) {
111 | nvmeCore.io.ssdCmd(i)(0)<> RegSlice(3)(benchmark.io.ssdCmd(i))
112 | benchmark.io.ssdCmpt(i) <> Pipe(nvmeCore.io.ssdCmpt(i)(0), 3)
113 | }
114 | benchmark.io.prpInput <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(1)))
115 |
116 | benchmark.io.ctlRunning := controlReg(160)
117 | benchmark.io.ctlEnd := statusReg(192) & ~RegNext(statusReg(192))
118 | benchmark.io.ctlFpgaBar := Cat(controlReg(37), controlReg(36))
119 | benchmark.io.ctlTimeTarget := Cat(controlReg(163), 0.U(6.W))
120 | benchmark.io.ctlNumNlb := controlReg(162)(15, 0)
121 | benchmark.io.ctlMaxParallel := controlReg(170)
122 | benchmark.io.ctlModeWrite := controlReg(161)(0)
123 | benchmark.io.ctlModeRandom := controlReg(161)(1)
124 | benchmark.io.ctlReadLatency.valid := controlReg(167)(0) & ~RegNext(controlReg(167)(0))
125 | benchmark.io.ctlReadLatency.bits := controlReg(168)
126 | benchmark.io.ctlModeMixed := 0.U
127 |
128 | statusReg(192) := !(nvmeCore.io.status.running || nvmeCore.io.control.enable)
129 | statusReg(193) := nvmeCore.io.status.stat.successfulOp
130 | statusReg(194) := nvmeCore.io.status.stat.failedOp
131 | statusReg(196) := nvmeCore.io.status.stat.executeTime(31, 0)
132 | statusReg(197) := nvmeCore.io.status.stat.executeTime(63, 32)
133 | for (ssdId <- 0 until SSD_NUM) {
134 | statusReg(200+ssdId) := benchmark.io.statSsdIo(ssdId)
135 | }
136 | statusReg(218) := nvmeCore.io.status.stat.totalLatency(31, 0)
137 | statusReg(219) := nvmeCore.io.status.stat.totalLatency(63, 32)
138 | statusReg(220) := benchmark.io.statLatency
139 |
140 | // Bandwidth probe
141 | val dataBufferIo = AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(2)))
142 | dataBufferIo.readData := 0.U
143 |
144 | val readProbe = Module(new BandwidthProbe)
145 | readProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0)
146 | readProbe.io.fire := dataBufferIo.readEnable
147 | readProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U)
148 | statusReg(216) := Mux(readProbe.io.count.valid, readProbe.io.count.bits, -1.S(32.W).asUInt)
149 |
150 | val writeProbe = Module(new BandwidthProbe)
151 | writeProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0)
152 | writeProbe.io.fire := (dataBufferIo.writeMask =/= 0.U)
153 | writeProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U)
154 | statusReg(217) := Mux(writeProbe.io.count.valid, writeProbe.io.count.bits, -1.S(32.W).asUInt)
155 |
156 | // AXIB Debug
157 | val aw_cnt = RegInit(UInt(32.W), 0.U)
158 | val w_cnt = RegInit(UInt(32.W), 0.U)
159 |
160 | when (qdma.io.axib.aw.fire) {
161 | aw_cnt := aw_cnt + qdma.io.axib.aw.bits.len + 1.U
162 | }
163 |
164 | when (qdma.io.axib.w.fire) {
165 | w_cnt := w_cnt + 1.U
166 | }
167 |
168 | val diff_cnt = aw_cnt - w_cnt
169 | val diff_time = RegInit(UInt(32.W), 0.U)
170 | when (diff_cnt === 0.U) {
171 | diff_time := 0.U
172 | }.otherwise {
173 | diff_time := diff_time + 1.U
174 | }
175 | }
176 | }
--------------------------------------------------------------------------------
/src/NVMeBandwidthBenchmarkTop.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import qdma._
9 |
10 | class NVMeBandwidthBenchmarkTop extends RawModule{
11 | val qdma_pin = IO(new QDMAPin(PCIE_WIDTH=8))
12 | val led = IO(Output(UInt(1.W)))
13 |
14 | led := 0.U
15 |
16 | // Global parameters
17 | val DATA_BUFFER_SHIFT = 27 // Upper 128 MiB is used for data buffer.
18 | val SSD_NUM = 1
19 | val QUEUE_NUM = 2
20 | val QUEUE_DEPTH = 64
21 | val QDMA_INTERFACE = "SAXIB"
22 |
23 | val qdma = Module(new QDMA(
24 | VIVADO_VERSION = "202101",
25 | PCIE_WIDTH = 8,
26 | SLAVE_BRIDGE = (QDMA_INTERFACE == "SAXIB"),
27 | TLB_TYPE = new BypassTLB, // TLB is not used here.
28 | BRIDGE_BAR_SCALE= "Megabytes",
29 | BRIDGE_BAR_SIZE = 256
30 | ))
31 | qdma.getTCL()
32 |
33 | val user_clk = qdma.io.pcie_clk
34 | val user_rstn = qdma.io.pcie_arstn
35 |
36 | ToZero(qdma.io.reg_status)
37 | qdma.io.pin <> qdma_pin
38 |
39 | // TODO: Notify Carl Zhang move AXI clock convertor into QDMA module :/
40 | qdma.io.user_clk := user_clk
41 | qdma.io.user_arstn := user_rstn
42 | qdma.io.soft_rstn := 1.U
43 |
44 | Collector.connect_to_status_reg(qdma.io.reg_status, 400)
45 |
46 | withClockAndReset(qdma.io.user_clk,!qdma.io.user_arstn) {
47 | val nvmeCore = Module(new NVMeController(
48 | SSD_NUM = SSD_NUM,
49 | QUEUE_NUM = QUEUE_NUM,
50 | QUEUE_DEPTH = QUEUE_DEPTH,
51 | QDMA_INTERFACE = QDMA_INTERFACE
52 | ))
53 |
54 | if (QDMA_INTERFACE == "DMA") {
55 | nvmeCore.io.c2hCmd.get <> qdma.io.c2h_cmd
56 | nvmeCore.io.c2hData.get <> qdma.io.c2h_data
57 | } else if (QDMA_INTERFACE == "SAXIB") {
58 | qdma.io.s_axib.get <> AXIRegSlice(3)(nvmeCore.io.sAxib.get)
59 | qdma.io.c2h_cmd <> DontCare
60 | qdma.io.c2h_data <> DontCare
61 | }
62 |
63 | val controlReg = qdma.io.reg_control
64 | val statusReg = qdma.io.reg_status
65 |
66 | statusReg(65) := nvmeCore.io.status.params.ssdNum
67 | statusReg(67) := nvmeCore.io.status.params.queueLowBit
68 | statusReg(64) := nvmeCore.io.status.params.ssdLowBit
69 | statusReg(66) := nvmeCore.io.status.params.queueDepth
70 | statusReg(68) := nvmeCore.io.status.params.queueNum
71 | statusReg(69) := nvmeCore.io.status.params.ramTypeBit
72 |
73 | // In such scenario, whole BAR space of QDMA is often used by different modules.
74 | // Thus, we must split AXI Bridge into different ways.
75 | // The 2nd input of AXIRouter is to automatically decide data widths, it will NOT
76 | // be connected to the router.
77 |
78 | // In this example design, AXIB space is splited into 3 ways:
79 | // 0 - Lower space is used to store SQ / CQ RAMs
80 | // 1 - Middle space is used to store PRP lists
81 | // 2 - Upper space is used for data buffer.
82 |
83 | val axibRt = AXIRouter(3, qdma.io.axib)
84 | axibRt.io.in <> AXIRegSlice(2)(qdma.io.axib)
85 | axibRt.io.wrIdx := Mux(
86 | axibRt.io.in.aw.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U,
87 | 0.U, Mux(axibRt.io.in.aw.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U)
88 | )
89 | axibRt.io.rdIdx := Mux(
90 | axibRt.io.in.ar.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U,
91 | 0.U, Mux(axibRt.io.in.ar.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U)
92 | )
93 |
94 | nvmeCore.io.ramIO <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(0)))
95 |
96 | qdma.io.h2c_cmd <> DontCare
97 | qdma.io.h2c_data <> DontCare
98 |
99 | nvmeCore.io.control.enable := controlReg(160)
100 | nvmeCore.io.control.ssdSetup.valid := controlReg(32) & ~RegNext(controlReg(32))
101 | nvmeCore.io.control.ssdSetup.bits.ssdId := controlReg(40)
102 | nvmeCore.io.control.ssdSetup.bits.ssdBarAddr := Cat(controlReg(39), controlReg(38))
103 |
104 | // Benchmark module
105 |
106 | val benchmark = Module(new NVMeBandwidthBenchmark(
107 | SSD_NUM = SSD_NUM,
108 | QUEUE_NUM = QUEUE_NUM,
109 | DATA_BUFFER_SHIFT = DATA_BUFFER_SHIFT
110 | ))
111 |
112 | benchmark.io.ssdCmd <> nvmeCore.io.ssdCmd
113 | benchmark.io.prpInput <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(1)))
114 |
115 | benchmark.io.ctlRunning := controlReg(160)
116 | benchmark.io.ctlEnd := statusReg(192) & ~RegNext(statusReg(192))
117 | benchmark.io.ctlFpgaBar := Cat(controlReg(37), controlReg(36))
118 | benchmark.io.ctlTimeTarget := Cat(controlReg(163), 0.U(6.W))
119 | benchmark.io.ctlNumNlb := controlReg(162)(15, 0)
120 | benchmark.io.ctlMaxParallel := controlReg(170)
121 | benchmark.io.ctlModeWrite := controlReg(161)(0)
122 | benchmark.io.ctlModeRandom := controlReg(161)(1)
123 | benchmark.io.ctlModeMixed := 0.U
124 | benchmark.io.ctlRdBlkSize := controlReg(167)
125 | benchmark.io.ctlWrBlkSize := controlReg(168)
126 | benchmark.io.ctlRdBlkAhead := controlReg(169)
127 |
128 | statusReg(192) := !(nvmeCore.io.status.running || nvmeCore.io.control.enable)
129 | statusReg(193) := nvmeCore.io.status.stat.successfulOp
130 | statusReg(194) := nvmeCore.io.status.stat.failedOp
131 | statusReg(196) := nvmeCore.io.status.stat.executeTime(31, 0)
132 | statusReg(197) := nvmeCore.io.status.stat.executeTime(63, 32)
133 | for (ssdId <- 0 until SSD_NUM) {
134 | statusReg(200+ssdId) := benchmark.io.statSsdIo(ssdId)
135 | }
136 | statusReg(218) := nvmeCore.io.status.stat.totalLatency(31, 0)
137 | statusReg(219) := nvmeCore.io.status.stat.totalLatency(63, 32)
138 |
139 | // Bandwidth probe
140 | val dataBufferIo = AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(2)))
141 | dataBufferIo.readData := 0.U
142 |
143 | val readProbe = Module(new BandwidthProbe)
144 | readProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0)
145 | readProbe.io.fire := dataBufferIo.readEnable
146 | readProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U)
147 | statusReg(216) := Mux(readProbe.io.count.valid, readProbe.io.count.bits, -1.S(32.W).asUInt)
148 |
149 | val writeProbe = Module(new BandwidthProbe)
150 | writeProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0)
151 | writeProbe.io.fire := (dataBufferIo.writeMask =/= 0.U)
152 | writeProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U)
153 | statusReg(217) := Mux(writeProbe.io.count.valid, writeProbe.io.count.bits, -1.S(32.W).asUInt)
154 |
155 | // AXIB Debug
156 | val aw_cnt = RegInit(UInt(32.W), 0.U)
157 | val w_cnt = RegInit(UInt(32.W), 0.U)
158 |
159 | when (qdma.io.axib.aw.fire) {
160 | aw_cnt := aw_cnt + qdma.io.axib.aw.bits.len + 1.U
161 | }
162 |
163 | when (qdma.io.axib.w.fire) {
164 | w_cnt := w_cnt + 1.U
165 | }
166 |
167 | val diff_cnt = aw_cnt - w_cnt
168 | val diff_time = RegInit(UInt(32.W), 0.U)
169 | when (diff_cnt === 0.U) {
170 | diff_time := 0.U
171 | }.otherwise {
172 | diff_time := diff_time + 1.U
173 | }
174 | }
175 | }
176 |
177 | class BypassTLB extends Module with BaseTLB {
178 | io.h2c_in <> io.h2c_out
179 | io.c2h_in <> io.c2h_out
180 | io.tlb_miss_count := 0.U
181 | io.wr_tlb.ready := 1.U
182 | }
--------------------------------------------------------------------------------
/src/deprecated/NVMe.scala:
--------------------------------------------------------------------------------
1 | package nvme.deprecated
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import nvme.NVMeRamIO
6 |
7 | import qdma._
8 |
9 | class NVMe (
10 | DEBUG : Boolean = true,
11 | SSD_MAX_ID : Int = 0,
12 | BUFFER_DATA_SHIFT : Int = 27,
13 | SSD_NUM_SHIFT : Int = 2,
14 | QUEUE_DEPTH_SHIFT : Int = 8,
15 | QUEUE_MAX_ID : Int = 3,
16 | QUEUE_COUNT_SHIFT : Int = 2,
17 | MAX_SQ_INTERVAL : Int = 30
18 | ) extends Module {
19 | val io = IO(new Bundle{
20 | val ssdCmd = Flipped(Vec(SSD_MAX_ID+1, Vec(QUEUE_MAX_ID+1, Decoupled(new NVMeCommand))))
21 |
22 | val regControl = new NVMeControl
23 | val regStatus = new NVMeStatus
24 |
25 | val bramReq = Flipped(new NVMeRamIO)
26 |
27 | val s_axib = new AXIB_SLAVE
28 |
29 | val pcie_hbm_write_transfer = if (DEBUG) {Some(Input(UInt(2.W)))} else None
30 | })
31 |
32 | val QUEUE_NUM = (SSD_MAX_ID+1) * (QUEUE_MAX_ID+1)
33 |
34 | val nvmeCore = Module(new NVMeBlackBox(
35 | SSD_MAX_ID = SSD_MAX_ID,
36 | BUFFER_DATA_SHIFT = BUFFER_DATA_SHIFT,
37 | SSD_NUM_SHIFT = SSD_NUM_SHIFT,
38 | QUEUE_DEPTH_SHIFT = QUEUE_DEPTH_SHIFT,
39 | QUEUE_MAX_ID = QUEUE_MAX_ID,
40 | QUEUE_COUNT_SHIFT = QUEUE_COUNT_SHIFT,
41 | MAX_SQ_INTERVAL = MAX_SQ_INTERVAL
42 | ))
43 |
44 | nvmeCore.io.clk_core := clock
45 | nvmeCore.io.sys_reset := reset
46 |
47 | io.regStatus.p2pdma_h2c_data := nvmeCore.io.status_p2pdma_h2c_data
48 | io.regStatus.p2pdma_h2c_done := nvmeCore.io.status_p2pdma_h2c_done
49 | io.regStatus.p2pdma_c2h_done := nvmeCore.io.status_p2pdma_c2h_done
50 | io.regStatus.nvme_init_done := nvmeCore.io.status_nvme_init_done
51 | io.regStatus.nvme_exec_done := nvmeCore.io.status_nvme_exec_done
52 | io.regStatus.stat_op_succ := nvmeCore.io.status_stat_op_succ
53 | io.regStatus.stat_op_fail := nvmeCore.io.status_stat_op_fail
54 | io.regStatus.stat_exec_time := nvmeCore.io.status_stat_exec_time
55 | io.regStatus.stat_io_ssd0 := nvmeCore.io.status_stat_io_ssd0
56 | io.regStatus.stat_io_ssd1 := nvmeCore.io.status_stat_io_ssd1
57 | io.regStatus.stat_io_ssd2 := nvmeCore.io.status_stat_io_ssd2
58 | io.regStatus.stat_io_ssd3 := nvmeCore.io.status_stat_io_ssd3
59 | io.regStatus.band_tr_rd := nvmeCore.io.status_band_tr_rd
60 | io.regStatus.band_tr_wr := nvmeCore.io.status_band_tr_wr
61 |
62 | nvmeCore.io.control_init_start := io.regControl.init_start
63 | nvmeCore.io.control_init_nsid := io.regControl.init_nsid
64 | nvmeCore.io.control_init_dma_addr := io.regControl.init_dma_addr
65 | nvmeCore.io.control_init_byp_addr := io.regControl.init_byp_addr
66 | nvmeCore.io.control_init_ssd_addr := io.regControl.init_ssd_addr
67 | nvmeCore.io.control_init_ssdid := io.regControl.init_ssdid
68 | nvmeCore.io.control_p2pdma_read := io.regControl.p2pdma_read
69 | nvmeCore.io.control_p2pdma_write := io.regControl.p2pdma_write
70 | nvmeCore.io.control_p2pdma_cmd_addr := io.regControl.p2pdma_cmd_addr
71 | nvmeCore.io.control_p2pdma_cmd_len := io.regControl.p2pdma_cmd_len
72 | nvmeCore.io.control_p2pdma_c2h_data := io.regControl.p2pdma_c2h_data
73 | nvmeCore.io.control_ssd_init := io.regControl.ssd_init
74 | nvmeCore.io.control_exec_start := io.regControl.exec_start
75 | nvmeCore.io.control_exec_time := io.regControl.exec_time
76 | nvmeCore.io.control_band_tr_en := io.regControl.band_tr_en
77 | nvmeCore.io.control_band_tr_read := io.regControl.band_tr_read
78 |
79 | nvmeCore.io.axib_read_enable := io.bramReq.readEnable
80 | nvmeCore.io.axib_read_addr := io.bramReq.readAddr
81 | io.bramReq.readData := nvmeCore.io.axib_read_data
82 | nvmeCore.io.axib_write_mask := io.bramReq.writeMask
83 | nvmeCore.io.axib_write_addr := io.bramReq.writeAddr
84 | nvmeCore.io.axib_write_data := io.bramReq.writeData
85 |
86 | io.s_axib.qdma_init()
87 |
88 | nvmeCore.io.s_axib_awid <> io.s_axib.aw.bits.id
89 | nvmeCore.io.s_axib_awaddr <> io.s_axib.aw.bits.addr
90 | nvmeCore.io.s_axib_awlen <> io.s_axib.aw.bits.len
91 | nvmeCore.io.s_axib_awsize <> io.s_axib.aw.bits.size
92 | nvmeCore.io.s_axib_awuser <> io.s_axib.aw.bits.user
93 | nvmeCore.io.s_axib_awburst <> io.s_axib.aw.bits.burst
94 | nvmeCore.io.s_axib_awregion <> io.s_axib.aw.bits.region
95 | nvmeCore.io.s_axib_awvalid <> io.s_axib.aw.valid
96 | nvmeCore.io.s_axib_awready <> io.s_axib.aw.ready
97 | nvmeCore.io.s_axib_wdata <> io.s_axib.w.bits.data
98 | nvmeCore.io.s_axib_wstrb <> io.s_axib.w.bits.strb
99 | nvmeCore.io.s_axib_wlast <> io.s_axib.w.bits.last
100 | nvmeCore.io.s_axib_wuser <> io.s_axib.w.bits.user
101 | nvmeCore.io.s_axib_wvalid <> io.s_axib.w.valid
102 | nvmeCore.io.s_axib_wready <> io.s_axib.w.ready
103 | nvmeCore.io.s_axib_bid <> io.s_axib.b.bits.id
104 | nvmeCore.io.s_axib_bresp <> io.s_axib.b.bits.resp
105 | nvmeCore.io.s_axib_bvalid <> io.s_axib.b.valid
106 | nvmeCore.io.s_axib_bready <> io.s_axib.b.ready
107 | nvmeCore.io.s_axib_arid <> io.s_axib.ar.bits.id
108 | nvmeCore.io.s_axib_araddr <> io.s_axib.ar.bits.addr
109 | nvmeCore.io.s_axib_arlen <> io.s_axib.ar.bits.len
110 | nvmeCore.io.s_axib_arsize <> io.s_axib.ar.bits.size
111 | nvmeCore.io.s_axib_aruser <> io.s_axib.ar.bits.user
112 | nvmeCore.io.s_axib_arburst <> io.s_axib.ar.bits.burst
113 | nvmeCore.io.s_axib_arregion <> io.s_axib.ar.bits.region
114 | nvmeCore.io.s_axib_arvalid <> io.s_axib.ar.valid
115 | nvmeCore.io.s_axib_arready <> io.s_axib.ar.ready
116 | nvmeCore.io.s_axib_rid <> io.s_axib.r.bits.id
117 | nvmeCore.io.s_axib_rdata <> io.s_axib.r.bits.data
118 | nvmeCore.io.s_axib_ruser <> io.s_axib.r.bits.user
119 | nvmeCore.io.s_axib_rresp <> io.s_axib.r.bits.resp
120 | nvmeCore.io.s_axib_rlast <> io.s_axib.r.bits.last
121 | nvmeCore.io.s_axib_rvalid <> io.s_axib.r.valid
122 | nvmeCore.io.s_axib_rready <> io.s_axib.r.ready
123 |
124 | if (DEBUG) {
125 | nvmeCore.io.pcie_hbm_write_transfer := io.pcie_hbm_write_transfer.get
126 | } else {
127 | nvmeCore.io.pcie_hbm_write_transfer := 0.U
128 | }
129 |
130 | // Handle NVMe command, where I need to combine each wire in bundle to a vector.
131 |
132 | val ssdCmdOpVec = Wire(Vec(QUEUE_NUM, UInt(1.W)))
133 | val ssdCmdNlbVec = Wire(Vec(QUEUE_NUM, UInt(16.W)))
134 | val ssdCmdLbaVec = Wire(Vec(QUEUE_NUM, UInt(32.W)))
135 | val ssdCmdOffsetVec = Wire(Vec(QUEUE_NUM, UInt(32.W)))
136 | val ssdCmdValidVec = Wire(Vec(QUEUE_NUM, UInt(1.W)))
137 | val ssdCmdReadyVec = Wire(Vec(QUEUE_NUM, UInt(1.W)))
138 |
139 | var i, j = 0
140 |
141 | for (i <- 0 to SSD_MAX_ID) {
142 | for (j <- 0 to QUEUE_MAX_ID) {
143 | val idx = i*(QUEUE_MAX_ID+1) + j
144 | ssdCmdOpVec(idx) := io.ssdCmd(i)(j).bits.op
145 | ssdCmdNlbVec(idx) := io.ssdCmd(i)(j).bits.numLb
146 | ssdCmdLbaVec(idx) := io.ssdCmd(i)(j).bits.ssdAddr
147 | ssdCmdOffsetVec(idx) := io.ssdCmd(i)(j).bits.memAddr
148 | ssdCmdValidVec(idx) := io.ssdCmd(i)(j).valid
149 | io.ssdCmd(i)(j).ready := ssdCmdReadyVec(idx)
150 | }
151 | }
152 |
153 | nvmeCore.io.ssd_cmd_op := ssdCmdOpVec.asTypeOf(UInt(QUEUE_NUM.W))
154 | nvmeCore.io.ssd_cmd_nlb := ssdCmdNlbVec.asTypeOf(UInt((QUEUE_NUM*16).W))
155 | nvmeCore.io.ssd_cmd_lba := ssdCmdLbaVec.asTypeOf(UInt((QUEUE_NUM*32).W))
156 | nvmeCore.io.ssd_cmd_offset := ssdCmdOffsetVec.asTypeOf(UInt((QUEUE_NUM*32).W))
157 | nvmeCore.io.ssd_cmd_valid := ssdCmdValidVec.asTypeOf(UInt(QUEUE_NUM.W))
158 | ssdCmdReadyVec := nvmeCore.io.ssd_cmd_ready.asTypeOf(Vec(QUEUE_NUM, UInt(1.W)))
159 | }
--------------------------------------------------------------------------------
/src/NVMeLatencyBenchmark.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import math.max
9 |
10 | // Runs a simple NVMe benchmark.
11 |
12 | class NVMeLatencyBenchmark (
13 | SSD_NUM : Int,
14 | DATA_BUFFER_SHIFT : Int
15 | ) extends Module {
16 | val io = IO(new Bundle {
17 | // Interfaces
18 | val prpInput = Flipped(new NVMeRamIO)
19 | val ssdCmd = Vec(SSD_NUM, Decoupled(UInt(512.W)))
20 | val ssdCmpt = Vec(SSD_NUM, Flipped(Valid(new SSDCompletion)))
21 |
22 | // Control
23 | val ctlRunning = Input(Bool())
24 | val ctlEnd = Input(Bool())
25 | val ctlFpgaBar = Input(UInt(64.W))
26 | val ctlTimeTarget = Input(UInt(38.W))
27 | val ctlNumNlb = Input(UInt(16.W))
28 | val ctlMaxParallel = Input(UInt(32.W))
29 | val ctlModeWrite = Input(UInt(1.W))
30 | val ctlModeRandom = Input(UInt(1.W))
31 | val ctlModeMixed = Input(UInt(1.W))
32 | val ctlReadLatency = Flipped(Valid(UInt(13.W)))
33 | // Status
34 | val statSsdIo = Output(Vec(SSD_NUM, UInt(32.W)))
35 | val statLatency = Output(UInt(32.W))
36 | })
37 |
38 | // Control and status signals.
39 |
40 | val ctlTimeTarget = RegInit(UInt(38.W), 0.U)
41 | val ctlNumNlb = RegInit(UInt(16.W), 0.U)
42 | val ctlMaxParallel = RegInit(UInt(32.W), 0.U)
43 | val ctlModeWrite = RegInit(UInt(1.W), 0.U)
44 | val ctlModeRandom = RegInit(UInt(1.W), 0.U)
45 | val ctlModeMixed = RegInit(UInt(1.W), 0.U)
46 |
47 | val statSsdIo = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(32.W))))
48 | for (ssdId <- 0 until SSD_NUM) {
49 | io.statSsdIo(ssdId) := statSsdIo(ssdId)
50 | }
51 |
52 | // Initiate signals
53 |
54 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) {
55 | ctlTimeTarget := io.ctlTimeTarget
56 | ctlNumNlb := io.ctlNumNlb
57 | ctlMaxParallel := io.ctlMaxParallel
58 | ctlModeWrite := io.ctlModeWrite
59 | ctlModeRandom := io.ctlModeRandom
60 | ctlModeMixed := io.ctlModeMixed
61 | for (ssdId <- 0 until SSD_NUM) {
62 | statSsdIo(ssdId) := 0.U
63 | }
64 | }
65 |
66 | // In this example, data transfer is between FPGA and SSDs.
67 | // I give each queue a fixed buffer.
68 |
69 | val SSD_BIT_LOW = DATA_BUFFER_SHIFT - log2Ceil(SSD_NUM)
70 | val QUEUE_BIT_LOW = SSD_BIT_LOW - 1
71 | val PRP_ADDR_MSB = DATA_BUFFER_SHIFT - 10
72 |
73 | // Global timer
74 | val gblTimer = RegInit(UInt(64.W), 0.U)
75 | gblTimer := gblTimer + 1.U(64.W)
76 |
77 | val startTimeRam = XRam(SSD_NUM)(UInt(64.W), 256, latency=2)
78 |
79 | // Generate commands
80 |
81 | for (ssdId <- 0 until SSD_NUM) {
82 | val cmdOutStd = RegInit(UInt(32.W), 0.U)
83 |
84 | val cmdRdCond = (cmdOutStd =/= ctlMaxParallel)
85 | val cmdWrCond = (cmdOutStd =/= ctlMaxParallel)
86 |
87 | val cmdLba = RegInit(0.U(64.W))
88 | val cmdPrp1 = Wire(UInt(64.W))
89 | val cmdPrp2 = Wire(UInt(64.W))
90 | val cmdNlb = RegInit(0.U(16.W))
91 | val cmdId = RegInit(0.U(16.W))
92 |
93 | cmdPrp1 := (io.ctlFpgaBar
94 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
95 | + (ssdId.U(64.W) << SSD_BIT_LOW)
96 | )
97 | cmdPrp2 := Mux(
98 | cmdNlb < 16.U,
99 | cmdPrp1 + 0x1000.U(64.W),
100 | (
101 | io.ctlFpgaBar
102 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
103 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9))
104 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9))
105 | )
106 | )
107 |
108 | io.ssdCmd(ssdId).valid := Mux(ctlModeWrite.asBool,
109 | ( // Write
110 | io.ctlRunning && RegNext(io.ctlRunning)
111 | && cmdWrCond && ctlModeWrite.asBool
112 | ),
113 | ( // Read
114 | io.ctlRunning && RegNext(io.ctlRunning)
115 | && cmdRdCond && ~ctlModeWrite.asBool
116 | )
117 | )
118 | io.ssdCmd(ssdId).bits := Mux(ctlModeWrite.asBool,
119 | NVMeCommandSet.nvmWrite(
120 | cmdId, cmdPrp1, cmdPrp2, cmdLba, cmdNlb
121 | ),
122 | NVMeCommandSet.nvmRead(
123 | cmdId, cmdPrp1, cmdPrp2, cmdLba, cmdNlb
124 | )
125 | )
126 |
127 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) {
128 | cmdId := 0.U
129 | when (io.ctlModeRandom.asBool) {
130 | cmdLba := Cat("h92918".U(20.W), 0.U(10.W))
131 | }.otherwise {
132 | cmdLba := 0.U
133 | }
134 | cmdNlb := io.ctlNumNlb
135 | cmdOutStd := 0.U
136 | }.elsewhen (io.ctlRunning) {
137 | when (io.ssdCmd(ssdId).fire) {
138 | when (ctlModeRandom.asBool) {
139 | val nextRndPart = Wire(UInt(20.W))
140 | nextRndPart := (cmdLba(29, 10) << 5) + (cmdLba(29, 10) >> 5)
141 | cmdLba := Cat(nextRndPart, 0.U(10.W))
142 | }.otherwise {
143 | val nextSeqPart = Wire(UInt(30.W))
144 | nextSeqPart := cmdLba + 1.U(30.W) + ctlNumNlb
145 | cmdLba := nextSeqPart
146 | }
147 |
148 | cmdId := cmdId + 1.U
149 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U
150 | cmdOutStd := cmdOutStd + 1.U
151 | }
152 |
153 | when (io.ssdCmpt(ssdId).valid) {
154 | cmdOutStd := cmdOutStd - 1.U
155 | }
156 | } // elsewhen (io.ctlRunning)
157 |
158 | // Start time RAM. We track latency for each command.
159 |
160 | startTimeRam(ssdId).io.addr_a := cmdId(7, 0)
161 | startTimeRam(ssdId).io.data_in_a := gblTimer
162 | startTimeRam(ssdId).io.wr_en_a := io.ssdCmd(ssdId).fire
163 | startTimeRam(ssdId).io.addr_b := io.ssdCmpt(ssdId).bits.cmdId(7, 0)
164 |
165 | } // for (ssdId <- 0 until SSD_NUM)
166 |
167 | val cmptId = Wire(UInt(max(1, log2Ceil(SSD_NUM)).W))
168 |
169 | cmptId := 0.U
170 | for (ssdId <- 0 until SSD_NUM) {
171 | when (io.ssdCmpt(ssdId).valid) {
172 | cmptId := ssdId.U
173 | }
174 | }
175 |
176 | val latencyBits = Wire(UInt(64.W))
177 | latencyBits := 0.U
178 | for (ssdId <- 0 until SSD_NUM) {
179 | when (RegNext(RegNext(cmptId)) === ssdId.U) {
180 | latencyBits := gblTimer - startTimeRam(ssdId).io.data_out_b
181 | }
182 | }
183 |
184 | // Latency RAM.
185 |
186 | val latencyValid = RegNext(RegNext(io.ssdCmpt(cmptId).valid))
187 |
188 | val latencyRam = XRam(UInt(32.W), 8192, latency=2, memory_type="ultra")
189 | latencyRam.io.addr_a := Mux(io.ctlRunning, RegNext(RegNext(latencyRam.io.addr_b)) ,io.ctlReadLatency.bits)
190 | latencyRam.io.wr_en_a := io.ctlRunning && latencyValid
191 | latencyRam.io.addr_b := Mux(latencyBits(63, 16) === 0.U, latencyBits(15, 3), -1.S(13.W).asUInt)
192 | latencyRam.io.data_in_a := latencyRam.io.data_out_b + 1.U
193 | io.statLatency := latencyRam.io.data_out_a
194 |
195 | // Generate PRP list
196 |
197 | io.prpInput.readData := Cat(
198 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6) + 1.U((PRP_ADDR_MSB-5).W), 0x0000.U(15.W),
199 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x7000.U(15.W),
200 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x6000.U(15.W),
201 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x5000.U(15.W),
202 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x4000.U(15.W),
203 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x3000.U(15.W),
204 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x2000.U(15.W),
205 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x1000.U(15.W),
206 | )
207 | }
--------------------------------------------------------------------------------
/src/NVMeBandwidthBenchmark.scala:
--------------------------------------------------------------------------------
1 | package nvme
2 |
3 | import chisel3._
4 | import chisel3.util._
5 | import common._
6 | import common.storage._
7 | import common.axi._
8 | import math.max
9 |
10 | // Runs a simple NVMe benchmark.
11 |
12 | class NVMeBandwidthBenchmark (
13 | SSD_NUM : Int,
14 | QUEUE_NUM : Int,
15 | DATA_BUFFER_SHIFT : Int
16 | ) extends Module {
17 | val io = IO(new Bundle {
18 | // Interfaces
19 | val prpInput = Flipped(new NVMeRamIO)
20 | val ssdCmd = Vec(SSD_NUM, Vec(QUEUE_NUM, Decoupled(UInt(512.W))))
21 | // Control
22 | val ctlRunning = Input(Bool())
23 | val ctlEnd = Input(Bool())
24 | val ctlFpgaBar = Input(UInt(64.W))
25 | val ctlTimeTarget = Input(UInt(38.W))
26 | val ctlNumNlb = Input(UInt(16.W))
27 | val ctlMaxParallel = Input(UInt(32.W))
28 | val ctlModeWrite = Input(UInt(1.W))
29 | val ctlModeRandom = Input(UInt(1.W))
30 | val ctlModeMixed = Input(UInt(1.W))
31 | val ctlRdBlkSize = Input(UInt(32.W))
32 | val ctlWrBlkSize = Input(UInt(32.W))
33 | val ctlRdBlkAhead = Input(UInt(32.W))
34 | // Status
35 | val statSsdIo = Output(Vec(SSD_NUM, UInt(32.W)))
36 | })
37 |
38 | assert (QUEUE_NUM >= 2, "At least 2 queues must be used for benchmarking")
39 |
40 | // Control and status signals.
41 |
42 | val ctlTimeTarget = RegInit(UInt(38.W), 0.U)
43 | val ctlNumNlb = RegInit(UInt(16.W), 0.U)
44 | val ctlMaxParallel = RegInit(UInt(32.W), 0.U)
45 | val ctlModeWrite = RegInit(UInt(1.W), 0.U)
46 | val ctlModeRandom = RegInit(UInt(1.W), 0.U)
47 | val ctlModeMixed = RegInit(UInt(1.W), 0.U)
48 | val ctlRdBlkSize = RegInit(UInt(32.W), 0.U)
49 | val ctlWrBlkSize = RegInit(UInt(32.W), 0.U)
50 | val ctlRdBlkAhead = RegInit(UInt(32.W), 0.U)
51 |
52 | val statSsdIo = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(32.W))))
53 | for (ssdId <- 0 until SSD_NUM) {
54 | io.statSsdIo(ssdId) := statSsdIo(ssdId)
55 | }
56 |
57 | // Initiate signals
58 |
59 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) {
60 | ctlTimeTarget := io.ctlTimeTarget
61 | ctlNumNlb := io.ctlNumNlb
62 | ctlMaxParallel := io.ctlMaxParallel
63 | ctlModeWrite := io.ctlModeWrite
64 | ctlModeRandom := io.ctlModeRandom
65 | ctlModeMixed := io.ctlModeMixed
66 | ctlRdBlkSize := io.ctlRdBlkSize
67 | ctlWrBlkSize := io.ctlWrBlkSize
68 | ctlRdBlkAhead := io.ctlRdBlkAhead
69 | for (ssdId <- 0 until SSD_NUM) {
70 | statSsdIo(ssdId) := 0.U
71 | }
72 | }
73 |
74 | // In this example, data transfer is between FPGA and SSDs.
75 | // I give each queue a fixed buffer.
76 |
77 | val SSD_BIT_LOW = DATA_BUFFER_SHIFT - log2Ceil(SSD_NUM)
78 | val QUEUE_BIT_LOW = SSD_BIT_LOW - 1
79 | val PRP_ADDR_MSB = DATA_BUFFER_SHIFT - 10
80 |
81 | // Generate commands
82 |
83 | for (ssdId <- 0 until SSD_NUM) {
84 | val cmdRdCnt = RegInit(UInt(32.W), 0.U)
85 | val cmdWrCnt = RegInit(UInt(32.W), 0.U)
86 | val cmdRdBlk = RegInit(UInt(32.W), 0.U)
87 | val cmdWrBlk = RegInit(UInt(32.W), 0.U)
88 |
89 | val cmdRdCond = (cmdWrBlk + ctlRdBlkAhead =/= cmdRdBlk)
90 | val cmdWrCond = (cmdWrBlk =/= cmdRdBlk)
91 |
92 | val cmdLba = RegInit(VecInit(Seq.fill(2)(0.U(64.W))))
93 | val cmdPrp1 = Wire(Vec(2, UInt(64.W)))
94 | val cmdPrp2 = Wire(Vec(2, UInt(64.W)))
95 | val cmdNlb = RegInit(VecInit(Seq.fill(2)(0.U(16.W))))
96 | val cmdId = RegInit(VecInit(Seq.fill(2)(0.U(16.W))))
97 |
98 | cmdPrp1(0) := (io.ctlFpgaBar
99 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
100 | + (ssdId.U(64.W) << SSD_BIT_LOW)
101 | )
102 | cmdPrp1(1) := (io.ctlFpgaBar
103 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
104 | + (ssdId.U(64.W) << SSD_BIT_LOW)
105 | + (1.U(64.W) << QUEUE_BIT_LOW)
106 | )
107 | cmdPrp2(0) := Mux(
108 | cmdNlb(0) < 16.U,
109 | cmdPrp1(0) + 0x1000.U(64.W),
110 | (
111 | io.ctlFpgaBar
112 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
113 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9))
114 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9))
115 | )
116 | )
117 | cmdPrp2(1) := Mux(
118 | cmdNlb(0) < 16.U,
119 | cmdPrp2(0) + 0x1000.U(64.W),
120 | (
121 | io.ctlFpgaBar
122 | + (1.U(64.W) << DATA_BUFFER_SHIFT)
123 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9))
124 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9))
125 | + (1.U(64.W) << (QUEUE_BIT_LOW-9))
126 | )
127 | )
128 |
129 | io.ssdCmd(ssdId)(0).valid := (
130 | io.ctlRunning && RegNext(io.ctlRunning)
131 | && Mux(ctlModeMixed.asBool, cmdRdCond, ~ctlModeWrite.asBool)
132 | )
133 | io.ssdCmd(ssdId)(0).bits := NVMeCommandSet.nvmRead(
134 | cmdId(0), cmdPrp1(0), cmdPrp2(0), cmdLba(0), cmdNlb(0)
135 | )
136 | io.ssdCmd(ssdId)(1).valid := (
137 | io.ctlRunning && RegNext(io.ctlRunning)
138 | && Mux(ctlModeMixed.asBool, cmdWrCond, ctlModeWrite.asBool)
139 | )
140 | io.ssdCmd(ssdId)(1).bits := NVMeCommandSet.nvmWrite(
141 | cmdId(1), cmdPrp1(1), cmdPrp2(1), cmdLba(1), cmdNlb(1)
142 | )
143 | for (queueId <- 2 until QUEUE_NUM) {
144 | io.ssdCmd(ssdId)(queueId).valid := 0.U
145 | io.ssdCmd(ssdId)(queueId).bits := 0.U
146 | }
147 |
148 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) {
149 | cmdId(0) := 0.U
150 | cmdId(1) := 1.U
151 | when (io.ctlModeRandom.asBool) {
152 | cmdLba(0) := Cat("h92918".U(20.W), 0.U(10.W))
153 | cmdLba(1) := Cat(1.U(1.W), "h92918".U(20.W), 0.U(10.W))
154 | }.otherwise {
155 | cmdLba(0) := 0.U
156 | cmdLba(1) := Cat(1.U(1.W), 0.U(30.W))
157 | }
158 | cmdNlb(0) := io.ctlNumNlb
159 | cmdNlb(1) := io.ctlNumNlb
160 | cmdRdCnt := 0.U
161 | cmdWrCnt := 0.U
162 | cmdRdBlk := 0.U
163 | cmdWrBlk := 0.U
164 | }.elsewhen (io.ctlRunning) {
165 | when (io.ssdCmd(ssdId)(0).fire) {
166 | when (ctlModeRandom.asBool) {
167 | val nextRndPart = Wire(UInt(20.W))
168 | nextRndPart := (cmdLba(0)(29, 10) << 5) + (cmdLba(0)(29, 10) >> 5)
169 | cmdLba(0) := Cat(nextRndPart, 0.U(10.W))
170 | }.otherwise {
171 | val nextSeqPart = Wire(UInt(30.W))
172 | nextSeqPart := cmdLba(0) + 1.U(30.W) + ctlNumNlb
173 | cmdLba(0) := nextSeqPart
174 | }
175 |
176 | when (cmdRdCnt + 1.U =/= ctlRdBlkSize) {
177 | cmdRdCnt := cmdRdCnt + 1.U
178 | }.otherwise {
179 | cmdRdBlk := cmdRdBlk + 1.U
180 | cmdRdCnt := 0.U
181 | }
182 |
183 | cmdId(0) := cmdId(0) + 2.U
184 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U
185 | }
186 |
187 | when (io.ssdCmd(ssdId)(1).fire) {
188 | when (ctlModeRandom.asBool) {
189 | val nextRndPart = Wire(UInt(20.W))
190 | nextRndPart := (cmdLba(1)(29, 10) << 5) + (cmdLba(1)(29, 10) >> 5)
191 | cmdLba(1) := Cat(1.U(1.W), nextRndPart, 1.U(10.W))
192 | }.otherwise {
193 | val nextSeqPart = Wire(UInt(30.W))
194 | nextSeqPart := cmdLba(1) + 1.U(30.W) + ctlNumNlb
195 | cmdLba(1) := Cat(1.U(1.W), nextSeqPart)
196 | }
197 |
198 | when (cmdWrCnt + 1.U =/= ctlWrBlkSize) {
199 | cmdWrCnt := cmdWrCnt + 1.U
200 | }.otherwise {
201 | cmdWrBlk := cmdWrBlk + 1.U
202 | cmdWrCnt := 0.U
203 | }
204 |
205 | cmdId(1) := cmdId(1) + 2.U
206 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U
207 | }
208 | } // elsewhen (io.ctlRunning)
209 | } // for (ssdId <- 0 until SSD_NUM)
210 |
211 | // Generate PRP list
212 |
213 | io.prpInput.readData := Cat(
214 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6) + 1.U((PRP_ADDR_MSB-5).W), 0x0000.U(15.W),
215 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x7000.U(15.W),
216 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x6000.U(15.W),
217 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x5000.U(15.W),
218 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x4000.U(15.W),
219 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x3000.U(15.W),
220 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x2000.U(15.W),
221 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x1000.U(15.W),
222 | )
223 | }
--------------------------------------------------------------------------------
/sw/BandwidthBenchmark.cpp:
--------------------------------------------------------------------------------
1 | /* NVMeBenchmark.cpp
2 | * A simple NVMe benchmark program.
3 | * Used with NVMeBenchmarkTop module.
4 | */
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | using namespace std;
18 |
19 | #define SSD_ADMIN_SQ_PHYS_BASE(ssd_id) ((queue_phys_base)+0x2000*(ssd_id))
20 | #define SSD_ADMIN_CQ_PHYS_BASE(ssd_id) ((queue_phys_base)+0x2000*(ssd_id)+0x1000)
21 | #define SSD_ADMIN_SQ_VIRT_BASE(ssd_id) ((huge_virt_base)+0x2000*(ssd_id))
22 | #define SSD_ADMIN_CQ_VIRT_BASE(ssd_id) ((huge_virt_base)+0x2000*(ssd_id)+0x1000)
23 |
24 | // Change this as you like. Zero's based.
25 | #define ADMIN_QUEUE_DEPTH 0x1f
26 | // List of devices.
27 | string pci_id[] = {"e3", "36", "38", "39", "3a", "3b", "3c", "3d"};
28 |
29 | // Functions for the CPU to create admin and I/O queues.
30 | uint16_t command_id[32];
31 | uint32_t admin_sq_tl[32], admin_cq_hd[32];
32 | uint64_t ssd_virt_base[32];
33 | uint64_t queue_phys_base, smart_phys_base;
34 | uint64_t huge_virt_base;
35 |
36 | void insert_admin_sq(int ssd_id, uint32_t command[])
37 | {
38 | // Calculate the starting address of command.
39 | uint32_t *command_base = (uint32_t *)(SSD_ADMIN_SQ_VIRT_BASE(ssd_id) + (64 * admin_sq_tl[ssd_id]));
40 |
41 | // Fill in the command.
42 | for (int i=0; i<16; i++)
43 | {
44 | command_base[i] = command[i];
45 | }
46 |
47 | // Ring the doorbell.
48 | command_id[ssd_id]++;
49 | admin_sq_tl[ssd_id] = (admin_sq_tl[ssd_id] + 1) & ADMIN_QUEUE_DEPTH;
50 | uint32_t *nvme_sq0tdbl_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1000);
51 | *nvme_sq0tdbl_pt = admin_sq_tl[ssd_id];
52 | return;
53 | }
54 |
55 | int wait_for_next_cqe(int ssd_id)
56 | {
57 | // Calculate the starting address of command.
58 | uint32_t *command_base = (uint32_t *)(SSD_ADMIN_CQ_VIRT_BASE(ssd_id) + (16 * (admin_cq_hd[ssd_id] & ADMIN_QUEUE_DEPTH)));
59 |
60 | int unexpected_phase = ((admin_cq_hd[ssd_id] >> 7) & 0x1);
61 | // fprintf(stdout, "command base: %08lx, unexpected phase: %x, sq tail: %x\n", (uint64_t) command_base, unexpected_phase, admin_sq_tl[0]);
62 |
63 | int current_phase = unexpected_phase;
64 | while (current_phase == unexpected_phase)
65 | {
66 | current_phase = command_base[3];
67 | current_phase = ((current_phase >> 16) & 0x1);
68 | }
69 | int status = command_base[3];
70 | status = (status >> 17);
71 |
72 | // Ring the doorbell.
73 | admin_cq_hd[ssd_id] = (admin_cq_hd[ssd_id] + 1) & ((ADMIN_QUEUE_DEPTH << 2) + 1);
74 | uint32_t *nvme_cq0hdbl_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1004);
75 | *nvme_cq0hdbl_pt = admin_cq_hd[ssd_id];
76 | // fprintf(stdout, "sq tail: %02x, cq head: %02x.\n", admin_sq_tl[0], admin_cq_hd[0]);
77 |
78 | return status;
79 | }
80 |
81 | int nvme_set_num_of_qp(int ssd_id, uint16_t queue_count)
82 | {
83 | uint16_t queue_count_zerobased = (queue_count - 1);
84 | uint32_t command[16];
85 | // Now fill in each dw of command.
86 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
87 | command[0] = (command_id[ssd_id] << 16) + (0x09 << 0);
88 | // DW 1: bit 31-0 namespace, all 1's in this case.
89 | command[1] = 0xffffffff;
90 | // DW 2-9 rsvd.
91 | for (int i=2; i<=9; i++)
92 | {
93 | command[i] = 0;
94 | }
95 | // DW 10: bit 31 save, bit 30-8 rsvd, bit 7-0 feature ID.
96 | command[10] = (0x07 << 0);
97 | // DW 11: bit 31-16 number of CQ zerobased, bit 15-0 number of SQ zerobased.
98 | command[11] = (queue_count_zerobased << 16) + (queue_count_zerobased << 0);
99 | // DW 12-15 rsvd.
100 | for (int i=12; i<=15; i++)
101 | {
102 | command[i] = 0;
103 | }
104 | // for (int i=0; i<16; i++)
105 | // {
106 | // fprintf(stdout, "DW %2d: %08x\n", i, command[i]);
107 | // }
108 | insert_admin_sq(ssd_id, command);
109 | return wait_for_next_cqe(ssd_id);
110 | }
111 |
112 | int nvme_create_cq(int ssd_id, uint16_t cq_id, uint16_t cq_depth, uint64_t cq_addr)
113 | {
114 | uint16_t cq_depth_zerobased = cq_depth - 1;
115 | uint32_t command[16];
116 | // Now fill in each dw of command.
117 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
118 | command[0] = (command_id[ssd_id] << 16) + (0x05 << 0);
119 | // DW 1-5 rsvd.
120 | for (int i=1; i<=5; i++)
121 | {
122 | command[i] = 0;
123 | }
124 | // DW 6-7: bit 63-0 PRP1
125 | command[6] = (uint32_t)(cq_addr & 0xffffffff);
126 | command[7] = (uint32_t)(cq_addr >> 32);
127 | // DW 8-9 rsvd.
128 | command[8] = 0;
129 | command[9] = 0;
130 | // DW 10: bit 31-16 queue depth, bit 15-0 queue id
131 | command[10] = (cq_depth_zerobased << 16) + (cq_id << 0);
132 | // DW 11: Bit 31-16 interrupt vector, bit 15-2 esvd, bit 1 int enable, bit 0 phys cont
133 | command[11] = 1;
134 | // DW 12-15 rsvd
135 | for (int i=12; i<=15; i++)
136 | {
137 | command[i] = 0;
138 | }
139 | // for (int i=0; i<16; i++)
140 | // {
141 | // fprintf(stdout, "DW %2d: %08x\n", i, command[i]);
142 | // }
143 | insert_admin_sq(ssd_id, command);
144 | return wait_for_next_cqe(ssd_id);
145 | }
146 |
147 | int nvme_create_sq(int ssd_id, uint16_t sq_id, uint16_t cq_id, uint16_t sq_depth, uint64_t sq_addr)
148 | {
149 | uint16_t sq_depth_zerobased = sq_depth - 1;
150 | uint32_t command[16];
151 | // Now fill in each dw of command.
152 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
153 | command[0] = (command_id[ssd_id] << 16) + (0x01 << 0);
154 | // DW 1-5 rsvd.
155 | for (int i=1; i<=5; i++)
156 | {
157 | command[i] = 0;
158 | }
159 | // DW 6-7: bit 63-0 PRP1
160 | command[6] = (uint32_t)(sq_addr & 0xffffffff);
161 | command[7] = (uint32_t)(sq_addr >> 32);
162 | // DW 8-9 rsvd.
163 | command[8] = 0;
164 | command[9] = 0;
165 | // DW 10: bit 31-16 queue depth, bit 15-0 queue id
166 | command[10] = (sq_depth_zerobased << 16) + (sq_id << 0);
167 | // DW 11: Bit 31-16 cq_id, bit 15-2 esvd, bit 1 int enable, bit 0 phys cont
168 | command[11] = (cq_id << 16) + (0x1 << 0);
169 | // DW 12-15 rsvd
170 | for (int i=12; i<=15; i++)
171 | {
172 | command[i] = 0;
173 | }
174 | insert_admin_sq(ssd_id, command);
175 | return wait_for_next_cqe(ssd_id);
176 | }
177 |
178 | int get_smart_info(int ssd_id)
179 | {
180 | uint32_t command[16];
181 | // Now fill in each dw of command.
182 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
183 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0);
184 | // DW 1: bit 31-0 namespace
185 | command[1] = 0xffffffff;
186 | // DW 2-5 rsvd.
187 | for (int i=2; i<=5; i++)
188 | {
189 | command[i] = 0;
190 | }
191 | // DW 6-7: bit 63-0 PRP1
192 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff);
193 | command[7] = (uint32_t)(smart_phys_base >> 32);
194 | // DW 8-9: bit 63-0 PRP2, rsvd in this case.
195 | command[8] = 0;
196 | command[9] = 0;
197 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event,
198 | // bit 14-8 rsvd, dw 7-0 log id
199 | command[10] = (0x400 << 16) + (0x0 << 15) + 0x02;
200 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper.
201 | command[11] = 0x0;
202 | // DW 12-13: bit 63-0 log page offset. 0 in this case.
203 | command[12] = 0x0;
204 | command[13] = 0x0;
205 | // DW 14: bit 31-0 UUID. 0 in this case.
206 | command[14] = 0x0;
207 | // DW 15 rsvd.
208 | command[15] = 0x0;
209 | insert_admin_sq(ssd_id, command);
210 | return wait_for_next_cqe(ssd_id);
211 | }
212 |
213 | int get_error_log(int ssd_id)
214 | {
215 | uint32_t command[16];
216 | // Now fill in each dw of command.
217 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
218 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0);
219 | // DW 1: bit 31-0 namespace
220 | command[1] = 0xffffffff;
221 | // DW 2-5 rsvd.
222 | for (int i=2; i<=5; i++)
223 | {
224 | command[i] = 0;
225 | }
226 | // DW 6-7: bit 63-0 PRP1
227 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff);
228 | command[7] = (uint32_t)(smart_phys_base >> 32);
229 | // DW 8-9: bit 63-0 PRP2, rsvd in this case.
230 | command[8] = 0;
231 | command[9] = 0;
232 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event,
233 | // bit 14-8 rsvd, dw 7-0 log id
234 | command[10] = (0x400 << 16) + (0x0 << 15) + 0x01;
235 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper.
236 | command[11] = 0x0;
237 | // DW 12-13: bit 63-0 log page offset. 0 in this case.
238 | command[12] = 0x0;
239 | command[13] = 0x0;
240 | // DW 14: bit 31-0 UUID. 0 in this case.
241 | command[14] = 0x0;
242 | // DW 15 rsvd.
243 | command[15] = 0x0;
244 | insert_admin_sq(ssd_id, command);
245 | return wait_for_next_cqe(ssd_id);
246 | }
247 |
248 | int get_temperature_info(int ssd_id)
249 | {
250 | uint32_t command[16];
251 | // Now fill in each dw of command.
252 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode.
253 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0);
254 | // DW 1: bit 31-0 namespace
255 | command[1] = 0xffffffff;
256 | // DW 2-5 rsvd.
257 | for (int i=2; i<=5; i++)
258 | {
259 | command[i] = 0;
260 | }
261 | // DW 6-7: bit 63-0 PRP1
262 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff);
263 | command[7] = (uint32_t)(smart_phys_base >> 32);
264 | // DW 8-9: bit 63-0 PRP2, rsvd in this case.
265 | command[8] = 0;
266 | command[9] = 0;
267 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event,
268 | // bit 14-8 rsvd, dw 7-0 log id
269 | command[10] = (0x1 << 16) + (0x0 << 15) + 0x02;
270 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper.
271 | command[11] = 0x0;
272 | // DW 12-13: bit 63-0 log page offset. 200 (0xc8) in this case.
273 | command[12] = 0xc8;
274 | command[13] = 0x0;
275 | // DW 14: bit 31-0 UUID. 0 in this case.
276 | command[14] = 0x0;
277 | // DW 15 rsvd.
278 | command[15] = 0x0;
279 | insert_admin_sq(ssd_id, command);
280 | return wait_for_next_cqe(ssd_id);
281 | }
282 |
283 | int main(int argc, char *argv[])
284 | {
285 |
286 | // Set QDMA regs
287 | // Initialize NVMe device.
288 | init(0xb1, 256*1024*1024); // FPGA device ID is 0000:1c:00.0
289 | // Make queue 0 active
290 | uint32_t pfch_tag;
291 | writeConfig(0x1408/4, 0);
292 | if (readConfig(0x1408/4) != 0) {
293 | fprintf(stderr, "ERROR: Cannot read FPGA BARs.");
294 | exit(1);
295 | }
296 | pfch_tag = readConfig(0x140c/4);
297 | writeReg(210, pfch_tag);
298 | fprintf(stdout, "Prefetch tag: %d\n", pfch_tag);
299 |
300 | // Physical addresses of several BARs.
301 | uint64_t nvme_base[32], bypass_base;
302 |
303 | FILE *fp;
304 |
305 | // Open FPGA card. Assume BAR 4 of 0000:37:00.0
306 |
307 | fp = fopen("/sys/bus/pci/devices/0000:b1:00.0/resource", "rb");
308 | if (fp == NULL)
309 | {
310 | fprintf(stderr, "ERROR: Cannot open fpga device.\n");
311 | exit(1);
312 | }
313 | fseek(fp, 228, SEEK_SET); // 57 * BAR
314 |
315 | fscanf(fp, "0x%lx", &bypass_base);
316 | fclose(fp);
317 |
318 | if (bypass_base == 0)
319 | {
320 | fprintf(stderr, "ERROR: Invalid PCI address for FPGA card.\n");
321 | exit(1);
322 | }
323 | else
324 | {
325 | fprintf(stdout, "BAR 2 of FPGA device is %lx.\n", bypass_base);
326 | }
327 |
328 | writeReg(32, 0);
329 | // Read FPGA configures
330 | uint32_t ssd_low_bit, ssd_num, queue_low_bit, queue_depth, queue_num, ram_type_bit;
331 | ssd_low_bit = readReg(576);
332 | ssd_num = readReg(577);
333 | queue_depth = readReg(578);
334 | queue_low_bit = readReg(579);
335 | queue_num = readReg(580);
336 | ram_type_bit = readReg(581);
337 | fprintf(stdout,
338 | "SSD_LOW_BIT: %u\nSSD_COUNT: %u\n"
339 | "QUEUE_DEPTH: %u\nQUEUE_LOW_BIT: %u\n"
340 | "QUEUE_NUM: %u\nRAM_TYPE_BIT: %u\n",
341 | ssd_low_bit, ssd_num,
342 | queue_depth, queue_low_bit,
343 | queue_num, ram_type_bit
344 | );
345 | if (ssd_low_bit == 0xffffffff)
346 | {
347 | fprintf(stderr, "ERROR: Invalid FPGA config info. \n");
348 | exit(1);
349 | }
350 |
351 | int multi_ssd = 0;
352 |
353 | if (ssd_num > 1) {
354 | multi_ssd = 1;
355 | }
356 |
357 | // Open SSD device, now I just assume it is BAR 0 of target
358 |
359 | uint64_t device_low_addr = 0, device_high_addr = 0;
360 |
361 | for (int i=0; i> 48) & 0xf;
473 | if (nvme_ctl_mpsmin > 0)
474 | {
475 | fprintf(stderr, "ERROR: The nvme device doesn't support 4KB page.\n");
476 | exit(1);
477 | }
478 | uint64_t nvme_ctl_dstrd = (nvme_ctl_cap >> 32) & 0xf;
479 | if (nvme_ctl_dstrd > 0)
480 | {
481 | fprintf(stderr, "ERROR: The nvme device doesn't support 4B doorbell stride.\n");
482 | exit(1);
483 | }
484 | uint64_t nvme_ctl_mqes = nvme_ctl_cap & 0xffff;
485 | if (nvme_ctl_mqes < 32)
486 | {
487 | fprintf(stderr, "ERROR: The nvme device doesn't support 32 queue entries.\n");
488 | exit(1);
489 | }
490 |
491 | // Reset the controller.
492 | uint32_t *nvme_cc_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x14);
493 | *nvme_cc_pt = 0x460000; // Do not enable now.
494 | fprintf(stdout, "CC set to %08x.\n", *nvme_cc_pt);
495 |
496 | // Wait the controller to be completely reset.
497 | // Otherwise it will get stuck :(
498 | uint32_t *nvme_csts_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1c);
499 | while (*nvme_csts_pt != 0);
500 | fprintf(stdout, "System reset done. Current CSTS is %08x.\n", *nvme_csts_pt);
501 |
502 | // Set admin queue size to 32.
503 | uint32_t *nvme_aqa_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x24);
504 | *nvme_aqa_pt = (ADMIN_QUEUE_DEPTH << 16) + ADMIN_QUEUE_DEPTH;
505 | fprintf(stdout, "AQA set to %08x.\n", *nvme_aqa_pt);
506 |
507 | // Set admin SQ base address.
508 | uint64_t *nvme_asq_pt = (uint64_t *)(ssd_virt_base[ssd_id] + 0x28);
509 | *nvme_asq_pt = SSD_ADMIN_SQ_PHYS_BASE(ssd_id);
510 | fprintf(stdout, "ASQ set to %016lx.\n", *nvme_asq_pt);
511 |
512 | // Set admin CQ base address.
513 | uint64_t *nvme_acq_pt = (uint64_t *)(ssd_virt_base[ssd_id] + 0x30);
514 | *nvme_acq_pt = SSD_ADMIN_CQ_PHYS_BASE(ssd_id);
515 | fprintf(stdout, "ACQ set to %016lx.\n", *nvme_acq_pt);
516 |
517 | // Enable the controller.
518 | *nvme_cc_pt = 0x460001;
519 | fprintf(stdout, "CC set to %08x.\n", *nvme_cc_pt);
520 |
521 | // Wait for the system to be started.
522 | while (*nvme_csts_pt == 0);
523 | fprintf(stdout, "System started. Current CSTS is %08x.\n", *nvme_csts_pt);
524 |
525 | // Reset queue pointers.
526 | admin_sq_tl[ssd_id] = 0;
527 | admin_cq_hd[ssd_id] = 0;
528 |
529 | uint32_t *nvme_cq_base = (uint32_t *)(SSD_ADMIN_CQ_VIRT_BASE(ssd_id));
530 | // Now clear the admin CQ buffer.
531 | for (int i=0; i<128; i++)
532 | {
533 | nvme_cq_base[i] = 0x0;
534 | }
535 |
536 | // Initialize SSD queues. First set feature.
537 | int cmd_ret = nvme_set_num_of_qp(ssd_id, queue_num);
538 | if (cmd_ret != 0)
539 | {
540 | fprintf(stdout, "ERROR: Set number of queue pair returned 0x%x\n", cmd_ret);
541 | exit(1);
542 | }
543 |
544 | for (int qid=1; qid<=queue_num; qid++)
545 | {
546 | // Calculate the address of each CQ.
547 | uint64_t cq_addr = bypass_base + ((qid-1) << queue_low_bit) + (ssd_id << ssd_low_bit) + (0x1 << ram_type_bit);
548 | // Create CQ now.
549 | cmd_ret = nvme_create_cq(ssd_id, qid, queue_depth, cq_addr);
550 | if (cmd_ret != 0)
551 | {
552 | fprintf(stdout, "ERROR: Create CQ %d returned 0x%x\n", qid, cmd_ret);
553 | exit(1);
554 | }
555 |
556 | uint64_t sq_addr = bypass_base + ((qid-1) << queue_low_bit) + (ssd_id << ssd_low_bit) + (0x0 << ram_type_bit);
557 | cmd_ret = nvme_create_sq(ssd_id, qid, qid, queue_depth, sq_addr);
558 | if (cmd_ret != 0)
559 | {
560 | fprintf(stdout, "ERROR: Create SQ %d returned 0x%x\n", qid, cmd_ret);
561 | exit(1);
562 | }
563 | }
564 |
565 | fprintf(stdout, "SSD %d queue initialization done.\n", ssd_id);
566 |
567 | // Try to get SMART page.
568 | cmd_ret = get_smart_info(ssd_id);
569 | if (cmd_ret != 0)
570 | {
571 | fprintf(stdout, "ERROR: Get smart page returned 0x%x\n", cmd_ret);
572 | exit(1);
573 | }
574 |
575 | uint8_t *smart_array = (uint8_t *)(huge_virt_base + 0x200000);
576 |
577 | // Get critical warnings.
578 | uint8_t smart_critical = smart_array[0];
579 | if (smart_critical != 0x00)
580 | {
581 | fprintf(stdout, "WARNING: SSD %d reported critical warning 0x%02x\n", ssd_id, smart_critical);
582 | }
583 |
584 | // Get temperature.
585 | // uint16_t smart_temp_comp;
586 | // smart_temp_comp = ((smart_array[2] << 8) + smart_array[1]) - 273;
587 | // fprintf(stdout, "Current temperature: %d\n", smart_temp_comp);
588 |
589 | // fprintf(stdout, "Creating I/O SQ/CQ...\n");
590 | // writeReg(128, 0);
591 | // writeReg(128, 1);
592 | // writeReg(128, 0);
593 |
594 | // // Wait for FPGA board to finish basic settings.
595 | // while (readReg(672) == 0);
596 | // if (readReg(673) != 0)
597 | // {
598 | // fprintf(stderr, "ERROR: NVMe queue initialization failed.\n");
599 | // fprintf(stdout, "status code: %08x\n", readReg(673));
600 | // exit(1);
601 | // }
602 |
603 | // fprintf(stdout, "NVMe queue initialization done.\n");
604 | }
605 |
606 | char *zero_buffer = NULL;
607 | posix_memalign((void **)&zero_buffer, 64 /*alignment */ , 64);
608 |
609 | for (int i=0; i<64; i++)
610 | {
611 | zero_buffer[i] = 0;
612 | }
613 |
614 | int stop_benchmark = 0;
615 |
616 | while (!stop_benchmark)
617 | {
618 | // Benchmarking
619 | int mode, num_lb, benchmark_time;
620 | int benchmark_stuck = 0;
621 |
622 | fprintf(stdout, "Enter mode. +1 for write, +2 for random, +1024 for record: ");
623 | fscanf(stdin, "%d", &mode);
624 | fprintf(stdout, "Enter number of logical blocks (512 B) for each cmd: ");
625 | fscanf(stdin, "%d", &num_lb);
626 | fprintf(stdout, "Enter time in seconds: ");
627 | fscanf(stdin, "%d", &benchmark_time);
628 |
629 | // Set parameters
630 | writeReg(161, mode & 0x3);
631 | writeReg(162, num_lb-1);
632 | writeReg(163, benchmark_time*3906250); // Time. 3,906,250 = 1s
633 | if (mode >= 1024)
634 | {
635 | writeReg(165, 1);
636 | }
637 | else
638 | {
639 | writeReg(165, 0);
640 | }
641 | fprintf(stdout, "Start benchmark...\n");
642 |
643 | writeReg(160, 0);
644 | writeReg(160, 1);
645 | sleep(benchmark_time);
646 | writeReg(160, 0);
647 |
648 | int diff_time = 0;
649 |
650 | while (readReg(704) == 0)
651 | {
652 | sleep(1);
653 | diff_time += 1;
654 |
655 | if (diff_time > 3)
656 | {
657 |
658 | // For debugging queues
659 | uint32_t *bypass_entry_buffer;
660 | posix_memalign((void **)&bypass_entry_buffer, 64 /*alignment */ , 64);
661 | fprintf(stderr, "ERROR: Benchmark stuck, now print information of SQE and CQE: \n");
662 | for (ssd_id=0; ssd_id 1) {
722 | uint32_t ssd_io[ssd_num];
723 | ssd_io[0] = readReg(712);
724 | fprintf(stdout, " (%.2lf",
725 | (num_lb*0.5*ssd_io[0]/1024.0) / total_time);
726 | for (int i=1; i= 1024)
738 | {
739 | // Get bandwidth curve
740 | fprintf(stdout, "Time (s),Read bandwidth (MB/s),Write bandwidth (MB/s)\n");
741 | uint32_t read_bw, write_bw;
742 | double bw_time = 0;
743 | writeReg(166, 0);
744 | writeReg(166, 1);
745 | writeReg(166, 0);
746 | write_bw = 0;
747 | read_bw = 0;
748 | while ((read_bw != 0xffffffff) && (write_bw != 0xffffffff))
749 | {
750 | fprintf(stdout, "%.1lf,%.2lf,%.2lf\n",
751 | bw_time, read_bw*640.0/(1024*1024), write_bw*640.0/(1024*1024));
752 | bw_time += 0.1;
753 | writeReg(166, 0);
754 | writeReg(166, 1);
755 | writeReg(166, 0);
756 | read_bw = readReg(729);
757 | write_bw = readReg(728);
758 | }
759 | }
760 |
761 | // Find failed entries.
762 | for (ssd_id=0; ssd_id 0, "At least one SSD is required.")
44 | assert(QUEUE_NUM > 0, "At least one queue is required.")
45 | assert(QUEUE_DEPTH >= 4, "Queue depth should be at least 4.")
46 | assert(Set("DMA", "SAXIB") contains QDMA_INTERFACE, "Invalid QDMA interface.")
47 | assert(pow(2, log2Ceil(QUEUE_DEPTH)).toInt == QUEUE_DEPTH, "Queue depth must be exponential of 2.")
48 |
49 | val QUEUE_MAX_ID = QUEUE_NUM - 1
50 |
51 | /* Basically below is how we handle queues using BAR space.
52 | * For SQE, address is splited into: {1'd0, ssd_id, queue_id, padding, entry_id, 6'd0};
53 | * For CQE, address is splited into: {1'd1, ssd_id, queue_id, padding, 2'd0, entry_id, 4'd0};
54 | * Padding is used to ensure starting address of a queue is 4-KiB aligned.
55 | * Length of padding is max(0, 6-bits(entry_id)).
56 | */
57 |
58 | val ENTRY_BIT_LEN = log2Ceil(QUEUE_DEPTH)
59 | val ENTRY_LOW_BIT_SQ = 6
60 | val ENTRY_HIGH_BIT_SQ = 6 + ENTRY_BIT_LEN - 1
61 |
62 | val ENTRY_LOW_BIT_CQ = 4
63 | val ENTRY_HIGH_BIT_CQ = 4 + ENTRY_BIT_LEN - 1
64 |
65 | val QUEUE_BIT_LEN_RAW = log2Ceil(QUEUE_NUM)
66 | val QUEUE_BIT_LEN = max(1, QUEUE_BIT_LEN_RAW)
67 | val QUEUE_LOW_BIT = max(12, ENTRY_HIGH_BIT_SQ+1)
68 | val QUEUE_HIGH_BIT = QUEUE_LOW_BIT + QUEUE_BIT_LEN_RAW - 1
69 |
70 | val SSD_BIT_LEN_RAW = log2Ceil(SSD_NUM)
71 | val SSD_BIT_LEN = max(1, SSD_BIT_LEN_RAW)
72 | val SSD_LOW_BIT = QUEUE_HIGH_BIT + 1
73 | val SSD_HIGH_BIT = SSD_LOW_BIT + SSD_BIT_LEN_RAW - 1
74 |
75 | val RAM_TYPE_BIT = SSD_HIGH_BIT + 1
76 |
77 | // SQ & CQ RAMs.
78 | // XRam use RAMB36 or URAM288 thus depth is 1K ~ 4K, which is too big to just hold one SSD.
79 | // Therefore, in this version of implementation, All SSDs share 1 SQ RAM and 1 CQ RAM.
80 |
81 | val sqRam = XRam(
82 | UInt(512.W),
83 | pow(2, log2Up(SSD_NUM) + log2Up(QUEUE_NUM)).toInt * QUEUE_DEPTH,
84 | latency = 1,
85 | use_musk = 0
86 | )
87 |
88 | // SQ data structures
89 | val sqTail = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(ENTRY_BIT_LEN.W))))))
90 | val sqHead = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(ENTRY_BIT_LEN.W))))))
91 |
92 | val cqRam = XRam(
93 | UInt(512.W),
94 | SSD_NUM * QUEUE_NUM * QUEUE_DEPTH / 4,
95 | latency = 1,
96 | use_musk = 1
97 | )
98 |
99 | // CQ data structures. CQ head can be divided into 2 parts,
100 | // One is phase, used to know whether CQE has been updated,
101 | // another is head counter, used to send doorbell.
102 | val cqHeadExt = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U((ENTRY_BIT_LEN+1).W))))))
103 | val cqPhase = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, UInt(1.W))))
104 | val cqHead = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, UInt(ENTRY_BIT_LEN.W))))
105 | for (ssdId <- 0 until SSD_NUM) {
106 | for (queueId <- 0 until QUEUE_NUM) {
107 | cqHead(ssdId)(queueId) := cqHeadExt(ssdId)(queueId)(ENTRY_BIT_LEN-1, 0)
108 | cqPhase(ssdId)(queueId) := cqHeadExt(ssdId)(queueId)(ENTRY_BIT_LEN).asUInt
109 | }
110 | }
111 | val cqHeadChanged = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Valid(Bool()))))
112 |
113 | // SSD BAR physical address, for sending doorbells.
114 |
115 | val ssdBarAddr = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(64.W))))
116 |
117 | // Set up SSD physical address.
118 | when (io.control.ssdSetup.valid) {
119 | ssdBarAddr(io.control.ssdSetup.bits.ssdId) := io.control.ssdSetup.bits.ssdBarAddr
120 | }
121 |
122 | // Running counters, getting to know whether all commands has been completed.
123 | // Only when enable is 0 and running is 0, this module is truly stopped.
124 |
125 | val commandStart = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(32.W))))))
126 | val commandEnd = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(32.W))))))
127 |
128 | val queueRunning = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Bool())))
129 | for (ssdId <- 0 until SSD_NUM) {
130 | for (queueId <- 0 until QUEUE_NUM) {
131 | queueRunning(ssdId)(queueId) := (commandStart(ssdId)(queueId) =/= commandEnd(ssdId)(queueId))
132 | }
133 | }
134 | io.status.running := queueRunning.asTypeOf(UInt((SSD_NUM*QUEUE_NUM).W)).orR
135 |
136 | // Parameters used to notify software
137 | io.status.params.ssdNum := SSD_NUM.U
138 | io.status.params.ssdLowBit := SSD_LOW_BIT.U
139 | io.status.params.queueLowBit:= QUEUE_LOW_BIT.U
140 | io.status.params.queueDepth := QUEUE_DEPTH.U
141 | io.status.params.queueNum := QUEUE_NUM.U
142 | io.status.params.ramTypeBit := RAM_TYPE_BIT.U
143 |
144 | // Statistical information
145 | val statExecTime = RegInit(UInt(64.W), 0.U)
146 | val statSuccOp = RegInit(UInt(32.W), 0.U)
147 | val statFailedOp = RegInit(UInt(32.W), 0.U)
148 | val statLatency = RegInit(UInt(64.W), 0.U)
149 |
150 | io.status.stat.executeTime := statExecTime
151 | io.status.stat.successfulOp := statSuccOp
152 | io.status.stat.failedOp := statFailedOp
153 | io.status.stat.totalLatency := statLatency
154 |
155 | val moduleRunning = io.status.running || io.control.enable
156 | val moduleStart = moduleRunning && ~RegNext(moduleRunning)
157 |
158 | when (moduleStart) {
159 | // Clear counters at start
160 | statExecTime := 0.U
161 | statSuccOp := 0.U
162 | statFailedOp := 0.U
163 | statLatency := 0.U
164 | }.elsewhen (moduleRunning) {
165 | statExecTime := statExecTime + 1.U
166 | }
167 |
168 | // Main logic.
169 |
170 | val dbReq = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Decoupled(new Doorbell))))
171 |
172 | // Part 1: Add command from user to SQ.
173 |
174 | // Each queue has a command FIFO.
175 | val cmdInputFifo = XQueue(SSD_NUM*QUEUE_NUM)(UInt(512.W), 16)
176 | val cmdInputFifoOut = Wire(Vec(SSD_NUM*QUEUE_NUM, Decoupled(UInt(512.W))))
177 |
178 | // Write command to command RAM.
179 | // Since all queues share one RAM, we use a reg to indicate which queue to write.
180 | val sqAllocPtSsd = RegInit(UInt(SSD_BIT_LEN.W), 0.U)
181 | val sqAllocPtQp = RegInit(UInt(QUEUE_BIT_LEN.W), 0.U)
182 | val queueWriteRdy = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Bool())))
183 |
184 | for (ssdId <- 0 until SSD_NUM) {
185 | for (queueId <- 0 until QUEUE_NUM) {
186 | val fifoId = ssdId*QUEUE_NUM+queueId
187 | val cmdInputFifoIn = Wire(Decoupled(UInt(512.W)))
188 | val cmdInputFifoSlice = RegSlice(2)(cmdInputFifo(fifoId).io.out)
189 | cmdInputFifo(fifoId).io.in <> RegSlice(2)(cmdInputFifoIn)
190 | io.ssdCmd(ssdId)(queueId).ready := io.control.enable && cmdInputFifoIn.ready
191 | cmdInputFifoIn.valid := io.control.enable && io.ssdCmd(ssdId)(queueId).valid
192 | cmdInputFifoIn.bits := io.ssdCmd(ssdId)(queueId).bits
193 | cmdInputFifoOut(fifoId).valid := cmdInputFifoSlice.valid
194 | cmdInputFifoOut(fifoId).bits := cmdInputFifoSlice.bits
195 | cmdInputFifoSlice.ready := cmdInputFifoOut(fifoId).ready
196 | cmdInputFifoOut(fifoId).ready := (queueWriteRdy(ssdId)(queueId)
197 | && (sqAllocPtQp === queueId.U) && (sqAllocPtSsd === ssdId.U))
198 |
199 | when (io.ssdCmd(ssdId)(queueId).fire) {
200 | commandStart(ssdId)(queueId) := commandStart(ssdId)(queueId) + 1.U
201 | }
202 | }
203 | }
204 |
205 | val sqAllocPtFifo = RegInit(UInt((SSD_BIT_LEN + QUEUE_BIT_LEN).W), 0.U)
206 |
207 | when (sqAllocPtQp === QUEUE_MAX_ID.U) {
208 | when (sqAllocPtSsd === (SSD_NUM-1).U) {
209 | sqAllocPtSsd := 0.U
210 | sqAllocPtFifo := 0.U
211 | }.otherwise {
212 | sqAllocPtSsd := sqAllocPtSsd + 1.U
213 | sqAllocPtFifo := sqAllocPtFifo + 1.U
214 | }
215 | sqAllocPtQp := 0.U
216 | }.otherwise {
217 | sqAllocPtQp := sqAllocPtQp + 1.U
218 | sqAllocPtFifo := sqAllocPtFifo + 1.U
219 | }
220 |
221 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue
222 | sqRam.io.addr_a := Cat(sqAllocPtSsd, sqAllocPtQp, sqTail(sqAllocPtSsd)(sqAllocPtQp))
223 | } else { // Only 1 queue
224 | sqRam.io.addr_a := Cat(sqAllocPtSsd, sqTail(sqAllocPtSsd)(sqAllocPtQp))
225 | }
226 |
227 | sqRam.io.data_in_a := cmdInputFifoOut(sqAllocPtFifo).bits
228 | sqRam.io.wr_en_a := cmdInputFifoOut(sqAllocPtFifo).fire
229 |
230 | // Part 2: Get to know whether CQ has been changed.
231 |
232 | val cqDetectPtQp = RegInit(UInt(QUEUE_BIT_LEN.W), 0.U)
233 | val cqDetectPtSsd = RegInit(UInt(SSD_BIT_LEN.W), 0.U)
234 | val cqDetectPtAddr = Wire(UInt((QUEUE_BIT_LEN+SSD_BIT_LEN).W))
235 |
236 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue
237 | cqDetectPtAddr := Cat(cqDetectPtSsd, cqDetectPtQp)
238 | } else {
239 | cqDetectPtAddr := cqDetectPtSsd
240 | }
241 |
242 | when (cqDetectPtQp === QUEUE_MAX_ID.U) {
243 | when (cqDetectPtSsd === (SSD_NUM-1).U) {
244 | cqDetectPtSsd := 0.U
245 | }.otherwise {
246 | cqDetectPtSsd := cqDetectPtSsd + 1.U
247 | }
248 | cqDetectPtQp := 0.U
249 | }.otherwise {
250 | cqDetectPtQp := cqDetectPtQp + 1.U
251 | }
252 |
253 | if (ENTRY_BIT_LEN <= 4) {
254 | cqRam.io.addr_b := cqDetectPtAddr
255 | } else {
256 | cqRam.io.addr_b := Cat(cqDetectPtAddr, cqHead(cqDetectPtSsd)(cqDetectPtQp)(ENTRY_BIT_LEN-1, 2))
257 | }
258 |
259 | for (ssdId <- 0 until SSD_NUM) {
260 | for (queueId <- 0 until QUEUE_NUM) {
261 | cqHeadChanged(ssdId)(queueId).valid := (
262 | ssdId.U(SSD_BIT_LEN.W) === RegNext(cqDetectPtSsd)
263 | && queueId.U(QUEUE_BIT_LEN.W) === RegNext(cqDetectPtQp)
264 | )
265 | cqHeadChanged(ssdId)(queueId).bits := 0.U
266 | switch (cqHead(ssdId)(queueId)(1, 0)) {
267 | is (0.U(2.W)) {
268 | cqHeadChanged(ssdId)(queueId).bits := (
269 | cqRam.io.data_out_b(112+128*0) =/= RegNext(cqPhase(ssdId)(queueId))
270 | )
271 | }
272 | is (1.U(2.W)) {
273 | cqHeadChanged(ssdId)(queueId).bits := (
274 | cqRam.io.data_out_b(112+128*1) =/= RegNext(cqPhase(ssdId)(queueId))
275 | )
276 | }
277 | is (2.U(2.W)) {
278 | cqHeadChanged(ssdId)(queueId).bits := (
279 | cqRam.io.data_out_b(112+128*2) =/= RegNext(cqPhase(ssdId)(queueId))
280 | )
281 | }
282 | is (3.U(2.W)) {
283 | cqHeadChanged(ssdId)(queueId).bits := (
284 | cqRam.io.data_out_b(112+128*3) =/= RegNext(cqPhase(ssdId)(queueId))
285 | )
286 | }
287 | }
288 | }
289 | }
290 |
291 | for (ssdId <- 0 until SSD_NUM) {
292 |
293 | // Part 3: Queue pair handle logic.
294 |
295 | // Basically a complex state machine.
296 | object QpState extends ChiselEnum {
297 | val sQpSqWait1, sQpSqWait2, sQpSqIns, sQpSqDb,
298 | sQpCqWait1, sQpCqWait2, sQpCqRefresh1, sQpCqRefresh2,
299 | sQpCqDb, sQpLoop = Value
300 | }
301 |
302 | import QpState._
303 |
304 | for (queueId <- 0 until QUEUE_NUM) {
305 | val qpSt = RegInit(QpState(), sQpSqWait1)
306 | val sqWaitCnt = RegInit(UInt(32.W), 0.U)
307 | val newCqCome = cqHeadChanged(ssdId)(queueId).bits && cqHeadChanged(ssdId)(queueId).valid
308 |
309 | switch (qpSt) {
310 | is (sQpSqWait1) { // Wait for new command
311 | when (cmdInputFifoOut(ssdId*QUEUE_NUM+queueId).fire) { // A new command comes.
312 | qpSt := sQpSqIns
313 | }.otherwise {
314 | qpSt := sQpSqWait1
315 | }
316 | }
317 | is (sQpSqIns) { // Insert the command
318 | when (sqTail(ssdId)(queueId) + 2.U === cqHead(ssdId)(queueId)) {
319 | // when (sqTail(ssdId)(queueId) + 2.U === sqHead(ssdId)(queueId)) {
320 | qpSt := sQpSqDb // SQ is full, directly ring doorbell.
321 | }.otherwise {
322 | qpSt := sQpSqWait2
323 | }
324 | }
325 | is (sQpSqWait2) { // Wait for more command to reduce doorbell signals
326 | when (cmdInputFifoOut(ssdId*QUEUE_NUM+queueId).fire) { // A new command comes.
327 | qpSt := sQpSqIns
328 | }.elsewhen (sqWaitCnt >= MAX_SQ_INTERVAL.U) { // No command comes for a while, ring doorbell.
329 | qpSt := sQpSqDb
330 | }.otherwise {
331 | qpSt := sQpSqWait2
332 | }
333 | }
334 | is (sQpSqDb) { // Ring SQ doorbell
335 | when (dbReq(ssdId)(queueId).fire) { // Doorbell accepted
336 | qpSt := sQpCqWait1
337 | }.otherwise {
338 | qpSt := sQpSqDb
339 | }
340 | }
341 | is (sQpCqWait1) { // Wait for new CQE
342 | when (newCqCome) { // We have a first CQE
343 | qpSt := sQpCqRefresh1
344 | }.otherwise {
345 | qpSt := sQpCqWait1
346 | }
347 | }
348 | is (sQpCqWait2) { // Check if more CQE has come in a row
349 | when (newCqCome) { // We have more CQEs
350 | qpSt := sQpCqRefresh1
351 | }.elsewhen (cqHeadChanged(ssdId)(queueId).valid) { // No more CQEs, ring doorbell.
352 | qpSt := sQpCqDb
353 | }.otherwise { // This CQ is not checked yet
354 | qpSt := sQpCqWait2
355 | }
356 | }
357 | is (sQpCqRefresh1) { // Wait for more CQEs 1
358 | qpSt := sQpCqRefresh2
359 | }
360 | is (sQpCqRefresh2) { // Wait for more CQEs 2
361 | qpSt := sQpCqWait2
362 | }
363 | is (sQpCqDb) { // Ring CQ doorbell
364 | when (dbReq(ssdId)(queueId).fire) { // Doorbell accepted
365 | qpSt := sQpLoop
366 | }.otherwise {
367 | qpSt := sQpCqDb
368 | }
369 | }
370 | is (sQpLoop) { // Prepare for next round
371 | when (sqTail(ssdId)(queueId) + 1.U === sqHead(ssdId)(queueId)) {
372 | // In case SQ head not moved, skip the SQWAIT phases.
373 | qpSt := sQpCqWait1
374 | }.otherwise {
375 | qpSt := sQpSqWait1
376 | }
377 | }
378 | } // switch (qpSt)
379 |
380 | // New command requests
381 | queueWriteRdy(ssdId)(queueId) := (qpSt === sQpSqWait1) || (qpSt === sQpSqWait2)
382 |
383 | // Doorbell requests
384 | dbReq(ssdId)(queueId).valid := (qpSt === sQpSqDb) || (qpSt === sQpCqDb)
385 | dbReq(ssdId)(queueId).bits.addr := Mux(qpSt === sQpSqDb,
386 | ssdBarAddr(ssdId) + 0x1008.U(64.W) + Cat(queueId.U(QUEUE_BIT_LEN.W), 0.U(3.W)),
387 | ssdBarAddr(ssdId) + 0x100c.U(64.W) + Cat(queueId.U(QUEUE_BIT_LEN.W), 0.U(3.W)),
388 | )
389 | dbReq(ssdId)(queueId).bits.value := Mux(qpSt === sQpSqDb,
390 | sqTail(ssdId)(queueId),
391 | cqHead(ssdId)(queueId)
392 | )
393 |
394 | // Update SQ tail and SQ wait timer
395 | when (qpSt === sQpSqIns) {
396 | sqTail(ssdId)(queueId) := sqTail(ssdId)(queueId) + 1.U
397 | sqWaitCnt := 0.U
398 | }.elsewhen (qpSt === sQpSqWait2) {
399 | sqWaitCnt := sqWaitCnt + 1.U
400 | }
401 |
402 | // Update CQ tail
403 | when (qpSt === sQpCqRefresh1) {
404 | cqHeadExt(ssdId)(queueId) := cqHeadExt(ssdId)(queueId) + 1.U
405 | }
406 | } // for (queueId <- 0 until QUEUE_NUM)
407 | } // for (ssdId <- 0 until SSD_NUM)
408 |
409 | // Part 4: Collect and send doorbells.
410 |
411 | val dbAbt = Module(new RRArbiter(new Doorbell, SSD_NUM*QUEUE_NUM))
412 |
413 | for (ssdId <- 0 until SSD_NUM) {
414 | for (queueId <- 0 until QUEUE_NUM) {
415 | dbAbt.io.in(ssdId*QUEUE_NUM + queueId) <> RegSlice(5)(dbReq(ssdId)(queueId))
416 | }
417 | }
418 |
419 | val dbFifo = XQueue(new Doorbell, 64)
420 |
421 | dbFifo.io.in <> RegSlice(2)(dbAbt.io.out)
422 |
423 | // Another state machine.
424 | object DbState extends ChiselEnum {
425 | val sDbWait, sDbPutDesc, sDbPutData = Value
426 | }
427 |
428 | import DbState._
429 |
430 | val dbSt = RegInit(DbState(), sDbWait)
431 | val dbAddr = RegInit(UInt(64.W), 0.U)
432 | val dbValue = RegInit(UInt(32.W), 0.U)
433 |
434 | switch (dbSt) {
435 | is (sDbWait) {
436 | when (dbFifo.io.out.fire) {
437 | dbSt := sDbPutDesc
438 | }.otherwise {
439 | dbSt := sDbWait
440 | }
441 | }
442 | is (sDbPutDesc) {
443 | if (QDMA_INTERFACE == "SAXIB") {
444 | when (io.sAxib.get.aw.fire) {
445 | dbSt := sDbPutData
446 | }.otherwise {
447 | dbSt := sDbPutDesc
448 | }
449 | } else {
450 | when (io.c2hCmd.get.fire) {
451 | dbSt := sDbPutData
452 | }.otherwise {
453 | dbSt := sDbPutDesc
454 | }
455 | }
456 | }
457 | is (sDbPutData) {
458 | if (QDMA_INTERFACE == "SAXIB") {
459 | when (io.sAxib.get.w.fire) {
460 | dbSt := sDbWait
461 | }.otherwise {
462 | dbSt := sDbPutData
463 | }
464 | } else {
465 | when (io.c2hData.get.fire) {
466 | dbSt := sDbWait
467 | }.otherwise {
468 | dbSt := sDbPutData
469 | }
470 | }
471 | }
472 | }
473 |
474 | dbFifo.io.out.ready := (dbSt === sDbWait)
475 |
476 | when (dbFifo.io.out.fire) {
477 | dbAddr := dbFifo.io.out.bits.addr
478 | dbValue := dbFifo.io.out.bits.value
479 | }
480 |
481 | if (QDMA_INTERFACE == "SAXIB") {
482 | ToZero(io.sAxib.get.aw.bits)
483 | ToZero(io.sAxib.get.ar.bits)
484 | ToZero(io.sAxib.get.w.bits)
485 |
486 | when (dbSt === sDbPutDesc) {
487 | io.sAxib.get.aw.bits.addr := Cat(dbAddr(63, 6), 0.U(6.W))
488 | io.sAxib.get.aw.bits.size := 2.U(3.W)
489 | io.sAxib.get.aw.valid := 1.U
490 | }.otherwise {
491 | io.sAxib.get.aw.bits.addr := 0.U
492 | io.sAxib.get.aw.bits.size := 0.U
493 | io.sAxib.get.aw.valid := 0.U
494 | }
495 | io.sAxib.get.aw.bits.burst := 1.U
496 |
497 | when (dbSt === sDbPutData) {
498 | io.sAxib.get.w.bits.data := ShiftData512(Cat(0.U(480.W), dbValue), dbAddr(5, 0))
499 | io.sAxib.get.w.bits.strb := ShiftStrb64("hf".U(64.W), dbAddr(5, 0))
500 | io.sAxib.get.w.bits.last := 1.U
501 | io.sAxib.get.w.valid := 1.U
502 | }.otherwise {
503 | io.sAxib.get.w.bits.data := 0.U
504 | io.sAxib.get.w.bits.strb := 0.U
505 | io.sAxib.get.w.bits.last := 0.U
506 | io.sAxib.get.w.valid := 0.U
507 | }
508 | io.sAxib.get.ar.bits.size := 6.U
509 | io.sAxib.get.ar.bits.burst := 1.U
510 | io.sAxib.get.ar.valid := 0.U
511 | io.sAxib.get.r.ready := 1.U
512 | io.sAxib.get.b.ready := 1.U
513 | }
514 |
515 | if (QDMA_INTERFACE == "DMA") {
516 | ToZero(io.c2hCmd.get.bits)
517 | when (dbSt === sDbPutDesc) {
518 | io.c2hCmd.get.bits.addr := dbAddr
519 | io.c2hCmd.get.bits.len := 4.U(16.W)
520 | io.c2hCmd.get.valid := 1.U
521 | }.otherwise {
522 | io.c2hCmd.get.bits.addr := 0.U
523 | io.c2hCmd.get.bits.len := 0.U
524 | io.c2hCmd.get.valid := 0.U
525 | }
526 |
527 | ToZero(io.c2hData.get.bits)
528 | when (dbSt === sDbPutData) {
529 | io.c2hData.get.bits.data := Cat(0.U(480.W), dbValue)
530 | io.c2hData.get.bits.ctrl_len := 4.U(16.W)
531 | io.c2hData.get.bits.mty := 60.U(6.W)
532 | io.c2hData.get.bits.last := 1.U
533 | io.c2hData.get.valid := 1.U
534 | }.otherwise {
535 | io.c2hData.get.bits.data := 0.U
536 | io.c2hData.get.bits.ctrl_len := 0.U
537 | io.c2hData.get.bits.mty := 0.U
538 | io.c2hData.get.bits.last := 0.U
539 | io.c2hData.get.valid := 0.U
540 | }
541 | }
542 |
543 | // Part 5 : QDMA read/write SQ/CQ RAM.
544 |
545 | // SQ RAM
546 |
547 | if (SSD_HIGH_BIT >= SSD_LOW_BIT) { // More than 1 SSD
548 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue
549 | sqRam.io.addr_b := Cat(
550 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
551 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
552 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ)
553 | )
554 | } else { // Only 1 queue.
555 | sqRam.io.addr_b := Cat(
556 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
557 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ)
558 | )
559 | }
560 | } else {
561 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue
562 | sqRam.io.addr_b := Cat(
563 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
564 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ)
565 | )
566 | } else { // Only 1 queue.
567 | sqRam.io.addr_b := io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ)
568 | }
569 | }
570 |
571 | // CQ RAM
572 |
573 | cqRam.io.wr_en_a := (
574 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U
575 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U
576 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U
577 | && io.ramIO.writeMask =/= 0.U
578 | )
579 |
580 | if (SSD_HIGH_BIT >= SSD_LOW_BIT) {// >1 SSD
581 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // >1 queue, >4 entries
582 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
583 | Cat(
584 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
585 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
586 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6)
587 | ),
588 | Cat(
589 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
590 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
591 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6)
592 | ),
593 | )
594 | } else if (QUEUE_HIGH_BIT < QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // 1 queue, >4 entries
595 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
596 | Cat(
597 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
598 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6)
599 | ),
600 | Cat(
601 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
602 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6)
603 | ),
604 | )
605 | } else if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ < 6) { // >1 queue, 4 entries
606 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
607 | Cat(
608 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
609 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)
610 | ),
611 | Cat(
612 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
613 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)
614 | )
615 | )
616 | } else { // 1 queue, 4 entries
617 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
618 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT),
619 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT)
620 | )
621 | }
622 | } else { // Only 1 SSD
623 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // >1 queue, >4 entries
624 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
625 | Cat(
626 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
627 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6)
628 | ),
629 | Cat(
630 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
631 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6)
632 | ),
633 | )
634 | } else if (QUEUE_HIGH_BIT < QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // 1 queue, >4 entries
635 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
636 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6),
637 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6),
638 | )
639 | } else if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ < 6) { // >1 queue, 4 entries
640 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a,
641 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT),
642 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)
643 | )
644 | } else { // 1 queue, 4 entries
645 | cqRam.io.addr_a := 0.U
646 | }
647 | }
648 | cqRam.io.musk_a.get := io.ramIO.writeMask
649 | cqRam.io.data_in_a := io.ramIO.writeData
650 |
651 | val nextReadAddr = RegNext(io.ramIO.readAddr)
652 |
653 | io.ramIO.readData := 0.U
654 |
655 | when (nextReadAddr(63, RAM_TYPE_BIT+1) === 0.U) {
656 | io.ramIO.readData := Mux(nextReadAddr(RAM_TYPE_BIT) === 0.U,
657 | sqRam.io.data_out_b,
658 | cqRam.io.data_out_a
659 | )
660 | }
661 |
662 | // Update SQ head from CQE.
663 |
664 | when (
665 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U
666 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U
667 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U
668 | ) {
669 | val chosenSsd = if (SSD_HIGH_BIT >= SSD_LOW_BIT) {io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT)} else 0.U
670 | val chosenQp = if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) {io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)} else 0.U
671 | when (io.ramIO.writeMask === "h000000000000ffff".U) {
672 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(79, 64)
673 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U
674 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) {
675 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(207, 192)
676 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U
677 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) {
678 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(335, 320)
679 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U
680 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) {
681 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(463, 448)
682 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U
683 | }
684 | }
685 |
686 | // Statistical counters
687 |
688 | for (ssdId <- 0 until SSD_NUM) {
689 | for (queueId <- 0 until QUEUE_NUM) {
690 | when (io.ssdCmd(ssdId)(queueId).fire) {
691 | statLatency := statLatency - statExecTime
692 | }
693 | }
694 | }
695 |
696 | when (
697 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U
698 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U
699 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U
700 | ) {
701 | when (io.ramIO.writeMask === "h000000000000ffff".U) {
702 | statLatency := statLatency + statExecTime
703 | when (io.ramIO.writeData(120, 113) === 0.U) {
704 | statSuccOp := statSuccOp + 1.U
705 | }.otherwise (
706 | statFailedOp := statFailedOp + 1.U
707 | )
708 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) {
709 | statLatency := statLatency + statExecTime
710 | when (io.ramIO.writeData(248, 241) === 0.U) {
711 | statSuccOp := statSuccOp + 1.U
712 | }.otherwise (
713 | statFailedOp := statFailedOp + 1.U
714 | )
715 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) {
716 | statLatency := statLatency + statExecTime
717 | when (io.ramIO.writeData(376, 369) === 0.U) {
718 | statSuccOp := statSuccOp + 1.U
719 | }.otherwise (
720 | statFailedOp := statFailedOp + 1.U
721 | )
722 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) {
723 | statLatency := statLatency + statExecTime
724 | when (io.ramIO.writeData(504, 497) === 0.U) {
725 | statSuccOp := statSuccOp + 1.U
726 | }.otherwise (
727 | statFailedOp := statFailedOp + 1.U
728 | )
729 | }
730 | }
731 |
732 | // SSD completion signals
733 | for (ssdId <- 0 until SSD_NUM) {
734 | for (queueId <- 0 until QUEUE_NUM) {
735 | io.ssdCmpt(ssdId)(queueId).valid := 0.U
736 | ToZero(io.ssdCmpt(ssdId)(queueId).bits)
737 | }
738 | }
739 | when (
740 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U
741 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U
742 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U
743 | ) {
744 | val ssdId = if (SSD_HIGH_BIT >= SSD_LOW_BIT) {io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT)} else 0.U
745 | val queueId = if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) {io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)} else 0.U
746 | when (io.ramIO.writeMask === "h000000000000ffff".U) {
747 | io.ssdCmpt(ssdId)(queueId).valid := 1.U
748 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(111, 96)
749 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(120, 113)
750 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) {
751 | io.ssdCmpt(ssdId)(queueId).valid := 1.U
752 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(239, 224)
753 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(248, 241)
754 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) {
755 | io.ssdCmpt(ssdId)(queueId).valid := 1.U
756 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(367, 352)
757 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(376, 369)
758 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) {
759 | io.ssdCmpt(ssdId)(queueId).valid := 1.U
760 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(495, 480)
761 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(504, 497)
762 | }
763 | }
764 | }
765 |
766 | object ShiftData512 {
767 | def apply (value : UInt, offset : UInt) = {
768 | assert(value.getWidth == 512)
769 | assert(offset.getWidth == 6)
770 | value << Cat(offset, 0.U(3.W))
771 | }
772 | }
773 |
774 | object ShiftStrb64 {
775 | def apply (value : UInt, offset : UInt) = {
776 | assert(value.getWidth == 64)
777 | assert(offset.getWidth == 6)
778 | value << offset
779 | }
780 | }
--------------------------------------------------------------------------------