├── sw ├── Makefile └── BandwidthBenchmark.cpp ├── src ├── TestAXIRouter.scala ├── Elaborate.scala ├── Interfaces.scala ├── BandwidthProbe.scala ├── Util.scala ├── deprecated │ ├── Interfaces.scala │ ├── NVMeBlackBox.scala │ └── NVMe.scala ├── AXI2NVMeRam.scala ├── NVMeLatencyBenchmarkTop.scala ├── NVMeBandwidthBenchmarkTop.scala ├── NVMeLatencyBenchmark.scala ├── NVMeBandwidthBenchmark.scala └── NVMeCore.scala ├── tb ├── tb_TestLatencyBucket.sv └── tb_TestAXIRouter.sv ├── sv └── NVMeBenchmarkTop.xdc └── README.md /sw/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CFLAGS = -lqdma 3 | 4 | default_target: all 5 | 6 | all: NVMeBenchmark 7 | 8 | NVMeBenchmark: NVMeBenchmark.cpp 9 | $(CC) NVMeBenchmark.cpp $(CFLAGS) -o NVMeBenchmark -------------------------------------------------------------------------------- /src/TestAXIRouter.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import qdma._ 9 | 10 | class TestAXIRouter extends Module { 11 | val io = IO(new Bundle { 12 | val axibIn = Flipped(new AXIB) 13 | val ramOut = Vec(3, new NVMeRamIO) 14 | }) 15 | 16 | val axibRt = AXIRouter(3, io.axibIn) 17 | axibRt.io.in <> io.axibIn 18 | axibRt.io.wrIdx := Mux( 19 | axibRt.io.in.aw.bits.addr(27), 20 | 2.U, 21 | Mux(axibRt.io.in.aw.bits.addr(26), 1.U, 0.U) 22 | ) 23 | axibRt.io.rdIdx := Mux( 24 | axibRt.io.in.ar.bits.addr(27), 25 | 2.U, 26 | Mux(axibRt.io.in.ar.bits.addr(26), 1.U, 0.U) 27 | ) 28 | for (idx <- 0 until 3) { 29 | io.ramOut(idx) <> AXI2NVMeRam(axibRt.io.out(idx)) 30 | } 31 | } -------------------------------------------------------------------------------- /src/Elaborate.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | import chisel3._ 3 | import chisel3.util._ 4 | import common._ 5 | import common.storage._ 6 | import qdma._ 7 | import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage} 8 | import firrtl.options.TargetDirAnnotation 9 | 10 | object elaborate extends App { 11 | println("Generating a %s class".format(args(0))) 12 | val stage = new chisel3.stage.ChiselStage 13 | val arr = Array("-X", "sverilog", "--full-stacktrace") 14 | val dir = TargetDirAnnotation("Verilog") 15 | 16 | args(0) match{ 17 | case "NVMeBandwidthBenchmarkTop" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new NVMeBandwidthBenchmarkTop()),dir)) 18 | case "NVMeLatencyBenchmarkTop" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new NVMeLatencyBenchmarkTop()),dir)) 19 | case "TestAXIRouter" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new TestAXIRouter()),dir)) 20 | case "BandwidthProbe" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new BandwidthProbe(100, 4096)),dir)) 21 | case "LatencyBucket" => stage.execute(arr,Seq(ChiselGeneratorAnnotation(() => new LatencyBucket(32, 1)),dir)) 22 | case _ => println("Module match failed!") 23 | } 24 | } -------------------------------------------------------------------------------- /tb/tb_TestLatencyBucket.sv: -------------------------------------------------------------------------------- 1 | module testbench_LatencyBucket( 2 | 3 | ); 4 | 5 | reg clock =0; 6 | reg reset =0; 7 | reg io_enable =0; 8 | reg io_start =0; 9 | reg io_end =0; 10 | reg [4:0] io_bucketRdId =0; 11 | wire [31:0] io_bucketValue ; 12 | reg io_resetBucket =0; 13 | wire io_resetDone ; 14 | 15 | 16 | LatencyBucket LatencyBucket_inst( 17 | .* 18 | ); 19 | 20 | 21 | initial begin 22 | reset <= 1; 23 | clock = 1; 24 | #100; 25 | reset <= 0; 26 | io_enable <= 1; 27 | #6; 28 | io_start <= 1; 29 | #2; 30 | io_start <= 0; 31 | #2; 32 | io_end <= 1; 33 | #1 34 | io_end <= 0; 35 | #5; 36 | io_end <= 1; 37 | #1; 38 | io_end <= 0; 39 | io_start <= 1; 40 | #1; 41 | io_start <= 0; 42 | #10; 43 | io_end <= 1; 44 | #1; 45 | io_end <= 0; 46 | #10; 47 | io_enable <= 0; 48 | io_bucketRdId <= 0; 49 | while (io_bucketRdId < 31) begin 50 | #5; 51 | io_bucketRdId <= io_bucketRdId + 'd1; 52 | end 53 | #5; 54 | io_resetBucket <= 1; 55 | #1; 56 | io_resetBucket <= 0; 57 | #50; 58 | $stop(); 59 | end 60 | always #0.5 clock=~clock; 61 | 62 | endmodule -------------------------------------------------------------------------------- /src/Interfaces.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import common.ToZero 9 | 10 | class NVMeRamIO extends Bundle{ 11 | val readEnable = Output(UInt(1.W)) 12 | val readAddr = Output(UInt(64.W)) 13 | val readData = Input(UInt(512.W)) 14 | val writeMask = Output(UInt(64.W)) 15 | val writeAddr = Output(UInt(64.W)) 16 | val writeData = Output(UInt(512.W)) 17 | } 18 | 19 | class SSDSetup extends Bundle { 20 | val ssdId = Output(UInt(32.W)) 21 | val ssdBarAddr = Output(UInt(64.W)) 22 | } 23 | 24 | class NVMeCoreControl extends Bundle { 25 | val enable = Output(Bool()) 26 | val ssdSetup = Valid(new SSDSetup) 27 | } 28 | 29 | class NVMeParameters extends Bundle { 30 | val ssdNum = Output(UInt(32.W)) 31 | val queueNum = Output(UInt(32.W)) 32 | val queueLowBit = Output(UInt(32.W)) 33 | val ssdLowBit = Output(UInt(32.W)) 34 | val queueDepth = Output(UInt(32.W)) 35 | val ramTypeBit = Output(UInt(32.W)) 36 | } 37 | 38 | class NVMeStat extends Bundle { 39 | val executeTime = Output(UInt(64.W)) 40 | val successfulOp = Output(UInt(32.W)) 41 | val failedOp = Output(UInt(32.W)) 42 | val totalLatency = Output(UInt(64.W)) 43 | } 44 | 45 | class NVMeCoreStatus extends Bundle { 46 | val running = Output(Bool()) 47 | val params = new NVMeParameters 48 | val stat = new NVMeStat 49 | } 50 | 51 | class SSDCompletion extends Bundle { 52 | val cmdId = Output(UInt(16.W)) 53 | val status = Output(UInt(8.W)) 54 | } -------------------------------------------------------------------------------- /sv/NVMeBenchmarkTop.xdc: -------------------------------------------------------------------------------- 1 | create_clock -name sys_clk -period 10 [get_ports qdma_pin_sys_clk_p] 2 | 3 | set_false_path -from [get_ports qdma_pin_sys_rst_n] 4 | set_property PULLUP true [get_ports qdma_pin_sys_rst_n] 5 | set_property IOSTANDARD LVCMOS18 [get_ports qdma_pin_sys_rst_n] 6 | set_property PACKAGE_PIN AW27 [get_ports qdma_pin_sys_rst_n] 7 | set_property CONFIG_VOLTAGE 1.8 [current_design] 8 | 9 | set_property LOC [get_package_pins -of_objects [get_bels [get_sites -filter {NAME =~ *COMMON*} -of_objects [get_iobanks -of_objects [get_sites GTYE4_CHANNEL_X1Y7]]]/REFCLK0P]] [get_ports qdma_pin_sys_clk_p] 10 | set_property LOC [get_package_pins -of_objects [get_bels [get_sites -filter {NAME =~ *COMMON*} -of_objects [get_iobanks -of_objects [get_sites GTYE4_CHANNEL_X1Y7]]]/REFCLK0N]] [get_ports qdma_pin_sys_clk_n] 11 | 12 | set_property PACKAGE_PIN J18 [get_ports led] 13 | set_property IOSTANDARD LVCMOS18 [get_ports led] 14 | 15 | # create_clock -name sys_100M_clock_0 -period 10 -add [get_ports sys_100M_0_p] 16 | 17 | # set_property PACKAGE_PIN BJ43 [get_ports sys_100M_0_p] 18 | # set_property PACKAGE_PIN BJ44 [get_ports sys_100M_0_n] 19 | # set_property IOSTANDARD DIFF_SSTL12 [get_ports sys_100M_0_p] 20 | # set_property IOSTANDARD DIFF_SSTL12 [get_ports sys_100M_0_n] 21 | 22 | set_false_path -from [get_cells -regexp {qdma/axil2reg/reg_control_[0-9]*_reg\[.*]}] 23 | set_false_path -to [get_cells -regexp {qdma/axil2reg/reg_status_[0-9]*_reg\[.*]}] 24 | #reg_control_0_reg[0] 25 | #set_false_path -from [get_cells qdma/axil2reg/reg_control_[*]] 26 | #set_false_path -to [get_cells qdma/axil_reg_0/reg_status_[*]] 27 | 28 | ### 29 | set_false_path -to [get_pins -hier *sync_reg[0]/D] 30 | ### 31 | set_property C_USER_SCAN_CHAIN 1 [get_debug_cores dbg_hub] 32 | connect_debug_port dbg_hub/clk [get_nets dbg_clk_pad_O] -------------------------------------------------------------------------------- /src/BandwidthProbe.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | import chisel3._ 3 | import chisel3.util._ 4 | import chisel3.experimental.ChiselEnum 5 | import common.axi._ 6 | import common.storage._ 7 | import common._ 8 | import math.ceil 9 | 10 | class BandwidthProbe ( 11 | CYCLE : Int = 25000000, 12 | DEPTH : Int = 4096 13 | ) extends Module { 14 | val io = IO(new Bundle{ 15 | val enable = Input(Bool()) 16 | val fire = Input(Bool()) 17 | val count = Decoupled(Output(UInt(32.W))) 18 | }) 19 | 20 | val time_cnt = RegInit(UInt(log2Ceil(CYCLE).W), 0.U) 21 | val record_valid = time_cnt === (CYCLE-1).U 22 | val band_cnt = RegInit(UInt(32.W), 0.U) 23 | 24 | when (~io.enable) { 25 | time_cnt := 0.U 26 | band_cnt := 0.U 27 | }.otherwise { 28 | when (record_valid) { 29 | time_cnt := 0.U 30 | }.otherwise { 31 | time_cnt := time_cnt + 1.U 32 | } 33 | 34 | when (record_valid) { 35 | when (io.fire) { 36 | band_cnt := 1.U 37 | }.otherwise { 38 | band_cnt := 0.U 39 | } 40 | }.elsewhen (io.fire) { 41 | band_cnt := band_cnt + 1.U 42 | } 43 | } 44 | 45 | val FIFO_CNT = ceil(DEPTH / 4096).toInt 46 | 47 | if (FIFO_CNT == 0) { 48 | val q = XQueue(UInt(32.W), DEPTH) 49 | q.io.out <> io.count 50 | q.io.in.valid := record_valid 51 | q.io.in.bits := band_cnt 52 | } else { 53 | val q = XQueue(FIFO_CNT)(UInt(32.W), 4096) 54 | q(0).io.in.valid := record_valid 55 | q(0).io.in.bits := band_cnt 56 | q(FIFO_CNT-1).io.out <> io.count 57 | for (i <- 1 until FIFO_CNT) { 58 | q(i-1).io.out <> q(i).io.in 59 | } 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /src/Util.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | object NVMeCommandSet { 7 | def nvmWrite (id : UInt, prp1 : UInt, prp2 : UInt, slba : UInt, nlb : UInt) = { 8 | 9 | // Make sure the lengths of input signals are correct. 10 | 11 | val idSig = Wire(UInt(16.W)) 12 | val prp1Sig = Wire(UInt(64.W)) 13 | val prp2Sig = Wire(UInt(64.W)) 14 | val slbaSig = Wire(UInt(64.W)) 15 | val nlbSig = Wire(UInt(16.W)) 16 | 17 | idSig := id 18 | prp1Sig := prp1 19 | prp2Sig := prp2 20 | slbaSig := slba 21 | nlbSig := nlb 22 | 23 | // Generate the NVMe-format command 24 | 25 | Cat( 26 | // DW 14-15: 27 | 0.U(64.W), // End to end protection, not used 28 | // DW 13: 29 | 0.U(24.W), // Rsvd 30 | 0.U(8.W), // Dataset, not used generally 31 | // DW 12: 32 | 1.U(1.W), // Limited retry 33 | 0.U(1.W), // Forced unit access 34 | 0.U(4.W), // Protection information 35 | 0.U(10.W), // Rsvd 36 | nlbSig, // Number of logical blocks, 0's based 37 | // DW 11-10: 38 | slbaSig, // Starting LB address 39 | // DW 9-8: 40 | prp2Sig, // PRP 2 41 | // DW 7-6 42 | prp1Sig, // PRP 1 43 | // DW 5-4: 44 | 0.U(64.W), // Metadata ptr, not used here 45 | // DW 3-2: 46 | 0.U(64.W), // Rsvd 47 | // DW 1: 48 | 1.U(32.W), // Namespace, typically 1 for most cases 49 | // DW 0: 50 | idSig, // Command ID 51 | 0.U(2.W), // Use PRP 52 | 0.U(4.W), // Rsvd 53 | 0.U(2.W), // Fuse command 54 | 0x01.U(8.W) // Opcode 55 | ) 56 | } 57 | 58 | def nvmRead (id : UInt, prp1 : UInt, prp2 : UInt, slba : UInt, nlb : UInt) = { 59 | 60 | // Make sure the lengths of input signals are correct. 61 | 62 | val idSig = Wire(UInt(16.W)) 63 | val prp1Sig = Wire(UInt(64.W)) 64 | val prp2Sig = Wire(UInt(64.W)) 65 | val slbaSig = Wire(UInt(64.W)) 66 | val nlbSig = Wire(UInt(16.W)) 67 | 68 | idSig := id 69 | prp1Sig := prp1 70 | prp2Sig := prp2 71 | slbaSig := slba 72 | nlbSig := nlb 73 | 74 | // Generate the NVMe-format command 75 | 76 | Cat( 77 | // DW 14-15: 78 | 0.U(64.W), // End to end protection, not used 79 | // DW 13: 80 | 0.U(24.W), // Rsvd 81 | 0.U(8.W), // Dataset, not used generally 82 | // DW 12: 83 | 1.U(1.W), // Limited retry 84 | 0.U(1.W), // Forced unit access 85 | 0.U(4.W), // Protection information 86 | 0.U(10.W), // Rsvd 87 | nlbSig, // Number of logical blocks, 0's based 88 | // DW 11-10: 89 | slbaSig, // Starting LB address 90 | // DW 9-8: 91 | prp2Sig, // PRP 2 92 | // DW 7-6 93 | prp1Sig, // PRP 1 94 | // DW 5-4: 95 | 0.U(64.W), // Metadata ptr, not used here 96 | // DW 3-2: 97 | 0.U(64.W), // Rsvd 98 | // DW 1: 99 | 1.U(32.W), // Namespace, typically 1 for most cases 100 | // DW 0: 101 | idSig, // Command ID 102 | 0.U(2.W), // Use PRP 103 | 0.U(4.W), // Rsvd 104 | 0.U(2.W), // Fuse command 105 | 0x02.U(8.W) // Opcode 106 | ) 107 | } 108 | } -------------------------------------------------------------------------------- /src/deprecated/Interfaces.scala: -------------------------------------------------------------------------------- 1 | package nvme.deprecated 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import common.ToZero 9 | 10 | class NVMeCommand extends Bundle{ 11 | val op = Output(UInt(1.W)) 12 | val numLb = Output(UInt(16.W)) 13 | val ssdAddr = Output(UInt(64.W)) 14 | val memAddr = Output(UInt(64.W)) 15 | } 16 | 17 | class NVMeControl extends Bundle { 18 | val init_start = Input(UInt(1.W)) 19 | val init_nsid = Input(UInt(32.W)) 20 | val init_dma_addr = Input(UInt(64.W)) 21 | val init_byp_addr = Input(UInt(64.W)) 22 | val init_ssd_addr = Input(UInt(64.W)) 23 | val init_ssdid = Input(UInt(32.W)) 24 | val p2pdma_read = Input(UInt(1.W)) 25 | val p2pdma_write = Input(UInt(1.W)) 26 | val p2pdma_cmd_addr = Input(UInt(64.W)) 27 | val p2pdma_cmd_len = Input(UInt(16.W)) 28 | val p2pdma_c2h_data = Input(UInt(512.W)) 29 | val ssd_init = Input(UInt(1.W)) 30 | val exec_start = Input(UInt(1.W)) 31 | val exec_time = Input(UInt(32.W)) 32 | val band_tr_en = Input(UInt(1.W)) 33 | val band_tr_read = Input(UInt(1.W)) 34 | } 35 | 36 | object Reg2NVMeControl { 37 | def apply(controlReg : Vec[UInt]) : NVMeControl = { 38 | val target = Wire(new NVMeControl) 39 | val source = Cat( 40 | controlReg(32)(0), 41 | controlReg(33), 42 | Cat(controlReg(35), controlReg(34)), 43 | Cat(controlReg(37), controlReg(36)), 44 | Cat(controlReg(39), controlReg(38)), 45 | controlReg(40), 46 | controlReg(64)(0), 47 | controlReg(65)(0), 48 | Cat(controlReg(67), controlReg(66)), 49 | controlReg(68)(15, 0), 50 | Cat(controlReg(84), controlReg(83), controlReg(82), controlReg(81), controlReg(80), controlReg(79), controlReg(78), controlReg(77), controlReg(76), controlReg(75), controlReg(74), controlReg(73), controlReg(72), controlReg(71), controlReg(70), controlReg(69)), 51 | controlReg(128)(0), 52 | controlReg(160)(0), 53 | controlReg(163), 54 | controlReg(165)(0), 55 | controlReg(166)(0), 56 | ) 57 | target := source.asTypeOf(new NVMeControl) 58 | target 59 | } 60 | } 61 | 62 | class NVMeStatus extends Bundle { 63 | val p2pdma_h2c_data = Output(UInt(512.W)) 64 | val p2pdma_h2c_done = Output(UInt(1.W)) 65 | val p2pdma_c2h_done = Output(UInt(1.W)) 66 | val nvme_init_done = Output(UInt(1.W)) 67 | val nvme_exec_done = Output(UInt(1.W)) 68 | val stat_op_succ = Output(UInt(32.W)) 69 | val stat_op_fail = Output(UInt(32.W)) 70 | val stat_exec_time = Output(UInt(64.W)) 71 | val stat_io_ssd0 = Output(UInt(32.W)) 72 | val stat_io_ssd1 = Output(UInt(32.W)) 73 | val stat_io_ssd2 = Output(UInt(32.W)) 74 | val stat_io_ssd3 = Output(UInt(32.W)) 75 | val band_tr_rd = Output(UInt(32.W)) 76 | val band_tr_wr = Output(UInt(32.W)) 77 | } 78 | 79 | object NVMeStatus2Reg { 80 | def apply(source : NVMeStatus, statusReg : Vec[UInt])= { 81 | var i = 0; 82 | for (i <- 0 until 16) { 83 | statusReg(96+i) := source.p2pdma_h2c_data(i*32+31, i*32) 84 | } 85 | for (i <- 0 until 2) { 86 | statusReg(196+i) := source.stat_exec_time(i*32+31, i*32) 87 | } 88 | statusReg(128) := source.p2pdma_h2c_done 89 | statusReg(129) := source.p2pdma_c2h_done 90 | statusReg(160) := source.nvme_init_done 91 | statusReg(192) := source.nvme_exec_done 92 | statusReg(193) := source.stat_op_succ 93 | statusReg(194) := source.stat_op_fail 94 | statusReg(200) := source.stat_io_ssd0 95 | statusReg(201) := source.stat_io_ssd1 96 | statusReg(202) := source.stat_io_ssd2 97 | statusReg(203) := source.stat_io_ssd3 98 | statusReg(216) := source.band_tr_rd 99 | statusReg(217) := source.band_tr_wr 100 | } 101 | } -------------------------------------------------------------------------------- /src/AXI2NVMeRam.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.axi._ 7 | import common.storage._ 8 | import common.connection._ 9 | 10 | object AXI2NVMeRam { 11 | def apply(in : AXI) = { 12 | val inst = Module(new AXI2NVMeRam( 13 | in.ar.bits.addr.getWidth, 14 | in.r.bits.data.getWidth, 15 | in.ar.bits.id.getWidth, 16 | in.ar.bits.user.getWidth, 17 | in.ar.bits.len.getWidth 18 | )) 19 | val out = Wire(new NVMeRamIO) 20 | inst.io.in <> in 21 | inst.io.out <> out 22 | 23 | out 24 | } 25 | 26 | class AXI2NVMeRam(ADDR_WIDTH:Int, DATA_WIDTH:Int, ID_WIDTH:Int, USER_WIDTH:Int, LEN_WIDTH:Int) extends Module { 27 | val io = IO(new Bundle{ 28 | val in = Flipped(new AXI(ADDR_WIDTH, DATA_WIDTH, ID_WIDTH, USER_WIDTH, LEN_WIDTH)) 29 | val out = new NVMeRamIO 30 | }) 31 | 32 | // W channels 33 | 34 | val rWid = RegInit(UInt(ID_WIDTH.W), 0.U) 35 | val rNextWrAddr = RegInit(UInt(ADDR_WIDTH.W), 0.U) 36 | 37 | val sWrReq :: sWrData :: Nil = Enum(2) 38 | val wrSt = RegInit(sWrReq) 39 | 40 | val wrFirstBeat = (wrSt === sWrReq) && io.in.aw.fire && io.in.w.fire 41 | val wrRemainBeat = (wrSt === sWrData) && io.in.w.fire 42 | 43 | val backFifo = XQueue(UInt(ID_WIDTH.W), 32) 44 | val backFifoIn = Wire(Decoupled(UInt(ID_WIDTH.W))) 45 | 46 | backFifo.io.in <> RegSlice(2)(backFifoIn) 47 | 48 | io.out.writeMask := Mux(io.in.w.fire, io.in.w.bits.strb, 0.U) 49 | io.out.writeAddr := Mux(wrFirstBeat, Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W)), rNextWrAddr) 50 | io.out.writeData := io.in.w.bits.data 51 | 52 | io.in.aw.ready := (wrSt === sWrReq) 53 | io.in.w.ready := backFifoIn.ready 54 | io.in.b.bits.id := backFifo.io.out.bits 55 | io.in.b.bits.resp := 0.U 56 | io.in.b.bits.user := 0.U 57 | io.in.b.valid := backFifo.io.out.valid 58 | backFifo.io.out.ready := io.in.b.ready 59 | 60 | backFifoIn.valid := io.in.w.fire && io.in.w.bits.last.asBool 61 | backFifoIn.bits := rWid 62 | 63 | switch (wrSt) { 64 | is (sWrReq) { 65 | when (io.in.aw.fire) { // Received a request 66 | when (io.in.w.fire && io.in.w.bits.last.asBool) { // 1-beat data, already handled. 67 | wrSt := sWrReq 68 | }.otherwise { 69 | wrSt := sWrData 70 | } 71 | }.otherwise { 72 | wrSt := sWrReq 73 | } 74 | } 75 | is (sWrData) { 76 | when (io.in.w.fire && io.in.w.bits.last.asBool) { // Last beat ends 77 | wrSt := sWrReq 78 | }.otherwise { 79 | wrSt := sWrData 80 | } 81 | } 82 | } 83 | 84 | when (io.in.aw.fire) { 85 | rWid := io.in.aw.bits.id 86 | rNextWrAddr := Mux( 87 | wrFirstBeat, 88 | Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W)) + "h40".U, 89 | Cat(io.in.aw.bits.addr(63, 6), 0.U(6.W)) 90 | ) 91 | }.elsewhen (wrRemainBeat) { 92 | when (!io.in.w.bits.last.asBool){ 93 | rNextWrAddr := rNextWrAddr + "h40".U 94 | } 95 | } 96 | 97 | // R channels 98 | 99 | val rRid = RegInit(UInt(ID_WIDTH.W), 0.U) 100 | val rLen = RegInit(UInt(LEN_WIDTH.W), 0.U) 101 | val rNextRdAddr = RegInit(UInt(ADDR_WIDTH.W), 0.U) 102 | 103 | val sRdReq :: sRdData :: Nil = Enum(2) 104 | val rdSt = RegInit(sRdReq) 105 | 106 | val rdFirstBeat = (rdSt === sRdReq) && io.in.ar.fire 107 | val rdRemainBeat = (rdSt === sRdData) && io.in.r.fire && (rLen =/= 0.U) 108 | 109 | io.out.readAddr := Mux(rdFirstBeat, Cat(io.in.ar.bits.addr(63, 6), 0.U(6.W)), rNextRdAddr) 110 | io.out.readEnable := rdFirstBeat || rdRemainBeat 111 | 112 | io.in.ar.ready := (rdSt === sRdReq) 113 | io.in.r.valid := (rdSt === sRdData) 114 | io.in.r.bits.id := rRid 115 | io.in.r.bits.user := 0.U 116 | io.in.r.bits.last := (rdSt === sRdData && rLen === 0.U) 117 | io.in.r.bits.data := io.out.readData 118 | io.in.r.bits.resp := 0.U 119 | 120 | switch (rdSt) { 121 | is (sRdReq) { 122 | when (io.in.ar.fire) { 123 | rdSt := sRdData 124 | }.otherwise { 125 | rdSt := sRdReq 126 | } 127 | } 128 | is (sRdData) { 129 | when (io.in.r.fire && io.in.r.bits.last.asBool) { 130 | rdSt := sRdReq 131 | }.otherwise { 132 | rdSt := sRdData 133 | } 134 | } 135 | } 136 | 137 | when (rdFirstBeat) { 138 | rLen := io.in.ar.bits.len 139 | rNextRdAddr := Cat(io.in.ar.bits.addr(63, 6), 0.U(6.W)) + "h40".U 140 | rRid := io.in.ar.bits.id 141 | }.elsewhen(rdRemainBeat) { 142 | when (rLen =/= 1.U) { 143 | rNextRdAddr := rNextRdAddr + "h40".U 144 | } 145 | rLen := rLen - 1.U 146 | } 147 | } 148 | } -------------------------------------------------------------------------------- /src/deprecated/NVMeBlackBox.scala: -------------------------------------------------------------------------------- 1 | package nvme.deprecated 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import common.ToZero 9 | import qdma._ 10 | 11 | class NVMeBlackBox ( 12 | SSD_MAX_ID : Int = 0, 13 | BUFFER_DATA_SHIFT : Int = 27, 14 | SSD_NUM_SHIFT : Int = 2, 15 | QUEUE_DEPTH_SHIFT : Int = 8, 16 | QUEUE_MAX_ID : Int = 3, 17 | QUEUE_COUNT_SHIFT : Int = 2, 18 | MAX_SQ_INTERVAL : Int = 30 19 | ) extends BlackBox(Map( 20 | "SSD_NUM" -> SSD_MAX_ID, 21 | "BRIDGE_DATA_SHIFT" -> BUFFER_DATA_SHIFT, 22 | "SSD_NUM_SHIFT" -> SSD_NUM_SHIFT, 23 | "QUEUE_DEPTH_SHIFT" -> QUEUE_DEPTH_SHIFT, 24 | "QUEUE_COUNT_SHIFT" -> QUEUE_COUNT_SHIFT, 25 | "MAX_SQ_INTERVAL" -> MAX_SQ_INTERVAL, 26 | "USED_QUEUE_MAX_ID" -> QUEUE_MAX_ID 27 | )) { 28 | val QUEUE_NUM = (SSD_MAX_ID+1) * (QUEUE_MAX_ID+1) 29 | 30 | val io = IO(new Bundle{ 31 | val clk_core = Input(Clock()) 32 | val sys_reset = Input(Bool()) 33 | 34 | val status_p2pdma_h2c_data = Output(UInt(512.W)) // Reg(111:96) 35 | val status_p2pdma_h2c_done = Output(UInt(1.W)) // Reg(128) 36 | val status_p2pdma_c2h_done = Output(UInt(1.W)) // Reg(129) 37 | val status_nvme_init_done = Output(UInt(1.W)) // Reg(160) 38 | val status_nvme_exec_done = Output(UInt(1.W)) // Reg(192) 39 | val status_stat_op_succ = Output(UInt(32.W)) // Reg(193) 40 | val status_stat_op_fail = Output(UInt(32.W)) // Reg(194) 41 | val status_stat_exec_time = Output(UInt(64.W)) // Reg(197:196) 42 | val status_stat_io_ssd0 = Output(UInt(32.W)) // Reg(200) 43 | val status_stat_io_ssd1 = Output(UInt(32.W)) // Reg(201) 44 | val status_stat_io_ssd2 = Output(UInt(32.W)) // Reg(202) 45 | val status_stat_io_ssd3 = Output(UInt(32.W)) // Reg(203) 46 | val status_band_tr_rd = Output(UInt(32.W)) // Reg(216) 47 | val status_band_tr_wr = Output(UInt(32.W)) // Reg(217) 48 | 49 | val control_init_start = Input(UInt(1.W)) // Reg(32) 50 | val control_init_nsid = Input(UInt(32.W)) // Reg(33) 51 | val control_init_dma_addr = Input(UInt(64.W)) // Reg(35:34) 52 | val control_init_byp_addr = Input(UInt(64.W)) // Reg(37:36) 53 | val control_init_ssd_addr = Input(UInt(64.W)) // Reg(39:38) 54 | val control_init_ssdid = Input(UInt(32.W)) // Reg(40) 55 | val control_p2pdma_read = Input(UInt(1.W)) // Reg(64) 56 | val control_p2pdma_write = Input(UInt(1.W)) // Reg(65) 57 | val control_p2pdma_cmd_addr = Input(UInt(64.W)) // Reg(67:66) 58 | val control_p2pdma_cmd_len = Input(UInt(16.W)) // Reg(68) 59 | val control_p2pdma_c2h_data = Input(UInt(512.W)) // Reg(84:69) 60 | val control_ssd_init = Input(UInt(1.W)) // Reg(128) 61 | val control_exec_start = Input(UInt(1.W)) // Reg(160) 62 | val control_exec_time = Input(UInt(32.W)) // Reg(163) 63 | val control_band_tr_en = Input(UInt(1.W)) // Reg(165) 64 | val control_band_tr_read = Input(UInt(1.W)) // Reg(166) 65 | 66 | val s_axib_awid = Output(UInt(4.W)) 67 | val s_axib_awaddr = Output(UInt(64.W)) 68 | val s_axib_awlen = Output(UInt(8.W)) 69 | val s_axib_awsize = Output(UInt(3.W)) 70 | val s_axib_awburst = Output(UInt(2.W)) 71 | val s_axib_awuser = Output(UInt(12.W)) 72 | val s_axib_awregion = Output(UInt(4.W)) 73 | val s_axib_awvalid = Output(UInt(1.W)) 74 | val s_axib_awready = Input(UInt(1.W)) 75 | val s_axib_wdata = Output(UInt(512.W)) 76 | val s_axib_wstrb = Output(UInt(64.W)) 77 | val s_axib_wlast = Output(UInt(1.W)) 78 | val s_axib_wuser = Output(UInt(64.W)) 79 | val s_axib_wvalid = Output(UInt(1.W)) 80 | val s_axib_wready = Input(UInt(1.W)) 81 | val s_axib_bid = Input(UInt(4.W)) 82 | val s_axib_bresp = Input(UInt(2.W)) 83 | val s_axib_bvalid = Input(UInt(1.W)) 84 | val s_axib_bready = Output(UInt(1.W)) 85 | val s_axib_arid = Output(UInt(4.W)) 86 | val s_axib_araddr = Output(UInt(64.W)) 87 | val s_axib_aruser = Output(UInt(12.W)) 88 | val s_axib_arlen = Output(UInt(8.W)) 89 | val s_axib_arsize = Output(UInt(3.W)) 90 | val s_axib_arburst = Output(UInt(2.W)) 91 | val s_axib_arregion = Output(UInt(4.W)) 92 | val s_axib_arvalid = Output(UInt(1.W)) 93 | val s_axib_arready = Input(UInt(1.W)) 94 | val s_axib_rid = Input(UInt(4.W)) 95 | val s_axib_rdata = Input(UInt(512.W)) 96 | val s_axib_rresp = Input(UInt(2.W)) 97 | val s_axib_ruser = Input(UInt(64.W)) 98 | val s_axib_rlast = Input(UInt(1.W)) 99 | val s_axib_rvalid = Input(UInt(1.W)) 100 | val s_axib_rready = Output(UInt(1.W)) 101 | 102 | val axib_read_enable = Input(UInt(1.W)) 103 | val axib_read_addr = Input(UInt(64.W)) 104 | val axib_read_data = Output(UInt(512.W)) 105 | val axib_write_mask = Input(UInt(64.W)) 106 | val axib_write_addr = Input(UInt(64.W)) 107 | val axib_write_data = Input(UInt(512.W)) 108 | 109 | val ssd_cmd_op = Input(UInt(QUEUE_NUM.W)) 110 | val ssd_cmd_nlb = Input(UInt((QUEUE_NUM*16).W)) 111 | val ssd_cmd_lba = Input(UInt((QUEUE_NUM*32).W)) 112 | val ssd_cmd_offset = Input(UInt((QUEUE_NUM*32).W)) 113 | val ssd_cmd_valid = Input(UInt(QUEUE_NUM.W)) 114 | val ssd_cmd_ready = Output(UInt(QUEUE_NUM.W)) 115 | 116 | val pcie_hbm_write_transfer = Input(UInt(2.W)) 117 | }) 118 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chisel NVMe Host Controller 2 | NVMe host controller written in Chisel. 3 | 4 | ## Table of contents 5 | - [Chisel NVMe Host Controller](#chisel-nvme-host-controller) 6 | - [Table of contents](#table-of-contents) 7 | - [How to Add this Module in Your Chisel Project:](#how-to-add-this-module-in-your-chisel-project) 8 | - [NVMeCore Module](#nvmecore-module) 9 | - [Parameter Description](#parameter-description) 10 | - [Port Description](#port-description) 11 | - [NVMeCoreControl](#nvmecorecontrol) 12 | - [NVMeCoreStatus](#nvmecorestatus) 13 | - [Other Modules and APIs](#other-modules-and-apis) 14 | - [AXI2NVMeRam](#axi2nvmeram) 15 | - [NVMe Command Builders](#nvme-command-builders) 16 | - [BandwidthProbe](#bandwidthprobe) 17 | - [Example Design](#example-design) 18 | 19 | ## How to Add this Module in Your Chisel Project: 20 | [QDMA](https://github.com/carlzhang4/qdma) is this module's dependency. Before using this module, make sure QDMA module are installed. 21 | To install this module, use the command below: 22 | ```bash 23 | $ git submodule add git@github.com:JewdrunAleph/fpga-nvme-controller nvme 24 | $ git clone git@github.com:JewdrunAleph/fpga-nvme-controller nvme 25 | ``` 26 | 27 | ## NVMeCore Module 28 | `NVMeCore` can offload I/O queue management of SSDs from CPU to FPGA. It takes NVMe commands as inputs, maintains I/O queues and rings doorbell signal via QDMA. 29 | 30 | ### Parameter Description 31 | |Parameter |Type |Range |Description | 32 | |:--- |:--- |:--- |:--- | 33 | |SSD_NUM |Int |Larger than 0. |Number of SSDs used. | 34 | |QUEUE_NUM |Int |Larger than 0. |Number of I/O queues FPGA taken care of for each SSD.| 35 | |QUEUE_DEPTH |Int |Larger than 4, exponential of 2. |Queue depth of each queue I/O queue. Depth of each queue should be equal.| 36 | |MAX_SQ_INTERVAL |Int |Larger than 0. |Time window for this module to wait new command in cycles.
To reduce doorbell traffic, when a new command is inserted, queue management module will not immediately ring the doorbell, instead, it will wait for a period to see whether new command has come.| 37 | |QDMA_INTERFACE |String |"DMA" or "SAXIB" |Interface of QDMA used to ring doorbell.
You can choose either DMA C2H engine, or AXI Slave Bridge. | 38 | 39 | 40 | ### Port Description 41 | |Port |Type |Direction |Description | 42 | |:--- |:--- |:--- |:--- | 43 | |ssdCmd |Vec[Vec[DecoupledIO[UInt]]]|Input |Command of each queue of each SSD. Each command should follow NVMe format.| 44 | |control |NVMeCoreControl |Input |Control signals for this module. Refer to [here](#NVMeCoreControl)| 45 | |status |NVMeCoreStatus |Output |Status signals for this module. Refer to [here](#NVMeCoreStatus)| 46 | |ramIO |NVMeRamIO | |SQ/CQ RAM I/O request from the host. This port can be converted from an AXI slave interface, see [here](#AXI2NVMeRam).| 47 | |sAxib |Option[AXIB_SLAVE] |Output |Used when `QDMA_INTERFACE == "SAXIB"`. Connect to QDMA's AXI Slave Bridge| 48 | |c2hCmd |Option[DecoupledIO[C2H_CMD]]|Output |Used when `QDMA_INTERFACE == "DMA"`. Connect to QDMA's C2H command port.| 49 | |c2hData |Option[DecoupledIO[C2H_DATA]]|Output |Used when `QDMA_INTERFACE == "DMA"`. Connect to QDMA's C2H data port.| 50 | 51 | #### NVMeCoreControl 52 | NVMe control signals are listed here. 53 | **enable** 54 | Only when this signal is high will this module work and accept new commands. When this signal is low, it still processes existing commands, but won't accept new commands anymore. Designed for benchmarking. 55 | **ssdSetup** 56 | Initialize an SSD with data required by this module. It has two signals: 57 | - `ssdId`: Index of SSD to be initialized. 58 | - `ssdBarAddr`: **Physical** address of BAR 0 of this SSD. It should be got from the host. 59 | 60 | #### NVMeCoreStatus 61 | NVMe status signals are listed here. This interface includes signals either needed by host or helpful for benchmarking. 62 | - `running`: Whether this module is processing or accepting commands. When `enable` signal is set to low and this module finishes processing all existing commands, this signal will turn to low. 63 | - `params`: Parameters required by the host. With these parameters, host can create I/O queue for all SSDs and assign correct address for these queues. 64 | - `stat`: Statistical informations **since the module is enabled**, include: 65 | - `executeTime`: Total execution time. 66 | - `successfulOp`: Number of commands SSD processed successfully. 67 | - `failedOp`: Number of commands SSD failed to process by SSDs. 68 | - `totalLatency`: Total latency for all commands processed **in cycles**. To get average latency of each SSD, devide this over number of commands processed. 69 | 70 | ## Other Modules and APIs 71 | 72 | ### AXI2NVMeRam 73 | For simplicity, NVMe core module takes `NVMeRamIO` as input, which is similar to simple dual port BRAM I/O signals. However, in real applications requests are from AXI interface. Therefore, this repo provides an `AXI2NVMeRam` module for interface conversion. 74 | 75 | ### NVMe Command Builders 76 | `NVMeCommandSet` object provides a set of functions which helps to fill in NVMe-formatted commands with some simple and basic items. Check Util.scala for more details. 77 | 78 | ### BandwidthProbe 79 | `BandwidthProbe` helps to record actual data transfer bandwidth of certain interface. 80 | 81 | ## Example Design 82 | This repo provides an example benchmark design includes hardware design and corresponding software. Example design is tested on an Alveo U50 Card. For U280 board, please use your own xdc file. 83 | To test this design: 84 | 1. Install [QDMA driver and LibQDMA](https://github.com/carlzhang4/qdma_improve). 85 | 2. Generate bitstream file, and program to FPGA. 86 | 3. Change list of NVMe devices in NVMeBenchmark.cpp 87 | 4. Use Makefile to generate executable. 88 | 5. Reboot your computer and run. -------------------------------------------------------------------------------- /src/NVMeLatencyBenchmarkTop.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import qdma._ 9 | 10 | class NVMeLatencyBenchmarkTop extends RawModule{ 11 | val qdma_pin = IO(new QDMAPin(PCIE_WIDTH=8)) 12 | val led = IO(Output(UInt(1.W))) 13 | 14 | led := 0.U 15 | 16 | // Global parameters 17 | val DATA_BUFFER_SHIFT = 27 // Upper 128 MiB is used for data buffer. 18 | val SSD_NUM = 4 19 | val QUEUE_DEPTH = 64 20 | val QDMA_INTERFACE = "SAXIB" 21 | 22 | val qdma = Module(new QDMA( 23 | VIVADO_VERSION = "202101", 24 | PCIE_WIDTH = 8, 25 | SLAVE_BRIDGE = (QDMA_INTERFACE == "SAXIB"), 26 | TLB_TYPE = new BypassTLB, // TLB is not used here. 27 | BRIDGE_BAR_SCALE= "Megabytes", 28 | BRIDGE_BAR_SIZE = 256 29 | )) 30 | qdma.getTCL() 31 | 32 | val user_clk = qdma.io.pcie_clk 33 | val user_rstn = qdma.io.pcie_arstn 34 | 35 | ToZero(qdma.io.reg_status) 36 | qdma.io.pin <> qdma_pin 37 | 38 | // TODO: Notify Carl Zhang move AXI clock convertor into QDMA module :/ 39 | qdma.io.user_clk := user_clk 40 | qdma.io.user_arstn := user_rstn 41 | qdma.io.soft_rstn := 1.U 42 | 43 | Collector.connect_to_status_reg(qdma.io.reg_status, 400) 44 | 45 | withClockAndReset(qdma.io.user_clk,!qdma.io.user_arstn) { 46 | val nvmeCore = Module(new NVMeController( 47 | SSD_NUM = SSD_NUM, 48 | QUEUE_NUM = 1, 49 | QUEUE_DEPTH = QUEUE_DEPTH, 50 | QDMA_INTERFACE = QDMA_INTERFACE 51 | )) 52 | 53 | if (QDMA_INTERFACE == "DMA") { 54 | nvmeCore.io.c2hCmd.get <> qdma.io.c2h_cmd 55 | nvmeCore.io.c2hData.get <> qdma.io.c2h_data 56 | } else if (QDMA_INTERFACE == "SAXIB") { 57 | qdma.io.s_axib.get <> AXIRegSlice(3)(nvmeCore.io.sAxib.get) 58 | qdma.io.c2h_cmd <> DontCare 59 | qdma.io.c2h_data <> DontCare 60 | } 61 | 62 | val controlReg = qdma.io.reg_control 63 | val statusReg = qdma.io.reg_status 64 | 65 | statusReg(65) := nvmeCore.io.status.params.ssdNum 66 | statusReg(67) := nvmeCore.io.status.params.queueLowBit 67 | statusReg(64) := nvmeCore.io.status.params.ssdLowBit 68 | statusReg(66) := nvmeCore.io.status.params.queueDepth 69 | statusReg(68) := nvmeCore.io.status.params.queueNum 70 | statusReg(69) := nvmeCore.io.status.params.ramTypeBit 71 | 72 | // In such scenario, whole BAR space of QDMA is often used by different modules. 73 | // Thus, we must split AXI Bridge into different ways. 74 | // The 2nd input of AXIRouter is to automatically decide data widths, it will NOT 75 | // be connected to the router. 76 | 77 | // In this example design, AXIB space is splited into 3 ways: 78 | // 0 - Lower space is used to store SQ / CQ RAMs 79 | // 1 - Middle space is used to store PRP lists 80 | // 2 - Upper space is used for data buffer. 81 | 82 | val axibRt = AXIRouter(3, qdma.io.axib) 83 | axibRt.io.in <> AXIRegSlice(2)(qdma.io.axib) 84 | axibRt.io.wrIdx := Mux( 85 | axibRt.io.in.aw.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U, 86 | 0.U, Mux(axibRt.io.in.aw.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U) 87 | ) 88 | axibRt.io.rdIdx := Mux( 89 | axibRt.io.in.ar.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U, 90 | 0.U, Mux(axibRt.io.in.ar.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U) 91 | ) 92 | 93 | nvmeCore.io.ramIO <> AXI2NVMeRam(AXIRegSlice(5)(axibRt.io.out(0))) 94 | 95 | qdma.io.h2c_cmd <> DontCare 96 | qdma.io.h2c_data <> DontCare 97 | 98 | nvmeCore.io.control.enable := controlReg(160) 99 | nvmeCore.io.control.ssdSetup.valid := controlReg(32) & ~RegNext(controlReg(32)) 100 | nvmeCore.io.control.ssdSetup.bits.ssdId := controlReg(40) 101 | nvmeCore.io.control.ssdSetup.bits.ssdBarAddr := Cat(controlReg(39), controlReg(38)) 102 | 103 | // Benchmark module 104 | 105 | val benchmark = Module(new NVMeLatencyBenchmark( 106 | SSD_NUM = SSD_NUM, 107 | DATA_BUFFER_SHIFT = DATA_BUFFER_SHIFT 108 | )) 109 | 110 | for (i <- 0 until SSD_NUM) { 111 | nvmeCore.io.ssdCmd(i)(0)<> RegSlice(3)(benchmark.io.ssdCmd(i)) 112 | benchmark.io.ssdCmpt(i) <> Pipe(nvmeCore.io.ssdCmpt(i)(0), 3) 113 | } 114 | benchmark.io.prpInput <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(1))) 115 | 116 | benchmark.io.ctlRunning := controlReg(160) 117 | benchmark.io.ctlEnd := statusReg(192) & ~RegNext(statusReg(192)) 118 | benchmark.io.ctlFpgaBar := Cat(controlReg(37), controlReg(36)) 119 | benchmark.io.ctlTimeTarget := Cat(controlReg(163), 0.U(6.W)) 120 | benchmark.io.ctlNumNlb := controlReg(162)(15, 0) 121 | benchmark.io.ctlMaxParallel := controlReg(170) 122 | benchmark.io.ctlModeWrite := controlReg(161)(0) 123 | benchmark.io.ctlModeRandom := controlReg(161)(1) 124 | benchmark.io.ctlReadLatency.valid := controlReg(167)(0) & ~RegNext(controlReg(167)(0)) 125 | benchmark.io.ctlReadLatency.bits := controlReg(168) 126 | benchmark.io.ctlModeMixed := 0.U 127 | 128 | statusReg(192) := !(nvmeCore.io.status.running || nvmeCore.io.control.enable) 129 | statusReg(193) := nvmeCore.io.status.stat.successfulOp 130 | statusReg(194) := nvmeCore.io.status.stat.failedOp 131 | statusReg(196) := nvmeCore.io.status.stat.executeTime(31, 0) 132 | statusReg(197) := nvmeCore.io.status.stat.executeTime(63, 32) 133 | for (ssdId <- 0 until SSD_NUM) { 134 | statusReg(200+ssdId) := benchmark.io.statSsdIo(ssdId) 135 | } 136 | statusReg(218) := nvmeCore.io.status.stat.totalLatency(31, 0) 137 | statusReg(219) := nvmeCore.io.status.stat.totalLatency(63, 32) 138 | statusReg(220) := benchmark.io.statLatency 139 | 140 | // Bandwidth probe 141 | val dataBufferIo = AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(2))) 142 | dataBufferIo.readData := 0.U 143 | 144 | val readProbe = Module(new BandwidthProbe) 145 | readProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0) 146 | readProbe.io.fire := dataBufferIo.readEnable 147 | readProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U) 148 | statusReg(216) := Mux(readProbe.io.count.valid, readProbe.io.count.bits, -1.S(32.W).asUInt) 149 | 150 | val writeProbe = Module(new BandwidthProbe) 151 | writeProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0) 152 | writeProbe.io.fire := (dataBufferIo.writeMask =/= 0.U) 153 | writeProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U) 154 | statusReg(217) := Mux(writeProbe.io.count.valid, writeProbe.io.count.bits, -1.S(32.W).asUInt) 155 | 156 | // AXIB Debug 157 | val aw_cnt = RegInit(UInt(32.W), 0.U) 158 | val w_cnt = RegInit(UInt(32.W), 0.U) 159 | 160 | when (qdma.io.axib.aw.fire) { 161 | aw_cnt := aw_cnt + qdma.io.axib.aw.bits.len + 1.U 162 | } 163 | 164 | when (qdma.io.axib.w.fire) { 165 | w_cnt := w_cnt + 1.U 166 | } 167 | 168 | val diff_cnt = aw_cnt - w_cnt 169 | val diff_time = RegInit(UInt(32.W), 0.U) 170 | when (diff_cnt === 0.U) { 171 | diff_time := 0.U 172 | }.otherwise { 173 | diff_time := diff_time + 1.U 174 | } 175 | } 176 | } -------------------------------------------------------------------------------- /src/NVMeBandwidthBenchmarkTop.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import qdma._ 9 | 10 | class NVMeBandwidthBenchmarkTop extends RawModule{ 11 | val qdma_pin = IO(new QDMAPin(PCIE_WIDTH=8)) 12 | val led = IO(Output(UInt(1.W))) 13 | 14 | led := 0.U 15 | 16 | // Global parameters 17 | val DATA_BUFFER_SHIFT = 27 // Upper 128 MiB is used for data buffer. 18 | val SSD_NUM = 1 19 | val QUEUE_NUM = 2 20 | val QUEUE_DEPTH = 64 21 | val QDMA_INTERFACE = "SAXIB" 22 | 23 | val qdma = Module(new QDMA( 24 | VIVADO_VERSION = "202101", 25 | PCIE_WIDTH = 8, 26 | SLAVE_BRIDGE = (QDMA_INTERFACE == "SAXIB"), 27 | TLB_TYPE = new BypassTLB, // TLB is not used here. 28 | BRIDGE_BAR_SCALE= "Megabytes", 29 | BRIDGE_BAR_SIZE = 256 30 | )) 31 | qdma.getTCL() 32 | 33 | val user_clk = qdma.io.pcie_clk 34 | val user_rstn = qdma.io.pcie_arstn 35 | 36 | ToZero(qdma.io.reg_status) 37 | qdma.io.pin <> qdma_pin 38 | 39 | // TODO: Notify Carl Zhang move AXI clock convertor into QDMA module :/ 40 | qdma.io.user_clk := user_clk 41 | qdma.io.user_arstn := user_rstn 42 | qdma.io.soft_rstn := 1.U 43 | 44 | Collector.connect_to_status_reg(qdma.io.reg_status, 400) 45 | 46 | withClockAndReset(qdma.io.user_clk,!qdma.io.user_arstn) { 47 | val nvmeCore = Module(new NVMeController( 48 | SSD_NUM = SSD_NUM, 49 | QUEUE_NUM = QUEUE_NUM, 50 | QUEUE_DEPTH = QUEUE_DEPTH, 51 | QDMA_INTERFACE = QDMA_INTERFACE 52 | )) 53 | 54 | if (QDMA_INTERFACE == "DMA") { 55 | nvmeCore.io.c2hCmd.get <> qdma.io.c2h_cmd 56 | nvmeCore.io.c2hData.get <> qdma.io.c2h_data 57 | } else if (QDMA_INTERFACE == "SAXIB") { 58 | qdma.io.s_axib.get <> AXIRegSlice(3)(nvmeCore.io.sAxib.get) 59 | qdma.io.c2h_cmd <> DontCare 60 | qdma.io.c2h_data <> DontCare 61 | } 62 | 63 | val controlReg = qdma.io.reg_control 64 | val statusReg = qdma.io.reg_status 65 | 66 | statusReg(65) := nvmeCore.io.status.params.ssdNum 67 | statusReg(67) := nvmeCore.io.status.params.queueLowBit 68 | statusReg(64) := nvmeCore.io.status.params.ssdLowBit 69 | statusReg(66) := nvmeCore.io.status.params.queueDepth 70 | statusReg(68) := nvmeCore.io.status.params.queueNum 71 | statusReg(69) := nvmeCore.io.status.params.ramTypeBit 72 | 73 | // In such scenario, whole BAR space of QDMA is often used by different modules. 74 | // Thus, we must split AXI Bridge into different ways. 75 | // The 2nd input of AXIRouter is to automatically decide data widths, it will NOT 76 | // be connected to the router. 77 | 78 | // In this example design, AXIB space is splited into 3 ways: 79 | // 0 - Lower space is used to store SQ / CQ RAMs 80 | // 1 - Middle space is used to store PRP lists 81 | // 2 - Upper space is used for data buffer. 82 | 83 | val axibRt = AXIRouter(3, qdma.io.axib) 84 | axibRt.io.in <> AXIRegSlice(2)(qdma.io.axib) 85 | axibRt.io.wrIdx := Mux( 86 | axibRt.io.in.aw.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U, 87 | 0.U, Mux(axibRt.io.in.aw.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U) 88 | ) 89 | axibRt.io.rdIdx := Mux( 90 | axibRt.io.in.ar.bits.addr(63, nvmeCore.RAM_TYPE_BIT+1) === 0.U, 91 | 0.U, Mux(axibRt.io.in.ar.bits.addr(63, DATA_BUFFER_SHIFT) === 0.U, 1.U, 2.U) 92 | ) 93 | 94 | nvmeCore.io.ramIO <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(0))) 95 | 96 | qdma.io.h2c_cmd <> DontCare 97 | qdma.io.h2c_data <> DontCare 98 | 99 | nvmeCore.io.control.enable := controlReg(160) 100 | nvmeCore.io.control.ssdSetup.valid := controlReg(32) & ~RegNext(controlReg(32)) 101 | nvmeCore.io.control.ssdSetup.bits.ssdId := controlReg(40) 102 | nvmeCore.io.control.ssdSetup.bits.ssdBarAddr := Cat(controlReg(39), controlReg(38)) 103 | 104 | // Benchmark module 105 | 106 | val benchmark = Module(new NVMeBandwidthBenchmark( 107 | SSD_NUM = SSD_NUM, 108 | QUEUE_NUM = QUEUE_NUM, 109 | DATA_BUFFER_SHIFT = DATA_BUFFER_SHIFT 110 | )) 111 | 112 | benchmark.io.ssdCmd <> nvmeCore.io.ssdCmd 113 | benchmark.io.prpInput <> AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(1))) 114 | 115 | benchmark.io.ctlRunning := controlReg(160) 116 | benchmark.io.ctlEnd := statusReg(192) & ~RegNext(statusReg(192)) 117 | benchmark.io.ctlFpgaBar := Cat(controlReg(37), controlReg(36)) 118 | benchmark.io.ctlTimeTarget := Cat(controlReg(163), 0.U(6.W)) 119 | benchmark.io.ctlNumNlb := controlReg(162)(15, 0) 120 | benchmark.io.ctlMaxParallel := controlReg(170) 121 | benchmark.io.ctlModeWrite := controlReg(161)(0) 122 | benchmark.io.ctlModeRandom := controlReg(161)(1) 123 | benchmark.io.ctlModeMixed := 0.U 124 | benchmark.io.ctlRdBlkSize := controlReg(167) 125 | benchmark.io.ctlWrBlkSize := controlReg(168) 126 | benchmark.io.ctlRdBlkAhead := controlReg(169) 127 | 128 | statusReg(192) := !(nvmeCore.io.status.running || nvmeCore.io.control.enable) 129 | statusReg(193) := nvmeCore.io.status.stat.successfulOp 130 | statusReg(194) := nvmeCore.io.status.stat.failedOp 131 | statusReg(196) := nvmeCore.io.status.stat.executeTime(31, 0) 132 | statusReg(197) := nvmeCore.io.status.stat.executeTime(63, 32) 133 | for (ssdId <- 0 until SSD_NUM) { 134 | statusReg(200+ssdId) := benchmark.io.statSsdIo(ssdId) 135 | } 136 | statusReg(218) := nvmeCore.io.status.stat.totalLatency(31, 0) 137 | statusReg(219) := nvmeCore.io.status.stat.totalLatency(63, 32) 138 | 139 | // Bandwidth probe 140 | val dataBufferIo = AXI2NVMeRam(AXIRegSlice(2)(axibRt.io.out(2))) 141 | dataBufferIo.readData := 0.U 142 | 143 | val readProbe = Module(new BandwidthProbe) 144 | readProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0) 145 | readProbe.io.fire := dataBufferIo.readEnable 146 | readProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U) 147 | statusReg(216) := Mux(readProbe.io.count.valid, readProbe.io.count.bits, -1.S(32.W).asUInt) 148 | 149 | val writeProbe = Module(new BandwidthProbe) 150 | writeProbe.io.enable := ~statusReg(192)(0) && controlReg(165)(0) 151 | writeProbe.io.fire := (dataBufferIo.writeMask =/= 0.U) 152 | writeProbe.io.count.ready := (controlReg(166)(0) === 1.U && RegNext(controlReg(166)(0)) =/= 1.U) 153 | statusReg(217) := Mux(writeProbe.io.count.valid, writeProbe.io.count.bits, -1.S(32.W).asUInt) 154 | 155 | // AXIB Debug 156 | val aw_cnt = RegInit(UInt(32.W), 0.U) 157 | val w_cnt = RegInit(UInt(32.W), 0.U) 158 | 159 | when (qdma.io.axib.aw.fire) { 160 | aw_cnt := aw_cnt + qdma.io.axib.aw.bits.len + 1.U 161 | } 162 | 163 | when (qdma.io.axib.w.fire) { 164 | w_cnt := w_cnt + 1.U 165 | } 166 | 167 | val diff_cnt = aw_cnt - w_cnt 168 | val diff_time = RegInit(UInt(32.W), 0.U) 169 | when (diff_cnt === 0.U) { 170 | diff_time := 0.U 171 | }.otherwise { 172 | diff_time := diff_time + 1.U 173 | } 174 | } 175 | } 176 | 177 | class BypassTLB extends Module with BaseTLB { 178 | io.h2c_in <> io.h2c_out 179 | io.c2h_in <> io.c2h_out 180 | io.tlb_miss_count := 0.U 181 | io.wr_tlb.ready := 1.U 182 | } -------------------------------------------------------------------------------- /src/deprecated/NVMe.scala: -------------------------------------------------------------------------------- 1 | package nvme.deprecated 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import nvme.NVMeRamIO 6 | 7 | import qdma._ 8 | 9 | class NVMe ( 10 | DEBUG : Boolean = true, 11 | SSD_MAX_ID : Int = 0, 12 | BUFFER_DATA_SHIFT : Int = 27, 13 | SSD_NUM_SHIFT : Int = 2, 14 | QUEUE_DEPTH_SHIFT : Int = 8, 15 | QUEUE_MAX_ID : Int = 3, 16 | QUEUE_COUNT_SHIFT : Int = 2, 17 | MAX_SQ_INTERVAL : Int = 30 18 | ) extends Module { 19 | val io = IO(new Bundle{ 20 | val ssdCmd = Flipped(Vec(SSD_MAX_ID+1, Vec(QUEUE_MAX_ID+1, Decoupled(new NVMeCommand)))) 21 | 22 | val regControl = new NVMeControl 23 | val regStatus = new NVMeStatus 24 | 25 | val bramReq = Flipped(new NVMeRamIO) 26 | 27 | val s_axib = new AXIB_SLAVE 28 | 29 | val pcie_hbm_write_transfer = if (DEBUG) {Some(Input(UInt(2.W)))} else None 30 | }) 31 | 32 | val QUEUE_NUM = (SSD_MAX_ID+1) * (QUEUE_MAX_ID+1) 33 | 34 | val nvmeCore = Module(new NVMeBlackBox( 35 | SSD_MAX_ID = SSD_MAX_ID, 36 | BUFFER_DATA_SHIFT = BUFFER_DATA_SHIFT, 37 | SSD_NUM_SHIFT = SSD_NUM_SHIFT, 38 | QUEUE_DEPTH_SHIFT = QUEUE_DEPTH_SHIFT, 39 | QUEUE_MAX_ID = QUEUE_MAX_ID, 40 | QUEUE_COUNT_SHIFT = QUEUE_COUNT_SHIFT, 41 | MAX_SQ_INTERVAL = MAX_SQ_INTERVAL 42 | )) 43 | 44 | nvmeCore.io.clk_core := clock 45 | nvmeCore.io.sys_reset := reset 46 | 47 | io.regStatus.p2pdma_h2c_data := nvmeCore.io.status_p2pdma_h2c_data 48 | io.regStatus.p2pdma_h2c_done := nvmeCore.io.status_p2pdma_h2c_done 49 | io.regStatus.p2pdma_c2h_done := nvmeCore.io.status_p2pdma_c2h_done 50 | io.regStatus.nvme_init_done := nvmeCore.io.status_nvme_init_done 51 | io.regStatus.nvme_exec_done := nvmeCore.io.status_nvme_exec_done 52 | io.regStatus.stat_op_succ := nvmeCore.io.status_stat_op_succ 53 | io.regStatus.stat_op_fail := nvmeCore.io.status_stat_op_fail 54 | io.regStatus.stat_exec_time := nvmeCore.io.status_stat_exec_time 55 | io.regStatus.stat_io_ssd0 := nvmeCore.io.status_stat_io_ssd0 56 | io.regStatus.stat_io_ssd1 := nvmeCore.io.status_stat_io_ssd1 57 | io.regStatus.stat_io_ssd2 := nvmeCore.io.status_stat_io_ssd2 58 | io.regStatus.stat_io_ssd3 := nvmeCore.io.status_stat_io_ssd3 59 | io.regStatus.band_tr_rd := nvmeCore.io.status_band_tr_rd 60 | io.regStatus.band_tr_wr := nvmeCore.io.status_band_tr_wr 61 | 62 | nvmeCore.io.control_init_start := io.regControl.init_start 63 | nvmeCore.io.control_init_nsid := io.regControl.init_nsid 64 | nvmeCore.io.control_init_dma_addr := io.regControl.init_dma_addr 65 | nvmeCore.io.control_init_byp_addr := io.regControl.init_byp_addr 66 | nvmeCore.io.control_init_ssd_addr := io.regControl.init_ssd_addr 67 | nvmeCore.io.control_init_ssdid := io.regControl.init_ssdid 68 | nvmeCore.io.control_p2pdma_read := io.regControl.p2pdma_read 69 | nvmeCore.io.control_p2pdma_write := io.regControl.p2pdma_write 70 | nvmeCore.io.control_p2pdma_cmd_addr := io.regControl.p2pdma_cmd_addr 71 | nvmeCore.io.control_p2pdma_cmd_len := io.regControl.p2pdma_cmd_len 72 | nvmeCore.io.control_p2pdma_c2h_data := io.regControl.p2pdma_c2h_data 73 | nvmeCore.io.control_ssd_init := io.regControl.ssd_init 74 | nvmeCore.io.control_exec_start := io.regControl.exec_start 75 | nvmeCore.io.control_exec_time := io.regControl.exec_time 76 | nvmeCore.io.control_band_tr_en := io.regControl.band_tr_en 77 | nvmeCore.io.control_band_tr_read := io.regControl.band_tr_read 78 | 79 | nvmeCore.io.axib_read_enable := io.bramReq.readEnable 80 | nvmeCore.io.axib_read_addr := io.bramReq.readAddr 81 | io.bramReq.readData := nvmeCore.io.axib_read_data 82 | nvmeCore.io.axib_write_mask := io.bramReq.writeMask 83 | nvmeCore.io.axib_write_addr := io.bramReq.writeAddr 84 | nvmeCore.io.axib_write_data := io.bramReq.writeData 85 | 86 | io.s_axib.qdma_init() 87 | 88 | nvmeCore.io.s_axib_awid <> io.s_axib.aw.bits.id 89 | nvmeCore.io.s_axib_awaddr <> io.s_axib.aw.bits.addr 90 | nvmeCore.io.s_axib_awlen <> io.s_axib.aw.bits.len 91 | nvmeCore.io.s_axib_awsize <> io.s_axib.aw.bits.size 92 | nvmeCore.io.s_axib_awuser <> io.s_axib.aw.bits.user 93 | nvmeCore.io.s_axib_awburst <> io.s_axib.aw.bits.burst 94 | nvmeCore.io.s_axib_awregion <> io.s_axib.aw.bits.region 95 | nvmeCore.io.s_axib_awvalid <> io.s_axib.aw.valid 96 | nvmeCore.io.s_axib_awready <> io.s_axib.aw.ready 97 | nvmeCore.io.s_axib_wdata <> io.s_axib.w.bits.data 98 | nvmeCore.io.s_axib_wstrb <> io.s_axib.w.bits.strb 99 | nvmeCore.io.s_axib_wlast <> io.s_axib.w.bits.last 100 | nvmeCore.io.s_axib_wuser <> io.s_axib.w.bits.user 101 | nvmeCore.io.s_axib_wvalid <> io.s_axib.w.valid 102 | nvmeCore.io.s_axib_wready <> io.s_axib.w.ready 103 | nvmeCore.io.s_axib_bid <> io.s_axib.b.bits.id 104 | nvmeCore.io.s_axib_bresp <> io.s_axib.b.bits.resp 105 | nvmeCore.io.s_axib_bvalid <> io.s_axib.b.valid 106 | nvmeCore.io.s_axib_bready <> io.s_axib.b.ready 107 | nvmeCore.io.s_axib_arid <> io.s_axib.ar.bits.id 108 | nvmeCore.io.s_axib_araddr <> io.s_axib.ar.bits.addr 109 | nvmeCore.io.s_axib_arlen <> io.s_axib.ar.bits.len 110 | nvmeCore.io.s_axib_arsize <> io.s_axib.ar.bits.size 111 | nvmeCore.io.s_axib_aruser <> io.s_axib.ar.bits.user 112 | nvmeCore.io.s_axib_arburst <> io.s_axib.ar.bits.burst 113 | nvmeCore.io.s_axib_arregion <> io.s_axib.ar.bits.region 114 | nvmeCore.io.s_axib_arvalid <> io.s_axib.ar.valid 115 | nvmeCore.io.s_axib_arready <> io.s_axib.ar.ready 116 | nvmeCore.io.s_axib_rid <> io.s_axib.r.bits.id 117 | nvmeCore.io.s_axib_rdata <> io.s_axib.r.bits.data 118 | nvmeCore.io.s_axib_ruser <> io.s_axib.r.bits.user 119 | nvmeCore.io.s_axib_rresp <> io.s_axib.r.bits.resp 120 | nvmeCore.io.s_axib_rlast <> io.s_axib.r.bits.last 121 | nvmeCore.io.s_axib_rvalid <> io.s_axib.r.valid 122 | nvmeCore.io.s_axib_rready <> io.s_axib.r.ready 123 | 124 | if (DEBUG) { 125 | nvmeCore.io.pcie_hbm_write_transfer := io.pcie_hbm_write_transfer.get 126 | } else { 127 | nvmeCore.io.pcie_hbm_write_transfer := 0.U 128 | } 129 | 130 | // Handle NVMe command, where I need to combine each wire in bundle to a vector. 131 | 132 | val ssdCmdOpVec = Wire(Vec(QUEUE_NUM, UInt(1.W))) 133 | val ssdCmdNlbVec = Wire(Vec(QUEUE_NUM, UInt(16.W))) 134 | val ssdCmdLbaVec = Wire(Vec(QUEUE_NUM, UInt(32.W))) 135 | val ssdCmdOffsetVec = Wire(Vec(QUEUE_NUM, UInt(32.W))) 136 | val ssdCmdValidVec = Wire(Vec(QUEUE_NUM, UInt(1.W))) 137 | val ssdCmdReadyVec = Wire(Vec(QUEUE_NUM, UInt(1.W))) 138 | 139 | var i, j = 0 140 | 141 | for (i <- 0 to SSD_MAX_ID) { 142 | for (j <- 0 to QUEUE_MAX_ID) { 143 | val idx = i*(QUEUE_MAX_ID+1) + j 144 | ssdCmdOpVec(idx) := io.ssdCmd(i)(j).bits.op 145 | ssdCmdNlbVec(idx) := io.ssdCmd(i)(j).bits.numLb 146 | ssdCmdLbaVec(idx) := io.ssdCmd(i)(j).bits.ssdAddr 147 | ssdCmdOffsetVec(idx) := io.ssdCmd(i)(j).bits.memAddr 148 | ssdCmdValidVec(idx) := io.ssdCmd(i)(j).valid 149 | io.ssdCmd(i)(j).ready := ssdCmdReadyVec(idx) 150 | } 151 | } 152 | 153 | nvmeCore.io.ssd_cmd_op := ssdCmdOpVec.asTypeOf(UInt(QUEUE_NUM.W)) 154 | nvmeCore.io.ssd_cmd_nlb := ssdCmdNlbVec.asTypeOf(UInt((QUEUE_NUM*16).W)) 155 | nvmeCore.io.ssd_cmd_lba := ssdCmdLbaVec.asTypeOf(UInt((QUEUE_NUM*32).W)) 156 | nvmeCore.io.ssd_cmd_offset := ssdCmdOffsetVec.asTypeOf(UInt((QUEUE_NUM*32).W)) 157 | nvmeCore.io.ssd_cmd_valid := ssdCmdValidVec.asTypeOf(UInt(QUEUE_NUM.W)) 158 | ssdCmdReadyVec := nvmeCore.io.ssd_cmd_ready.asTypeOf(Vec(QUEUE_NUM, UInt(1.W))) 159 | } -------------------------------------------------------------------------------- /src/NVMeLatencyBenchmark.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import math.max 9 | 10 | // Runs a simple NVMe benchmark. 11 | 12 | class NVMeLatencyBenchmark ( 13 | SSD_NUM : Int, 14 | DATA_BUFFER_SHIFT : Int 15 | ) extends Module { 16 | val io = IO(new Bundle { 17 | // Interfaces 18 | val prpInput = Flipped(new NVMeRamIO) 19 | val ssdCmd = Vec(SSD_NUM, Decoupled(UInt(512.W))) 20 | val ssdCmpt = Vec(SSD_NUM, Flipped(Valid(new SSDCompletion))) 21 | 22 | // Control 23 | val ctlRunning = Input(Bool()) 24 | val ctlEnd = Input(Bool()) 25 | val ctlFpgaBar = Input(UInt(64.W)) 26 | val ctlTimeTarget = Input(UInt(38.W)) 27 | val ctlNumNlb = Input(UInt(16.W)) 28 | val ctlMaxParallel = Input(UInt(32.W)) 29 | val ctlModeWrite = Input(UInt(1.W)) 30 | val ctlModeRandom = Input(UInt(1.W)) 31 | val ctlModeMixed = Input(UInt(1.W)) 32 | val ctlReadLatency = Flipped(Valid(UInt(13.W))) 33 | // Status 34 | val statSsdIo = Output(Vec(SSD_NUM, UInt(32.W))) 35 | val statLatency = Output(UInt(32.W)) 36 | }) 37 | 38 | // Control and status signals. 39 | 40 | val ctlTimeTarget = RegInit(UInt(38.W), 0.U) 41 | val ctlNumNlb = RegInit(UInt(16.W), 0.U) 42 | val ctlMaxParallel = RegInit(UInt(32.W), 0.U) 43 | val ctlModeWrite = RegInit(UInt(1.W), 0.U) 44 | val ctlModeRandom = RegInit(UInt(1.W), 0.U) 45 | val ctlModeMixed = RegInit(UInt(1.W), 0.U) 46 | 47 | val statSsdIo = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(32.W)))) 48 | for (ssdId <- 0 until SSD_NUM) { 49 | io.statSsdIo(ssdId) := statSsdIo(ssdId) 50 | } 51 | 52 | // Initiate signals 53 | 54 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) { 55 | ctlTimeTarget := io.ctlTimeTarget 56 | ctlNumNlb := io.ctlNumNlb 57 | ctlMaxParallel := io.ctlMaxParallel 58 | ctlModeWrite := io.ctlModeWrite 59 | ctlModeRandom := io.ctlModeRandom 60 | ctlModeMixed := io.ctlModeMixed 61 | for (ssdId <- 0 until SSD_NUM) { 62 | statSsdIo(ssdId) := 0.U 63 | } 64 | } 65 | 66 | // In this example, data transfer is between FPGA and SSDs. 67 | // I give each queue a fixed buffer. 68 | 69 | val SSD_BIT_LOW = DATA_BUFFER_SHIFT - log2Ceil(SSD_NUM) 70 | val QUEUE_BIT_LOW = SSD_BIT_LOW - 1 71 | val PRP_ADDR_MSB = DATA_BUFFER_SHIFT - 10 72 | 73 | // Global timer 74 | val gblTimer = RegInit(UInt(64.W), 0.U) 75 | gblTimer := gblTimer + 1.U(64.W) 76 | 77 | val startTimeRam = XRam(SSD_NUM)(UInt(64.W), 256, latency=2) 78 | 79 | // Generate commands 80 | 81 | for (ssdId <- 0 until SSD_NUM) { 82 | val cmdOutStd = RegInit(UInt(32.W), 0.U) 83 | 84 | val cmdRdCond = (cmdOutStd =/= ctlMaxParallel) 85 | val cmdWrCond = (cmdOutStd =/= ctlMaxParallel) 86 | 87 | val cmdLba = RegInit(0.U(64.W)) 88 | val cmdPrp1 = Wire(UInt(64.W)) 89 | val cmdPrp2 = Wire(UInt(64.W)) 90 | val cmdNlb = RegInit(0.U(16.W)) 91 | val cmdId = RegInit(0.U(16.W)) 92 | 93 | cmdPrp1 := (io.ctlFpgaBar 94 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 95 | + (ssdId.U(64.W) << SSD_BIT_LOW) 96 | ) 97 | cmdPrp2 := Mux( 98 | cmdNlb < 16.U, 99 | cmdPrp1 + 0x1000.U(64.W), 100 | ( 101 | io.ctlFpgaBar 102 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 103 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9)) 104 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9)) 105 | ) 106 | ) 107 | 108 | io.ssdCmd(ssdId).valid := Mux(ctlModeWrite.asBool, 109 | ( // Write 110 | io.ctlRunning && RegNext(io.ctlRunning) 111 | && cmdWrCond && ctlModeWrite.asBool 112 | ), 113 | ( // Read 114 | io.ctlRunning && RegNext(io.ctlRunning) 115 | && cmdRdCond && ~ctlModeWrite.asBool 116 | ) 117 | ) 118 | io.ssdCmd(ssdId).bits := Mux(ctlModeWrite.asBool, 119 | NVMeCommandSet.nvmWrite( 120 | cmdId, cmdPrp1, cmdPrp2, cmdLba, cmdNlb 121 | ), 122 | NVMeCommandSet.nvmRead( 123 | cmdId, cmdPrp1, cmdPrp2, cmdLba, cmdNlb 124 | ) 125 | ) 126 | 127 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) { 128 | cmdId := 0.U 129 | when (io.ctlModeRandom.asBool) { 130 | cmdLba := Cat("h92918".U(20.W), 0.U(10.W)) 131 | }.otherwise { 132 | cmdLba := 0.U 133 | } 134 | cmdNlb := io.ctlNumNlb 135 | cmdOutStd := 0.U 136 | }.elsewhen (io.ctlRunning) { 137 | when (io.ssdCmd(ssdId).fire) { 138 | when (ctlModeRandom.asBool) { 139 | val nextRndPart = Wire(UInt(20.W)) 140 | nextRndPart := (cmdLba(29, 10) << 5) + (cmdLba(29, 10) >> 5) 141 | cmdLba := Cat(nextRndPart, 0.U(10.W)) 142 | }.otherwise { 143 | val nextSeqPart = Wire(UInt(30.W)) 144 | nextSeqPart := cmdLba + 1.U(30.W) + ctlNumNlb 145 | cmdLba := nextSeqPart 146 | } 147 | 148 | cmdId := cmdId + 1.U 149 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U 150 | cmdOutStd := cmdOutStd + 1.U 151 | } 152 | 153 | when (io.ssdCmpt(ssdId).valid) { 154 | cmdOutStd := cmdOutStd - 1.U 155 | } 156 | } // elsewhen (io.ctlRunning) 157 | 158 | // Start time RAM. We track latency for each command. 159 | 160 | startTimeRam(ssdId).io.addr_a := cmdId(7, 0) 161 | startTimeRam(ssdId).io.data_in_a := gblTimer 162 | startTimeRam(ssdId).io.wr_en_a := io.ssdCmd(ssdId).fire 163 | startTimeRam(ssdId).io.addr_b := io.ssdCmpt(ssdId).bits.cmdId(7, 0) 164 | 165 | } // for (ssdId <- 0 until SSD_NUM) 166 | 167 | val cmptId = Wire(UInt(max(1, log2Ceil(SSD_NUM)).W)) 168 | 169 | cmptId := 0.U 170 | for (ssdId <- 0 until SSD_NUM) { 171 | when (io.ssdCmpt(ssdId).valid) { 172 | cmptId := ssdId.U 173 | } 174 | } 175 | 176 | val latencyBits = Wire(UInt(64.W)) 177 | latencyBits := 0.U 178 | for (ssdId <- 0 until SSD_NUM) { 179 | when (RegNext(RegNext(cmptId)) === ssdId.U) { 180 | latencyBits := gblTimer - startTimeRam(ssdId).io.data_out_b 181 | } 182 | } 183 | 184 | // Latency RAM. 185 | 186 | val latencyValid = RegNext(RegNext(io.ssdCmpt(cmptId).valid)) 187 | 188 | val latencyRam = XRam(UInt(32.W), 8192, latency=2, memory_type="ultra") 189 | latencyRam.io.addr_a := Mux(io.ctlRunning, RegNext(RegNext(latencyRam.io.addr_b)) ,io.ctlReadLatency.bits) 190 | latencyRam.io.wr_en_a := io.ctlRunning && latencyValid 191 | latencyRam.io.addr_b := Mux(latencyBits(63, 16) === 0.U, latencyBits(15, 3), -1.S(13.W).asUInt) 192 | latencyRam.io.data_in_a := latencyRam.io.data_out_b + 1.U 193 | io.statLatency := latencyRam.io.data_out_a 194 | 195 | // Generate PRP list 196 | 197 | io.prpInput.readData := Cat( 198 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6) + 1.U((PRP_ADDR_MSB-5).W), 0x0000.U(15.W), 199 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x7000.U(15.W), 200 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x6000.U(15.W), 201 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x5000.U(15.W), 202 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x4000.U(15.W), 203 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x3000.U(15.W), 204 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x2000.U(15.W), 205 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x1000.U(15.W), 206 | ) 207 | } -------------------------------------------------------------------------------- /src/NVMeBandwidthBenchmark.scala: -------------------------------------------------------------------------------- 1 | package nvme 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import common._ 6 | import common.storage._ 7 | import common.axi._ 8 | import math.max 9 | 10 | // Runs a simple NVMe benchmark. 11 | 12 | class NVMeBandwidthBenchmark ( 13 | SSD_NUM : Int, 14 | QUEUE_NUM : Int, 15 | DATA_BUFFER_SHIFT : Int 16 | ) extends Module { 17 | val io = IO(new Bundle { 18 | // Interfaces 19 | val prpInput = Flipped(new NVMeRamIO) 20 | val ssdCmd = Vec(SSD_NUM, Vec(QUEUE_NUM, Decoupled(UInt(512.W)))) 21 | // Control 22 | val ctlRunning = Input(Bool()) 23 | val ctlEnd = Input(Bool()) 24 | val ctlFpgaBar = Input(UInt(64.W)) 25 | val ctlTimeTarget = Input(UInt(38.W)) 26 | val ctlNumNlb = Input(UInt(16.W)) 27 | val ctlMaxParallel = Input(UInt(32.W)) 28 | val ctlModeWrite = Input(UInt(1.W)) 29 | val ctlModeRandom = Input(UInt(1.W)) 30 | val ctlModeMixed = Input(UInt(1.W)) 31 | val ctlRdBlkSize = Input(UInt(32.W)) 32 | val ctlWrBlkSize = Input(UInt(32.W)) 33 | val ctlRdBlkAhead = Input(UInt(32.W)) 34 | // Status 35 | val statSsdIo = Output(Vec(SSD_NUM, UInt(32.W))) 36 | }) 37 | 38 | assert (QUEUE_NUM >= 2, "At least 2 queues must be used for benchmarking") 39 | 40 | // Control and status signals. 41 | 42 | val ctlTimeTarget = RegInit(UInt(38.W), 0.U) 43 | val ctlNumNlb = RegInit(UInt(16.W), 0.U) 44 | val ctlMaxParallel = RegInit(UInt(32.W), 0.U) 45 | val ctlModeWrite = RegInit(UInt(1.W), 0.U) 46 | val ctlModeRandom = RegInit(UInt(1.W), 0.U) 47 | val ctlModeMixed = RegInit(UInt(1.W), 0.U) 48 | val ctlRdBlkSize = RegInit(UInt(32.W), 0.U) 49 | val ctlWrBlkSize = RegInit(UInt(32.W), 0.U) 50 | val ctlRdBlkAhead = RegInit(UInt(32.W), 0.U) 51 | 52 | val statSsdIo = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(32.W)))) 53 | for (ssdId <- 0 until SSD_NUM) { 54 | io.statSsdIo(ssdId) := statSsdIo(ssdId) 55 | } 56 | 57 | // Initiate signals 58 | 59 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) { 60 | ctlTimeTarget := io.ctlTimeTarget 61 | ctlNumNlb := io.ctlNumNlb 62 | ctlMaxParallel := io.ctlMaxParallel 63 | ctlModeWrite := io.ctlModeWrite 64 | ctlModeRandom := io.ctlModeRandom 65 | ctlModeMixed := io.ctlModeMixed 66 | ctlRdBlkSize := io.ctlRdBlkSize 67 | ctlWrBlkSize := io.ctlWrBlkSize 68 | ctlRdBlkAhead := io.ctlRdBlkAhead 69 | for (ssdId <- 0 until SSD_NUM) { 70 | statSsdIo(ssdId) := 0.U 71 | } 72 | } 73 | 74 | // In this example, data transfer is between FPGA and SSDs. 75 | // I give each queue a fixed buffer. 76 | 77 | val SSD_BIT_LOW = DATA_BUFFER_SHIFT - log2Ceil(SSD_NUM) 78 | val QUEUE_BIT_LOW = SSD_BIT_LOW - 1 79 | val PRP_ADDR_MSB = DATA_BUFFER_SHIFT - 10 80 | 81 | // Generate commands 82 | 83 | for (ssdId <- 0 until SSD_NUM) { 84 | val cmdRdCnt = RegInit(UInt(32.W), 0.U) 85 | val cmdWrCnt = RegInit(UInt(32.W), 0.U) 86 | val cmdRdBlk = RegInit(UInt(32.W), 0.U) 87 | val cmdWrBlk = RegInit(UInt(32.W), 0.U) 88 | 89 | val cmdRdCond = (cmdWrBlk + ctlRdBlkAhead =/= cmdRdBlk) 90 | val cmdWrCond = (cmdWrBlk =/= cmdRdBlk) 91 | 92 | val cmdLba = RegInit(VecInit(Seq.fill(2)(0.U(64.W)))) 93 | val cmdPrp1 = Wire(Vec(2, UInt(64.W))) 94 | val cmdPrp2 = Wire(Vec(2, UInt(64.W))) 95 | val cmdNlb = RegInit(VecInit(Seq.fill(2)(0.U(16.W)))) 96 | val cmdId = RegInit(VecInit(Seq.fill(2)(0.U(16.W)))) 97 | 98 | cmdPrp1(0) := (io.ctlFpgaBar 99 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 100 | + (ssdId.U(64.W) << SSD_BIT_LOW) 101 | ) 102 | cmdPrp1(1) := (io.ctlFpgaBar 103 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 104 | + (ssdId.U(64.W) << SSD_BIT_LOW) 105 | + (1.U(64.W) << QUEUE_BIT_LOW) 106 | ) 107 | cmdPrp2(0) := Mux( 108 | cmdNlb(0) < 16.U, 109 | cmdPrp1(0) + 0x1000.U(64.W), 110 | ( 111 | io.ctlFpgaBar 112 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 113 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9)) 114 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9)) 115 | ) 116 | ) 117 | cmdPrp2(1) := Mux( 118 | cmdNlb(0) < 16.U, 119 | cmdPrp2(0) + 0x1000.U(64.W), 120 | ( 121 | io.ctlFpgaBar 122 | + (1.U(64.W) << DATA_BUFFER_SHIFT) 123 | - (1.U(64.W) << (DATA_BUFFER_SHIFT-9)) 124 | + (ssdId.U(64.W) << (SSD_BIT_LOW-9)) 125 | + (1.U(64.W) << (QUEUE_BIT_LOW-9)) 126 | ) 127 | ) 128 | 129 | io.ssdCmd(ssdId)(0).valid := ( 130 | io.ctlRunning && RegNext(io.ctlRunning) 131 | && Mux(ctlModeMixed.asBool, cmdRdCond, ~ctlModeWrite.asBool) 132 | ) 133 | io.ssdCmd(ssdId)(0).bits := NVMeCommandSet.nvmRead( 134 | cmdId(0), cmdPrp1(0), cmdPrp2(0), cmdLba(0), cmdNlb(0) 135 | ) 136 | io.ssdCmd(ssdId)(1).valid := ( 137 | io.ctlRunning && RegNext(io.ctlRunning) 138 | && Mux(ctlModeMixed.asBool, cmdWrCond, ctlModeWrite.asBool) 139 | ) 140 | io.ssdCmd(ssdId)(1).bits := NVMeCommandSet.nvmWrite( 141 | cmdId(1), cmdPrp1(1), cmdPrp2(1), cmdLba(1), cmdNlb(1) 142 | ) 143 | for (queueId <- 2 until QUEUE_NUM) { 144 | io.ssdCmd(ssdId)(queueId).valid := 0.U 145 | io.ssdCmd(ssdId)(queueId).bits := 0.U 146 | } 147 | 148 | when (io.ctlRunning && ~RegNext(io.ctlRunning)) { 149 | cmdId(0) := 0.U 150 | cmdId(1) := 1.U 151 | when (io.ctlModeRandom.asBool) { 152 | cmdLba(0) := Cat("h92918".U(20.W), 0.U(10.W)) 153 | cmdLba(1) := Cat(1.U(1.W), "h92918".U(20.W), 0.U(10.W)) 154 | }.otherwise { 155 | cmdLba(0) := 0.U 156 | cmdLba(1) := Cat(1.U(1.W), 0.U(30.W)) 157 | } 158 | cmdNlb(0) := io.ctlNumNlb 159 | cmdNlb(1) := io.ctlNumNlb 160 | cmdRdCnt := 0.U 161 | cmdWrCnt := 0.U 162 | cmdRdBlk := 0.U 163 | cmdWrBlk := 0.U 164 | }.elsewhen (io.ctlRunning) { 165 | when (io.ssdCmd(ssdId)(0).fire) { 166 | when (ctlModeRandom.asBool) { 167 | val nextRndPart = Wire(UInt(20.W)) 168 | nextRndPart := (cmdLba(0)(29, 10) << 5) + (cmdLba(0)(29, 10) >> 5) 169 | cmdLba(0) := Cat(nextRndPart, 0.U(10.W)) 170 | }.otherwise { 171 | val nextSeqPart = Wire(UInt(30.W)) 172 | nextSeqPart := cmdLba(0) + 1.U(30.W) + ctlNumNlb 173 | cmdLba(0) := nextSeqPart 174 | } 175 | 176 | when (cmdRdCnt + 1.U =/= ctlRdBlkSize) { 177 | cmdRdCnt := cmdRdCnt + 1.U 178 | }.otherwise { 179 | cmdRdBlk := cmdRdBlk + 1.U 180 | cmdRdCnt := 0.U 181 | } 182 | 183 | cmdId(0) := cmdId(0) + 2.U 184 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U 185 | } 186 | 187 | when (io.ssdCmd(ssdId)(1).fire) { 188 | when (ctlModeRandom.asBool) { 189 | val nextRndPart = Wire(UInt(20.W)) 190 | nextRndPart := (cmdLba(1)(29, 10) << 5) + (cmdLba(1)(29, 10) >> 5) 191 | cmdLba(1) := Cat(1.U(1.W), nextRndPart, 1.U(10.W)) 192 | }.otherwise { 193 | val nextSeqPart = Wire(UInt(30.W)) 194 | nextSeqPart := cmdLba(1) + 1.U(30.W) + ctlNumNlb 195 | cmdLba(1) := Cat(1.U(1.W), nextSeqPart) 196 | } 197 | 198 | when (cmdWrCnt + 1.U =/= ctlWrBlkSize) { 199 | cmdWrCnt := cmdWrCnt + 1.U 200 | }.otherwise { 201 | cmdWrBlk := cmdWrBlk + 1.U 202 | cmdWrCnt := 0.U 203 | } 204 | 205 | cmdId(1) := cmdId(1) + 2.U 206 | statSsdIo(ssdId) := statSsdIo(ssdId) + 1.U 207 | } 208 | } // elsewhen (io.ctlRunning) 209 | } // for (ssdId <- 0 until SSD_NUM) 210 | 211 | // Generate PRP list 212 | 213 | io.prpInput.readData := Cat( 214 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6) + 1.U((PRP_ADDR_MSB-5).W), 0x0000.U(15.W), 215 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x7000.U(15.W), 216 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x6000.U(15.W), 217 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x5000.U(15.W), 218 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x4000.U(15.W), 219 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x3000.U(15.W), 220 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x2000.U(15.W), 221 | io.ctlFpgaBar(63, DATA_BUFFER_SHIFT+1), 1.U(1.W), io.prpInput.readAddr(PRP_ADDR_MSB, 6), 0x1000.U(15.W), 222 | ) 223 | } -------------------------------------------------------------------------------- /sw/BandwidthBenchmark.cpp: -------------------------------------------------------------------------------- 1 | /* NVMeBenchmark.cpp 2 | * A simple NVMe benchmark program. 3 | * Used with NVMeBenchmarkTop module. 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | 19 | #define SSD_ADMIN_SQ_PHYS_BASE(ssd_id) ((queue_phys_base)+0x2000*(ssd_id)) 20 | #define SSD_ADMIN_CQ_PHYS_BASE(ssd_id) ((queue_phys_base)+0x2000*(ssd_id)+0x1000) 21 | #define SSD_ADMIN_SQ_VIRT_BASE(ssd_id) ((huge_virt_base)+0x2000*(ssd_id)) 22 | #define SSD_ADMIN_CQ_VIRT_BASE(ssd_id) ((huge_virt_base)+0x2000*(ssd_id)+0x1000) 23 | 24 | // Change this as you like. Zero's based. 25 | #define ADMIN_QUEUE_DEPTH 0x1f 26 | // List of devices. 27 | string pci_id[] = {"e3", "36", "38", "39", "3a", "3b", "3c", "3d"}; 28 | 29 | // Functions for the CPU to create admin and I/O queues. 30 | uint16_t command_id[32]; 31 | uint32_t admin_sq_tl[32], admin_cq_hd[32]; 32 | uint64_t ssd_virt_base[32]; 33 | uint64_t queue_phys_base, smart_phys_base; 34 | uint64_t huge_virt_base; 35 | 36 | void insert_admin_sq(int ssd_id, uint32_t command[]) 37 | { 38 | // Calculate the starting address of command. 39 | uint32_t *command_base = (uint32_t *)(SSD_ADMIN_SQ_VIRT_BASE(ssd_id) + (64 * admin_sq_tl[ssd_id])); 40 | 41 | // Fill in the command. 42 | for (int i=0; i<16; i++) 43 | { 44 | command_base[i] = command[i]; 45 | } 46 | 47 | // Ring the doorbell. 48 | command_id[ssd_id]++; 49 | admin_sq_tl[ssd_id] = (admin_sq_tl[ssd_id] + 1) & ADMIN_QUEUE_DEPTH; 50 | uint32_t *nvme_sq0tdbl_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1000); 51 | *nvme_sq0tdbl_pt = admin_sq_tl[ssd_id]; 52 | return; 53 | } 54 | 55 | int wait_for_next_cqe(int ssd_id) 56 | { 57 | // Calculate the starting address of command. 58 | uint32_t *command_base = (uint32_t *)(SSD_ADMIN_CQ_VIRT_BASE(ssd_id) + (16 * (admin_cq_hd[ssd_id] & ADMIN_QUEUE_DEPTH))); 59 | 60 | int unexpected_phase = ((admin_cq_hd[ssd_id] >> 7) & 0x1); 61 | // fprintf(stdout, "command base: %08lx, unexpected phase: %x, sq tail: %x\n", (uint64_t) command_base, unexpected_phase, admin_sq_tl[0]); 62 | 63 | int current_phase = unexpected_phase; 64 | while (current_phase == unexpected_phase) 65 | { 66 | current_phase = command_base[3]; 67 | current_phase = ((current_phase >> 16) & 0x1); 68 | } 69 | int status = command_base[3]; 70 | status = (status >> 17); 71 | 72 | // Ring the doorbell. 73 | admin_cq_hd[ssd_id] = (admin_cq_hd[ssd_id] + 1) & ((ADMIN_QUEUE_DEPTH << 2) + 1); 74 | uint32_t *nvme_cq0hdbl_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1004); 75 | *nvme_cq0hdbl_pt = admin_cq_hd[ssd_id]; 76 | // fprintf(stdout, "sq tail: %02x, cq head: %02x.\n", admin_sq_tl[0], admin_cq_hd[0]); 77 | 78 | return status; 79 | } 80 | 81 | int nvme_set_num_of_qp(int ssd_id, uint16_t queue_count) 82 | { 83 | uint16_t queue_count_zerobased = (queue_count - 1); 84 | uint32_t command[16]; 85 | // Now fill in each dw of command. 86 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 87 | command[0] = (command_id[ssd_id] << 16) + (0x09 << 0); 88 | // DW 1: bit 31-0 namespace, all 1's in this case. 89 | command[1] = 0xffffffff; 90 | // DW 2-9 rsvd. 91 | for (int i=2; i<=9; i++) 92 | { 93 | command[i] = 0; 94 | } 95 | // DW 10: bit 31 save, bit 30-8 rsvd, bit 7-0 feature ID. 96 | command[10] = (0x07 << 0); 97 | // DW 11: bit 31-16 number of CQ zerobased, bit 15-0 number of SQ zerobased. 98 | command[11] = (queue_count_zerobased << 16) + (queue_count_zerobased << 0); 99 | // DW 12-15 rsvd. 100 | for (int i=12; i<=15; i++) 101 | { 102 | command[i] = 0; 103 | } 104 | // for (int i=0; i<16; i++) 105 | // { 106 | // fprintf(stdout, "DW %2d: %08x\n", i, command[i]); 107 | // } 108 | insert_admin_sq(ssd_id, command); 109 | return wait_for_next_cqe(ssd_id); 110 | } 111 | 112 | int nvme_create_cq(int ssd_id, uint16_t cq_id, uint16_t cq_depth, uint64_t cq_addr) 113 | { 114 | uint16_t cq_depth_zerobased = cq_depth - 1; 115 | uint32_t command[16]; 116 | // Now fill in each dw of command. 117 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 118 | command[0] = (command_id[ssd_id] << 16) + (0x05 << 0); 119 | // DW 1-5 rsvd. 120 | for (int i=1; i<=5; i++) 121 | { 122 | command[i] = 0; 123 | } 124 | // DW 6-7: bit 63-0 PRP1 125 | command[6] = (uint32_t)(cq_addr & 0xffffffff); 126 | command[7] = (uint32_t)(cq_addr >> 32); 127 | // DW 8-9 rsvd. 128 | command[8] = 0; 129 | command[9] = 0; 130 | // DW 10: bit 31-16 queue depth, bit 15-0 queue id 131 | command[10] = (cq_depth_zerobased << 16) + (cq_id << 0); 132 | // DW 11: Bit 31-16 interrupt vector, bit 15-2 esvd, bit 1 int enable, bit 0 phys cont 133 | command[11] = 1; 134 | // DW 12-15 rsvd 135 | for (int i=12; i<=15; i++) 136 | { 137 | command[i] = 0; 138 | } 139 | // for (int i=0; i<16; i++) 140 | // { 141 | // fprintf(stdout, "DW %2d: %08x\n", i, command[i]); 142 | // } 143 | insert_admin_sq(ssd_id, command); 144 | return wait_for_next_cqe(ssd_id); 145 | } 146 | 147 | int nvme_create_sq(int ssd_id, uint16_t sq_id, uint16_t cq_id, uint16_t sq_depth, uint64_t sq_addr) 148 | { 149 | uint16_t sq_depth_zerobased = sq_depth - 1; 150 | uint32_t command[16]; 151 | // Now fill in each dw of command. 152 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 153 | command[0] = (command_id[ssd_id] << 16) + (0x01 << 0); 154 | // DW 1-5 rsvd. 155 | for (int i=1; i<=5; i++) 156 | { 157 | command[i] = 0; 158 | } 159 | // DW 6-7: bit 63-0 PRP1 160 | command[6] = (uint32_t)(sq_addr & 0xffffffff); 161 | command[7] = (uint32_t)(sq_addr >> 32); 162 | // DW 8-9 rsvd. 163 | command[8] = 0; 164 | command[9] = 0; 165 | // DW 10: bit 31-16 queue depth, bit 15-0 queue id 166 | command[10] = (sq_depth_zerobased << 16) + (sq_id << 0); 167 | // DW 11: Bit 31-16 cq_id, bit 15-2 esvd, bit 1 int enable, bit 0 phys cont 168 | command[11] = (cq_id << 16) + (0x1 << 0); 169 | // DW 12-15 rsvd 170 | for (int i=12; i<=15; i++) 171 | { 172 | command[i] = 0; 173 | } 174 | insert_admin_sq(ssd_id, command); 175 | return wait_for_next_cqe(ssd_id); 176 | } 177 | 178 | int get_smart_info(int ssd_id) 179 | { 180 | uint32_t command[16]; 181 | // Now fill in each dw of command. 182 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 183 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0); 184 | // DW 1: bit 31-0 namespace 185 | command[1] = 0xffffffff; 186 | // DW 2-5 rsvd. 187 | for (int i=2; i<=5; i++) 188 | { 189 | command[i] = 0; 190 | } 191 | // DW 6-7: bit 63-0 PRP1 192 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff); 193 | command[7] = (uint32_t)(smart_phys_base >> 32); 194 | // DW 8-9: bit 63-0 PRP2, rsvd in this case. 195 | command[8] = 0; 196 | command[9] = 0; 197 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event, 198 | // bit 14-8 rsvd, dw 7-0 log id 199 | command[10] = (0x400 << 16) + (0x0 << 15) + 0x02; 200 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper. 201 | command[11] = 0x0; 202 | // DW 12-13: bit 63-0 log page offset. 0 in this case. 203 | command[12] = 0x0; 204 | command[13] = 0x0; 205 | // DW 14: bit 31-0 UUID. 0 in this case. 206 | command[14] = 0x0; 207 | // DW 15 rsvd. 208 | command[15] = 0x0; 209 | insert_admin_sq(ssd_id, command); 210 | return wait_for_next_cqe(ssd_id); 211 | } 212 | 213 | int get_error_log(int ssd_id) 214 | { 215 | uint32_t command[16]; 216 | // Now fill in each dw of command. 217 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 218 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0); 219 | // DW 1: bit 31-0 namespace 220 | command[1] = 0xffffffff; 221 | // DW 2-5 rsvd. 222 | for (int i=2; i<=5; i++) 223 | { 224 | command[i] = 0; 225 | } 226 | // DW 6-7: bit 63-0 PRP1 227 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff); 228 | command[7] = (uint32_t)(smart_phys_base >> 32); 229 | // DW 8-9: bit 63-0 PRP2, rsvd in this case. 230 | command[8] = 0; 231 | command[9] = 0; 232 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event, 233 | // bit 14-8 rsvd, dw 7-0 log id 234 | command[10] = (0x400 << 16) + (0x0 << 15) + 0x01; 235 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper. 236 | command[11] = 0x0; 237 | // DW 12-13: bit 63-0 log page offset. 0 in this case. 238 | command[12] = 0x0; 239 | command[13] = 0x0; 240 | // DW 14: bit 31-0 UUID. 0 in this case. 241 | command[14] = 0x0; 242 | // DW 15 rsvd. 243 | command[15] = 0x0; 244 | insert_admin_sq(ssd_id, command); 245 | return wait_for_next_cqe(ssd_id); 246 | } 247 | 248 | int get_temperature_info(int ssd_id) 249 | { 250 | uint32_t command[16]; 251 | // Now fill in each dw of command. 252 | // DW 0: bit 31-16 cmd_id, bit 15-10 rsvd, bit 9-8 fuse, bit 7-0 opcode. 253 | command[0] = (command_id[ssd_id] << 16) + (0x02 << 0); 254 | // DW 1: bit 31-0 namespace 255 | command[1] = 0xffffffff; 256 | // DW 2-5 rsvd. 257 | for (int i=2; i<=5; i++) 258 | { 259 | command[i] = 0; 260 | } 261 | // DW 6-7: bit 63-0 PRP1 262 | command[6] = (uint32_t)(smart_phys_base & 0xffffffff); 263 | command[7] = (uint32_t)(smart_phys_base >> 32); 264 | // DW 8-9: bit 63-0 PRP2, rsvd in this case. 265 | command[8] = 0; 266 | command[9] = 0; 267 | // DW 10: bit 31-16 num of dwords lower, bit 15 retain async event, 268 | // bit 14-8 rsvd, dw 7-0 log id 269 | command[10] = (0x1 << 16) + (0x0 << 15) + 0x02; 270 | // DW 11: bit 31-16 rsvd, bit 15-0 num of dwords upper. 271 | command[11] = 0x0; 272 | // DW 12-13: bit 63-0 log page offset. 200 (0xc8) in this case. 273 | command[12] = 0xc8; 274 | command[13] = 0x0; 275 | // DW 14: bit 31-0 UUID. 0 in this case. 276 | command[14] = 0x0; 277 | // DW 15 rsvd. 278 | command[15] = 0x0; 279 | insert_admin_sq(ssd_id, command); 280 | return wait_for_next_cqe(ssd_id); 281 | } 282 | 283 | int main(int argc, char *argv[]) 284 | { 285 | 286 | // Set QDMA regs 287 | // Initialize NVMe device. 288 | init(0xb1, 256*1024*1024); // FPGA device ID is 0000:1c:00.0 289 | // Make queue 0 active 290 | uint32_t pfch_tag; 291 | writeConfig(0x1408/4, 0); 292 | if (readConfig(0x1408/4) != 0) { 293 | fprintf(stderr, "ERROR: Cannot read FPGA BARs."); 294 | exit(1); 295 | } 296 | pfch_tag = readConfig(0x140c/4); 297 | writeReg(210, pfch_tag); 298 | fprintf(stdout, "Prefetch tag: %d\n", pfch_tag); 299 | 300 | // Physical addresses of several BARs. 301 | uint64_t nvme_base[32], bypass_base; 302 | 303 | FILE *fp; 304 | 305 | // Open FPGA card. Assume BAR 4 of 0000:37:00.0 306 | 307 | fp = fopen("/sys/bus/pci/devices/0000:b1:00.0/resource", "rb"); 308 | if (fp == NULL) 309 | { 310 | fprintf(stderr, "ERROR: Cannot open fpga device.\n"); 311 | exit(1); 312 | } 313 | fseek(fp, 228, SEEK_SET); // 57 * BAR 314 | 315 | fscanf(fp, "0x%lx", &bypass_base); 316 | fclose(fp); 317 | 318 | if (bypass_base == 0) 319 | { 320 | fprintf(stderr, "ERROR: Invalid PCI address for FPGA card.\n"); 321 | exit(1); 322 | } 323 | else 324 | { 325 | fprintf(stdout, "BAR 2 of FPGA device is %lx.\n", bypass_base); 326 | } 327 | 328 | writeReg(32, 0); 329 | // Read FPGA configures 330 | uint32_t ssd_low_bit, ssd_num, queue_low_bit, queue_depth, queue_num, ram_type_bit; 331 | ssd_low_bit = readReg(576); 332 | ssd_num = readReg(577); 333 | queue_depth = readReg(578); 334 | queue_low_bit = readReg(579); 335 | queue_num = readReg(580); 336 | ram_type_bit = readReg(581); 337 | fprintf(stdout, 338 | "SSD_LOW_BIT: %u\nSSD_COUNT: %u\n" 339 | "QUEUE_DEPTH: %u\nQUEUE_LOW_BIT: %u\n" 340 | "QUEUE_NUM: %u\nRAM_TYPE_BIT: %u\n", 341 | ssd_low_bit, ssd_num, 342 | queue_depth, queue_low_bit, 343 | queue_num, ram_type_bit 344 | ); 345 | if (ssd_low_bit == 0xffffffff) 346 | { 347 | fprintf(stderr, "ERROR: Invalid FPGA config info. \n"); 348 | exit(1); 349 | } 350 | 351 | int multi_ssd = 0; 352 | 353 | if (ssd_num > 1) { 354 | multi_ssd = 1; 355 | } 356 | 357 | // Open SSD device, now I just assume it is BAR 0 of target 358 | 359 | uint64_t device_low_addr = 0, device_high_addr = 0; 360 | 361 | for (int i=0; i> 48) & 0xf; 473 | if (nvme_ctl_mpsmin > 0) 474 | { 475 | fprintf(stderr, "ERROR: The nvme device doesn't support 4KB page.\n"); 476 | exit(1); 477 | } 478 | uint64_t nvme_ctl_dstrd = (nvme_ctl_cap >> 32) & 0xf; 479 | if (nvme_ctl_dstrd > 0) 480 | { 481 | fprintf(stderr, "ERROR: The nvme device doesn't support 4B doorbell stride.\n"); 482 | exit(1); 483 | } 484 | uint64_t nvme_ctl_mqes = nvme_ctl_cap & 0xffff; 485 | if (nvme_ctl_mqes < 32) 486 | { 487 | fprintf(stderr, "ERROR: The nvme device doesn't support 32 queue entries.\n"); 488 | exit(1); 489 | } 490 | 491 | // Reset the controller. 492 | uint32_t *nvme_cc_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x14); 493 | *nvme_cc_pt = 0x460000; // Do not enable now. 494 | fprintf(stdout, "CC set to %08x.\n", *nvme_cc_pt); 495 | 496 | // Wait the controller to be completely reset. 497 | // Otherwise it will get stuck :( 498 | uint32_t *nvme_csts_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x1c); 499 | while (*nvme_csts_pt != 0); 500 | fprintf(stdout, "System reset done. Current CSTS is %08x.\n", *nvme_csts_pt); 501 | 502 | // Set admin queue size to 32. 503 | uint32_t *nvme_aqa_pt = (uint32_t *)(ssd_virt_base[ssd_id] + 0x24); 504 | *nvme_aqa_pt = (ADMIN_QUEUE_DEPTH << 16) + ADMIN_QUEUE_DEPTH; 505 | fprintf(stdout, "AQA set to %08x.\n", *nvme_aqa_pt); 506 | 507 | // Set admin SQ base address. 508 | uint64_t *nvme_asq_pt = (uint64_t *)(ssd_virt_base[ssd_id] + 0x28); 509 | *nvme_asq_pt = SSD_ADMIN_SQ_PHYS_BASE(ssd_id); 510 | fprintf(stdout, "ASQ set to %016lx.\n", *nvme_asq_pt); 511 | 512 | // Set admin CQ base address. 513 | uint64_t *nvme_acq_pt = (uint64_t *)(ssd_virt_base[ssd_id] + 0x30); 514 | *nvme_acq_pt = SSD_ADMIN_CQ_PHYS_BASE(ssd_id); 515 | fprintf(stdout, "ACQ set to %016lx.\n", *nvme_acq_pt); 516 | 517 | // Enable the controller. 518 | *nvme_cc_pt = 0x460001; 519 | fprintf(stdout, "CC set to %08x.\n", *nvme_cc_pt); 520 | 521 | // Wait for the system to be started. 522 | while (*nvme_csts_pt == 0); 523 | fprintf(stdout, "System started. Current CSTS is %08x.\n", *nvme_csts_pt); 524 | 525 | // Reset queue pointers. 526 | admin_sq_tl[ssd_id] = 0; 527 | admin_cq_hd[ssd_id] = 0; 528 | 529 | uint32_t *nvme_cq_base = (uint32_t *)(SSD_ADMIN_CQ_VIRT_BASE(ssd_id)); 530 | // Now clear the admin CQ buffer. 531 | for (int i=0; i<128; i++) 532 | { 533 | nvme_cq_base[i] = 0x0; 534 | } 535 | 536 | // Initialize SSD queues. First set feature. 537 | int cmd_ret = nvme_set_num_of_qp(ssd_id, queue_num); 538 | if (cmd_ret != 0) 539 | { 540 | fprintf(stdout, "ERROR: Set number of queue pair returned 0x%x\n", cmd_ret); 541 | exit(1); 542 | } 543 | 544 | for (int qid=1; qid<=queue_num; qid++) 545 | { 546 | // Calculate the address of each CQ. 547 | uint64_t cq_addr = bypass_base + ((qid-1) << queue_low_bit) + (ssd_id << ssd_low_bit) + (0x1 << ram_type_bit); 548 | // Create CQ now. 549 | cmd_ret = nvme_create_cq(ssd_id, qid, queue_depth, cq_addr); 550 | if (cmd_ret != 0) 551 | { 552 | fprintf(stdout, "ERROR: Create CQ %d returned 0x%x\n", qid, cmd_ret); 553 | exit(1); 554 | } 555 | 556 | uint64_t sq_addr = bypass_base + ((qid-1) << queue_low_bit) + (ssd_id << ssd_low_bit) + (0x0 << ram_type_bit); 557 | cmd_ret = nvme_create_sq(ssd_id, qid, qid, queue_depth, sq_addr); 558 | if (cmd_ret != 0) 559 | { 560 | fprintf(stdout, "ERROR: Create SQ %d returned 0x%x\n", qid, cmd_ret); 561 | exit(1); 562 | } 563 | } 564 | 565 | fprintf(stdout, "SSD %d queue initialization done.\n", ssd_id); 566 | 567 | // Try to get SMART page. 568 | cmd_ret = get_smart_info(ssd_id); 569 | if (cmd_ret != 0) 570 | { 571 | fprintf(stdout, "ERROR: Get smart page returned 0x%x\n", cmd_ret); 572 | exit(1); 573 | } 574 | 575 | uint8_t *smart_array = (uint8_t *)(huge_virt_base + 0x200000); 576 | 577 | // Get critical warnings. 578 | uint8_t smart_critical = smart_array[0]; 579 | if (smart_critical != 0x00) 580 | { 581 | fprintf(stdout, "WARNING: SSD %d reported critical warning 0x%02x\n", ssd_id, smart_critical); 582 | } 583 | 584 | // Get temperature. 585 | // uint16_t smart_temp_comp; 586 | // smart_temp_comp = ((smart_array[2] << 8) + smart_array[1]) - 273; 587 | // fprintf(stdout, "Current temperature: %d\n", smart_temp_comp); 588 | 589 | // fprintf(stdout, "Creating I/O SQ/CQ...\n"); 590 | // writeReg(128, 0); 591 | // writeReg(128, 1); 592 | // writeReg(128, 0); 593 | 594 | // // Wait for FPGA board to finish basic settings. 595 | // while (readReg(672) == 0); 596 | // if (readReg(673) != 0) 597 | // { 598 | // fprintf(stderr, "ERROR: NVMe queue initialization failed.\n"); 599 | // fprintf(stdout, "status code: %08x\n", readReg(673)); 600 | // exit(1); 601 | // } 602 | 603 | // fprintf(stdout, "NVMe queue initialization done.\n"); 604 | } 605 | 606 | char *zero_buffer = NULL; 607 | posix_memalign((void **)&zero_buffer, 64 /*alignment */ , 64); 608 | 609 | for (int i=0; i<64; i++) 610 | { 611 | zero_buffer[i] = 0; 612 | } 613 | 614 | int stop_benchmark = 0; 615 | 616 | while (!stop_benchmark) 617 | { 618 | // Benchmarking 619 | int mode, num_lb, benchmark_time; 620 | int benchmark_stuck = 0; 621 | 622 | fprintf(stdout, "Enter mode. +1 for write, +2 for random, +1024 for record: "); 623 | fscanf(stdin, "%d", &mode); 624 | fprintf(stdout, "Enter number of logical blocks (512 B) for each cmd: "); 625 | fscanf(stdin, "%d", &num_lb); 626 | fprintf(stdout, "Enter time in seconds: "); 627 | fscanf(stdin, "%d", &benchmark_time); 628 | 629 | // Set parameters 630 | writeReg(161, mode & 0x3); 631 | writeReg(162, num_lb-1); 632 | writeReg(163, benchmark_time*3906250); // Time. 3,906,250 = 1s 633 | if (mode >= 1024) 634 | { 635 | writeReg(165, 1); 636 | } 637 | else 638 | { 639 | writeReg(165, 0); 640 | } 641 | fprintf(stdout, "Start benchmark...\n"); 642 | 643 | writeReg(160, 0); 644 | writeReg(160, 1); 645 | sleep(benchmark_time); 646 | writeReg(160, 0); 647 | 648 | int diff_time = 0; 649 | 650 | while (readReg(704) == 0) 651 | { 652 | sleep(1); 653 | diff_time += 1; 654 | 655 | if (diff_time > 3) 656 | { 657 | 658 | // For debugging queues 659 | uint32_t *bypass_entry_buffer; 660 | posix_memalign((void **)&bypass_entry_buffer, 64 /*alignment */ , 64); 661 | fprintf(stderr, "ERROR: Benchmark stuck, now print information of SQE and CQE: \n"); 662 | for (ssd_id=0; ssd_id 1) { 722 | uint32_t ssd_io[ssd_num]; 723 | ssd_io[0] = readReg(712); 724 | fprintf(stdout, " (%.2lf", 725 | (num_lb*0.5*ssd_io[0]/1024.0) / total_time); 726 | for (int i=1; i= 1024) 738 | { 739 | // Get bandwidth curve 740 | fprintf(stdout, "Time (s),Read bandwidth (MB/s),Write bandwidth (MB/s)\n"); 741 | uint32_t read_bw, write_bw; 742 | double bw_time = 0; 743 | writeReg(166, 0); 744 | writeReg(166, 1); 745 | writeReg(166, 0); 746 | write_bw = 0; 747 | read_bw = 0; 748 | while ((read_bw != 0xffffffff) && (write_bw != 0xffffffff)) 749 | { 750 | fprintf(stdout, "%.1lf,%.2lf,%.2lf\n", 751 | bw_time, read_bw*640.0/(1024*1024), write_bw*640.0/(1024*1024)); 752 | bw_time += 0.1; 753 | writeReg(166, 0); 754 | writeReg(166, 1); 755 | writeReg(166, 0); 756 | read_bw = readReg(729); 757 | write_bw = readReg(728); 758 | } 759 | } 760 | 761 | // Find failed entries. 762 | for (ssd_id=0; ssd_id 0, "At least one SSD is required.") 44 | assert(QUEUE_NUM > 0, "At least one queue is required.") 45 | assert(QUEUE_DEPTH >= 4, "Queue depth should be at least 4.") 46 | assert(Set("DMA", "SAXIB") contains QDMA_INTERFACE, "Invalid QDMA interface.") 47 | assert(pow(2, log2Ceil(QUEUE_DEPTH)).toInt == QUEUE_DEPTH, "Queue depth must be exponential of 2.") 48 | 49 | val QUEUE_MAX_ID = QUEUE_NUM - 1 50 | 51 | /* Basically below is how we handle queues using BAR space. 52 | * For SQE, address is splited into: {1'd0, ssd_id, queue_id, padding, entry_id, 6'd0}; 53 | * For CQE, address is splited into: {1'd1, ssd_id, queue_id, padding, 2'd0, entry_id, 4'd0}; 54 | * Padding is used to ensure starting address of a queue is 4-KiB aligned. 55 | * Length of padding is max(0, 6-bits(entry_id)). 56 | */ 57 | 58 | val ENTRY_BIT_LEN = log2Ceil(QUEUE_DEPTH) 59 | val ENTRY_LOW_BIT_SQ = 6 60 | val ENTRY_HIGH_BIT_SQ = 6 + ENTRY_BIT_LEN - 1 61 | 62 | val ENTRY_LOW_BIT_CQ = 4 63 | val ENTRY_HIGH_BIT_CQ = 4 + ENTRY_BIT_LEN - 1 64 | 65 | val QUEUE_BIT_LEN_RAW = log2Ceil(QUEUE_NUM) 66 | val QUEUE_BIT_LEN = max(1, QUEUE_BIT_LEN_RAW) 67 | val QUEUE_LOW_BIT = max(12, ENTRY_HIGH_BIT_SQ+1) 68 | val QUEUE_HIGH_BIT = QUEUE_LOW_BIT + QUEUE_BIT_LEN_RAW - 1 69 | 70 | val SSD_BIT_LEN_RAW = log2Ceil(SSD_NUM) 71 | val SSD_BIT_LEN = max(1, SSD_BIT_LEN_RAW) 72 | val SSD_LOW_BIT = QUEUE_HIGH_BIT + 1 73 | val SSD_HIGH_BIT = SSD_LOW_BIT + SSD_BIT_LEN_RAW - 1 74 | 75 | val RAM_TYPE_BIT = SSD_HIGH_BIT + 1 76 | 77 | // SQ & CQ RAMs. 78 | // XRam use RAMB36 or URAM288 thus depth is 1K ~ 4K, which is too big to just hold one SSD. 79 | // Therefore, in this version of implementation, All SSDs share 1 SQ RAM and 1 CQ RAM. 80 | 81 | val sqRam = XRam( 82 | UInt(512.W), 83 | pow(2, log2Up(SSD_NUM) + log2Up(QUEUE_NUM)).toInt * QUEUE_DEPTH, 84 | latency = 1, 85 | use_musk = 0 86 | ) 87 | 88 | // SQ data structures 89 | val sqTail = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(ENTRY_BIT_LEN.W)))))) 90 | val sqHead = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(ENTRY_BIT_LEN.W)))))) 91 | 92 | val cqRam = XRam( 93 | UInt(512.W), 94 | SSD_NUM * QUEUE_NUM * QUEUE_DEPTH / 4, 95 | latency = 1, 96 | use_musk = 1 97 | ) 98 | 99 | // CQ data structures. CQ head can be divided into 2 parts, 100 | // One is phase, used to know whether CQE has been updated, 101 | // another is head counter, used to send doorbell. 102 | val cqHeadExt = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U((ENTRY_BIT_LEN+1).W)))))) 103 | val cqPhase = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, UInt(1.W)))) 104 | val cqHead = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, UInt(ENTRY_BIT_LEN.W)))) 105 | for (ssdId <- 0 until SSD_NUM) { 106 | for (queueId <- 0 until QUEUE_NUM) { 107 | cqHead(ssdId)(queueId) := cqHeadExt(ssdId)(queueId)(ENTRY_BIT_LEN-1, 0) 108 | cqPhase(ssdId)(queueId) := cqHeadExt(ssdId)(queueId)(ENTRY_BIT_LEN).asUInt 109 | } 110 | } 111 | val cqHeadChanged = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Valid(Bool())))) 112 | 113 | // SSD BAR physical address, for sending doorbells. 114 | 115 | val ssdBarAddr = RegInit(VecInit(Seq.fill(SSD_NUM)(0.U(64.W)))) 116 | 117 | // Set up SSD physical address. 118 | when (io.control.ssdSetup.valid) { 119 | ssdBarAddr(io.control.ssdSetup.bits.ssdId) := io.control.ssdSetup.bits.ssdBarAddr 120 | } 121 | 122 | // Running counters, getting to know whether all commands has been completed. 123 | // Only when enable is 0 and running is 0, this module is truly stopped. 124 | 125 | val commandStart = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(32.W)))))) 126 | val commandEnd = RegInit(VecInit(Seq.fill(SSD_NUM)(VecInit(Seq.fill(QUEUE_NUM)(0.U(32.W)))))) 127 | 128 | val queueRunning = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Bool()))) 129 | for (ssdId <- 0 until SSD_NUM) { 130 | for (queueId <- 0 until QUEUE_NUM) { 131 | queueRunning(ssdId)(queueId) := (commandStart(ssdId)(queueId) =/= commandEnd(ssdId)(queueId)) 132 | } 133 | } 134 | io.status.running := queueRunning.asTypeOf(UInt((SSD_NUM*QUEUE_NUM).W)).orR 135 | 136 | // Parameters used to notify software 137 | io.status.params.ssdNum := SSD_NUM.U 138 | io.status.params.ssdLowBit := SSD_LOW_BIT.U 139 | io.status.params.queueLowBit:= QUEUE_LOW_BIT.U 140 | io.status.params.queueDepth := QUEUE_DEPTH.U 141 | io.status.params.queueNum := QUEUE_NUM.U 142 | io.status.params.ramTypeBit := RAM_TYPE_BIT.U 143 | 144 | // Statistical information 145 | val statExecTime = RegInit(UInt(64.W), 0.U) 146 | val statSuccOp = RegInit(UInt(32.W), 0.U) 147 | val statFailedOp = RegInit(UInt(32.W), 0.U) 148 | val statLatency = RegInit(UInt(64.W), 0.U) 149 | 150 | io.status.stat.executeTime := statExecTime 151 | io.status.stat.successfulOp := statSuccOp 152 | io.status.stat.failedOp := statFailedOp 153 | io.status.stat.totalLatency := statLatency 154 | 155 | val moduleRunning = io.status.running || io.control.enable 156 | val moduleStart = moduleRunning && ~RegNext(moduleRunning) 157 | 158 | when (moduleStart) { 159 | // Clear counters at start 160 | statExecTime := 0.U 161 | statSuccOp := 0.U 162 | statFailedOp := 0.U 163 | statLatency := 0.U 164 | }.elsewhen (moduleRunning) { 165 | statExecTime := statExecTime + 1.U 166 | } 167 | 168 | // Main logic. 169 | 170 | val dbReq = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Decoupled(new Doorbell)))) 171 | 172 | // Part 1: Add command from user to SQ. 173 | 174 | // Each queue has a command FIFO. 175 | val cmdInputFifo = XQueue(SSD_NUM*QUEUE_NUM)(UInt(512.W), 16) 176 | val cmdInputFifoOut = Wire(Vec(SSD_NUM*QUEUE_NUM, Decoupled(UInt(512.W)))) 177 | 178 | // Write command to command RAM. 179 | // Since all queues share one RAM, we use a reg to indicate which queue to write. 180 | val sqAllocPtSsd = RegInit(UInt(SSD_BIT_LEN.W), 0.U) 181 | val sqAllocPtQp = RegInit(UInt(QUEUE_BIT_LEN.W), 0.U) 182 | val queueWriteRdy = Wire(Vec(SSD_NUM, Vec(QUEUE_NUM, Bool()))) 183 | 184 | for (ssdId <- 0 until SSD_NUM) { 185 | for (queueId <- 0 until QUEUE_NUM) { 186 | val fifoId = ssdId*QUEUE_NUM+queueId 187 | val cmdInputFifoIn = Wire(Decoupled(UInt(512.W))) 188 | val cmdInputFifoSlice = RegSlice(2)(cmdInputFifo(fifoId).io.out) 189 | cmdInputFifo(fifoId).io.in <> RegSlice(2)(cmdInputFifoIn) 190 | io.ssdCmd(ssdId)(queueId).ready := io.control.enable && cmdInputFifoIn.ready 191 | cmdInputFifoIn.valid := io.control.enable && io.ssdCmd(ssdId)(queueId).valid 192 | cmdInputFifoIn.bits := io.ssdCmd(ssdId)(queueId).bits 193 | cmdInputFifoOut(fifoId).valid := cmdInputFifoSlice.valid 194 | cmdInputFifoOut(fifoId).bits := cmdInputFifoSlice.bits 195 | cmdInputFifoSlice.ready := cmdInputFifoOut(fifoId).ready 196 | cmdInputFifoOut(fifoId).ready := (queueWriteRdy(ssdId)(queueId) 197 | && (sqAllocPtQp === queueId.U) && (sqAllocPtSsd === ssdId.U)) 198 | 199 | when (io.ssdCmd(ssdId)(queueId).fire) { 200 | commandStart(ssdId)(queueId) := commandStart(ssdId)(queueId) + 1.U 201 | } 202 | } 203 | } 204 | 205 | val sqAllocPtFifo = RegInit(UInt((SSD_BIT_LEN + QUEUE_BIT_LEN).W), 0.U) 206 | 207 | when (sqAllocPtQp === QUEUE_MAX_ID.U) { 208 | when (sqAllocPtSsd === (SSD_NUM-1).U) { 209 | sqAllocPtSsd := 0.U 210 | sqAllocPtFifo := 0.U 211 | }.otherwise { 212 | sqAllocPtSsd := sqAllocPtSsd + 1.U 213 | sqAllocPtFifo := sqAllocPtFifo + 1.U 214 | } 215 | sqAllocPtQp := 0.U 216 | }.otherwise { 217 | sqAllocPtQp := sqAllocPtQp + 1.U 218 | sqAllocPtFifo := sqAllocPtFifo + 1.U 219 | } 220 | 221 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue 222 | sqRam.io.addr_a := Cat(sqAllocPtSsd, sqAllocPtQp, sqTail(sqAllocPtSsd)(sqAllocPtQp)) 223 | } else { // Only 1 queue 224 | sqRam.io.addr_a := Cat(sqAllocPtSsd, sqTail(sqAllocPtSsd)(sqAllocPtQp)) 225 | } 226 | 227 | sqRam.io.data_in_a := cmdInputFifoOut(sqAllocPtFifo).bits 228 | sqRam.io.wr_en_a := cmdInputFifoOut(sqAllocPtFifo).fire 229 | 230 | // Part 2: Get to know whether CQ has been changed. 231 | 232 | val cqDetectPtQp = RegInit(UInt(QUEUE_BIT_LEN.W), 0.U) 233 | val cqDetectPtSsd = RegInit(UInt(SSD_BIT_LEN.W), 0.U) 234 | val cqDetectPtAddr = Wire(UInt((QUEUE_BIT_LEN+SSD_BIT_LEN).W)) 235 | 236 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue 237 | cqDetectPtAddr := Cat(cqDetectPtSsd, cqDetectPtQp) 238 | } else { 239 | cqDetectPtAddr := cqDetectPtSsd 240 | } 241 | 242 | when (cqDetectPtQp === QUEUE_MAX_ID.U) { 243 | when (cqDetectPtSsd === (SSD_NUM-1).U) { 244 | cqDetectPtSsd := 0.U 245 | }.otherwise { 246 | cqDetectPtSsd := cqDetectPtSsd + 1.U 247 | } 248 | cqDetectPtQp := 0.U 249 | }.otherwise { 250 | cqDetectPtQp := cqDetectPtQp + 1.U 251 | } 252 | 253 | if (ENTRY_BIT_LEN <= 4) { 254 | cqRam.io.addr_b := cqDetectPtAddr 255 | } else { 256 | cqRam.io.addr_b := Cat(cqDetectPtAddr, cqHead(cqDetectPtSsd)(cqDetectPtQp)(ENTRY_BIT_LEN-1, 2)) 257 | } 258 | 259 | for (ssdId <- 0 until SSD_NUM) { 260 | for (queueId <- 0 until QUEUE_NUM) { 261 | cqHeadChanged(ssdId)(queueId).valid := ( 262 | ssdId.U(SSD_BIT_LEN.W) === RegNext(cqDetectPtSsd) 263 | && queueId.U(QUEUE_BIT_LEN.W) === RegNext(cqDetectPtQp) 264 | ) 265 | cqHeadChanged(ssdId)(queueId).bits := 0.U 266 | switch (cqHead(ssdId)(queueId)(1, 0)) { 267 | is (0.U(2.W)) { 268 | cqHeadChanged(ssdId)(queueId).bits := ( 269 | cqRam.io.data_out_b(112+128*0) =/= RegNext(cqPhase(ssdId)(queueId)) 270 | ) 271 | } 272 | is (1.U(2.W)) { 273 | cqHeadChanged(ssdId)(queueId).bits := ( 274 | cqRam.io.data_out_b(112+128*1) =/= RegNext(cqPhase(ssdId)(queueId)) 275 | ) 276 | } 277 | is (2.U(2.W)) { 278 | cqHeadChanged(ssdId)(queueId).bits := ( 279 | cqRam.io.data_out_b(112+128*2) =/= RegNext(cqPhase(ssdId)(queueId)) 280 | ) 281 | } 282 | is (3.U(2.W)) { 283 | cqHeadChanged(ssdId)(queueId).bits := ( 284 | cqRam.io.data_out_b(112+128*3) =/= RegNext(cqPhase(ssdId)(queueId)) 285 | ) 286 | } 287 | } 288 | } 289 | } 290 | 291 | for (ssdId <- 0 until SSD_NUM) { 292 | 293 | // Part 3: Queue pair handle logic. 294 | 295 | // Basically a complex state machine. 296 | object QpState extends ChiselEnum { 297 | val sQpSqWait1, sQpSqWait2, sQpSqIns, sQpSqDb, 298 | sQpCqWait1, sQpCqWait2, sQpCqRefresh1, sQpCqRefresh2, 299 | sQpCqDb, sQpLoop = Value 300 | } 301 | 302 | import QpState._ 303 | 304 | for (queueId <- 0 until QUEUE_NUM) { 305 | val qpSt = RegInit(QpState(), sQpSqWait1) 306 | val sqWaitCnt = RegInit(UInt(32.W), 0.U) 307 | val newCqCome = cqHeadChanged(ssdId)(queueId).bits && cqHeadChanged(ssdId)(queueId).valid 308 | 309 | switch (qpSt) { 310 | is (sQpSqWait1) { // Wait for new command 311 | when (cmdInputFifoOut(ssdId*QUEUE_NUM+queueId).fire) { // A new command comes. 312 | qpSt := sQpSqIns 313 | }.otherwise { 314 | qpSt := sQpSqWait1 315 | } 316 | } 317 | is (sQpSqIns) { // Insert the command 318 | when (sqTail(ssdId)(queueId) + 2.U === cqHead(ssdId)(queueId)) { 319 | // when (sqTail(ssdId)(queueId) + 2.U === sqHead(ssdId)(queueId)) { 320 | qpSt := sQpSqDb // SQ is full, directly ring doorbell. 321 | }.otherwise { 322 | qpSt := sQpSqWait2 323 | } 324 | } 325 | is (sQpSqWait2) { // Wait for more command to reduce doorbell signals 326 | when (cmdInputFifoOut(ssdId*QUEUE_NUM+queueId).fire) { // A new command comes. 327 | qpSt := sQpSqIns 328 | }.elsewhen (sqWaitCnt >= MAX_SQ_INTERVAL.U) { // No command comes for a while, ring doorbell. 329 | qpSt := sQpSqDb 330 | }.otherwise { 331 | qpSt := sQpSqWait2 332 | } 333 | } 334 | is (sQpSqDb) { // Ring SQ doorbell 335 | when (dbReq(ssdId)(queueId).fire) { // Doorbell accepted 336 | qpSt := sQpCqWait1 337 | }.otherwise { 338 | qpSt := sQpSqDb 339 | } 340 | } 341 | is (sQpCqWait1) { // Wait for new CQE 342 | when (newCqCome) { // We have a first CQE 343 | qpSt := sQpCqRefresh1 344 | }.otherwise { 345 | qpSt := sQpCqWait1 346 | } 347 | } 348 | is (sQpCqWait2) { // Check if more CQE has come in a row 349 | when (newCqCome) { // We have more CQEs 350 | qpSt := sQpCqRefresh1 351 | }.elsewhen (cqHeadChanged(ssdId)(queueId).valid) { // No more CQEs, ring doorbell. 352 | qpSt := sQpCqDb 353 | }.otherwise { // This CQ is not checked yet 354 | qpSt := sQpCqWait2 355 | } 356 | } 357 | is (sQpCqRefresh1) { // Wait for more CQEs 1 358 | qpSt := sQpCqRefresh2 359 | } 360 | is (sQpCqRefresh2) { // Wait for more CQEs 2 361 | qpSt := sQpCqWait2 362 | } 363 | is (sQpCqDb) { // Ring CQ doorbell 364 | when (dbReq(ssdId)(queueId).fire) { // Doorbell accepted 365 | qpSt := sQpLoop 366 | }.otherwise { 367 | qpSt := sQpCqDb 368 | } 369 | } 370 | is (sQpLoop) { // Prepare for next round 371 | when (sqTail(ssdId)(queueId) + 1.U === sqHead(ssdId)(queueId)) { 372 | // In case SQ head not moved, skip the SQWAIT phases. 373 | qpSt := sQpCqWait1 374 | }.otherwise { 375 | qpSt := sQpSqWait1 376 | } 377 | } 378 | } // switch (qpSt) 379 | 380 | // New command requests 381 | queueWriteRdy(ssdId)(queueId) := (qpSt === sQpSqWait1) || (qpSt === sQpSqWait2) 382 | 383 | // Doorbell requests 384 | dbReq(ssdId)(queueId).valid := (qpSt === sQpSqDb) || (qpSt === sQpCqDb) 385 | dbReq(ssdId)(queueId).bits.addr := Mux(qpSt === sQpSqDb, 386 | ssdBarAddr(ssdId) + 0x1008.U(64.W) + Cat(queueId.U(QUEUE_BIT_LEN.W), 0.U(3.W)), 387 | ssdBarAddr(ssdId) + 0x100c.U(64.W) + Cat(queueId.U(QUEUE_BIT_LEN.W), 0.U(3.W)), 388 | ) 389 | dbReq(ssdId)(queueId).bits.value := Mux(qpSt === sQpSqDb, 390 | sqTail(ssdId)(queueId), 391 | cqHead(ssdId)(queueId) 392 | ) 393 | 394 | // Update SQ tail and SQ wait timer 395 | when (qpSt === sQpSqIns) { 396 | sqTail(ssdId)(queueId) := sqTail(ssdId)(queueId) + 1.U 397 | sqWaitCnt := 0.U 398 | }.elsewhen (qpSt === sQpSqWait2) { 399 | sqWaitCnt := sqWaitCnt + 1.U 400 | } 401 | 402 | // Update CQ tail 403 | when (qpSt === sQpCqRefresh1) { 404 | cqHeadExt(ssdId)(queueId) := cqHeadExt(ssdId)(queueId) + 1.U 405 | } 406 | } // for (queueId <- 0 until QUEUE_NUM) 407 | } // for (ssdId <- 0 until SSD_NUM) 408 | 409 | // Part 4: Collect and send doorbells. 410 | 411 | val dbAbt = Module(new RRArbiter(new Doorbell, SSD_NUM*QUEUE_NUM)) 412 | 413 | for (ssdId <- 0 until SSD_NUM) { 414 | for (queueId <- 0 until QUEUE_NUM) { 415 | dbAbt.io.in(ssdId*QUEUE_NUM + queueId) <> RegSlice(5)(dbReq(ssdId)(queueId)) 416 | } 417 | } 418 | 419 | val dbFifo = XQueue(new Doorbell, 64) 420 | 421 | dbFifo.io.in <> RegSlice(2)(dbAbt.io.out) 422 | 423 | // Another state machine. 424 | object DbState extends ChiselEnum { 425 | val sDbWait, sDbPutDesc, sDbPutData = Value 426 | } 427 | 428 | import DbState._ 429 | 430 | val dbSt = RegInit(DbState(), sDbWait) 431 | val dbAddr = RegInit(UInt(64.W), 0.U) 432 | val dbValue = RegInit(UInt(32.W), 0.U) 433 | 434 | switch (dbSt) { 435 | is (sDbWait) { 436 | when (dbFifo.io.out.fire) { 437 | dbSt := sDbPutDesc 438 | }.otherwise { 439 | dbSt := sDbWait 440 | } 441 | } 442 | is (sDbPutDesc) { 443 | if (QDMA_INTERFACE == "SAXIB") { 444 | when (io.sAxib.get.aw.fire) { 445 | dbSt := sDbPutData 446 | }.otherwise { 447 | dbSt := sDbPutDesc 448 | } 449 | } else { 450 | when (io.c2hCmd.get.fire) { 451 | dbSt := sDbPutData 452 | }.otherwise { 453 | dbSt := sDbPutDesc 454 | } 455 | } 456 | } 457 | is (sDbPutData) { 458 | if (QDMA_INTERFACE == "SAXIB") { 459 | when (io.sAxib.get.w.fire) { 460 | dbSt := sDbWait 461 | }.otherwise { 462 | dbSt := sDbPutData 463 | } 464 | } else { 465 | when (io.c2hData.get.fire) { 466 | dbSt := sDbWait 467 | }.otherwise { 468 | dbSt := sDbPutData 469 | } 470 | } 471 | } 472 | } 473 | 474 | dbFifo.io.out.ready := (dbSt === sDbWait) 475 | 476 | when (dbFifo.io.out.fire) { 477 | dbAddr := dbFifo.io.out.bits.addr 478 | dbValue := dbFifo.io.out.bits.value 479 | } 480 | 481 | if (QDMA_INTERFACE == "SAXIB") { 482 | ToZero(io.sAxib.get.aw.bits) 483 | ToZero(io.sAxib.get.ar.bits) 484 | ToZero(io.sAxib.get.w.bits) 485 | 486 | when (dbSt === sDbPutDesc) { 487 | io.sAxib.get.aw.bits.addr := Cat(dbAddr(63, 6), 0.U(6.W)) 488 | io.sAxib.get.aw.bits.size := 2.U(3.W) 489 | io.sAxib.get.aw.valid := 1.U 490 | }.otherwise { 491 | io.sAxib.get.aw.bits.addr := 0.U 492 | io.sAxib.get.aw.bits.size := 0.U 493 | io.sAxib.get.aw.valid := 0.U 494 | } 495 | io.sAxib.get.aw.bits.burst := 1.U 496 | 497 | when (dbSt === sDbPutData) { 498 | io.sAxib.get.w.bits.data := ShiftData512(Cat(0.U(480.W), dbValue), dbAddr(5, 0)) 499 | io.sAxib.get.w.bits.strb := ShiftStrb64("hf".U(64.W), dbAddr(5, 0)) 500 | io.sAxib.get.w.bits.last := 1.U 501 | io.sAxib.get.w.valid := 1.U 502 | }.otherwise { 503 | io.sAxib.get.w.bits.data := 0.U 504 | io.sAxib.get.w.bits.strb := 0.U 505 | io.sAxib.get.w.bits.last := 0.U 506 | io.sAxib.get.w.valid := 0.U 507 | } 508 | io.sAxib.get.ar.bits.size := 6.U 509 | io.sAxib.get.ar.bits.burst := 1.U 510 | io.sAxib.get.ar.valid := 0.U 511 | io.sAxib.get.r.ready := 1.U 512 | io.sAxib.get.b.ready := 1.U 513 | } 514 | 515 | if (QDMA_INTERFACE == "DMA") { 516 | ToZero(io.c2hCmd.get.bits) 517 | when (dbSt === sDbPutDesc) { 518 | io.c2hCmd.get.bits.addr := dbAddr 519 | io.c2hCmd.get.bits.len := 4.U(16.W) 520 | io.c2hCmd.get.valid := 1.U 521 | }.otherwise { 522 | io.c2hCmd.get.bits.addr := 0.U 523 | io.c2hCmd.get.bits.len := 0.U 524 | io.c2hCmd.get.valid := 0.U 525 | } 526 | 527 | ToZero(io.c2hData.get.bits) 528 | when (dbSt === sDbPutData) { 529 | io.c2hData.get.bits.data := Cat(0.U(480.W), dbValue) 530 | io.c2hData.get.bits.ctrl_len := 4.U(16.W) 531 | io.c2hData.get.bits.mty := 60.U(6.W) 532 | io.c2hData.get.bits.last := 1.U 533 | io.c2hData.get.valid := 1.U 534 | }.otherwise { 535 | io.c2hData.get.bits.data := 0.U 536 | io.c2hData.get.bits.ctrl_len := 0.U 537 | io.c2hData.get.bits.mty := 0.U 538 | io.c2hData.get.bits.last := 0.U 539 | io.c2hData.get.valid := 0.U 540 | } 541 | } 542 | 543 | // Part 5 : QDMA read/write SQ/CQ RAM. 544 | 545 | // SQ RAM 546 | 547 | if (SSD_HIGH_BIT >= SSD_LOW_BIT) { // More than 1 SSD 548 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue 549 | sqRam.io.addr_b := Cat( 550 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 551 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 552 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ) 553 | ) 554 | } else { // Only 1 queue. 555 | sqRam.io.addr_b := Cat( 556 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 557 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ) 558 | ) 559 | } 560 | } else { 561 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) { // More than 1 queue 562 | sqRam.io.addr_b := Cat( 563 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 564 | io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ) 565 | ) 566 | } else { // Only 1 queue. 567 | sqRam.io.addr_b := io.ramIO.readAddr(ENTRY_HIGH_BIT_SQ, ENTRY_LOW_BIT_SQ) 568 | } 569 | } 570 | 571 | // CQ RAM 572 | 573 | cqRam.io.wr_en_a := ( 574 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U 575 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U 576 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U 577 | && io.ramIO.writeMask =/= 0.U 578 | ) 579 | 580 | if (SSD_HIGH_BIT >= SSD_LOW_BIT) {// >1 SSD 581 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // >1 queue, >4 entries 582 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 583 | Cat( 584 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 585 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 586 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6) 587 | ), 588 | Cat( 589 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 590 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 591 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6) 592 | ), 593 | ) 594 | } else if (QUEUE_HIGH_BIT < QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // 1 queue, >4 entries 595 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 596 | Cat( 597 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 598 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6) 599 | ), 600 | Cat( 601 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 602 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6) 603 | ), 604 | ) 605 | } else if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ < 6) { // >1 queue, 4 entries 606 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 607 | Cat( 608 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 609 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT) 610 | ), 611 | Cat( 612 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 613 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT) 614 | ) 615 | ) 616 | } else { // 1 queue, 4 entries 617 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 618 | io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT), 619 | io.ramIO.readAddr(SSD_HIGH_BIT, SSD_LOW_BIT) 620 | ) 621 | } 622 | } else { // Only 1 SSD 623 | if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // >1 queue, >4 entries 624 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 625 | Cat( 626 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 627 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6) 628 | ), 629 | Cat( 630 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 631 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6) 632 | ), 633 | ) 634 | } else if (QUEUE_HIGH_BIT < QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ >= 6) { // 1 queue, >4 entries 635 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 636 | io.ramIO.writeAddr(ENTRY_HIGH_BIT_CQ, 6), 637 | io.ramIO.readAddr(ENTRY_HIGH_BIT_CQ, 6), 638 | ) 639 | } else if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT && ENTRY_HIGH_BIT_CQ < 6) { // >1 queue, 4 entries 640 | cqRam.io.addr_a := Mux(cqRam.io.wr_en_a, 641 | io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT), 642 | io.ramIO.readAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT) 643 | ) 644 | } else { // 1 queue, 4 entries 645 | cqRam.io.addr_a := 0.U 646 | } 647 | } 648 | cqRam.io.musk_a.get := io.ramIO.writeMask 649 | cqRam.io.data_in_a := io.ramIO.writeData 650 | 651 | val nextReadAddr = RegNext(io.ramIO.readAddr) 652 | 653 | io.ramIO.readData := 0.U 654 | 655 | when (nextReadAddr(63, RAM_TYPE_BIT+1) === 0.U) { 656 | io.ramIO.readData := Mux(nextReadAddr(RAM_TYPE_BIT) === 0.U, 657 | sqRam.io.data_out_b, 658 | cqRam.io.data_out_a 659 | ) 660 | } 661 | 662 | // Update SQ head from CQE. 663 | 664 | when ( 665 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U 666 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U 667 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U 668 | ) { 669 | val chosenSsd = if (SSD_HIGH_BIT >= SSD_LOW_BIT) {io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT)} else 0.U 670 | val chosenQp = if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) {io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)} else 0.U 671 | when (io.ramIO.writeMask === "h000000000000ffff".U) { 672 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(79, 64) 673 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U 674 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) { 675 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(207, 192) 676 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U 677 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) { 678 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(335, 320) 679 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U 680 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) { 681 | sqHead(chosenSsd)(chosenQp) := io.ramIO.writeData(463, 448) 682 | commandEnd(chosenSsd)(chosenQp) := commandEnd(chosenSsd)(chosenQp) + 1.U 683 | } 684 | } 685 | 686 | // Statistical counters 687 | 688 | for (ssdId <- 0 until SSD_NUM) { 689 | for (queueId <- 0 until QUEUE_NUM) { 690 | when (io.ssdCmd(ssdId)(queueId).fire) { 691 | statLatency := statLatency - statExecTime 692 | } 693 | } 694 | } 695 | 696 | when ( 697 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U 698 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U 699 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U 700 | ) { 701 | when (io.ramIO.writeMask === "h000000000000ffff".U) { 702 | statLatency := statLatency + statExecTime 703 | when (io.ramIO.writeData(120, 113) === 0.U) { 704 | statSuccOp := statSuccOp + 1.U 705 | }.otherwise ( 706 | statFailedOp := statFailedOp + 1.U 707 | ) 708 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) { 709 | statLatency := statLatency + statExecTime 710 | when (io.ramIO.writeData(248, 241) === 0.U) { 711 | statSuccOp := statSuccOp + 1.U 712 | }.otherwise ( 713 | statFailedOp := statFailedOp + 1.U 714 | ) 715 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) { 716 | statLatency := statLatency + statExecTime 717 | when (io.ramIO.writeData(376, 369) === 0.U) { 718 | statSuccOp := statSuccOp + 1.U 719 | }.otherwise ( 720 | statFailedOp := statFailedOp + 1.U 721 | ) 722 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) { 723 | statLatency := statLatency + statExecTime 724 | when (io.ramIO.writeData(504, 497) === 0.U) { 725 | statSuccOp := statSuccOp + 1.U 726 | }.otherwise ( 727 | statFailedOp := statFailedOp + 1.U 728 | ) 729 | } 730 | } 731 | 732 | // SSD completion signals 733 | for (ssdId <- 0 until SSD_NUM) { 734 | for (queueId <- 0 until QUEUE_NUM) { 735 | io.ssdCmpt(ssdId)(queueId).valid := 0.U 736 | ToZero(io.ssdCmpt(ssdId)(queueId).bits) 737 | } 738 | } 739 | when ( 740 | io.ramIO.writeAddr(63, RAM_TYPE_BIT+1) === 0.U 741 | && io.ramIO.writeAddr(RAM_TYPE_BIT) === 1.U 742 | && io.ramIO.writeAddr(QUEUE_LOW_BIT-1, ENTRY_HIGH_BIT_CQ+1) === 0.U 743 | ) { 744 | val ssdId = if (SSD_HIGH_BIT >= SSD_LOW_BIT) {io.ramIO.writeAddr(SSD_HIGH_BIT, SSD_LOW_BIT)} else 0.U 745 | val queueId = if (QUEUE_HIGH_BIT >= QUEUE_LOW_BIT) {io.ramIO.writeAddr(QUEUE_HIGH_BIT, QUEUE_LOW_BIT)} else 0.U 746 | when (io.ramIO.writeMask === "h000000000000ffff".U) { 747 | io.ssdCmpt(ssdId)(queueId).valid := 1.U 748 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(111, 96) 749 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(120, 113) 750 | }.elsewhen (io.ramIO.writeMask === "h00000000ffff0000".U) { 751 | io.ssdCmpt(ssdId)(queueId).valid := 1.U 752 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(239, 224) 753 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(248, 241) 754 | }.elsewhen (io.ramIO.writeMask === "h0000ffff00000000".U) { 755 | io.ssdCmpt(ssdId)(queueId).valid := 1.U 756 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(367, 352) 757 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(376, 369) 758 | }.elsewhen (io.ramIO.writeMask === "hffff000000000000".U) { 759 | io.ssdCmpt(ssdId)(queueId).valid := 1.U 760 | io.ssdCmpt(ssdId)(queueId).bits.cmdId := io.ramIO.writeData(495, 480) 761 | io.ssdCmpt(ssdId)(queueId).bits.status := io.ramIO.writeData(504, 497) 762 | } 763 | } 764 | } 765 | 766 | object ShiftData512 { 767 | def apply (value : UInt, offset : UInt) = { 768 | assert(value.getWidth == 512) 769 | assert(offset.getWidth == 6) 770 | value << Cat(offset, 0.U(3.W)) 771 | } 772 | } 773 | 774 | object ShiftStrb64 { 775 | def apply (value : UInt, offset : UInt) = { 776 | assert(value.getWidth == 64) 777 | assert(offset.getWidth == 6) 778 | value << offset 779 | } 780 | } --------------------------------------------------------------------------------