├── ddr3_rdcal.v ├── ddr3_x16_phy_cust.v ├── ddr3_x16_phy_params.vh └── docs ├── README.md ├── vivado-clkgen.png ├── vivado-fifogen.png └── vivado-util-cust-graph.png /ddr3_rdcal.v: -------------------------------------------------------------------------------- 1 | `timescale 1ns / 1ps 2 | /** To use the RDCAL module: 3 | * - set p_IDELAY_INIT_DQ and p_IDELAY_INIT_DQS to '0' 4 | * - set p_IDELAY_TYPE to "VAR_LOAD" 5 | * TODO: Complete description 6 | */ 7 | module ddr3_rdcal #( 8 | parameter p_RDCAL_BANK = 3'b0, 9 | parameter p_RDCAL_ROW = 14'b0, 10 | parameter p_RDCAL_COL = 10'b0, 11 | 12 | parameter p_RDCAL_WORD = 128'h0000_ffff_0000_ffff_0000_ffff_0000_ffff 13 | )( 14 | input i_clk_div, 15 | input i_rdcal_start, 16 | 17 | output o_rdcal_done, 18 | output o_rdcal_err, 19 | 20 | output o_dqs_delay_ld, 21 | output o_dq_delay_ld, 22 | 23 | output [4:0] o5_dqs_idelay_cnt, 24 | output [4:0] o5_dq_idelay_cnt, 25 | 26 | input i_phy_init_done, 27 | input i_phy_rddata_valid, 28 | input [127:0] in_phy_rddata, 29 | 30 | 31 | input i_phy_cmd_full, 32 | 33 | input i_rdc_cmd_en, 34 | input i_rdc_cmd_sel, 35 | input [2:0] i3_rdc_bank, 36 | input [13:0] i14_rdc_row, 37 | input [9:0] i10_rdc_col, 38 | input [127:0] i128_rdc_wrdata, 39 | input [7:0] i8_rdc_wrdm, 40 | 41 | output o_phy_cmd_en, 42 | output o_phy_cmd_sel, 43 | output [2:0] o3_phy_bank, 44 | output [13:0] o14_phy_row, 45 | output [9:0] o10_phy_col, 46 | output [127:0] o128_phy_wrdata, 47 | output [7:0] o8_phy_wrdm 48 | ); 49 | 50 | reg r_phy_cmd_en; 51 | reg r_phy_cmd_sel; 52 | 53 | reg [2:0] r3_calib_state = 4'b0; 54 | 55 | wire [2:0] w3_cal_bank = p_RDCAL_BANK; 56 | wire [13:0] w14_cal_row = p_RDCAL_ROW; 57 | wire [9:0] w10_cal_col = p_RDCAL_COL; 58 | 59 | wire [127:0] w128_cal_wrdata = p_RDCAL_WORD; 60 | wire [7:0] w8_cal_wrdm = 8'b0; 61 | 62 | reg [4:0] r5_dqs_delay_cnt, r5_dq_delay_cnt; 63 | reg r_dqs_delay_ld, r_dq_delay_ld; 64 | 65 | reg [4:0] r5_calib_width_best, // largest number of successful read attempts per DQ tap (as a function of DQS taps) 66 | r5_calib_width, // number of successful read attempts per DQ tap (as a function of DQS taps) for the currently tested DQ tap value 67 | r5_calib_dq_best, // DQ tap value corresponding to the widest range of workable DQS tap settings (denoted by r5_calib_width_best) 68 | r5_calib_dqs_min, // minimum workable value of DQS tap setting for the currently tested DQ tap value 69 | r5_calib_dqs_min_best; 70 | reg r_rd_cal_done; 71 | reg r_rd_cal_err; 72 | 73 | always @(posedge i_clk_div) begin: rd_calibration 74 | r_dqs_delay_ld <= 1'b0; 75 | r_dq_delay_ld <= 1'b0; 76 | 77 | r_phy_cmd_en <= 1'b0; 78 | 79 | case (r3_calib_state) 80 | 'd0: begin // init IDELAY, write calibration word to DRAM 81 | if (i_rdcal_start && !i_phy_cmd_full && i_phy_init_done) begin 82 | 83 | r_phy_cmd_en <= 1'b1; // write calibration word to SDRAM 84 | r_phy_cmd_sel <= 1'b0; 85 | 86 | // Reset tap and counter values 87 | r_rd_cal_done <= 1'b0; 88 | 89 | r5_calib_width_best <= 5'b0; 90 | r5_calib_width <= 5'b0; 91 | r5_calib_dq_best <= 5'b0; 92 | r5_calib_dqs_min <= 5'b0; 93 | r5_calib_dqs_min_best <= 5'b0; 94 | 95 | r5_dqs_delay_cnt <= 5'd2; // keep DQS tap value minimum 2 taps larger than DQ tap value 96 | r5_dq_delay_cnt <= 5'd0; 97 | r_dqs_delay_ld <= 1'b1; 98 | r_dq_delay_ld <= 1'b1; 99 | 100 | r3_calib_state <= 'd1; 101 | end 102 | end 103 | 'd1: begin // repeat IDELAY tick (IDELAY tap settings can be buggy) 104 | r_dqs_delay_ld <= 1'b1; 105 | r_dq_delay_ld <= 1'b1; 106 | r3_calib_state <= 'd2; 107 | end 108 | 'd2: begin // read calibration word from SDRAM 109 | if (!i_phy_cmd_full) begin 110 | r_phy_cmd_en <= 1'b1; 111 | r_phy_cmd_sel <= 1'b1; 112 | 113 | r3_calib_state <= 'd3; 114 | end 115 | end 116 | 'd3: begin // log whether current IDELAY tap counts are okay 117 | if (i_phy_rddata_valid) begin 118 | if (in_phy_rddata == p_RDCAL_WORD) begin 119 | r5_calib_width <= r5_calib_width + 1'b1; // if ok, increase current calib_width 120 | if (r5_calib_width == 5'd0) 121 | r5_calib_dqs_min <= r5_dqs_delay_cnt; // remember first valid DQS tap count 122 | end 123 | r3_calib_state <= 'd4; 124 | end 125 | end 126 | 'd4: begin // save best attempt, decide next state (increment DQS, DQ, or finish calibration?) 127 | if (r5_calib_width > r5_calib_width_best) begin 128 | r5_calib_width_best <= r5_calib_width; 129 | r5_calib_dq_best <= r5_dq_delay_cnt; 130 | r5_calib_dqs_min_best <= r5_calib_dqs_min; 131 | end 132 | if (r5_dqs_delay_cnt == 'd31) begin 133 | if (r5_dq_delay_cnt == 'd29) begin // tap testing complete 134 | r3_calib_state <= 'd5; 135 | end else begin // increment DQ tap, reset DQS tap to DQ+2 136 | r5_dq_delay_cnt <= r5_dq_delay_cnt + 1; 137 | r5_dqs_delay_cnt <= r5_dq_delay_cnt + 3; 138 | r_dqs_delay_ld <= 1'b1; 139 | r_dq_delay_ld <= 1'b1; 140 | 141 | r5_calib_width <= 5'b0; 142 | r3_calib_state <= 'd1; 143 | end 144 | end else begin // only increment DQS tap 145 | r5_dqs_delay_cnt <= r5_dqs_delay_cnt + 1; 146 | r_dqs_delay_ld <= 1'b1; 147 | r_dq_delay_ld <= 1'b1; 148 | r3_calib_state <= 'd1; 149 | end 150 | end 151 | 'd5: begin // set measured best tap values for DQ and DQS ISERDESE 152 | r5_dq_delay_cnt <= r5_calib_dq_best; // recall best DQ value 153 | r5_dqs_delay_cnt <= (r5_calib_width_best/2) + r5_calib_dqs_min_best; // move data strobe into center of valid window 154 | r_dqs_delay_ld <= 1'b1; 155 | r_dq_delay_ld <= 1'b1; 156 | 157 | r3_calib_state <= 'd6; 158 | end 159 | 'd6: begin // raise cal_done flag, repeat IDELAY tick (IDELAY tap settings can be buggy) 160 | r_dqs_delay_ld <= 1'b1; 161 | r_dq_delay_ld <= 1'b1; 162 | 163 | r_rd_cal_err <= (r5_dqs_delay_cnt == 5'b0); 164 | 165 | r_rd_cal_done <= 1'b1; 166 | r3_calib_state <= 'd7; 167 | end 168 | 'd7: begin // extend rd_cal_done for 1 cycle (make it safer to disable rdcal) 169 | r_rd_cal_done <= 1'b1; 170 | r3_calib_state <= 'd0; 171 | end 172 | default ; // should not be reached 173 | endcase 174 | end 175 | 176 | assign o_dqs_delay_ld = r_dqs_delay_ld; 177 | assign o_dq_delay_ld = r_dq_delay_ld; 178 | 179 | assign o5_dqs_idelay_cnt = r5_dqs_delay_cnt; 180 | assign o5_dq_idelay_cnt = r5_dq_delay_cnt; 181 | 182 | assign {o_phy_cmd_en, o_phy_cmd_sel, o3_phy_bank, o14_phy_row, o10_phy_col, o128_phy_wrdata, o8_phy_wrdm} = (r_rd_cal_done) 183 | ? {i_rdc_cmd_en, i_rdc_cmd_sel, i3_rdc_bank, i14_rdc_row, i10_rdc_col, i128_rdc_wrdata, i8_rdc_wrdm} 184 | : {r_phy_cmd_en, r_phy_cmd_sel, w3_cal_bank, w14_cal_row, w10_cal_col, w128_cal_wrdata, w8_cal_wrdm}; 185 | 186 | assign o_rdcal_done = r_rd_cal_done; 187 | assign o_rdcal_err = r_rd_cal_err; 188 | 189 | endmodule 190 | 191 | -------------------------------------------------------------------------------- /ddr3_x16_phy_cust.v: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/someone755/ddr3-controller/6dcf675e1958dc1ee19af367303639333f25e840/ddr3_x16_phy_cust.v -------------------------------------------------------------------------------- /ddr3_x16_phy_params.vh: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////// 2 | // Memory configuration (MR) and timing parametrization 3 | ///////////////////////////////////////////////// 4 | // #################### DLL OFF, <125 MHZ #################### 5 | if (p_DDR_CK_PS > 7999) begin: DLL 6 | localparam lp_CL = 6; 7 | localparam lp_CWL = 6; 8 | localparam lp_AL = 0; 9 | localparam lp_WL = lp_CWL + lp_AL; 10 | localparam lp_RL = lp_CL + lp_AL - 1; // DLL OFF: -1 11 | localparam lpdiv_WL = lp_WL/2; 12 | localparam lpdiv_RL = lp_RL/2; 13 | // Modified parameters: 14 | localparam lpdiv_RTW = (lp_RL + lp_CCD + 2 - lp_WL + 1)/2; // READ-to-WRITE = RL + CCD + 2 CK - WL (+1 to round off correctly) 15 | localparam lpdiv_WTR = (lp_WL + lp_BL/2 + (p_WTR/p_DDR_CK_PS+1))/2; // WRITE-to-READ = WL + BL/2 + WTR (WTR after WRITE DQS postamble) 16 | localparam lpdiv_WR = 5; // WRITE-to-PRECHARGE = WL + BL/2 + WR (WR after WRITE DQS postamble) 17 | // WR (min, DLL OFF) = 4 CK, but minimum allowed in MR0 is 5 CK 18 | 19 | // MRx is {BA[3:0], A[13:0]} 20 | /* 16 15 14 13 12 11 10 ~9 ~8 ~7 ~6 ~5 ~4 ~3 ~2 ~1 ~0 */ 21 | localparam lp_MR0 = 17'b__0__0__0__0__0__0__0__1__1__0__0__1__0__0__0__0__0; 22 | localparam lp_MR1 = 17'b__0__0__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 23 | localparam lp_MR2 = 17'b__0__1__0__0__0__0__0__0__0__0__0__0__0__1__0__0__0; 24 | localparam lp_MR3 = 17'b__0__1__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 25 | 26 | // #################### 300-333 MHz #################### 27 | end else if ((p_DDR_CK_PS > 2999) & (p_DDR_CK_PS < 3334)) begin: DLL 28 | localparam lp_CL = 5; 29 | localparam lp_CWL = 5; 30 | localparam lp_AL = 0; 31 | localparam lp_WL = lp_CWL + lp_AL; 32 | localparam lp_RL = lp_CL + lp_AL; 33 | localparam lpdiv_WL = lp_WL/2; 34 | localparam lpdiv_RL = lp_RL/2; 35 | // Modified parameters: 36 | localparam lpdiv_RTW = (lp_RL + lp_CCD + 2 - lp_WL + 1)/2; // READ-to-WRITE = RL + CCD + 2 CK - WL (+1 to round off correctly) 37 | localparam lpdiv_WTR = (lp_WL + lp_BL/2 + (p_WTR/p_DDR_CK_PS+1))/2; // WRITE-to-READ = WL + BL/2 + WTR (WTR after WRITE DQS postamble) 38 | localparam lpdiv_WR = (lp_WL + lp_BL/2 + (p_WR/p_DDR_CK_PS+1))/2; // WRITE-to-PRECHARGE = WL + BL/2 + WR (WR after WRITE DQS postamble) 39 | // WR = roundup(tWR/tCK) = roundup(15 ns / (3.3~3.0 ss)) = 5 CK 40 | /* 16 15 14 13 12 11 10 ~9 ~8 ~7 ~6 ~5 ~4 ~3 ~2 ~1 ~0 */ 41 | localparam lp_MR0 = 17'b__0__0__0__0__0__0__0__1__1__0__0__0__1__0__0__0__0; 42 | localparam lp_MR1 = 17'b__0__0__1__0__0__0__0__0__0__0__1__0__0__0__1__0__0; 43 | localparam lp_MR2 = 17'b__0__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 44 | localparam lp_MR3 = 17'b__0__1__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 45 | 46 | // #################### 333-400 MHz #################### 47 | end else if ((p_DDR_CK_PS > 2499) & (p_DDR_CK_PS < 3334)) begin: DLL 48 | localparam lp_CL = 6; 49 | localparam lp_CWL = 5; 50 | localparam lp_AL = 0; 51 | localparam lp_WL = lp_CWL + lp_AL; 52 | localparam lp_RL = lp_CL + lp_AL; 53 | localparam lpdiv_WL = lp_WL/2; 54 | localparam lpdiv_RL = lp_RL/2; 55 | // Modified parameters: 56 | localparam lpdiv_RTW = (lp_RL + lp_CCD + 2 - lp_WL + 1)/2; // READ-to-WRITE = RL + CCD + 2 CK - WL (+1 to round off correctly) 57 | localparam lpdiv_WTR = (lp_WL + lp_BL/2 + (p_WTR/p_DDR_CK_PS+1))/2; // WRITE-to-READ = WL + BL/2 + WTR (WTR after WRITE DQS postamble) 58 | localparam lpdiv_WR = (lp_WL + lp_BL/2 + (p_WR/p_DDR_CK_PS+1))/2; // WRITE-to-PRECHARGE = WL + BL/2 + WR (WR after WRITE DQS postamble) 59 | // WR = roundup(tWR/tCK) = roundup(15 ns / (3.0~2.5 ss)) = 6 CK 60 | /* 16 15 14 13 12 11 10 ~9 ~8 ~7 ~6 ~5 ~4 ~3 ~2 ~1 ~0 */ 61 | localparam lp_MR0 = 17'b__0__0__0__0__0__0__1__0__1__0__0__1__0__0__0__0__0; 62 | localparam lp_MR1 = 17'b__0__0__1__0__0__0__0__0__0__0__1__0__0__0__1__0__0; 63 | localparam lp_MR2 = 17'b__0__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 64 | localparam lp_MR3 = 17'b__0__1__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 65 | 66 | // #################### TODO 400-466 MHz #################### 67 | end else if ((p_DDR_CK_PS > 2141) & (p_DDR_CK_PS < 2499)) begin: DLL 68 | localparam lp_CL = 7; 69 | localparam lp_CWL = 6; 70 | localparam lp_AL = 0; 71 | localparam lp_WL = lp_CWL + lp_AL; 72 | localparam lp_RL = lp_CL + lp_AL; 73 | localparam lpdiv_WL = lp_WL/2; 74 | localparam lpdiv_RL = lp_RL/2; 75 | // Modified parameters: 76 | localparam lpdiv_RTW = (lp_RL + lp_CCD + 2 - lp_WL + 1)/2; // READ-to-WRITE = RL + CCD + 2 CK - WL (+1 to round off correctly) 77 | localparam lpdiv_WTR = (lp_WL + lp_BL/2 + (p_WTR/p_DDR_CK_PS+1))/2; // WRITE-to-READ = WL + BL/2 + WTR (WTR after WRITE DQS postamble) 78 | localparam lpdiv_WR = (lp_WL + lp_BL/2 + (p_WR/p_DDR_CK_PS+1))/2; // WRITE-to-PRECHARGE = WL + BL/2 + WR (WR after WRITE DQS postamble) 79 | // WR = roundup(tWR/tCK) = roundup(15 ns / (2.500~2.143 ss)) = 7 (466 MHz) 80 | // WR = roundup(tWR/tCK) = roundup(15 ns / (2.143~1.875 ns)) = 8 (533 MHz) 81 | /* 16 15 14 13 12 11 10 ~9 ~8 ~7 ~6 ~5 ~4 ~3 ~2 ~1 ~0 */ 82 | localparam lp_MR0 = 17'b__0__0__0__0__0__0__1__1__1__0__0__1__1__0__0__0__0; 83 | localparam lp_MR1 = 17'b__0__0__1__0__0__0__0__0__0__0__1__0__0__0__1__0__0; 84 | localparam lp_MR2 = 17'b__0__1__0__0__0__0__0__0__0__0__0__0__0__1__0__0__0; 85 | localparam lp_MR3 = 17'b__0__1__1__0__0__0__0__0__0__0__0__0__0__0__0__0__0; 86 | 87 | // #################### 125-300, >466 MHz ERROR #################### 88 | end else begin: DLL 89 | localparam lp_CL = 0; 90 | localparam lp_CWL = 0; 91 | localparam lp_AL = 0; 92 | unsupported_frequency_error_generation BAD_FREQ(); 93 | end 94 | 95 | localparam lpdiv_WL_MAX = 6/2; // only CWL 5, 6 supported (no way this does > 533 MHz) 96 | // if CWL = 5, OSERDES must be triggered one CLKDIV period earlier 97 | localparam lp_MR3_MPR = 14'b__0__0__0__0__0__0__0__0__0__0__0__1__0__0; // to be used when reading 1010 from MPR 98 | 99 | ///////////////////////////////////////////////// 100 | // Timing parameter to DIV CK conversion 101 | ///////////////////////////////////////////////// 102 | localparam lp_CCD = 4; // 4 CK between RD-to-RD or WR-to-WR 103 | localparam lp_BL = 8; // 8 transfers per RD/WR command 104 | 105 | localparam lpdiv_MRD = lp_CCD/2 - 1; // MRS cycle time (4 CK) 106 | localparam lpdiv_CCD = lp_BL/2/2 - 1; // WRITE-to-WRITE (actually just BL/2 in DDR CK)(CCD = CAS#-to-CAS# delay) 107 | 108 | localparam lp_DIV_FREQ_MHZ = p_DDR_FREQ_MHZ/2; 109 | localparam lp_DIV_CK_PS = `ck2ps(lp_DIV_FREQ_MHZ); 110 | 111 | // Parameters directly following from DDR nCKs: 112 | localparam lpdiv_RAS = p_RAS/lp_DIV_CK_PS; // ACTIVATE-to-PRECHARGE 113 | localparam lpdiv_RCD = p_RCD/lp_DIV_CK_PS; // ACTIVATE-to-READ or ACTIVATE-to-WRITE delay 114 | localparam lpdiv_REFI = p_REFI/lp_DIV_CK_PS-10; // Average periodic refresh interval // (-1) because it needs to be rounded DOWN 115 | localparam lpdiv_RFC_MIN = p_RFC_MIN/lp_DIV_CK_PS; // REFRESH-to-ACTIVATE or REFRESH-to-REFRESH 116 | localparam lpdiv_RFC_MAX = p_RFC_MAX/lp_DIV_CK_PS-1; 117 | localparam lpdiv_RP = p_RP/lp_DIV_CK_PS; // PRECHARGE-to-ACTIVATE or PRECHARGE-to-REFRESH 118 | localparam lpdiv_RRD = p_RRD/lp_DIV_CK_PS; // ACTIVATE-to-ACTIVATE in different banks 119 | localparam lpdiv_RTP = p_RTP/lp_DIV_CK_PS; // READ-to-PRECHARGE 120 | //localparam lpdiv_WTR = p_WTR/lp_DIV_CK_PS; // WRITE-to-READ 121 | //localparam lpdiv_WR = p_WR/lp_DIV_CK_PS; // WRITE-to-PRECHARGE (WRITE recovery time) 122 | localparam lpdiv_XPR = p_XPR/lp_DIV_CK_PS; // Exit reset from CKE HIGH to valid command 123 | localparam lpdiv_MOD = p_MOD/lp_DIV_CK_PS; // MRS-to-non-MRS (MRS update delay) 124 | localparam lpdiv_ZQINIT = p_ZQINIT/lp_DIV_CK_PS; // Long calibration time 125 | 126 | //localparam lpdiv_WRAP = (DLL.lp_WL + DLL.lp_BL/2 + ((p_WR + p_RP)/p_DDR_CK_PS+1))/2; // WRITE with AP exit time 127 | //localparam lpdiv_RDAP = (p_RTP + p_RP)/p_DDR_CK_PS; // READ with AP exit time 128 | 129 | 130 | // Initialization timing parameters: 131 | `ifdef SIMULATION 132 | localparam lpdiv_NRST_LO = 2 * p_DDR_FREQ_MHZ/2 - 1; // RESET#: After power stable, RESET# held LOW for >200 us. 133 | localparam lpdiv_CKE_LO = 7 * p_DDR_FREQ_MHZ/2 - 1; // CKE: After RESET# transitions HIGH wait >500 us with CKE LOW. 134 | `else 135 | localparam lpdiv_NRST_LO = 250 * p_DDR_FREQ_MHZ/2 - 1; // RESET#: After power stable, RESET# held LOW for >200 us. 136 | localparam lpdiv_CKE_LO = 501 * p_DDR_FREQ_MHZ/2 - 1; // CKE: After RESET# transitions HIGH wait >500 us with CKE LOW. 137 | `endif 138 | localparam lp_STATE_TMR_WIDTH = $clog2(lpdiv_CKE_LO); 139 | //localparam lp_IDLE_TMR_WIDTH = $clog2(lpdiv_RFC_MAX); 140 | //localparam lpdiv_INIT_CTR_PLAY = 50/2; 141 | //localparam lpdiv_INIT_CTR_START = lpdiv_NRST_LO + lpdiv_CKE_LO + lpdiv_XPR + 3 * lpdiv_MRD + lpdiv_MOD + lpdiv_ZQINIT + lpdiv_INIT_CTR_PLAY; 142 | 143 | ///////////////////////////////////////////////// 144 | // Command defines; {(nCS,) nRAS, nCAS, nWE}; See Micron datasheet Table 87 145 | ///////////////////////////////////////////////// 146 | localparam lp_CMD_MRS = 3'b0000; // MODE REGISTER SET 147 | localparam lp_CMD_REF = 3'b0001; // REFRESH 148 | localparam lp_CMD_PRE = 3'b0010; // & A10 LOW: Single-bank PRECHARGE 149 | // & A10 HIGH: PRECHARGE all banks 150 | localparam lp_CMD_ACT = 3'b0011; // Bank ACTIVATE 151 | localparam lp_CMD_WR = 3'b0100; // & A10 LOW: normal WRITE (assuming BL8MRS) 152 | // & A10 HIGH: WRITE with auto precharge 153 | localparam lp_CMD_RD = 3'b0101; // & A10 LOW: normal READ 154 | // & A10 HIGH: READ with auto precharge 155 | localparam lp_CMD_NOP = 3'b0111; // NO OPERATION 156 | localparam lp_CMD_ZQCL = 3'b0110; // ZQ CALIBRATION LONG 157 | 158 | ///////////////////////////////////////////////// 159 | // State machine state definitions (one per cmd + init)) 160 | ///////////////////////////////////////////////// 161 | localparam STATE_INIT = 4'd0; 162 | localparam STATE_MRS = 4'd1; 163 | localparam STATE_REF = 4'd2; 164 | localparam STATE_PRE = 4'd3; 165 | localparam STATE_ACT = 4'd4; 166 | localparam STATE_WR = 4'd5; 167 | localparam STATE_RD = 4'd6; 168 | localparam STATE_IDLE = 4'd7; 169 | localparam STATE_ZQCL = 4'd8; 170 | 171 | ///////////////////////////////////////////////// 172 | // Input FIFO parameters 173 | ///////////////////////////////////////////////// 174 | localparam lp_CMDFIFO_OP_WR = 1'b0; 175 | localparam lp_CMDFIFO_OP_RD = 1'b1; 176 | localparam lp_CMDFIFO_OP_WIDTH = 1; 177 | // CMD FIFO stores OP (r/w), address (bank+row+col), wr data for writes (8*DQ), and data mask (1 bit per burst word) 178 | localparam lp_CMDFIFO_WIDTH = lp_CMDFIFO_OP_WIDTH + p_BANK_W + p_ROW_W 179 | + p_COL_W + 8 * p_DQ_W + lp_BL; 180 | 181 | 182 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # ddr3-controller 4 | A DDR3(L) PHY and controller, written in Verilog, for Xilinx 7-Series FPGAs. 5 | 6 | Originally written for the Digilent Arty S7-50 development board and its supplied 2 Gbit x16 DDR3L SDRAM. It is adaptable, with parametrized timing values and bus widths. The core supports nominal frequencies of 300 MHz and up, as well as the optional "DLL disable" mode as specified by JEDEC, for operation at 125 MHz and below, for more robust operation owing to improved timing margin. 7 | 8 | #### Features 9 | * Read and write commands are issued into a FIFO: easy operation 10 | * Maximized sequential read/write speeds 11 | * Optional read calibration module using FPGA's IDELAYE2 primitives' variable taps 12 | * Very small footprint compared to Xilinx's MIG IP 13 | * Tested working up to 464 MHz: 40% faster than Xilinx's MIG 14 | #### Quirks 15 | * Only one open bank at a time 16 | * No command re-arranging or intelligent look-ahead 17 | * Write-to-read or read-to-write time is long even within same row 18 | * Write leveling is not implemented: Only one SDRAM chip supported 19 | * Incomplete timing constraints 20 | 21 | ### Usage 22 | Externally, the core requires the user to instantiate a MMCM. Four of the MMCM outputs are to be used and wired to the core's inputs, **all generated by the same MMCM**, consisting of: 23 | - `i_clk_ddr`: A "fast" clock at which the SDRAM is to be ran 24 | - `i_clk_ddr_90`: The fast clock with a 90° phase shift 25 | - `i_clk_div`: A "slow" clock that is (a) in phase with i_clk_ddr, and (b) exactly 1/2 of the memory frequency 26 | - `i_clk_ref`: A 200 or 300 MHz clock that drives the IDELAYCTRL primitive, which controls the IDELAY taps used for read calibration[^0] 27 | 28 | ![Screen capture of the clock generator as used in my own application](./vivado-clkgen.png) 29 | [^0]: 200 or 300 MHz is the nominal frequency that you should aim for. Supported ranges, as per Xilinx documentation, are 190-210 MHz, and 290-300 MHz. 30 | 31 | All of the user-facing signals (input and output) are syhcnronous to the slow `i_clk_div` clock. 32 | 33 | Also required is a FIFO module.[^10] The FIFO needs to be wide enough to fit the controller input signals: 34 | 35 | - `[0:0] i_phy_cmd_sel`: The command select bit (`'b1` to read, `'b0` to write) 36 | - The read or write address, consisting of: 37 | - `[p_BANK_W-1:0] in_phy_bank` 38 | - `[p_ROW_W-1:0] in_phy_row` 39 | - `[p_COL_W-1:0] in_phy_col` 40 | - `[(8*p_DQ_W)-1:0] in_phy_wrdata`: The write data[^20] 41 | - `[7:0] in_phy_wrdm`: The write data mask[^20] 42 | 43 | ![Screen capture of the FIFO generator as used in my own application](./vivado-fifogen.png) 44 | 45 | [^10]: For ease of use and peace of mind, I opted for a FIFO generated using Xilinx's FIFO generator. The depth and architecture are up to the user, so long as it is a common clock non-FWFT FIFO with sufficient depth for the application, although Xilinx's minimum depth of 16 should be good enough for any application I can think of. 46 | 47 | [^20]: `in_phy_wrdata` and `i8_phy_wrdm` are "do not care" if a read command is issued (i.e. `i_phy_cmd_sel` is 1'b1). 48 | 49 | #### Other signals 50 | 51 | - `o_phy_rddata_valid` goes high for one clock cycle to indicate that the read data (8 times the SDRAM width -- 128 bits for a x16 chip, or as defined by `on_phy_rddata`) is valid. 52 | - `i_phy_rst` is a reset signal for IDELAYCTRL, SERDES, and is wired directly to the SDRAM chip. To comply with JEDEC specifications, this signal should not be set to logic low (1'b0) until 500 us after power to the SDRAM is stable. To comply with IDELAYCTRL and SERDES specifications, the signal should be held high until all clocks are stable. 53 | - `o_phy_init_done` goes high once the core completes the reset and initialization sequence. 54 | - wire `i2_iserdes_ce` to logic high (2'b11). 55 | - `*dqs_delay*` and `*dq_delay*` signals control the IDELAY tap values -- unless you are writing your own read calibration module, wire the `*_ce` and `*_inc` inputs to logic low. Connect the input `*_idelay_cnt` and `*_delay_ld` signals to the `rdcal` module. The output `*_idelay_cnt` signals can be ignored and are only exposed for debugging purposes. 56 | 57 | #### `rdcal` read calibration module 58 | Nominally, in DDR3, the read strobe is synchronous to the read data. The controller is required to delay the read strobe so that data is read correctly. The read calibration module attempts to find the optimal delay for the DQ and DQS lines in order to center the read data strobe in the data valid window. The calibration is done on the entire bus simultaneously (not per-byte or per-bit). Though an improvement can be made here, I haven't encountered any failed reads due to failed calibration. 59 | 60 | Wiring and control of the module is again made to be simple: 61 | * The `o_phy_*` signals are to be connected from the rdcal module to the controller module: 62 | * `o_phy_cmd_en` --> `i_phy_cmd_en` 63 | * `o_phy_cmd_sel` --> `i_phy_cmd_sel` 64 | * `o3_phy_bank` --> `in_phy_bank` 65 | * etc. 66 | * The `i_rdc_*` signals are used as the inputs of the controller's command FIFO. The controller's FIFO signals are used exclusively by the `rdcal` logic (hijacked) until the read calibration cycle is complete. 67 | * `o_rdcal_done`: Goes high once the read calibration cycle is complete. No user commands are registered into the command FIFO while this signal is logic low. 68 | * `o_rdcal_err`: Goes high in case that there is no combination of DQ and DQS IDELAY taps that produce valid read data. 69 | * `i_rdcal_start`: Is used to start the calibration loop. Read calibration can be re-done at any time, but to run the calibration, 128 bits of the SDRAM's memory will be overwritten with the read calibration word.[^25] 70 | * The `i_phy_*` signals are to be connected from the controller module to the rdcal module: 71 | * `o_phy_init_done` --> `i_phy_init_done` 72 | * `o_phy_rddata_valid` --> `i_phy_rddata_valid` 73 | * `on_phy_rddata` --> `in_phy_rddata` 74 | * etc. 75 | * The `*delay_*` signals are to be connected from the rdcal module to the controller module:[^26] 76 | * `o_dq_delay_ld` --> `in_dq_delay_ld` 77 | * `o5_dq_idelay_cnt` --> `in_dq_idelay_cnt` 78 | * etc. 79 | 80 | [^25]: By default, the read calibration word is `'h0000_FFFF_0000_FFFF_0000_FFFF_0000_FFFF`. The default address used for calibration is `27'b0`. These values can be changed by editing the `p_RDCAL_WORD` and `p_RDCAL_BANK`, `p_RDCAL_ROW`, and `p_RDCAL_COL` parameters, respectively. 81 | 82 | [^26]: If the signal widths do not match, concatenate the signal output from the read calibration module with itself. E.g. if `in_dqs_idelay_cnt` is defined as `[9:0]`, then connect it with `{o5_dqs_idelay_cnt, o5_dqs_idelay_cnt}`. 83 | 84 | #### Calibrating read data output 85 | While the read calibration module will aim to center the DQS edges into the center of the DQ line data eye, it is up to the user of this interface to find the correct delay between a read command being issued to the memory chip, and the time the that the `o_phy_rddata_valid` flag signals that data is available on the `on_phy_rddata` parallel output. Once determined, this value will not change for a given application unless the values of CL or CWL change. 86 | 87 | To simplify: 88 | * A ±1 change of `p_RD_DELAY` will appear to shift[^27] the output data by 64 bits. 89 | * Toggling `p_ISERDES_32B_SHIFT` between `"TRUE"` and `"FALSE"` will shift the output data by 32 bits. 90 | 91 | [^27]: "Appear to shift": In actuality, this parameter determines the length of a shift register whose input equals 1'b1 whenever a READ command is given to the memory. The parameter does not in any way manipulate with the ISERDES parallel data output. 92 | 93 | For example, on an Arty S7-50 development board, at 300-333 MHz, correct read data can be obtained with the combination of `p_RD_DELAY = 10` and `p_ISERDES_32B_SHIFT = "FALSE"`. 94 | 95 | #### Example project 96 | An example project/top module for the Arty S7-50 board is currently available [in another repository](https://github.com/someone755/arty_s7_playground/blob/master/ddr3/ddr3.srcs/sources_1/ddr3_x16_cust_top.v).[^28] The Python script in that repository can also be used to test the functionality of the core. 97 | 98 | [^28]: I admit the code is far from pretty, but should be good enough to see how the core and read calibration module are meant to be connected in a functional application. 99 | 100 | ### Discussion 101 | #### Core operation 102 | The logic part of this memory controller is relatively simple. Once the initialization is complete, the core loops and periodically requests a refresh of the SDRAM. Outside that, if the input command FIFO is not empty, the command is read out, and the proper sequence of commands is executed, e.g. ACT -> RD -> PRE for one issued read command in the command FIFO. A simplified state diagram is drawn below: 103 | 104 | ```mermaid 105 | graph TD; 106 | A[Reset and initialization sequence]-->ID[Idle state]; 107 | ID-->REF[Refresh]; 108 | REF-->ID 109 | ID--Command in FIFO-->ACT[Activate] 110 | ACT--Write command-->WR[Write] 111 | WR-->WR 112 | WR-->PRE[Precharge] 113 | ACT--Read command-->RD[Read] 114 | RD-->RD 115 | RD-->PRE 116 | PRE-->ID 117 | ``` 118 | 119 | Note that there are no bank machines as in Xilinx's MIG, and only one bank may be active at a time. 120 | 121 | Refresh requests, raised by a free running refresh timer, take precedence over user commands. User commands are run in the order that they are pushed into the command FIFO. There is no intelligent look-ahead or command re-shuffling to improve access times or data throughput. 122 | 123 | To make use of the high burst data rate of DDR SDRAM, the core supports sequential access of one operation in the same bank and row. To this end, the read or write state is exited once either (a) the refresh timer requests an SDRAM refresh, (b) the user command is different than the previous one (e.g. previous user request was a read, current request is a write, or vice-versa), (c) the user specified bank or row are different than that of the previous read or write command, or (d) the command FIFO becomes empty. 124 | 125 | The 2:1 ratio between the DRAM and the core clock frequencies is owed to the supported data widths of the SERDES primitives in "MEMORY" mode.[^30] Because of this, the core effectively employs a 2T command rate such that the controller's valid commands are interleaved with the "DESELECT" command. 126 | 127 | [^30]: See UG471, Chapter 3, section "Input Serial-to-Parallel Logic Resources (ISERDESE2)", Table 3-3: Supported Data Widths. With DDR data signaling, only a 2:1 mode is supported. 128 | 129 | The controller and PHY are not generic and not replaceable, as the DFI protocol is not employed. 130 | 131 | #### FPGA area used 132 | One of the aims of the project was to try and build a smaller alternative to Xilinx's supplied MIG IP. The same project using the Xilinx MIG DDR3 controller utilizes nearly 14% of the FPGA LUTs, versus just over 3% with this core. 133 | 134 | ![Graphical representation of Post-Implementation FPGA utilization](./vivado-util-cust-graph.png) 135 | 136 | For designs that prioritize low FPGA utilization, this core (once/if properly constrained) could be a possible solution. Testing has showed favorable results up to the limits of my Spartan 7 device at 464 MHz (928 MT/s). 137 | 138 | #### The issue of Xilinx's undocumented primitives 139 | Xilinx refuses to document or allow third parties to access the PHASER_IN and PHASER_OUT primitives beyond admitting that they exist and are used by the MIG IP in documentation. UG953, for example, includes six (!) consecutive pages, explaining that phaser elements' "only intended use is by the Memory Interface Generator (MIG), and [they are] not intended to be instantiated, used, or modified outside of Xilinx-generated IP." 140 | 141 | This core works around the issue of the PHASER_IN primitive by instead instantiating the IDELAYE2 primitives to calibrate the read data strobe to the valid read data window, as demonstrated in the optional `rdcal` module. This method has worked well in testing. 142 | 143 | The lack of access to the PHASER_OUT primitive is more apparent: DDR3 memory requires that the write data strobe is properly synced to the output clock. For this purpose, JEDEC has defined a new feature in DDR3 (as compared to DDR2), called write leveling. Yet, the lack of access to primitives that could delay the output signals[^40] makes write leveling impossible outside of Xilinx's own MIG. This fact limits the interface to single-chip DDR3 applications.[^42] 144 | 145 | [^40]: There exists an ODELAYE2 primitive that is well documented, but only available in HP banks. The Spartan FPGA only has HR banks. 146 | 147 | [^42]: While not ideal, note here that MIG on the Spartan 7 also does not implement write leveling and is thus limited to single-chip DDR3 applications just the same. I personally do not count this as a loss. 148 | 149 | #### Timing constraints 150 | The lack of proper timing constraints means that the Vivado timing analysis tools cannot help in meeting the SDRAM's timing. Thus, a bitstream may not work across PVT (process, voltage, and temperature), though it may appear to work during testing.[^50] 151 | 152 | Routing all external signals (command, address, data bus) through OSERDES means that the IOLOGIC primitives are responsible for meeting timing. Relying on this solution has worked well in testing up to 464 MHz. 153 | 154 | [^50]: In my testing of the core between 100 and 464 MHz, operation was flawless, but of course I cannot make guarantees about functionality across PVT. The results of my testing should be taken as anecdotal, and more thorough analysis should be undertaken. 155 | 156 | I haven't delved into setting up timing constraints for this project, nor do I plan to in the near future. If you are an XDC guru and wish to contribute to this controller, feel free to contact me, open a bug, or a pull request. 157 | 158 | #### Further reading 159 | This core is heavily influenced by Xilinx's own XAPP721 application note, which details a similar PHY as employed here, used for DDR2 SDRAM on a Virtex-4 FPGA. Various DDR SDRAM manufacturers' application notes such as Micron's TN-04-54 ("High-Speed DRAM Controller Design") can also be of great help in regards to memory controller and PHY design. 160 | 161 | To demistify the workings of DDR3 memory, there is of course the complete DDR3 specification in JEDEC file JESD79-3F. Also applicable here are various DDR3 SDRAM datasheets and application notes by silicon manufacturers (e.g. Micron, Samsung, SK hynix, TI, Philips, even MIT lecture notes), which act as abridged and slightly re-worded versions of the JEDEC spec. Further and more verbose instructions can be found in files that DDR SDRAM manufacturers publish alongside part datasheets, such as Micron's excellent "TN-41" series of application notes. 162 | 163 | For better understanding of the FPGA primitives this PHY is built from, the following Xilinx documents are invaluable (albeit not faultless) resources: 164 | - UG471: "7 Series FPGAs SelectIO Resources" 165 | - UG768: "Xilinx 7 Series FPGA Libraries Guide for HDL Designs" 166 | - UG953: "Vivado Design Suite 7 Series FPGA and Zynq-7000 SoC Libraries Guide" 167 | 168 | A similar DDR3 core focusing only on "DLL disabled" mode was written by @ultraembedded, and is available here: https://github.com/ultraembedded/core_ddr3_controller 169 | 170 | Of course I would be remiss if I didn't mention 171 | - reddit's FPGA community at https://www.reddit.com/r/FPGA/ 172 | - Digilent's own forums at https://forum.digilent.com/ 173 | - the Xilinx support website at https://support.xilinx.com/ 174 | 175 | as treasure troves both of specific knowledge as well as general wisdoms. 176 | -------------------------------------------------------------------------------- /docs/vivado-clkgen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/someone755/ddr3-controller/6dcf675e1958dc1ee19af367303639333f25e840/docs/vivado-clkgen.png -------------------------------------------------------------------------------- /docs/vivado-fifogen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/someone755/ddr3-controller/6dcf675e1958dc1ee19af367303639333f25e840/docs/vivado-fifogen.png -------------------------------------------------------------------------------- /docs/vivado-util-cust-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/someone755/ddr3-controller/6dcf675e1958dc1ee19af367303639333f25e840/docs/vivado-util-cust-graph.png --------------------------------------------------------------------------------